Commit 8f294aa6 authored by Luis Pedro Coelho's avatar Luis Pedro Coelho
Browse files

ENH Create a single master script, metaSNV.py

parent b1598f33
......@@ -114,42 +114,12 @@ Workflow:
## Optional Files:
* **'db\_ann'** = a gene annotation file for the reference database.
## 1. Initiate a new project
## 2. Run metaSNV
metaSNV_NEW project_dirname
metaSNV.py project_dir/ all_samples ref_db ref_fasta [options]
Generates a structured results directory for your project.
## 2. Part I: Pre-processing [optional]
Note: Subsequent SNP filtering depends on these coverage estimations.
This part can be skipped if you only want to use the 'raw' SNP output and do not intend to balance the workload or if you already performed the pre-processing for the dataset.
### a) run metaSNV_COV
metaSNV_COV project_dirname all_samples
The script generates a list of commandline jobs. Run each commandline jobs
before proceeding with the next step.
### b) run metaSNV_OPT
Helper script for workload balancing (reference genome splitting).
metaSNV_OPT project_dir/ genome_def nr_splits[int]
Note: This step requires an appropriate reference genome_definitions file (included in our database).
## 3. Part II: SNV calling
Note: Can be run as a single job, independently of the pre-processing or you use an already existing genome split.
### a) metaSNV_SNV
metaSNV_SNV project_dir/ all_samples ref_db [options]
The script generates a list of commandline jobs. Submit each commandline to
your HPC cluster or compute locally.
## 4. Part III: Post-Processing (Filtering & Analysis)
## 3. Part II: Post-Processing (Filtering & Analysis)
### a) Filtering:
Note: requires SNP calling (Part II) to be done!
......
#!/usr/bin/env python
import os
from sys import stderr, exit
from os import path
import argparse
import subprocess
import multiprocessing
basedir = os.path.dirname(os.path.abspath(__file__))
def exit_worker(signum, frame):
raise RuntimeError("Keyboard Interrupt")
def init_worker():
import signal
signal.signal(signal.SIGINT, exit_worker)
def run_sample(sample, command):
'''Simply wraps subprocess.call and returns contextual information in order
to provide a good error message (if necessary)'''
ret = subprocess.call(command)
return sample, command, ret
def main():
parser = argparse.ArgumentParser(description='Compute coverages')
parser.add_argument('project_dir', metavar='DIR',
help='A metaSNP initialized project directory (metaSNP_NEW)')
parser.add_argument('all_samples', metavar='FILE',
help='File with an input list of bam files, one file per line')
parser.add_argument('--print-commands', default=False, action='store_true',
help='Instead of executing the commands, simply print them out')
parser.add_argument('--threads', metavar='INT', default=1, type=int,
help='Number of jobs to run simmultaneously')
args = parser.parse_args()
if not path.isdir(args.project_dir):
stderr.write("Cannot find project directory '{}'".format(args.project_dir))
args.print_help()
out_dir = path.join(args.project_dir, 'cov')
try:
os.makedirs(out_dir)
except:
pass
p = multiprocessing.Pool(args.threads, init_worker)
results = []
for line in open(args.all_samples):
line = line.rstrip()
name = path.basename(line)
cmd = ['{}/src/qaTools/qaCompute'.format(basedir)
,'-c' ,'10', '-d',
'-i', line, '{}/{}.cov'.format(out_dir, name)]
if args.print_commands:
print(" ".join(cmd))
else:
results.append(p.apply_async(run_sample, (name, cmd)))
p.close()
p.join()
for r in results:
sample, command, ret = r.get()
if ret:
stderr.write("Failure in sample {}".format(sample))
stderr.write("Call to {} failed.".format(' '.join(cmd)))
exit(1)
if __name__ == '__main__':
main()
#!/usr/bin/env python
# This code is part of the metagenomic SNV calling pipeline (metaSNV)
# Helper script to initiate a new project file structure.
import argparse
from sys import stderr, exit
from os import path
def mkdir_p(dirname):
'''Equivalent to 'mkdir -p' on the command line'''
from os import makedirs
try:
makedirs(dirname)
except OSError:
pass
def create_directories(basedir):
mkdir_p(basedir)
for sub in ['cov',
'bestsplits',
'snpCaller',
'filtered',
'filtered/pop',
'filtered/ind',
'distances']:
mkdir_p(path.join(basedir, sub))
def init_project():
parser = argparse.ArgumentParser(description='Create project')
parser.add_argument('project_dir', metavar='PROJECT_DIR',
help='Name of the project (name of output directory to create)')
args = parser.parse_args()
if path.exists(args.project_dir):
stderr.write("Project directory '{}' already exists".format(args.project_dir))
args.print_help()
exit(1)
create_directories(args.project_dir)
if __name__ == '__main__':
init_project()
#!/usr/bin/env python
#########################################
# metaSNP Step II: `Database Indexing` #
#########################################
#
# Helper script
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch, 2017 Luis Pedro Coelho
# Licenced under the GNU General Public License (see LICENSE)
import subprocess
from sys import stderr, exit
from os import path
import argparse
from glob import glob
import os
basedir = os.path.dirname(os.path.abspath(__file__))
def split_opt():
parser = argparse.ArgumentParser(description='Optimize project splits')
parser.add_argument('project_dir', metavar='PROJECT_DIR',
help='A metaSNP initialized project directory (metaSNP_NEW)')
parser.add_argument('db_ann', metavar='BED_FILE',
help='Database annotation (BED file).')
parser.add_argument('n_splits', metavar='INT', default=10, type=int, nargs='?',
help='Number of jobs to run simmultaneously')
args = parser.parse_args()
if not path.isdir(args.project_dir):
stderr.write("Cannot find project directory '{}'".format(args.project_dir))
args.print_help()
exit(1)
if args.n_splits > 100:
stderr.write("Maximum number of splits is 100.\n")
exit(1)
older_files = glob(args.project_dir + '/bestsplits/*')
if older_files:
stderr.write("\nremoving old splits.\n")
for f in older_files:
os.unlink(f)
cov_dir = path.join(args.project_dir, 'cov')
cov_files = glob(cov_dir + '/*.cov')
project_name = path.basename(args.project_dir)
if not cov_files:
stderr("Cov files not found. Please run metaSNP_COV.\n")
exit(1)
for f in cov_files:
cmd = ['python',
path.join(basedir, 'src/computeGenomeCoverage.py'),
f,
f + '.detail',
f + '.summary']
subprocess.call(cmd)
print("\nCoverage summary here: {}".format(args.project_dir))
print(" Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, project_name))
print(" Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, project_name))
cmd = ['python',
'{}/src/collapse_coverages.py'.format(basedir),
args.project_dir]
subprocess.call(cmd)
print("\nGenome Splitting:")
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
cmd = ['python',
'{}/src/createOptimumSplit.py'.format(basedir),
"{}/{}.all_cov.tab".format(args.project_dir, project_name),
"{}/{}.all_perc.tab".format(args.project_dir, project_name),
args.db_ann,
str(args.n_splits),
path.join(args.project_dir, "bestsplits", "best_split")]
subprocess.call(cmd)
if __name__ == '__main__':
split_opt()
#!/usr/bin/env python
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
import os
# This code is part of the metagenomic SNV calling pipeline (metaSNV)
# Helper script to initiate a new project file structure.
import argparse
from sys import stderr, exit
from os import path
import argparse
import subprocess
from glob import glob
import os
import subprocess
import multiprocessing
basedir = os.path.dirname(os.path.abspath(__file__))
def mkdir_p(dirname):
'''Equivalent to 'mkdir -p' on the command line'''
from os import makedirs
try:
makedirs(dirname)
except OSError:
pass
def create_directories(basedir):
mkdir_p(basedir)
for sub in ['cov',
'bestsplits',
'snpCaller',
'filtered',
'filtered/pop',
'filtered/ind',
'distances']:
mkdir_p(path.join(basedir, sub))
def exit_worker(signum, frame):
raise RuntimeError("Keyboard Interrupt")
def init_worker():
import signal
signal.signal(signal.SIGINT, exit_worker)
def run_sample(sample, command):
'''Simply wraps subprocess.call and returns contextual information in order
to provide a good error message (if necessary)'''
ret = subprocess.call(command)
return sample, command, ret
def compute_opt(args):
out_dir = path.join(args.project_dir, 'cov')
try:
os.makedirs(out_dir)
except:
pass
p = multiprocessing.Pool(args.threads, init_worker)
results = []
for line in open(args.all_samples):
line = line.rstrip()
name = path.basename(line)
cmd = ['{}/src/qaTools/qaCompute'.format(basedir)
,'-c' ,'10', '-d',
'-i', line, '{}/{}.cov'.format(out_dir, name)]
if args.print_commands:
print(" ".join(cmd))
else:
results.append(p.apply_async(run_sample, (name, cmd)))
p.close()
p.join()
for r in results:
sample, command, ret = r.get()
if ret:
stderr.write("Failure in sample {}".format(sample))
stderr.write("Call to {} failed.".format(' '.join(cmd)))
exit(1)
def split_opt(args):
if args.n_splits > 100:
stderr.write("Maximum number of splits is 100.\n")
exit(1)
older_files = glob(args.project_dir + '/bestsplits/*')
if older_files:
stderr.write("\nremoving old splits.\n")
for f in older_files:
os.unlink(f)
cov_dir = path.join(args.project_dir, 'cov')
cov_files = glob(cov_dir + '/*.cov')
project_name = path.basename(args.project_dir)
if not cov_files:
stderr("Cov files not found. Please run metaSNP_COV.\n")
exit(1)
for f in cov_files:
cmd = ['python',
path.join(basedir, 'src/computeGenomeCoverage.py'),
f,
f + '.detail',
f + '.summary']
subprocess.call(cmd)
print("\nCoverage summary here: {}".format(args.project_dir))
print(" Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, project_name))
print(" Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, project_name))
cmd = ['python',
'{}/src/collapse_coverages.py'.format(basedir),
args.project_dir]
subprocess.call(cmd)
print("\nGenome Splitting:")
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
cmd = ['python',
'{}/src/createOptimumSplit.py'.format(basedir),
"{}/{}.all_cov.tab".format(args.project_dir, project_name),
"{}/{}.all_perc.tab".format(args.project_dir, project_name),
args.db_ann,
str(args.n_splits),
path.join(args.project_dir, "bestsplits", "best_split")]
subprocess.call(cmd)
def execute_snp_call(args, snpCaller, ifile, ofile, split):
db_ann_args = []
if args.db_ann:
......@@ -45,45 +147,13 @@ def execute_snp_call(args, snpCaller, ifile, ofile, split):
return snpcaller_call.wait()
def snp_call():
parser = argparse.ArgumentParser(description='Call SNPs')
parser.add_argument('project_dir', metavar='PROJECT_DIR',
help='A metaSNP initialized project directory (metaSNP_NEW)')
parser.add_argument('all_samples', metavar='SAMPLES_FILE',
help='File with an input list of bam files, one file per line')
parser.add_argument("ref_db", metavar='REF_DB_FILE',
help='reference multi-sequence FASTA file used for the alignments.')
parser.add_argument('db_ann', metavar='DB_ANN', default='', action='store', nargs='?',
help='Database annotation.')
parser.add_argument('--print-commands', default=False, action='store_true',
help='Instead of executing the commands, simply print them out')
parser.add_argument('--splits', default=None, action='store',
help='Directory where split files are found')
parser.add_argument('--threads', metavar='INT', default=1, type=int,
help='Number of jobs to run simmultaneously')
args = parser.parse_args()
if not path.isdir(args.project_dir):
stderr.write("Cannot find project directory '{}'".format(args.project_dir))
args.print_help()
exit(1)
def snp_call(args):
out_dir = path.join(args.project_dir, 'snpCaller')
try:
os.makedirs(out_dir)
except:
pass
if not path.isfile(args.ref_db):
stderr.write('''
ERROR: No reference database or annotation file found!"
ERROR: '{}' is not a file."
SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP caller
'''.format(args.ref_db))
parser.print_help()
exit(1)
snpCaller = basedir + "/src/snpCaller/snpCall"
......@@ -99,8 +169,8 @@ SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP cal
threads = (args.threads if not args.print_commands else 1)
p = multiprocessing.Pool(threads, init_worker)
results = []
if args.splits:
splits = glob('{}/best_split_*'.format(args.splits))
if args.n_splits:
splits = glob('{}/bestsplits/best_split_*'.format(args.project_dir))
for split in splits:
results.append(p.apply_async(execute_snp_call,
(args,
......@@ -111,9 +181,58 @@ SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP cal
p.close()
p.join()
for r in results:
r.wait()
v = r.wait()
if v > 0:
stderr.write("SNV calling failed")
exit(1)
else:
execute_snp_call(args, snpCaller, indiv_out, called_SNP, None)
v = execute_snp_call(args, snpCaller, indiv_out, called_SNP, None)
if v > 0:
stderr.write("SNV calling failed")
exit(1)
def main():
parser = argparse.ArgumentParser(description='Compute SNV profiles')
parser.add_argument('project_dir', metavar='DIR',
help='A metaSNP initialized project directory (metaSNP_NEW)')
parser.add_argument('all_samples', metavar='FILE',
help='File with an input list of bam files, one file per line')
parser.add_argument('db_ann', metavar='BED_FILE',
help='Database annotation (BED file).')
parser.add_argument("ref_db", metavar='REF_DB_FILE',
help='reference multi-sequence FASTA file used for the alignments.')
parser.add_argument('--print-commands', default=False, action='store_true',
help='Instead of executing the commands, simply print them out')
parser.add_argument('--continue-after-opt', default=False, action='store_true',
help='Continue after print-commands')
parser.add_argument('--threads', metavar='INT', default=1, type=int,
help='Number of jobs to run simmultaneously')
parser.add_argument('--n_splits', metavar='INT', default=10, type=int, nargs='?',
help='Number of jobs to run simmultaneously')
args = parser.parse_args()
if not path.isfile(args.ref_db):
stderr.write('''
ERROR: No reference database or annotation file found!"
ERROR: '{}' is not a file."
SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP caller
'''.format(args.ref_db))
parser.print_help()
exit(1)
if not args.continue_after_opt:
if path.exists(args.project_dir):
stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
parser.print_help()
exit(1)
create_directories(args.project_dir)
compute_opt(args)
if not args.print_commands:
split_opt(args)
snp_call(args)
if __name__ == '__main__':
snp_call()
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment