Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • rossum/metaSNV
  • mantas/metaSNV
  • milanese/metaSNV
  • costea/metaSNV
4 results
Show changes
Commits on Source (15)
...@@ -9,7 +9,7 @@ Download ...@@ -9,7 +9,7 @@ Download
Via Git: Via Git:
git clone git@git.embl.de:costea/metaSNV.git git clone https://git.embl.de/costea/metaSNV.git
or [download](https://git.embl.de/costea/metaSNV/repository/archive.zip?ref=master) a zip file of the repository. or [download](https://git.embl.de/costea/metaSNV/repository/archive.zip?ref=master) a zip file of the repository.
...@@ -86,9 +86,10 @@ Note: requires SNP calling (Part II) to be done! ...@@ -86,9 +86,10 @@ Note: requires SNP calling (Part II) to be done!
Example Tutorial Example Tutorial
================ ================
## 1. Run the setup & compilation steps and download the provided reference database. ## 1. Run the setup & compilation steps and download the provided reference database.
./getRefDB.sh $ ./getRefDB.sh
Select freeze9, as the tutorial files have been mapped against this freeze.
## 2. Go to the EXAMPLE directory and download the samples with the getSamplesScript.sh ## 2. Go to the EXAMPLE directory and download the samples with the getSamplesScript.sh
......
...@@ -51,10 +51,7 @@ def run_sample(sample, command): ...@@ -51,10 +51,7 @@ def run_sample(sample, command):
def compute_opt(args): def compute_opt(args):
out_dir = path.join(args.project_dir, 'cov') out_dir = path.join(args.project_dir, 'cov')
try: mkdir_p(out_dir)
os.makedirs(out_dir)
except:
pass
p = multiprocessing.Pool(args.threads, init_worker) p = multiprocessing.Pool(args.threads, init_worker)
results = [] results = []
for line in open(args.all_samples): for line in open(args.all_samples):
...@@ -79,16 +76,17 @@ def compute_opt(args): ...@@ -79,16 +76,17 @@ def compute_opt(args):
def get_header(args): def get_header(args):
use = open(args.all_samples).readline().rstrip() use = open(args.all_samples).readline().rstrip()
o = subprocess.check_output(["samtools","view","-H",use]) o = subprocess.check_output(["samtools","view","-H",use]).decode("utf-8")
f = open(args.project_dir + '/bed_header','w') f = open(args.project_dir + '/bed_header','w')
for line in o.split('\n')[1:]: for line in o.split('\n')[1:]:
line = line.rstrip().split('\t') line = line.rstrip().split('\t')
if len(line) != 3: if len(line) != 3:
continue continue
line[1] = line[1].replace('SN:','') if line[0] == "@SQ":
line[2] = line[2].replace('LN:','') line[1] = line[1].replace('SN:','')
f.write(line[1]+'\t1\t'+line[2]+'\n') line[2] = line[2].replace('LN:','')
f.close() f.write(line[1]+'\t1\t'+line[2]+'\n')
f.close()
args.ctg_len = args.project_dir + '/bed_header' args.ctg_len = args.project_dir + '/bed_header'
def compute_summary(args): def compute_summary(args):
...@@ -99,14 +97,14 @@ def compute_summary(args): ...@@ -99,14 +97,14 @@ def compute_summary(args):
cov_files = glob(cov_dir + '/*.cov') cov_files = glob(cov_dir + '/*.cov')
if not cov_files: if not cov_files:
if not args.print_commands: if not args.print_commands:
stderr.write("Coverage files not found.\n") stderr.write("Coverage files not found.\n")
else: else:
stderr.write("Coverage files not found.\nFinish running the commands printed above and then run this command again.\n") stderr.write("Coverage files not found.\nFinish running the commands printed above and then run this command again.\n")
exit(1) exit(1)
for f in cov_files: for f in cov_files:
cmd = ['python', cmd = ['python',
path.join(basedir, 'src/computeGenomeCoverage.py'), path.join(basedir, 'src/computeGenomeCoverage.py'),
f, f,
f + '.detail', f + '.detail',
f + '.summary'] f + '.summary']
...@@ -118,13 +116,13 @@ def compute_summary(args): ...@@ -118,13 +116,13 @@ def compute_summary(args):
cmd = ['python', cmd = ['python',
'{}/src/collapse_coverages.py'.format(basedir), '{}/src/collapse_coverages.py'.format(basedir),
args.project_dir] args.project_dir]
subprocess.call(cmd) subprocess.call(cmd)
def split_opt(args): def split_opt(args):
if args.n_splits > 100: if args.n_splits > 100:
stderr.write("Maximum number of splits is 100.\n") stderr.write("Maximum number of splits is 100.\n")
exit(1) args.n_splits = 100
older_files = glob(args.project_dir + '/bestsplits/*') older_files = glob(args.project_dir + '/bestsplits/*')
if older_files: if older_files:
...@@ -132,7 +130,7 @@ def split_opt(args): ...@@ -132,7 +130,7 @@ def split_opt(args):
for f in older_files: for f in older_files:
os.unlink(f) os.unlink(f)
project_name = path.basename(args.project_dir) project_name = path.basename(args.project_dir)
print("\nCalculating best database split:") print("\nCalculating best database split:")
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile> # usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
...@@ -249,28 +247,27 @@ SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP cal ...@@ -249,28 +247,27 @@ SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP cal
exit(1) exit(1)
if not path.isfile(basedir+"/src/qaTools/qaCompute") or not path.isfile(basedir+"/src/snpCaller/snpCall"): if not path.isfile(basedir+"/src/qaTools/qaCompute") or not path.isfile(basedir+"/src/snpCaller/snpCall"):
stderr.write(''' stderr.write('''
ERROR: No binaries found ERROR: No binaries found
SOLUTION: make\n\n'''.format(basedir)) SOLUTION: make\n\n'''.format(basedir))
exit(1) exit(1)
if args.threads > 1 and args.n_splits==1: if args.threads > 1 and args.n_splits==1:
args.n_splits=args.threads args.n_splits=args.threads
if path.exists(args.project_dir) and not args.print_commands: if path.exists(args.project_dir) and not args.print_commands:
stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir)) stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
exit(1) exit(1)
create_directories(args.project_dir) create_directories(args.project_dir)
compute_opt(args) compute_opt(args)
compute_summary(args) compute_summary(args)
get_header(args) get_header(args)
if args.n_splits > 1: if args.n_splits > 1:
split_opt(args) split_opt(args)
snp_call(args) snp_call(args)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
#!/usr/bin/env python #!/usr/bin/env python
import os import os
import sys import sys
import time
import argparse import argparse
import glob import glob
try: try:
import numpy as np import numpy as np
except ImportError: except ImportError:
sys.stderr.write("Numpy is necessary to run postfiltering.\n") sys.stderr.write("Numpy is necessary to run postfiltering.\n")
sys.exit(1) sys.exit(1)
try: try:
import pandas as pd import pandas as pd
except ImportError: except ImportError:
sys.stderr.write("Pandas is necessary to run postfiltering.\n") sys.stderr.write("Pandas is necessary to run postfiltering.\n")
sys.exit(1) sys.exit(1)
basedir = os.path.dirname(os.path.abspath(__file__)) basedir = os.path.dirname(os.path.abspath(__file__))
...@@ -24,101 +23,101 @@ basedir = os.path.dirname(os.path.abspath(__file__)) ...@@ -24,101 +23,101 @@ basedir = os.path.dirname(os.path.abspath(__file__))
# Parse Commandline Arguments # Parse Commandline Arguments
############################# #############################
def get_arguments(): def get_arguments():
''' '''
Get commandline arguments and return namespace Get commandline arguments and return namespace
''' '''
## Initialize Parser ## Initialize Parser
parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Not Showns: # Not Showns:
parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS) parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
# TODO: POPULATION STATISTICS # TODO: POPULATION STATISTICS
## Population Statistics (Fst - ): ## Population Statistics (Fst - ):
# parser.add_argument("-f", "--fst", action="store_true") # parser.add_argument("-f", "--fst", action="store_true")
## pNpS ratio: ## pNpS ratio:
# parser.add_argument("-n", "--pnps", action="store_true") # parser.add_argument("-n", "--pnps", action="store_true")
# OPTIONAL arguments: # OPTIONAL arguments:
parser.add_argument('-b', metavar='FLOAT', type=float, help="Coverage breadth: Horizontal genome coverage percentage per sample per species", default=40.0) parser.add_argument('-b', metavar='FLOAT', type=float, default=40.0,
parser.add_argument('-d', metavar='FLOAT', type=float, help="Coverage depth: Average vertical genome coverage per sample per species", default=5.0) help="Coverage breadth: minimal horizontal genome coverage percentage per sample per species")
parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2) parser.add_argument('-d', metavar='FLOAT', type=float, default=5.0,
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0) help="Coverage depth: minimal average vertical genome coverage per sample per species")
parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50) parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-div',action='store_true',help="Compute diversity measures") parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
parser.add_argument('-div',action='store_true',help="Compute diversity measures")
# REQUIRED arguments: # REQUIRED arguments:
parser.add_argument('projdir', help='project name', metavar='Proj') parser.add_argument('projdir', help='project name', metavar='Proj')
return parser.parse_args()
return parser.parse_args()
############################## ##############################
def debugging(taxids_of_interest, header_cov): def debugging(taxids_of_interest, header_cov):
'''Sanity check for the Dict of TaxID:Samples''' '''Sanity check for the Dict of TaxID:Samples'''
nr_keys = 0 nr_keys = 0
taxids_of_interest = [] taxids_of_interest = []
for key in samples_of_interest: for key in samples_of_interest:
nr_keys += 1 nr_keys += 1
taxids_of_interest.append(key) taxids_of_interest.append(key)
taxids_of_interest.sort() taxids_of_interest.sort()
print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys)) print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
print(taxids_of_interest) print(taxids_of_interest)
# print("\n") # print("\n")
# print("nr_TaxIDs_of_Interest.keys: {}".format(nr_keys)) # print("nr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
header = "\t".join(header_cov) print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov))) # header = "\t".join(header_cov)
# print("header: {}".format(header)) # print("header: {}".format(header))
sys.exit("only filter 1") sys.exit("only filter 1")
def file_check(): def file_check():
''' Check if required files exist (True / False)''' ''' Check if required files exist (True / False)'''
args.projdir = args.projdir.rstrip('/') args.projdir = args.projdir.rstrip('/')
args.coverage_file = args.projdir+'/'+args.projdir+'.all_cov.tab' args.coverage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_cov.tab'
args.percentage_file = args.projdir+'/'+args.projdir+'.all_perc.tab' args.percentage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_perc.tab'
args.all_samples = args.projdir+'/'+'all_samples' args.all_samples = args.projdir+'/'+'all_samples'
print("Checking for necessary input files...") print("Checking for necessary input files...")
if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file): if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file)) print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
else: else:
sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file)) sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
if os.path.isfile(args.all_samples): if os.path.isfile(args.all_samples):
print("found: '{}'\n".format(args.all_samples)) print("found: '{}'\n".format(args.all_samples))
else: else:
sys.exit("\nERROR: No such file '{}'".format(args.all_samples)) sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
def print_arguments(): def print_arguments():
## Print Defined Thresholds: ## Print Defined Thresholds:
print("Options:") print("Options:")
if args.b: if args.b:
print("threshold: percentage covered (breadth) {}".format(args.b) ) print("threshold: percentage covered (breadth) {}".format(args.b) )
if args.d: if args.d:
print("threshold: average coverage (depth) {}".format(args.d) ) print("threshold: average coverage (depth) {}".format(args.d) )
if args.m: if args.m:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) ) print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
if args.c: if args.c:
print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) ) print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
if args.p: if args.p:
print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) ) print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
print("") print("")
############################### ###############################
## FILTER I: Count SAMPLES_OF_INTEREST per TAXON_OF_INTEREST ## FILTER I: Count SAMPLES_OF_INTEREST per TAXON_OF_INTEREST
# Example: "Which taxon has at least 10 SoI?" # Example: "Which taxon has at least 10 SoI?"
# --Coverage Conditions: # --Coverage Conditions:
# Sample of Interest: # Sample of Interest:
# 1. breadth > 40 % # 1. breadth > 40 %
# 2. depth > 5 X # 2. depth > 5 X
# --Min. number Samples: # --Min. number Samples:
# 3. min samples/TaxID > 10 # 3. min samples/TaxID > 10
...@@ -137,9 +136,9 @@ def relevant_taxa(args): ...@@ -137,9 +136,9 @@ def relevant_taxa(args):
sys.exit("ERROR: Coverage file headers do not match!") #Exit with error message sys.exit("ERROR: Coverage file headers do not match!") #Exit with error message
# Read taxon by taxon (line by line) check coverage conditions (thresholds) # Read taxon by taxon (line by line) check coverage conditions (thresholds)
for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces) for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
cstring = cov.split() cstring = cov.split()
pstring = perc.split() pstring = perc.split()
cov_taxID = cstring.pop(0) cov_taxID = cstring.pop(0)
perc_taxID = pstring.pop(0) perc_taxID = pstring.pop(0)
coverage = list(map(float, cstring)) # convert list_of_strings 2 list_of_floats (TaxID=INT!) coverage = list(map(float, cstring)) # convert list_of_strings 2 list_of_floats (TaxID=INT!)
...@@ -152,10 +151,10 @@ def relevant_taxa(args): ...@@ -152,10 +151,10 @@ def relevant_taxa(args):
if cov_taxID != perc_taxID: # check if taxIDs match! if cov_taxID != perc_taxID: # check if taxIDs match!
sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!") sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!")
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed]) for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
sample_count += 1 sample_count += 1
if c >= args.d and p >= args.b: if c >= args.d and p >= args.b:
sample_names.append(header_cov[sample_count-1]) # starting at 0 in array sample_names.append(header_cov[sample_count-1]) # starting at 0 in array
...@@ -176,7 +175,7 @@ def header_comparison(header_cov): ...@@ -176,7 +175,7 @@ def header_comparison(header_cov):
'''Comparing the sample order in the coverage and all_samples file''' '''Comparing the sample order in the coverage and all_samples file'''
# sort snp_indices by sample_of_interest names # sort snp_indices by sample_of_interest names
header = "\t".join(header_cov) #header = "\t".join(header_cov)
#print("\nNr.Samples_cov_header: {}".format(len(header_cov))) #print("\nNr.Samples_cov_header: {}".format(len(header_cov)))
#print("cov_header: {}\n\n".format(header_cov[:10])) #print("cov_header: {}\n\n".format(header_cov[:10]))
#print("\ncov_header: {}\n {}\n\n".format(len(header_cov),header)) #print("\ncov_header: {}\n {}\n\n".format(len(header_cov),header))
...@@ -185,21 +184,21 @@ def header_comparison(header_cov): ...@@ -185,21 +184,21 @@ def header_comparison(header_cov):
all_samples = open(args.all_samples,'r') all_samples = open(args.all_samples,'r')
snp_header = all_samples.read().splitlines() snp_header = all_samples.read().splitlines()
snp_header = [i.split('/')[-1] for i in snp_header] #get name /trim/off/path/to/sample.name.bam snp_header = [i.split('/')[-1] for i in snp_header] #get name /trim/off/path/to/sample.name.bam
snp_header_joined = "\t".join(snp_header)
#print("Nr.Samples in (all_samples): {}".format(len(snp_header))) #print("Nr.Samples in (all_samples): {}".format(len(snp_header)))
#print("all_samples: {}\n".format(snp_header[:10])) #print("all_samples: {}\n".format(snp_header[:10]))
# Compare the sample order in ALL.COV/ALL.PERC and all_samples # Compare the sample order in ALL.COV/ALL.PERC and all_samples
#print("Comparing headers..") #print("Comparing headers..")
#snp_header_joined = "\t".join(snp_header)
#if header == snp_header_joined: #if header == snp_header_joined:
# print("Headers match nicely\n") # print("Headers match nicely\n")
#else: #else:
# print("CAUTION: Header in COV_FILE does not match the order of samples in the SNP_FILES,\n\t no problem, we took care of it!\n") # print("CAUTION: Header in COV_FILE does not match the order of samples in the SNP_FILES,\n\t no problem, we took care of it!\n")
return snp_header
return snp_header
################################# #################################
## FILTER II: ACQUIRE SNPs WITH SUFFICIENT OCCURRENCE WITHIN SAMPLES_OF_INTEREST ## FILTER II: ACQUIRE SNPs WITH SUFFICIENT OCCURRENCE WITHIN SAMPLES_OF_INTEREST
# --SNP Conditions (Default): # --SNP Conditions (Default):
...@@ -208,21 +207,21 @@ def header_comparison(header_cov): ...@@ -208,21 +207,21 @@ def header_comparison(header_cov):
def filter_two(args, snp_header, snp_files, outdir): def filter_two(args, snp_header, snp_files, outdir):
'''position wise filtering''' '''position wise filtering'''
header_taxID = '' header_taxID = ''
snp_taxID = '_' snp_taxID = '_'
# print(locals()) # print(locals())
for best_split_x in snp_files: for best_split_x in snp_files:
with open(best_split_x, 'r') as file: with open(best_split_x, 'r') as file:
for snp_line in file: #position wise loop for snp_line in file: #position wise loop
snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ] snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]
## SPECIES FILTER: ## SPECIES FILTER:
if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
continue #Taxon is not relevant, NEXT! continue #Taxon is not relevant, NEXT!
else: else:
## SAMPLE FILTER: only load samples with enough coverage ## SAMPLE FILTER: only load samples with enough coverage
sample_list = samples_of_interest[snp_taxID]#Load Sample List sample_list = samples_of_interest[snp_taxID]#Load Sample List
...@@ -246,9 +245,9 @@ def filter_two(args, snp_header, snp_files, outdir): ...@@ -246,9 +245,9 @@ def filter_two(args, snp_header, snp_files, outdir):
nr_good += 1 nr_good += 1
#FILTER: Position incidence with sufficient coverage: #FILTER: Position incidence with sufficient coverage:
if float(nr_good)/len(sample_indices) < args.p:#if snp_incidence < x % drop SNP if float(nr_good)/len(sample_indices) < args.p:#if snp_incidence < x % drop SNP
continue#mainly uninformative position, SNP incidence < x %, SNP droped! continue#mainly uninformative position, SNP incidence < x %, SNP droped!
else: else:
# CALCULATE SNP ALLELE FREQUENCY: # CALCULATE SNP ALLELE FREQUENCY:
# If at least one position passed cutoffs open file (if new TaxID): # If at least one position passed cutoffs open file (if new TaxID):
...@@ -259,42 +258,42 @@ def filter_two(args, snp_header, snp_files, outdir): ...@@ -259,42 +258,42 @@ def filter_two(args, snp_header, snp_files, outdir):
outfile = open(outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w') outfile = open(outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w')
print("Generating: {}".format(outdir+'/'+'%s.filtered.freq' % snp_taxID)) print("Generating: {}".format(outdir+'/'+'%s.filtered.freq' % snp_taxID))
outfile.write('\t' + "\t".join(sample_list) + '\n') outfile.write('\t' + "\t".join(sample_list) + '\n')
header_taxID = snp_taxID#Jump to current TaxID header_taxID = snp_taxID#Jump to current TaxID
#Loop through alternative alleles [5](comma separated): #Loop through alternative alleles [5](comma separated):
line_id = ":".join(whole_line[:4])#line_id composed of CHROM:REFGENE:POS:REFBASE line_id = ":".join(whole_line[:4])#line_id composed of CHROM:REFGENE:POS:REFBASE
reference_base = whole_line[3] reference_base = whole_line[3]
alt_bases_totalcov = []#total coverage per alt allele alt_bases_totalcov = []#total coverage per alt allele
#VCF format #VCF format
#LOOP Start: #LOOP Start:
for snp in whole_line[5].split(','): for snp in whole_line[5].split(','):
xS = snp.split('|') xS = snp.split('|')
snp_coverage = list(map(float, xS[3:]))#coverage string (unfiltered!) snp_coverage = list(map(float, xS[3:]))#coverage string (unfiltered!)
#Sanity check: #Sanity check:
if len(site_coverage) != len(snp_coverage): if len(site_coverage) != len(snp_coverage):
print("ERROR: SNP FILE {} is corrupted".format(best_split_x)) print("ERROR: SNP FILE {} is corrupted".format(best_split_x))
sys.exit("ERROR: Site coverage and SNP coverage string have uneven length!") sys.exit("ERROR: Site coverage and SNP coverage string have uneven length!")
#Compute allele frequency tables: #Compute allele frequency tables:
alt_base = snp.split('|')[1]#alternative base alt_base = snp.split('|')[1]#alternative base
#FREQUENCY Computation #FREQUENCY Computation
total_reads = 0 total_reads = 0
snp_frq = []#frequencies for SNPs (pos >5X in at least 50% of the SoIs) snp_frq = []#frequencies for SNPs (pos >5X in at least 50% of the SoIs)
for index in sample_indices: for index in sample_indices:
#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0! #prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
if site_coverage[index] >= args.c and site_coverage[index] != 0: if site_coverage[index] >= args.c and site_coverage[index] != 0:
snp_frq.append(snp_coverage[index]/site_coverage[index]) snp_frq.append(snp_coverage[index]/site_coverage[index])
else: else:
snp_frq.append(-1) snp_frq.append(-1)
# WRITE OUTPUT Allele Frequencies (Default) # WRITE OUTPUT Allele Frequencies (Default)
outfile.write(":".join(snp_line.split()[:4])+'>'+alt_base +':'+ xS[2] + '\t' + "\t".join(str(x) for x in snp_frq) + '\n') outfile.write(":".join(snp_line.split()[:4])+'>'+alt_base +':'+ xS[2] + '\t' + "\t".join(str(x) for x in snp_frq) + '\n')
...@@ -311,20 +310,20 @@ def alleledist(d1,d2, threshold=.6): ...@@ -311,20 +310,20 @@ def alleledist(d1,d2, threshold=.6):
def computeAllDist(args): def computeAllDist(args):
print "Computing distances" print("Computing distances")
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq') allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq: for f in allFreq:
data = pd.read_table(f, index_col=0, na_values=['-1']).T data = pd.read_table(f, index_col=0, na_values=['-1']).T
dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))] dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index) dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t') dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))] dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index) dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t') dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
...@@ -366,11 +365,9 @@ def genetic_distance(sample1, sample2): ...@@ -366,11 +365,9 @@ def genetic_distance(sample1, sample2):
def computeAllDiv(args): def computeAllDiv(args):
# Load external info : Coverage, genomes size, genes size
print "Computing diversities" horizontal_coverage = pd.read_table(args.percentage_file, skiprows=[1], index_col=0)
cov_perc = pd.read_table(args.percentage_file, skiprows=[1], index_col=False) vertical_coverage = pd.read_table(args.coverage_file, skiprows=[1], index_col=0)
cov_perc = cov_perc.set_index(cov_perc.loc[:, 'Unnamed: 0'])
cov_perc = cov_perc.drop('Unnamed: 0', 1)
bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None) bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)] bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
...@@ -383,11 +380,18 @@ def computeAllDiv(args): ...@@ -383,11 +380,18 @@ def computeAllDiv(args):
data = pd.read_table(f, index_col=0, na_values=['-1']) data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)] pre_index = [i.split(':') for i in list(data.index)]
pos_index = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index] data = data.set_index(pd.Index([item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]))
data = data.set_index(pd.Index(pos_index)) data = data.sort_index()
########
# Correcting the coverage by the genome length observed in each pairwise comparison # Correcting the coverage by the genome length observed in each pairwise comparison
correction_cov = [[(min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns] correction_coverage = [[(min(horizontal_coverage.loc[species, i], horizontal_coverage.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
dist = [[genetic_distance(data.iloc[:, i], data.iloc[:, j]) / correction_cov[j][i] for i in range(j + 1)] for j in range(len(data.columns))] ########
# Vertical coverage in pi within : AvgCov / (AvgCov - 1)
for j,i in enumerate(data.columns):
correction_within = vertical_coverage.loc[species,i] / (vertical_coverage.loc[species,i] - 1)
correction_coverage[j][j] = correction_coverage[j][j] / correction_within
dist = [[genetic_distance(data.iloc[:, i], data.iloc[:, j]) / correction_coverage[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))] FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))]
dist = pd.DataFrame(dist, index=data.columns, columns=data.columns) dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
......
...@@ -28,7 +28,7 @@ def write_matrix(cov, header, ofile): ...@@ -28,7 +28,7 @@ def write_matrix(cov, header, ofile):
out.write('TaxId\t') out.write('TaxId\t')
out.write('\t'.join([header for _ in bamfiles])) out.write('\t'.join([header for _ in bamfiles]))
out.write('\n') out.write('\n')
for taxid in sorted(avg_cov.keys(), key=int): for taxid in sorted(avg_cov.keys()):
c = cov[taxid] c = cov[taxid]
out.write('{}\t'.format(taxid)) out.write('{}\t'.format(taxid))
out.write('\t'.join([c[bf] for bf in bamfiles])) out.write('\t'.join([c[bf] for bf in bamfiles]))
......
...@@ -526,13 +526,37 @@ int main(int argc, char *argv[]) ...@@ -526,13 +526,37 @@ int main(int argc, char *argv[])
++duplicates; ++duplicates;
} else { } else {
if (!userOpt.spanCov) { if (!userOpt.spanCov) {
//All entries in SAM file are represented on the forward strand! (See specs of SAM format for details) //All entries in SAM file are represented on the forward strand! (See specs of SAM format for details)
++entireChr[core->pos]; uint32_t* cigar = bam_get_cigar(b);
++usedReads; uint32_t pp = core->pos+1;
if ((uint32_t)(core->pos+core->l_qseq) >= chrSize) int i = 0;
--entireChr[chrSize-1]; if(((*cigar) & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP || ((*cigar) & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {//BWA is actively fucking with me.
else //The start of the alignment is reported without this clip.
--entireChr[core->pos+core->l_qseq]; ++cigar; ++i;
}
while (i<core->n_cigar) {
++i;
if (((*cigar) & BAM_CIGAR_MASK) != BAM_CMATCH) {
pp = pp + ((*cigar) >> BAM_CIGAR_SHIFT);
} else {
++entireChr[pp];
pp = pp + ((*cigar) >> BAM_CIGAR_SHIFT);
if (pp >= chrSize) {//These are mismatches that actually hang over the end. Silly mapper!
//fprintf(stderr,"you are fucked %d %d\n",pp,chrSize);
--entireChr[chrSize-1];
} else {
--entireChr[pp];
}
}
++cigar;
}
//++entireChr[core->pos];
++usedReads;
//if ((uint32_t)(core->pos+core->l_qseq) >= chrSize)
// --entireChr[chrSize-1];
//else
// --entireChr[core->pos+core->l_qseq];
} else { } else {
//Computing span coverage. //Computing span coverage.
//Only consider first read in pair! and extend a bit to the end of the insert //Only consider first read in pair! and extend a bit to the end of the insert
......
...@@ -130,7 +130,6 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) { ...@@ -130,7 +130,6 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
filePosStart = strlen(line);//This includes the \n filePosStart = strlen(line);//This includes the \n
filePosition p1; filePosition p1;
while (fgets(line,10000,refGenes)) { while (fgets(line,10000,refGenes)) {
fileConsumed += strlen(line);
int pos = 0; int pos = 0;
const char* rest = toksplit(line,'\t',tok,10000); const char* rest = toksplit(line,'\t',tok,10000);
while (*rest) { while (*rest) {
...@@ -152,6 +151,7 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) { ...@@ -152,6 +151,7 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
++pos; ++pos;
rest = toksplit(rest,'\t',tok,10000); rest = toksplit(rest,'\t',tok,10000);
} }
fileConsumed += strlen(line);
lineCount += 1; lineCount += 1;
} }
//Add the last one! //Add the last one!
...@@ -248,7 +248,7 @@ bool loadGenome(std::string gName, FILE* refGenes, bool* hasGenes) { ...@@ -248,7 +248,7 @@ bool loadGenome(std::string gName, FILE* refGenes, bool* hasGenes) {
} }
if (pos == 2) {//This is the name! if (pos == 2) {//This is the name!
if (gName.compare(tok) != 0) { if (gName.compare(tok) != 0) {
fprintf(stderr,"Reading wrong gene defintions"); fprintf(stderr,"Reading wrong gene defintion for %s\n. Scafold supposed to be %s, but is %s\n.",geneName.c_str(),tok,gName.c_str());
break; break;
} }
} else if (pos == 6) {//Start } else if (pos == 6) {//Start
......