Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • rossum/metaSNV
  • mantas/metaSNV
  • milanese/metaSNV
  • costea/metaSNV
4 results
Show changes
Commits on Source (19)
......@@ -9,7 +9,7 @@ Download
Via Git:
git clone git@git.embl.de:costea/metaSNV.git
git clone https://git.embl.de/costea/metaSNV.git
or [download](https://git.embl.de/costea/metaSNV/repository/archive.zip?ref=master) a zip file of the repository.
......@@ -86,9 +86,10 @@ Note: requires SNP calling (Part II) to be done!
Example Tutorial
================
## 1. Run the setup & compilation steps and download the provided reference database.
## 1. Run the setup & compilation steps and download the provided reference database.
./getRefDB.sh
$ ./getRefDB.sh
Select freeze9, as the tutorial files have been mapped against this freeze.
## 2. Go to the EXAMPLE directory and download the samples with the getSamplesScript.sh
......
......@@ -51,10 +51,7 @@ def run_sample(sample, command):
def compute_opt(args):
out_dir = path.join(args.project_dir, 'cov')
try:
os.makedirs(out_dir)
except:
pass
mkdir_p(out_dir)
p = multiprocessing.Pool(args.threads, init_worker)
results = []
for line in open(args.all_samples):
......@@ -79,51 +76,53 @@ def compute_opt(args):
def get_header(args):
use = open(args.all_samples).readline().rstrip()
o = subprocess.check_output(["samtools","view","-H",use])
o = subprocess.check_output(["samtools","view","-H",use]).decode("utf-8")
f = open(args.project_dir + '/bed_header','w')
for line in o.split('\n')[1:]:
line = line.rstrip().split('\t')
if len(line) != 3:
continue
line[1] = line[1].replace('SN:','')
line[2] = line[2].replace('LN:','')
f.write(line[1]+'\t1\t'+line[2]+'\n')
f.close()
line = line.rstrip().split('\t')
if len(line) != 3:
continue
if line[0] == "@SQ":
line[1] = line[1].replace('SN:','')
line[2] = line[2].replace('LN:','')
f.write(line[1]+'\t1\t'+line[2]+'\n')
f.close()
args.ctg_len = args.project_dir + '/bed_header'
def compute_summary(args):
'''This information is required by metaSNV_post.py'''
project_name = path.basename(args.project_dir)
cov_dir = path.join(args.project_dir, 'cov')
cov_files = glob(cov_dir + '/*.cov')
# project_name = path.basename(args.project_dir)
if not cov_files:
if not args.print_commands:
stderr.write("Coverage files not found.\n")
else:
stderr.write("Coverage files not found.\nFinish running the commands printed above and then run this command again.\n")
if not args.print_commands:
stderr.write("Coverage files not found.\n")
else:
stderr.write("Coverage files not found.\nFinish running the commands printed above and then run this command again.\n")
exit(1)
for f in cov_files:
cmd = ['python',
path.join(basedir, 'src/computeGenomeCoverage.py'),
path.join(basedir, 'src/computeGenomeCoverage.py'),
f,
f + '.detail',
f + '.summary']
subprocess.call(cmd)
print("\nCoverage summary here: {}".format(args.project_dir))
print(" Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, args.project_dir))
print(" Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, args.project_dir))
print(" Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, project_name))
print(" Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, project_name))
print("")
cmd = ['python',
'{}/src/collapse_coverages.py'.format(basedir),
args.project_dir]
subprocess.call(cmd)
subprocess.call(cmd)
def split_opt(args):
if args.n_splits > 100:
stderr.write("Maximum number of splits is 100.\n")
exit(1)
args.n_splits = 100
older_files = glob(args.project_dir + '/bestsplits/*')
if older_files:
......@@ -131,12 +130,14 @@ def split_opt(args):
for f in older_files:
os.unlink(f)
project_name = path.basename(args.project_dir)
print("\nCalculating best database split:")
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
cmd = ['python',
'{}/src/createOptimumSplit.py'.format(basedir),
"{}/{}.all_cov.tab".format(args.project_dir, args.project_dir),
"{}/{}.all_perc.tab".format(args.project_dir, args.project_dir),
"{}/{}.all_cov.tab".format(args.project_dir, project_name),
"{}/{}.all_perc.tab".format(args.project_dir, project_name),
args.ctg_len,
str(args.n_splits),
path.join(args.project_dir, "bestsplits", "best_split")]
......@@ -224,7 +225,7 @@ def main():
help='File with an input list of bam files, one file per line')
parser.add_argument("ref_db", metavar='REF_DB_FILE',
help='reference multi-sequence FASTA file used for the alignments.')
parser.add_argument('--db_ann', metavar='BED_FILE',default='',
parser.add_argument('--db_ann', metavar='DB_ANN_FILE',default='',
help='Database gene annotation.')
parser.add_argument('--print-commands', default=False, action='store_true',
help='Instead of executing the commands, simply print them out')
......@@ -246,28 +247,27 @@ SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP cal
exit(1)
if not path.isfile(basedir+"/src/qaTools/qaCompute") or not path.isfile(basedir+"/src/snpCaller/snpCall"):
stderr.write('''
stderr.write('''
ERROR: No binaries found
SOLUTION: make\n\n'''.format(basedir))
exit(1)
SOLUTION: make\n\n'''.format(basedir))
exit(1)
if args.threads > 1 and args.n_splits==1:
args.n_splits=args.threads
args.n_splits=args.threads
if path.exists(args.project_dir) and not args.print_commands:
stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
exit(1)
create_directories(args.project_dir)
compute_opt(args)
compute_summary(args)
get_header(args)
if args.n_splits > 1:
split_opt(args)
split_opt(args)
snp_call(args)
if __name__ == '__main__':
main()
#!/usr/bin/env python
import os
import sys
import time
import argparse
import glob
try:
import numpy as np
import numpy as np
except ImportError:
sys.stderr.write("Numpy is necessary to run postfiltering.\n")
sys.exit(1)
sys.stderr.write("Numpy is necessary to run postfiltering.\n")
sys.exit(1)
try:
import pandas as pd
import pandas as pd
except ImportError:
sys.stderr.write("Pandas is necessary to run postfiltering.\n")
sys.exit(1)
sys.stderr.write("Pandas is necessary to run postfiltering.\n")
sys.exit(1)
basedir = os.path.dirname(os.path.abspath(__file__))
......@@ -24,101 +23,101 @@ basedir = os.path.dirname(os.path.abspath(__file__))
# Parse Commandline Arguments
#############################
def get_arguments():
'''
Get commandline arguments and return namespace
'''
## Initialize Parser
parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
'''
Get commandline arguments and return namespace
'''
## Initialize Parser
parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Not Showns:
parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
# TODO: POPULATION STATISTICS
## Population Statistics (Fst - ):
# parser.add_argument("-f", "--fst", action="store_true")
## pNpS ratio:
# parser.add_argument("-n", "--pnps", action="store_true")
# OPTIONAL arguments:
parser.add_argument('-b', metavar='FLOAT', type=float, help="Coverage breadth: Horizontal genome coverage percentage per sample per species", default=40.0)
parser.add_argument('-d', metavar='FLOAT', type=float, help="Coverage depth: Average vertical genome coverage per sample per species", default=5.0)
parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
parser.add_argument('-div',action='store_true',help="Compute diversity measures")
parser.add_argument('-b', metavar='FLOAT', type=float, default=40.0,
help="Coverage breadth: minimal horizontal genome coverage percentage per sample per species")
parser.add_argument('-d', metavar='FLOAT', type=float, default=5.0,
help="Coverage depth: minimal average vertical genome coverage per sample per species")
parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
parser.add_argument('-div',action='store_true',help="Compute diversity measures")
# REQUIRED arguments:
parser.add_argument('projdir', help='project name', metavar='Proj')
return parser.parse_args()
parser.add_argument('projdir', help='project name', metavar='Proj')
return parser.parse_args()
##############################
def debugging(taxids_of_interest, header_cov):
'''Sanity check for the Dict of TaxID:Samples'''
nr_keys = 0
taxids_of_interest = []
for key in samples_of_interest:
nr_keys += 1
taxids_of_interest.append(key)
taxids_of_interest.sort()
print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
print(taxids_of_interest)
'''Sanity check for the Dict of TaxID:Samples'''
nr_keys = 0
taxids_of_interest = []
for key in samples_of_interest:
nr_keys += 1
taxids_of_interest.append(key)
taxids_of_interest.sort()
print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
print(taxids_of_interest)
# print("\n")
# print("nr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
header = "\t".join(header_cov)
print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
# header = "\t".join(header_cov)
# print("header: {}".format(header))
sys.exit("only filter 1")
sys.exit("only filter 1")
def file_check():
''' Check if required files exist (True / False)'''
args.projdir = args.projdir.rstrip('/')
args.coverage_file = args.projdir+'/'+args.projdir+'.all_cov.tab'
args.percentage_file = args.projdir+'/'+args.projdir+'.all_perc.tab'
args.all_samples = args.projdir+'/'+'all_samples'
print("Checking for necessary input files...")
if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
else:
sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
if os.path.isfile(args.all_samples):
print("found: '{}'\n".format(args.all_samples))
else:
sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
''' Check if required files exist (True / False)'''
args.projdir = args.projdir.rstrip('/')
args.coverage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_cov.tab'
args.percentage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_perc.tab'
args.all_samples = args.projdir+'/'+'all_samples'
print("Checking for necessary input files...")
if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
else:
sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
if os.path.isfile(args.all_samples):
print("found: '{}'\n".format(args.all_samples))
else:
sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
def print_arguments():
## Print Defined Thresholds:
print("Options:")
if args.b:
print("threshold: percentage covered (breadth) {}".format(args.b) )
if args.d:
print("threshold: average coverage (depth) {}".format(args.d) )
if args.m:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
if args.c:
print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
if args.p:
print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
print("")
## Print Defined Thresholds:
print("Options:")
if args.b:
print("threshold: percentage covered (breadth) {}".format(args.b) )
if args.d:
print("threshold: average coverage (depth) {}".format(args.d) )
if args.m:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
if args.c:
print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
if args.p:
print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
print("")
###############################
## FILTER I: Count SAMPLES_OF_INTEREST per TAXON_OF_INTEREST
# Example: "Which taxon has at least 10 SoI?"
# --Coverage Conditions:
# Sample of Interest:
# 1. breadth > 40 %
# 1. breadth > 40 %
# 2. depth > 5 X
# --Min. number Samples:
# 3. min samples/TaxID > 10
......@@ -137,9 +136,9 @@ def relevant_taxa(args):
sys.exit("ERROR: Coverage file headers do not match!") #Exit with error message
# Read taxon by taxon (line by line) check coverage conditions (thresholds)
for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
cstring = cov.split()
pstring = perc.split()
pstring = perc.split()
cov_taxID = cstring.pop(0)
perc_taxID = pstring.pop(0)
coverage = list(map(float, cstring)) # convert list_of_strings 2 list_of_floats (TaxID=INT!)
......@@ -152,10 +151,10 @@ def relevant_taxa(args):
if cov_taxID != perc_taxID: # check if taxIDs match!
sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!")
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
sample_count += 1
if c >= args.d and p >= args.b:
if c >= args.d and p >= args.b:
sample_names.append(header_cov[sample_count-1]) # starting at 0 in array
......@@ -176,7 +175,7 @@ def header_comparison(header_cov):
'''Comparing the sample order in the coverage and all_samples file'''
# sort snp_indices by sample_of_interest names
header = "\t".join(header_cov)
#header = "\t".join(header_cov)
#print("\nNr.Samples_cov_header: {}".format(len(header_cov)))
#print("cov_header: {}\n\n".format(header_cov[:10]))
#print("\ncov_header: {}\n {}\n\n".format(len(header_cov),header))
......@@ -185,21 +184,21 @@ def header_comparison(header_cov):
all_samples = open(args.all_samples,'r')
snp_header = all_samples.read().splitlines()
snp_header = [i.split('/')[-1] for i in snp_header] #get name /trim/off/path/to/sample.name.bam
snp_header_joined = "\t".join(snp_header)
#print("Nr.Samples in (all_samples): {}".format(len(snp_header)))
#print("all_samples: {}\n".format(snp_header[:10]))
# Compare the sample order in ALL.COV/ALL.PERC and all_samples
#print("Comparing headers..")
#snp_header_joined = "\t".join(snp_header)
#if header == snp_header_joined:
# print("Headers match nicely\n")
#else:
# print("CAUTION: Header in COV_FILE does not match the order of samples in the SNP_FILES,\n\t no problem, we took care of it!\n")
return snp_header
return snp_header
#################################
## FILTER II: ACQUIRE SNPs WITH SUFFICIENT OCCURRENCE WITHIN SAMPLES_OF_INTEREST
# --SNP Conditions (Default):
......@@ -208,21 +207,21 @@ def header_comparison(header_cov):
def filter_two(args, snp_header, snp_files, outdir):
'''position wise filtering'''
header_taxID = ''
snp_taxID = '_'
# print(locals())
for best_split_x in snp_files:
with open(best_split_x, 'r') as file:
for snp_line in file: #position wise loop
snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]
## SPECIES FILTER:
if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
continue #Taxon is not relevant, NEXT!
else:
## SAMPLE FILTER: only load samples with enough coverage
sample_list = samples_of_interest[snp_taxID]#Load Sample List
......@@ -246,9 +245,9 @@ def filter_two(args, snp_header, snp_files, outdir):
nr_good += 1
#FILTER: Position incidence with sufficient coverage:
if float(nr_good)/len(sample_indices) < args.p:#if snp_incidence < x % drop SNP
continue#mainly uninformative position, SNP incidence < x %, SNP droped!
else:
# CALCULATE SNP ALLELE FREQUENCY:
# If at least one position passed cutoffs open file (if new TaxID):
......@@ -259,42 +258,42 @@ def filter_two(args, snp_header, snp_files, outdir):
outfile = open(outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w')
print("Generating: {}".format(outdir+'/'+'%s.filtered.freq' % snp_taxID))
outfile.write('\t' + "\t".join(sample_list) + '\n')
header_taxID = snp_taxID#Jump to current TaxID
#Loop through alternative alleles [5](comma separated):
line_id = ":".join(whole_line[:4])#line_id composed of CHROM:REFGENE:POS:REFBASE
reference_base = whole_line[3]
alt_bases_totalcov = []#total coverage per alt allele
#VCF format
#LOOP Start:
for snp in whole_line[5].split(','):
xS = snp.split('|')
snp_coverage = list(map(float, xS[3:]))#coverage string (unfiltered!)
#Sanity check:
#Sanity check:
if len(site_coverage) != len(snp_coverage):
print("ERROR: SNP FILE {} is corrupted".format(best_split_x))
sys.exit("ERROR: Site coverage and SNP coverage string have uneven length!")
#Compute allele frequency tables:
alt_base = snp.split('|')[1]#alternative base
alt_base = snp.split('|')[1]#alternative base
#FREQUENCY Computation
total_reads = 0
snp_frq = []#frequencies for SNPs (pos >5X in at least 50% of the SoIs)
for index in sample_indices:
#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
if site_coverage[index] >= args.c and site_coverage[index] != 0:
snp_frq.append(snp_coverage[index]/site_coverage[index])
else:
snp_frq.append(-1)
snp_frq.append(-1)
# WRITE OUTPUT Allele Frequencies (Default)
outfile.write(":".join(snp_line.split()[:4])+'>'+alt_base +':'+ xS[2] + '\t' + "\t".join(str(x) for x in snp_frq) + '\n')
......@@ -311,20 +310,20 @@ def alleledist(d1,d2, threshold=.6):
def computeAllDist(args):
print "Computing distances"
print("Computing distances")
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq:
data = pd.read_table(f, index_col=0, na_values=['-1']).T
dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
data = pd.read_table(f, index_col=0, na_values=['-1']).T
dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
......@@ -366,11 +365,9 @@ def genetic_distance(sample1, sample2):
def computeAllDiv(args):
print "Computing diversities"
cov_perc = pd.read_table(args.percentage_file, skiprows=[1], index_col=False)
cov_perc = cov_perc.set_index(cov_perc.loc[:, 'Unnamed: 0'])
cov_perc = cov_perc.drop('Unnamed: 0', 1)
# Load external info : Coverage, genomes size, genes size
horizontal_coverage = pd.read_table(args.percentage_file, skiprows=[1], index_col=0)
vertical_coverage = pd.read_table(args.coverage_file, skiprows=[1], index_col=0)
bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
......@@ -383,11 +380,18 @@ def computeAllDiv(args):
data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)]
pos_index = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
data = data.set_index(pd.Index(pos_index))
data = data.set_index(pd.Index([item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]))
data = data.sort_index()
########
# Correcting the coverage by the genome length observed in each pairwise comparison
correction_cov = [[(min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
dist = [[genetic_distance(data.iloc[:, i], data.iloc[:, j]) / correction_cov[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
correction_coverage = [[(min(horizontal_coverage.loc[species, i], horizontal_coverage.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
########
# Vertical coverage in pi within : AvgCov / (AvgCov - 1)
for j,i in enumerate(data.columns):
correction_within = vertical_coverage.loc[species,i] / (vertical_coverage.loc[species,i] - 1)
correction_coverage[j][j] = correction_coverage[j][j] / correction_within
dist = [[genetic_distance(data.iloc[:, i], data.iloc[:, j]) / correction_coverage[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))]
dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
......
......@@ -28,7 +28,7 @@ def write_matrix(cov, header, ofile):
out.write('TaxId\t')
out.write('\t'.join([header for _ in bamfiles]))
out.write('\n')
for taxid in sorted(avg_cov.keys(), key=int):
for taxid in sorted(avg_cov.keys()):
c = cov[taxid]
out.write('{}\t'.format(taxid))
out.write('\t'.join([c[bf] for bf in bamfiles]))
......
......@@ -526,13 +526,37 @@ int main(int argc, char *argv[])
++duplicates;
} else {
if (!userOpt.spanCov) {
//All entries in SAM file are represented on the forward strand! (See specs of SAM format for details)
++entireChr[core->pos];
++usedReads;
if ((uint32_t)(core->pos+core->l_qseq) >= chrSize)
--entireChr[chrSize-1];
else
--entireChr[core->pos+core->l_qseq];
//All entries in SAM file are represented on the forward strand! (See specs of SAM format for details)
uint32_t* cigar = bam_get_cigar(b);
uint32_t pp = core->pos+1;
int i = 0;
if(((*cigar) & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP || ((*cigar) & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {//BWA is actively fucking with me.
//The start of the alignment is reported without this clip.
++cigar; ++i;
}
while (i<core->n_cigar) {
++i;
if (((*cigar) & BAM_CIGAR_MASK) != BAM_CMATCH) {
pp = pp + ((*cigar) >> BAM_CIGAR_SHIFT);
} else {
++entireChr[pp];
pp = pp + ((*cigar) >> BAM_CIGAR_SHIFT);
if (pp >= chrSize) {//These are mismatches that actually hang over the end. Silly mapper!
//fprintf(stderr,"you are fucked %d %d\n",pp,chrSize);
--entireChr[chrSize-1];
} else {
--entireChr[pp];
}
}
++cigar;
}
//++entireChr[core->pos];
++usedReads;
//if ((uint32_t)(core->pos+core->l_qseq) >= chrSize)
// --entireChr[chrSize-1];
//else
// --entireChr[core->pos+core->l_qseq];
} else {
//Computing span coverage.
//Only consider first read in pair! and extend a bit to the end of the insert
......
......@@ -130,7 +130,6 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
filePosStart = strlen(line);//This includes the \n
filePosition p1;
while (fgets(line,10000,refGenes)) {
fileConsumed += strlen(line);
int pos = 0;
const char* rest = toksplit(line,'\t',tok,10000);
while (*rest) {
......@@ -152,6 +151,7 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
++pos;
rest = toksplit(rest,'\t',tok,10000);
}
fileConsumed += strlen(line);
lineCount += 1;
}
//Add the last one!
......@@ -248,7 +248,7 @@ bool loadGenome(std::string gName, FILE* refGenes, bool* hasGenes) {
}
if (pos == 2) {//This is the name!
if (gName.compare(tok) != 0) {
fprintf(stderr,"Reading wrong gene defintions");
fprintf(stderr,"Reading wrong gene defintion for %s\n. Scafold supposed to be %s, but is %s\n.",geneName.c_str(),tok,gName.c_str());
break;
}
} else if (pos == 6) {//Start
......