Compare revisions

Robin Erich Muench · Robin Erich Muench · Robin Erich Muench · Robin Erich Muench · Luis Pedro Coelho · Luis Pedro Coelho
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Download

 Via Git:

-    git clone git@git.embl.de:costea/metaSNV.git
+    git clone https://git.embl.de/costea/metaSNV.git
    
 or [download](https://git.embl.de/costea/metaSNV/repository/archive.zip?ref=master) a zip file of the repository.

@@ -86,9 +86,10 @@ Note: requires SNP calling (Part II) to be done!
 Example Tutorial
 ================

-## 1. Run the setup & compilation steps and download the provided reference database.
+## 1. Run the setup & compilation steps and download the provided reference database. 

-    ./getRefDB.sh
+    $ ./getRefDB.sh
+    Select freeze9, as the tutorial files have been mapped against this freeze. 

 ## 2. Go to the EXAMPLE directory and download the samples with the getSamplesScript.sh


--- a/metaSNV.py
+++ b/metaSNV.py
@@ -51,10 +51,7 @@ def run_sample(sample, command):

 def compute_opt(args):
    out_dir = path.join(args.project_dir, 'cov')
-    try:
-        os.makedirs(out_dir)
-    except:
-        pass
+    mkdir_p(out_dir)
    p = multiprocessing.Pool(args.threads, init_worker)
    results = []
    for line in open(args.all_samples):
@@ -79,51 +76,53 @@ def compute_opt(args):

 def get_header(args):
    use = open(args.all_samples).readline().rstrip()
-    o = subprocess.check_output(["samtools","view","-H",use])
+    o = subprocess.check_output(["samtools","view","-H",use]).decode("utf-8")
    f = open(args.project_dir + '/bed_header','w')
    for line in o.split('\n')[1:]:
-	line = line.rstrip().split('\t')
-	if len(line) != 3:
-		continue
-	line[1] = line[1].replace('SN:','')
-	line[2] = line[2].replace('LN:','')
-	f.write(line[1]+'\t1\t'+line[2]+'\n')
-    f.close()	
+        line = line.rstrip().split('\t')
+        if len(line) != 3:
+            continue
+        if line[0] == "@SQ":
+            line[1] = line[1].replace('SN:','')
+            line[2] = line[2].replace('LN:','')
+            f.write(line[1]+'\t1\t'+line[2]+'\n')
+    f.close()
    args.ctg_len = args.project_dir + '/bed_header'

-
 def compute_summary(args):
    '''This information is required by metaSNV_post.py'''
+
+    project_name = path.basename(args.project_dir)
    cov_dir = path.join(args.project_dir, 'cov')
    cov_files = glob(cov_dir + '/*.cov')
-#    project_name = path.basename(args.project_dir)
+
    if not cov_files:
-	if not args.print_commands:
-        	stderr.write("Coverage files not found.\n")
-	else:
-		stderr.write("Coverage files not found.\nFinish running the commands printed above and then run this command again.\n")
+        if not args.print_commands:
+            stderr.write("Coverage files not found.\n")
+        else:
+            stderr.write("Coverage files not found.\nFinish running the commands printed above and then run this command again.\n")
        exit(1)
    for f in cov_files:
        cmd = ['python',
-	            path.join(basedir, 'src/computeGenomeCoverage.py'),
+                path.join(basedir, 'src/computeGenomeCoverage.py'),
                f,
                f + '.detail',
                f + '.summary']
        subprocess.call(cmd)
    print("\nCoverage summary here: {}".format(args.project_dir))
-    print("	Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, args.project_dir))
-    print("	Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, args.project_dir))
+    print("	Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, project_name))
+    print("	Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, project_name))
+    print("")
    cmd = ['python',
            '{}/src/collapse_coverages.py'.format(basedir),
            args.project_dir]
-    subprocess.call(cmd)    
-
+    subprocess.call(cmd)

 def split_opt(args):

    if args.n_splits > 100:
        stderr.write("Maximum number of splits is 100.\n")
-        exit(1)
+        args.n_splits = 100

    older_files = glob(args.project_dir + '/bestsplits/*')
    if older_files:
@@ -131,12 +130,14 @@ def split_opt(args):
        for f in older_files:
            os.unlink(f)

+    project_name = path.basename(args.project_dir)
+
    print("\nCalculating best database split:")
    # usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
    cmd = ['python',
            '{}/src/createOptimumSplit.py'.format(basedir),
-            "{}/{}.all_cov.tab".format(args.project_dir, args.project_dir),
-            "{}/{}.all_perc.tab".format(args.project_dir, args.project_dir),
+            "{}/{}.all_cov.tab".format(args.project_dir, project_name),
+            "{}/{}.all_perc.tab".format(args.project_dir, project_name),
            args.ctg_len,
            str(args.n_splits),
            path.join(args.project_dir, "bestsplits", "best_split")]
@@ -224,7 +225,7 @@ def main():
                        help='File with an input list of bam files, one file per line')
    parser.add_argument("ref_db", metavar='REF_DB_FILE',
                        help='reference multi-sequence FASTA file used for the alignments.')
-    parser.add_argument('--db_ann', metavar='BED_FILE',default='',
+    parser.add_argument('--db_ann', metavar='DB_ANN_FILE',default='',
                        help='Database gene annotation.')
    parser.add_argument('--print-commands', default=False, action='store_true',
                        help='Instead of executing the commands, simply print them out')
@@ -246,28 +247,27 @@ SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP cal
        exit(1)

    if not path.isfile(basedir+"/src/qaTools/qaCompute") or not path.isfile(basedir+"/src/snpCaller/snpCall"):
-	stderr.write('''
+        stderr.write('''
 ERROR:  No binaries found

-SOLUTION: make\n\n'''.format(basedir))	
-	exit(1)
+SOLUTION: make\n\n'''.format(basedir))
+        exit(1)

    if args.threads > 1 and args.n_splits==1:
-	args.n_splits=args.threads
+        args.n_splits=args.threads

    if path.exists(args.project_dir) and not args.print_commands:
-    	stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
+        stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
        exit(1)

    create_directories(args.project_dir)
    compute_opt(args)
    compute_summary(args)
    get_header(args)
-    
+
    if args.n_splits > 1:
-	split_opt(args)
+        split_opt(args)
    snp_call(args)

 if __name__ == '__main__':
    main()
-
--- a/metaSNV_post.py
+++ b/metaSNV_post.py
 #!/usr/bin/env python
 import os
 import sys
-import time
 import argparse
 import glob

 try:
-	import numpy as np
+        import numpy as np
 except ImportError:
-	sys.stderr.write("Numpy is necessary to run postfiltering.\n")
-	sys.exit(1)
+        sys.stderr.write("Numpy is necessary to run postfiltering.\n")
+        sys.exit(1)
 try:
-	import pandas as pd
+        import pandas as pd
 except ImportError:
-	sys.stderr.write("Pandas is necessary to run postfiltering.\n")
-	sys.exit(1)
+        sys.stderr.write("Pandas is necessary to run postfiltering.\n")
+        sys.exit(1)


 basedir = os.path.dirname(os.path.abspath(__file__))
@@ -24,101 +23,101 @@ basedir = os.path.dirname(os.path.abspath(__file__))
 # Parse Commandline Arguments
 #############################
 def get_arguments():
-	'''
-	Get commandline arguments and return namespace
-	'''
-	## Initialize Parser
-	parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-	
+        '''
+        Get commandline arguments and return namespace
+        '''
+        ## Initialize Parser
+        parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
 # Not Showns:
-	parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
-	parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
+        parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
+        parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
+

-	
 #	TODO: POPULATION STATISTICS
 	## Population Statistics (Fst - ):
 #	parser.add_argument("-f", "--fst", action="store_true")
 	## pNpS ratio:
 #	parser.add_argument("-n", "--pnps", action="store_true")
-	
-	
+
+
 # OPTIONAL arguments:
-	parser.add_argument('-b', metavar='FLOAT', type=float, help="Coverage breadth: Horizontal genome coverage percentage per sample per species", default=40.0)
-	parser.add_argument('-d', metavar='FLOAT', type=float, help="Coverage depth: Average vertical genome coverage per sample per species", default=5.0)
-	parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
-	parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
-	parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
-	
-	parser.add_argument('-div',action='store_true',help="Compute diversity measures")
-	
+        parser.add_argument('-b', metavar='FLOAT', type=float, default=40.0,
+                                help="Coverage breadth: minimal horizontal genome coverage percentage per sample per species")
+        parser.add_argument('-d', metavar='FLOAT', type=float, default=5.0,
+                                help="Coverage depth: minimal average vertical genome coverage per sample per species")
+        parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
+        parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
+        parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
+        parser.add_argument('-div',action='store_true',help="Compute diversity measures")
+
 # REQUIRED  arguments:
-	parser.add_argument('projdir', help='project name', metavar='Proj')
-	
-	return parser.parse_args()
+        parser.add_argument('projdir', help='project name', metavar='Proj')
+        return parser.parse_args()

 	##############################

 def debugging(taxids_of_interest, header_cov):
-	'''Sanity check for the Dict of TaxID:Samples'''
-	
-	nr_keys = 0
-	taxids_of_interest = []
-	for key in samples_of_interest:
-		nr_keys += 1
-		taxids_of_interest.append(key)
-
-	taxids_of_interest.sort()
-	print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
-	print(taxids_of_interest)
+        '''Sanity check for the Dict of TaxID:Samples'''
+
+        nr_keys = 0
+        taxids_of_interest = []
+        for key in samples_of_interest:
+                nr_keys += 1
+                taxids_of_interest.append(key)
+
+        taxids_of_interest.sort()
+        print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
+        print(taxids_of_interest)
 #	print("\n")
 #   print("nr_TaxIDs_of_Interest.keys: {}".format(nr_keys))

-	header = "\t".join(header_cov)
-	print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
+        print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
+#   header = "\t".join(header_cov)
 #   print("header: {}".format(header))

-	sys.exit("only filter 1")
+        sys.exit("only filter 1")
+

-	
 def file_check():
-	''' Check if required files exist (True / False)'''
-	args.projdir = args.projdir.rstrip('/')
-	args.coverage_file = args.projdir+'/'+args.projdir+'.all_cov.tab'
-	args.percentage_file = args.projdir+'/'+args.projdir+'.all_perc.tab'
-	args.all_samples = args.projdir+'/'+'all_samples'
-
-	print("Checking for necessary input files...")
-	if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
-		print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
-	else:
-		sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
-
-	if os.path.isfile(args.all_samples):
-		print("found: '{}'\n".format(args.all_samples))
-	else:
-		sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
+        ''' Check if required files exist (True / False)'''
+        args.projdir = args.projdir.rstrip('/')
+        args.coverage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_cov.tab'
+        args.percentage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_perc.tab'
+        args.all_samples = args.projdir+'/'+'all_samples'
+
+        print("Checking for necessary input files...")
+        if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
+            print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
+        else:
+            sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
+
+        if os.path.isfile(args.all_samples):
+                print("found: '{}'\n".format(args.all_samples))
+        else:
+                sys.exit("\nERROR: No such file '{}'".format(args.all_samples))

 def print_arguments():
-	## Print Defined Thresholds:
-	print("Options:")
-	if args.b:
-		print("threshold: percentage covered (breadth) {}".format(args.b) )
-	if args.d:
-		print("threshold: average coverage (depth) {}".format(args.d) )
-	if args.m:
-		print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
-	if args.c:
-		print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
-	if args.p:
-		print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )	
-	print("")
+        ## Print Defined Thresholds:
+        print("Options:")
+        if args.b:
+            print("threshold: percentage covered (breadth) {}".format(args.b) )
+        if args.d:
+            print("threshold: average coverage (depth) {}".format(args.d) )
+        if args.m:
+            print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
+        if args.c:
+            print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
+        if args.p:
+            print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
+        print("")

 ###############################
 ## FILTER I: Count SAMPLES_OF_INTEREST per TAXON_OF_INTEREST
 #	Example: "Which taxon has at least 10 SoI?"
 #  --Coverage Conditions:
 #	Sample of Interest:
-#		1. breadth > 40 %  
+#		1. breadth > 40 %
 #		2. depth   >  5 X
 #  --Min. number Samples:
 #		3. min samples/TaxID > 10
@@ -137,9 +136,9 @@ def relevant_taxa(args):
 			sys.exit("ERROR: Coverage file headers do not match!")  #Exit with error message

 		# Read taxon by taxon (line by line) check coverage conditions (thresholds)
-		for cov,perc in zip(COV,PER):	# Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)	
+		for cov,perc in zip(COV,PER):	# Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
 			cstring = cov.split()
-			pstring = perc.split()	
+			pstring = perc.split()
 			cov_taxID = cstring.pop(0)
 			perc_taxID = pstring.pop(0)
 			coverage = list(map(float, cstring))  # convert list_of_strings 2 list_of_floats (TaxID=INT!)
@@ -152,10 +151,10 @@ def relevant_taxa(args):
 			if cov_taxID != perc_taxID:	# check if taxIDs match!
 				sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!")

-			for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed]) 
+			for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
 				sample_count += 1

-				if c >= args.d and p >= args.b:	
+				if c >= args.d and p >= args.b:

 					sample_names.append(header_cov[sample_count-1]) # starting at 0 in array

@@ -176,7 +175,7 @@ def header_comparison(header_cov):
 	'''Comparing the sample order in the coverage and all_samples file'''

 	# sort snp_indices by sample_of_interest names
-	header = "\t".join(header_cov)
+	#header = "\t".join(header_cov)
 	#print("\nNr.Samples_cov_header: {}".format(len(header_cov)))
 	#print("cov_header: {}\n\n".format(header_cov[:10]))
 	#print("\ncov_header: {}\n {}\n\n".format(len(header_cov),header))
@@ -185,21 +184,21 @@ def header_comparison(header_cov):
 	all_samples = open(args.all_samples,'r')
 	snp_header = all_samples.read().splitlines()
 	snp_header = [i.split('/')[-1] for i in snp_header] #get name /trim/off/path/to/sample.name.bam
-	snp_header_joined = "\t".join(snp_header)
-	
+
 	#print("Nr.Samples in (all_samples): {}".format(len(snp_header)))
 	#print("all_samples: {}\n".format(snp_header[:10]))
-	
+
 	# Compare the sample order in ALL.COV/ALL.PERC and all_samples
 	#print("Comparing headers..")
+        #snp_header_joined = "\t".join(snp_header)
 	#if header == snp_header_joined:
 	#	print("Headers match nicely\n")
 	#else:
 	#	print("CAUTION: Header in COV_FILE does not match the order of samples in the SNP_FILES,\n\t no problem, we took care of it!\n")
-	
-	return snp_header	

-	
+	return snp_header
+
+
 #################################
 ## FILTER II: ACQUIRE SNPs WITH SUFFICIENT OCCURRENCE WITHIN SAMPLES_OF_INTEREST
 #  --SNP Conditions (Default):
@@ -208,21 +207,21 @@ def header_comparison(header_cov):

 def filter_two(args, snp_header, snp_files, outdir):
 	'''position wise filtering'''
-	
+
 	header_taxID = ''
 	snp_taxID = '_'
 #	print(locals())
 	for best_split_x in snp_files:
-	
+
 		with open(best_split_x, 'r') as file:
 			for snp_line in file:	#position wise loop
 				snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]

 			## SPECIES FILTER:
 				if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
-				
+
 					continue #Taxon is not relevant, NEXT!
-	
+
 				else:
 			## SAMPLE FILTER: only load samples with enough coverage
 					sample_list = samples_of_interest[snp_taxID]#Load Sample List
@@ -246,9 +245,9 @@ def filter_two(args, snp_header, snp_files, outdir):
 							nr_good += 1
 				#FILTER: Position incidence with sufficient coverage:
 					if float(nr_good)/len(sample_indices) < args.p:#if snp_incidence < x % drop SNP
-	
+
 						continue#mainly uninformative position, SNP incidence < x %, SNP droped!
-	
+
 					else:
 					# CALCULATE SNP ALLELE FREQUENCY:
 						# If at least one position passed cutoffs open file (if new TaxID):
@@ -259,42 +258,42 @@ def filter_two(args, snp_header, snp_files, outdir):
 							outfile = open(outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w')
 							print("Generating: {}".format(outdir+'/'+'%s.filtered.freq' % snp_taxID))
 							outfile.write('\t' + "\t".join(sample_list) + '\n')
-		
+
 							header_taxID = snp_taxID#Jump to current TaxID
-			
+
 					#Loop through alternative alleles [5](comma separated):
 						line_id = ":".join(whole_line[:4])#line_id composed of CHROM:REFGENE:POS:REFBASE
 						reference_base = whole_line[3]
 						alt_bases_totalcov = []#total coverage per alt allele
 						#VCF format
-	
+
 						#LOOP Start:
 						for snp in whole_line[5].split(','):
-	
+
 							xS = snp.split('|')
 							snp_coverage = list(map(float, xS[3:]))#coverage string (unfiltered!)
-	
-							#Sanity check:	
+
+							#Sanity check:
 							if len(site_coverage) != len(snp_coverage):
 								print("ERROR: SNP FILE {} is corrupted".format(best_split_x))
 								sys.exit("ERROR: Site coverage and SNP coverage string have uneven length!")
-	
+
 							#Compute allele frequency tables:
-							alt_base = snp.split('|')[1]#alternative base	
-	
+							alt_base = snp.split('|')[1]#alternative base
+
 							#FREQUENCY Computation
 							total_reads = 0
 							snp_frq = []#frequencies for SNPs (pos >5X in at least 50% of the SoIs)
 							for index in sample_indices:
-	
-						#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!	
+
+						#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
 								if site_coverage[index] >= args.c and site_coverage[index] != 0:
-	
+
 									snp_frq.append(snp_coverage[index]/site_coverage[index])
-	
+
 								else:
-									snp_frq.append(-1)	
-	
+									snp_frq.append(-1)
+
 					# WRITE OUTPUT Allele Frequencies (Default)
 							outfile.write(":".join(snp_line.split()[:4])+'>'+alt_base +':'+ xS[2] + '\t' + "\t".join(str(x) for x in snp_frq) + '\n')

@@ -311,20 +310,20 @@ def alleledist(d1,d2, threshold=.6):

 def computeAllDist(args):

-    print "Computing distances"
+    print("Computing distances")
    allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
    for f in allFreq:

-	data = pd.read_table(f, index_col=0, na_values=['-1']).T
-    	dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
-    	dist = pd.DataFrame(dist, index=data.index, columns=data.index)
+        data = pd.read_table(f, index_col=0, na_values=['-1']).T
+        dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
+        dist = pd.DataFrame(dist, index=data.index, columns=data.index)

-    	dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
+        dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')

-    	dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
-    	dist = pd.DataFrame(dist, index=data.index, columns=data.index)
+        dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
+        dist = pd.DataFrame(dist, index=data.index, columns=data.index)

-    	dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
+        dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')



@@ -366,11 +365,9 @@ def genetic_distance(sample1, sample2):


 def computeAllDiv(args):
-
-    print "Computing diversities"
-    cov_perc = pd.read_table(args.percentage_file, skiprows=[1], index_col=False)
-    cov_perc = cov_perc.set_index(cov_perc.loc[:, 'Unnamed: 0'])
-    cov_perc = cov_perc.drop('Unnamed: 0', 1)
+    # Load external info : Coverage, genomes size, genes size
+    horizontal_coverage = pd.read_table(args.percentage_file, skiprows=[1], index_col=0)
+    vertical_coverage = pd.read_table(args.coverage_file, skiprows=[1], index_col=0)

    bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
    bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
@@ -383,11 +380,18 @@ def computeAllDiv(args):

        data = pd.read_table(f, index_col=0, na_values=['-1'])
        pre_index = [i.split(':') for i in list(data.index)]
-        pos_index = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
-        data = data.set_index(pd.Index(pos_index))
+        data = data.set_index(pd.Index([item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]))
+        data = data.sort_index()
+        ########
        # Correcting the coverage by the genome length observed in each pairwise comparison
-        correction_cov = [[(min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
-        dist = [[genetic_distance(data.iloc[:, i], data.iloc[:, j]) / correction_cov[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
+        correction_coverage = [[(min(horizontal_coverage.loc[species, i], horizontal_coverage.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
+        ########
+        # Vertical coverage in pi within : AvgCov / (AvgCov - 1)
+        for j,i in enumerate(data.columns):
+            correction_within = vertical_coverage.loc[species,i] / (vertical_coverage.loc[species,i] - 1)
+            correction_coverage[j][j] = correction_coverage[j][j] / correction_within
+
+        dist = [[genetic_distance(data.iloc[:, i], data.iloc[:, j]) / correction_coverage[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
        FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))]

        dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)

--- a/src/collapse_coverages.py
+++ b/src/collapse_coverages.py
@@ -28,7 +28,7 @@ def write_matrix(cov, header, ofile):
    out.write('TaxId\t')
    out.write('\t'.join([header for _ in bamfiles]))
    out.write('\n')
-    for taxid in sorted(avg_cov.keys(), key=int):
+    for taxid in sorted(avg_cov.keys()):
        c = cov[taxid]
        out.write('{}\t'.format(taxid))
        out.write('\t'.join([c[bf] for bf in bamfiles]))

--- a/src/qaTools/qaCompute.cpp
+++ b/src/qaTools/qaCompute.cpp
@@ -526,13 +526,37 @@ int main(int argc, char *argv[])
 	   ++duplicates;
 	 } else {
 	   if (!userOpt.spanCov) {
-	     //All entries in SAM file are represented on the forward strand! (See specs of SAM format for details)    
-	     ++entireChr[core->pos];
-	     ++usedReads;
-	     if ((uint32_t)(core->pos+core->l_qseq) >= chrSize)
-	       --entireChr[chrSize-1];
-	     else
-	       --entireChr[core->pos+core->l_qseq];
+	     //All entries in SAM file are represented on the forward strand! (See specs of SAM format for details)
+	     uint32_t* cigar = bam_get_cigar(b);
+         uint32_t pp = core->pos+1;
+         int i = 0;
+         if(((*cigar) & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP || ((*cigar) & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {//BWA is actively fucking with me.
+            //The start of the alignment is reported without this clip.
+            ++cigar; ++i;
+         }
+         while (i<core->n_cigar) {
+            ++i;
+            if (((*cigar) & BAM_CIGAR_MASK) != BAM_CMATCH) {
+                    pp = pp + ((*cigar) >> BAM_CIGAR_SHIFT);
+            } else {
+                    ++entireChr[pp];
+                    pp = pp + ((*cigar) >> BAM_CIGAR_SHIFT);
+		    if (pp >= chrSize) {//These are mismatches that actually hang over the end. Silly mapper!
+				//fprintf(stderr,"you are fucked %d %d\n",pp,chrSize);
+				--entireChr[chrSize-1];
+			} else {
+                    		--entireChr[pp];
+			}
+            }
+            ++cigar;
+         }
+
+         //++entireChr[core->pos];
+         ++usedReads;
+         //if ((uint32_t)(core->pos+core->l_qseq) >= chrSize)
+         //  --entireChr[chrSize-1];
+         //else
+         //  --entireChr[core->pos+core->l_qseq];
 	   } else {
 	     //Computing span coverage. 
 	     //Only consider first read in pair! and extend a bit to the end of the insert

--- a/src/snpCaller/call_vC.cpp
+++ b/src/snpCaller/call_vC.cpp
@@ -130,7 +130,6 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
    filePosStart = strlen(line);//This includes the \n
    filePosition p1;
    while (fgets(line,10000,refGenes)) {
-        fileConsumed += strlen(line);
        int pos = 0;
        const char* rest = toksplit(line,'\t',tok,10000);
        while (*rest) {
@@ -152,6 +151,7 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
            ++pos;
            rest = toksplit(rest,'\t',tok,10000);
        }
+	fileConsumed += strlen(line);
        lineCount += 1;
    }
    //Add the last one!
@@ -248,7 +248,7 @@ bool loadGenome(std::string gName, FILE* refGenes, bool* hasGenes) {
            }
            if (pos == 2) {//This is the name!
                if (gName.compare(tok) != 0) {
-                    fprintf(stderr,"Reading wrong gene defintions");
+                    fprintf(stderr,"Reading wrong gene defintion for %s\n. Scafold supposed to be %s, but is %s\n.",geneName.c_str(),tok,gName.c_str());
                    break;
                }
            } else if (pos == 6) {//Start
No results found