Compare revisions

Paul Igor Costea · Paul Igor Costea · Paul Costea · Paul Costea · Paul Igor Costea · Robin Erich Muench
--- a/.gitignore
+++ b/.gitignore
+*.o
--- a/README.md
+++ b/README.md
@@ -9,9 +9,9 @@ Download

 Via Git:

-    git clone git@git.embl.de:rmuench/metaSNP.git
+    git clone https://git.embl.de/costea/metaSNV.git
    
-or [download](https://git.embl.de/rmuench/metaSNP/repository/archive.zip?ref=master) a zip file of the repository.
+or [download](https://git.embl.de/costea/metaSNV/repository/archive.zip?ref=master) a zip file of the repository.

 Dependencies
 ============
@@ -21,6 +21,8 @@ Dependencies
 * [htslib](http://www.htslib.org/)
 
 * Python-2.7 or above
+    * numpy
+    * pandas

 #### Installing dependencies on Ubuntu/debian

@@ -31,7 +33,6 @@ universe repository before):

    sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe"
    sudo apt-get update
-
    sudo apt-get install libhts-dev libboost-dev

 ### Installing dependencies using anaconda
@@ -39,19 +40,17 @@ universe repository before):
 If you use [anaconda](https://www.continuum.io/downloads), you can create an
 environment with all necessary dependencies using the following commands:

-    conda create --name metaSNV boost htslib pkg-config
+    conda create --name metaSNV boost htslib pkg-config numpy pandas
    source activate metaSNV
    export CFLAGS=-I$CONDA_ENV_PATH/include
    export LD_LIBRARY_PATH=$CONDA_ENV_PATH/lib:$LD_LIBRARY_PATH

 If you do not have a C++ compiler, anaconda can also install G++:

-    conda create --name metaSNV boost htslib pkg-config
+    conda create --name metaSNV boost htslib pkg-config numpy pandas
    source activate metaSNV
-
    # Add this command:
    conda install gcc
-
    export CFLAGS=-I$CONDA_ENV_PATH/include
    export LD_LIBRARY_PATH=$CONDA_ENV_PATH/lib:$LD_LIBRARY_PATH

@@ -64,11 +63,12 @@ Workflow:
 =========
 ## Required Files:

-* **'all\_samples'**    = a list of all BAM files, one /path/2/sample.bam per line (no duplicates)
-* **'ref\_db'**  = the reference database in fasta format (f.i. multi-sequence fasta)
+* **'all\_samples'**  = a list of all BAM files, one /path/2/sample.bam per line (no duplicates)
+* **'ref\_db'**       = the reference database in fasta format (f.i. multi-sequence fasta)
+* **'gen\_pos'**      = a list with start and end positions for each sequence in the reference (format: `sequence\_id  start end`)

 ## Optional Files:
-* **'db\_ann'** = a gene annotation file for the reference database.
+* **'db\_ann'** = a gene annotation file for the reference database (format: ).

 ## To use one of the provided reference databases:

@@ -76,184 +76,59 @@ Workflow:
    
 ## 2. Run metaSNV

-    metaSNV.py project_dir/ all_samples ref_db ref_fasta [options]
+    metaSNV.py project_dir/ all_samples ref_db [options]

 ## 3. Part II: Post-Processing (Filtering & Analysis)
-
-### a) Filtering:
 Note: requires SNP calling (Part II) to be done!
-Caution: Perform this step seperately for individual SNPs and population SNPs.
-
-    usage: metaSNP_filtering.py
-
-        positional arguments:
-            perc_FILE               input file with horizontal genome (taxon) coverage (breadth) per sample (percentage covered)
-            cov_FILE                input file with average genome (taxon) coverage
-                                    (depth) per sample (average number reads per site)
-            snp_FILE                input files from SNP calling
-            all_samples             list of input BAM files, one per line
-            output_dir/             output folder
-
-        optional arguments:
-            -h, --help              show this help message and exit
-            -p PERC, --perc PERC    Coverage breadth: Horizontal coverage cutoff
-                                    (percentage taxon covered) per sample (default: 40.0)
-            -c COV, --cov COV       Coverage depth: Average vertical coverage cutoff per
-                                    taxon, per sample (default: 5.0)
-            -m MINSAMPLES, --minsamples MINSAMPLES
-                                    Minimum number of sample required to pass the coverage
-                                    cutoffs per genome (default: 2)
-            -s SNPC, --snpc SNPC    FILTERING STEP II: SNP coverage cutoff (default: 5.0)
-            -i SNPI, --snpi SNPI    FILTERING STEP II: SNP occurence (incidence) cutoff
-                                    within samples_of_interest (default: 0.5)
-
-### b) Analysis:
->   TODO: include R scripts for computing pairwise distances and visualization

+    metaSNV_post.py project_dir [options]

 Example Tutorial
 ================

-## 1. Run the setup & compilation steps and download the provided reference database.
+## 1. Run the setup & compilation steps and download the provided reference database. 

-    ./getRefDB.sh
+    $ ./getRefDB.sh
+    Select freeze9, as the tutorial files have been mapped against this freeze. 

 ## 2. Go to the EXAMPLE directory and download the samples with the getSamplesScript.sh

    $ cd EXAMPLE
    $ ./getSamplesScript.sh

-## 3. Initiate a new project in the parent directory
-
-    $ metaSNV_New tutorial
-
-## 4. Generate the 'all_samples' file
-
-    $ find `pwd`/EXAMPLE/samples -name “*.bam” > tutorial/all_samples
-
-## 5. Prepare and run the coverage estimation
+## 3. Make sample list

-    $ metaSNV_COV tutorial/ tutorial/all_samples > runCoverage
-    $ bash runCoverage
+    $ find `pwd`/EXAMPLE/samples -name “*.bam” > sample_list

-## 6. Perform a work load balancing step for run time optimization.
+## 4. Run the SNV calling step

-    $ metaSNV_OPT tutorial/ db/Genomev9_definitions 5
-    $ bash runCoverage
+    $ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --threads 8

-## 7. Prepare and run the SNV calling step
+## 5. Run filtering and post processing

-    $ metaSNV_SNP tutorial/ tutorial/all_samples db/RepGenomesv9.fna -a db/RefOrganismDB_v9_gene.clean -l tutorial/bestsplits/ > runSNPcall
-    $ bash runSNPcall
-
-## 8. Run the post processing / filtering steps
-### a) Compute allele frequencies for each position that pass the given thresholds.
-
-    $ metaSNV_filtering.py tutorial/tutorial.all_perc.tab tutorial/tutorial.all_cov.tab tutorial/snpCaller/called_SNVs.best_split_* tutorial/all_samples tutorial/filtered/pop/
-
-### b) Compute pair-wise distances between samples on their SNP profiles and create a PCoA plot.
+    $ python metaSNV_post.py tutorial
    
+    Voila! Your distances will be in the tutorial/distances folder. Enjoy!

-
-
-Advanced usage (tools and scripts)
+Advanced usage 
 ==================================

-If you are interested in using the pipeline in a more manual way (for example
-the metaSNV caller stand alone), you will find the executables for the
-individual steps in the `src/` directory.
-
-metaSNV caller
--------------
-Calls SNVs from samtools pileup format and generates two outputs.
-
-    usage: ./snpCall [options] < stdin.mpileup > std.out.popSNPs
-
-    Options:
-        -f,     faidx indexed reference genome.
-        -g,     gene annotation file.
-        -i,     individual SNVs.
-
-Note: Expecting samtools mpileup as standard input
-
-### __Output__
-1. Population SNVs (pSNVs): 
-Population wide variants that occur with a frequency of 1 % at positions with at least 4x coverage.
-
-2. Individual specific SNVs (iSNVs):
-Non population variants, that occur with a frequency of 10 % at positions with at least 10x coverage.
-
-
-[qaCompute](https://github.com/CosteaPaul/qaTools)
-------------------------------------------------
-Computes normal and span coverage from a bam/sam file. 
-Also counts unmapped and sub-par quality reads.
-
-### __Parameters:__
-   m	    -	Compute median coverage for each contig/chromosome. 
-   		Will make running a bit slower. Off by default.
-   
-   q [INT]  -   Quality threshold. Any read with a mapping quality under
-                INT will be ignored when computing the coverage.
-                
-		NOTE: bwa outputs mapping quality 0 for reads that map with
-		equal quality in multiple places. If you want to condier this,
-		set q to 0.
-		
-   d        -   Print coverage histrogram over each individual contig/chromosome.
-   	        These details will be printed in file <output>.detail
-   	        
-   p [INT]  -   Print coverage profile to bed file, averaged over given window size.  
-   
-   i        -   Silent run. Will not print running info to stdout.
+If you want to run a lot of samples and would like to use the power of your cluster, we will print out the commands you need to
+run and you can decide on how to schedule and manage them.
+
+## 1. Get the first set of commands
    
-   s [INT]  -   Compute span coverage. (Use for mate pair libs)
-                Instead of actual read coverage, using the options will consider
-                the entire span of the insert as a read, if insert size is
-		lower than INT. 
- 		For an accurate estimation of span coverage, I recommend
-		setting an insert size limit INT around 3*std_dev of your lib's 
-		insert size distribution.
+    $ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --print-commands
    
-   c [INT]  -   Maximum X coverage to consider in histogram.
+    Note the addition of the "--print-commnads". This will print out one-liners that you need to run. When done, run same again.
+
+## 2. Get the second set of commands
+ 
+    $ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --print-commands
    
-   h [STR]  -   Use different header. 
-                Because mappers sometimes break the headers or simply don't output them, 
-		this is provieded as a non-kosher way around it. Use with care!
+    This will calculate the "load balancing" and give you the commands for running the SNV calling.
    
-   For more info on the parameteres try ./qaCompute
-   
-
-metaSNV_filtering.py
--------------------   
-usage: metaSNV filtering [-h] [-p PERC] [-c COV] [-m MINSAMPLES] [-s SNPC]
-                         [-i SNPI] 
-                         perc_FILE cov_FILE snp_FILE [snp_FILE ...]
-                         all_samples output_dir/
-
-metaSNV filtering
-
-positional arguments:
-  perc_FILE             input file with horizontal genome (taxon) coverage
-                        (breadth) per sample (percentage covered)
-  cov_FILE              input file with average genome (taxon) coverage
-                        (depth) per sample (average number reads per site)
-  snp_FILE              input files from SNP calling
-  all_samples           list of input BAM files, one per line
-  output_dir/           output folder
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -p PERC, --perc PERC  Coverage breadth: Horizontal coverage cutoff
-                        (percentage taxon covered) per sample (default: 40.0)
-  -c COV, --cov COV     Coverage depth: Average vertical coverage cutoff per
-                        taxon, per sample (default: 5.0)
-  -m MINSAMPLES, --minsamples MINSAMPLES
-                        Minimum number of sample that have to pass the
-                        filtering criteria in order to write an output for the
-                        representative Genome (default: 2)
-  -s SNPC, --snpc SNPC  FILTERING STEP II: SNP coverage cutoff (default: 5.0)
-  -i SNPI, --snpi SNPI  FILTERING STEP II: SNP occurence (incidence) cutoff
-                        within samples_of_interest (default: 0.5)
+## 3. Run post-processing as usual

+    $ python metaSNV_post.py tutorial

--- a/all_samples
+++ b/all_samples
-/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s5.sorted.bam
-/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s2.sorted.bam
-/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s1.sorted.bam
-/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s6.sorted.bam
-/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s3.sorted.bam
-/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s4.sorted.bam
--- a/getRefDB.sh
+++ b/getRefDB.sh
@@ -4,12 +4,12 @@ download_9() {
    [ -d "db" ] || mkdir -p db
    cd db
    echo "Downloading the archive."
-    wget vm-lux.embl.de/~rmuench/files/db_f9.tar.bz2
+    wget vm-lux.embl.de/~rmuench/files/freeze9.tar.bz2
    echo "This might take up to 10 minutes."
    echo "Extracting files!"
    echo "Patience please..."
-    tar xjvf db_f9.tar.bz2
-    rm db_f9.tar.bz2
+    tar xjvf freeze9.tar.bz2
+    rm freeze9.tar.bz2
    exit	
 }

@@ -17,12 +17,12 @@ download_11() {
    [ -d "db" ] || mkdir -p db
    cd db
    echo "Downloading the archive."
-    wget vm-lux.embl.de/~rmuench/files/db_f11.tar.bz2
+    wget vm-lux.embl.de/~rmuench/files/freeze11.tar.bz2
 	echo "This might take up to 20 minutes."
    echo "Extracting files!"
    echo "Patience please..."
-    tar xjvf db_f11.tar.bz2
-    rm db_f11.tar.bz2
+    tar xjvf freeze11.tar.bz2
+    rm freeze11.tar.bz2
    exit
 }


--- a/metaSNV.py
+++ b/metaSNV.py
@@ -51,10 +51,7 @@ def run_sample(sample, command):

 def compute_opt(args):
    out_dir = path.join(args.project_dir, 'cov')
-    try:
-        os.makedirs(out_dir)
-    except:
-        pass
+    mkdir_p(out_dir)
    p = multiprocessing.Pool(args.threads, init_worker)
    results = []
    for line in open(args.all_samples):
@@ -76,30 +73,38 @@ def compute_opt(args):
            stderr.write("Failure in sample {}".format(sample))
            stderr.write("Call to {} failed.".format(' '.join(cmd)))
            exit(1)
-def split_opt(args):

-    if args.n_splits > 100:
-        stderr.write("Maximum number of splits is 100.\n")
-        exit(1)
-
-    older_files = glob(args.project_dir + '/bestsplits/*')
-    if older_files:
-        stderr.write("\nremoving old splits.\n")
-        for f in older_files:
-            os.unlink(f)
+def get_header(args):
+    use = open(args.all_samples).readline().rstrip()
+    o = subprocess.check_output(["samtools","view","-H",use]).decode("utf-8")
+    f = open(args.project_dir + '/bed_header','w')
+    for line in o.split('\n')[1:]:
+        line = line.rstrip().split('\t')
+        if len(line) != 3:
+            continue
+        if line[0] == "@SQ":
+            line[1] = line[1].replace('SN:','')
+            line[2] = line[2].replace('LN:','')
+            f.write(line[1]+'\t1\t'+line[2]+'\n')
+    f.close()
+    args.ctg_len = args.project_dir + '/bed_header'
+
+def compute_summary(args):
+    '''This information is required by metaSNV_post.py'''

+    project_name = path.basename(args.project_dir)
    cov_dir = path.join(args.project_dir, 'cov')
    cov_files = glob(cov_dir + '/*.cov')
-    project_name = path.basename(args.project_dir)
+
    if not cov_files:
-	if not args.print_commands:
-        	stderr.write("Coverage files not found.\n")
-	else:
-		stderr.write("Coverage files not found.\nFinish running the commands prinnted above and then run this command again.\n")
+        if not args.print_commands:
+            stderr.write("Coverage files not found.\n")
+        else:
+            stderr.write("Coverage files not found.\nFinish running the commands printed above and then run this command again.\n")
        exit(1)
    for f in cov_files:
        cmd = ['python',
-	            path.join(basedir, 'src/computeGenomeCoverage.py'),
+                path.join(basedir, 'src/computeGenomeCoverage.py'),
                f,
                f + '.detail',
                f + '.summary']
@@ -107,11 +112,25 @@ def split_opt(args):
    print("\nCoverage summary here: {}".format(args.project_dir))
    print("	Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, project_name))
    print("	Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, project_name))
+    print("")
    cmd = ['python',
            '{}/src/collapse_coverages.py'.format(basedir),
            args.project_dir]
    subprocess.call(cmd)

+def split_opt(args):
+
+    if args.n_splits > 100:
+        stderr.write("Maximum number of splits is 100.\n")
+        args.n_splits = 100
+
+    older_files = glob(args.project_dir + '/bestsplits/*')
+    if older_files:
+        stderr.write("\nremoving old splits.\n")
+        for f in older_files:
+            os.unlink(f)
+
+    project_name = path.basename(args.project_dir)

    print("\nCalculating best database split:")
    # usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
@@ -206,7 +225,7 @@ def main():
                        help='File with an input list of bam files, one file per line')
    parser.add_argument("ref_db", metavar='REF_DB_FILE',
                        help='reference multi-sequence FASTA file used for the alignments.')
-    parser.add_argument('--db_ann', metavar='BED_FILE',default='',
+    parser.add_argument('--db_ann', metavar='DB_ANN_FILE',default='',
                        help='Database gene annotation.')
    parser.add_argument('--print-commands', default=False, action='store_true',
                        help='Instead of executing the commands, simply print them out')
@@ -214,9 +233,9 @@ def main():
                        help='Number of jobs to run simmultaneously. Will create same number of splits, unless n_splits set differently.')
    parser.add_argument('--n_splits', metavar='INT', default=1,type=int,
                        help='Number of bins to split ref into')
-    parser.add_argument('--ctg_len',metavar='BED_FILE',default='',
-			help='Per contig lengths, required for splitting.')
+
    args = parser.parse_args()
+    args.project_dir = args.project_dir.rstrip('/')
    if not path.isfile(args.ref_db):
        stderr.write('''
 ERROR:	No reference database or annotation file found!"
@@ -228,31 +247,27 @@ SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP cal
        exit(1)

    if not path.isfile(basedir+"/src/qaTools/qaCompute") or not path.isfile(basedir+"/src/snpCaller/snpCall"):
-	stderr.write('''
+        stderr.write('''
 ERROR:  No binaries found

-SOLUTION: make\n\n'''.format(basedir))	
-	exit(1)
-
-    if (args.n_splits > 1 or args.threads > 1) and args.ctg_len=='':
-	stderr.write('''
-ERROR:  No contig information supplied. Cannot compute splits
-
-SOLUTION: Supply contig information through --ctg_len\n\n'''.format(basedir))  
-        exit(1)	
+SOLUTION: make\n\n'''.format(basedir))
+        exit(1)

    if args.threads > 1 and args.n_splits==1:
-	args.n_splits=args.threads
+        args.n_splits=args.threads

    if path.exists(args.project_dir) and not args.print_commands:
-    	stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
+        stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
        exit(1)
+
    create_directories(args.project_dir)
    compute_opt(args)
+    compute_summary(args)
+    get_header(args)
+
    if args.n_splits > 1:
-	split_opt(args)
+        split_opt(args)
    snp_call(args)

 if __name__ == '__main__':
    main()
-
--- a/metaSNV_post.py
+++ b/metaSNV_post.py
 #!/usr/bin/env python
-import os               # interacting with operating system
-import sys		# interact with system commandline 
-import time		# print current date/time
-import argparse		# manage command line options and arguments
+import os
+import sys
+import argparse
 import glob

-import numpy as np
-import pandas as pd
+try:
+        import numpy as np
+except ImportError:
+        sys.stderr.write("Numpy is necessary to run postfiltering.\n")
+        sys.exit(1)
+try:
+        import pandas as pd
+except ImportError:
+        sys.stderr.write("Pandas is necessary to run postfiltering.\n")
+        sys.exit(1)


 basedir = os.path.dirname(os.path.abspath(__file__))
@@ -16,98 +23,101 @@ basedir = os.path.dirname(os.path.abspath(__file__))
 # Parse Commandline Arguments
 #############################
 def get_arguments():
-	'''
-	Get commandline arguments and return namespace
-	'''
-	## Initialize Parser
-	parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-	
+        '''
+        Get commandline arguments and return namespace
+        '''
+        ## Initialize Parser
+        parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
 # Not Showns:
-	parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
-	parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
+        parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
+        parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
+

-	
 #	TODO: POPULATION STATISTICS
 	## Population Statistics (Fst - ):
 #	parser.add_argument("-f", "--fst", action="store_true")
 	## pNpS ratio:
 #	parser.add_argument("-n", "--pnps", action="store_true")
-	
-	
+
+
 # OPTIONAL arguments:
-	parser.add_argument('-b', metavar='FLOAT', type=float, help="Coverage breadth: Horizontal genome coverage percentage per sample per species", default=40.0)
-	parser.add_argument('-d', metavar='FLOAT', type=float, help="Coverage depth: Average vertical genome coverage per sample per species", default=5.0)
-	parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
-	parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
-	parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
-		
+        parser.add_argument('-b', metavar='FLOAT', type=float, default=40.0,
+                                help="Coverage breadth: minimal horizontal genome coverage percentage per sample per species")
+        parser.add_argument('-d', metavar='FLOAT', type=float, default=5.0,
+                                help="Coverage depth: minimal average vertical genome coverage per sample per species")
+        parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
+        parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
+        parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
+        parser.add_argument('-div',action='store_true',help="Compute diversity measures")
+
 # REQUIRED  arguments:
-	parser.add_argument('projdir', help='project name', metavar='Proj')
-	
-	return parser.parse_args()
+        parser.add_argument('projdir', help='project name', metavar='Proj')
+        return parser.parse_args()

 	##############################

 def debugging(taxids_of_interest, header_cov):
-	'''Sanity check for the Dict of TaxID:Samples'''
-	
-	nr_keys = 0
-	taxids_of_interest = []
-	for key in samples_of_interest:
-		nr_keys += 1
-		taxids_of_interest.append(key)
-
-	taxids_of_interest.sort()
-	print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
-	print(taxids_of_interest)
+        '''Sanity check for the Dict of TaxID:Samples'''
+
+        nr_keys = 0
+        taxids_of_interest = []
+        for key in samples_of_interest:
+                nr_keys += 1
+                taxids_of_interest.append(key)
+
+        taxids_of_interest.sort()
+        print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
+        print(taxids_of_interest)
 #	print("\n")
 #   print("nr_TaxIDs_of_Interest.keys: {}".format(nr_keys))

-	header = "\t".join(header_cov)
-	print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
+        print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
+#   header = "\t".join(header_cov)
 #   print("header: {}".format(header))

-	sys.exit("only filter 1")
+        sys.exit("only filter 1")
+

-	
 def file_check():
-	''' Check if required files exist (True / False)'''
-	args.coverage_file = args.projdir+'/'+args.projdir+'.all_cov.tab'
-	args.percentage_file = args.projdir+'/'+args.projdir+'.all_perc.tab'
-	args.all_samples = args.projdir+'/'+'all_samples'
-
-	print("Checking for necessary input files...")
-	if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
-		print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
-	else:
-		sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
-
-	if os.path.isfile(args.all_samples):
-		print("found: '{}'\n".format(args.all_samples))
-	else:
-		sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
+        ''' Check if required files exist (True / False)'''
+        args.projdir = args.projdir.rstrip('/')
+        args.coverage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_cov.tab'
+        args.percentage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_perc.tab'
+        args.all_samples = args.projdir+'/'+'all_samples'
+
+        print("Checking for necessary input files...")
+        if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
+            print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
+        else:
+            sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
+
+        if os.path.isfile(args.all_samples):
+                print("found: '{}'\n".format(args.all_samples))
+        else:
+                sys.exit("\nERROR: No such file '{}'".format(args.all_samples))

 def print_arguments():
-	## Print Defined Thresholds:
-	print("Options:")
-	if args.b:
-		print("threshold: percentage covered (breadth) {}".format(args.b) )
-	if args.d:
-		print("threshold: average coverage (depth) {}".format(args.d) )
-	if args.m:
-		print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
-	if args.c:
-		print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
-	if args.p:
-		print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )	
-	print("")
+        ## Print Defined Thresholds:
+        print("Options:")
+        if args.b:
+            print("threshold: percentage covered (breadth) {}".format(args.b) )
+        if args.d:
+            print("threshold: average coverage (depth) {}".format(args.d) )
+        if args.m:
+            print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
+        if args.c:
+            print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
+        if args.p:
+            print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
+        print("")

 ###############################
 ## FILTER I: Count SAMPLES_OF_INTEREST per TAXON_OF_INTEREST
 #	Example: "Which taxon has at least 10 SoI?"
 #  --Coverage Conditions:
 #	Sample of Interest:
-#		1. breadth > 40 %  
+#		1. breadth > 40 %
 #		2. depth   >  5 X
 #  --Min. number Samples:
 #		3. min samples/TaxID > 10
@@ -126,9 +136,9 @@ def relevant_taxa(args):
 			sys.exit("ERROR: Coverage file headers do not match!")  #Exit with error message

 		# Read taxon by taxon (line by line) check coverage conditions (thresholds)
-		for cov,perc in zip(COV,PER):	# Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)	
+		for cov,perc in zip(COV,PER):	# Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
 			cstring = cov.split()
-			pstring = perc.split()	
+			pstring = perc.split()
 			cov_taxID = cstring.pop(0)
 			perc_taxID = pstring.pop(0)
 			coverage = list(map(float, cstring))  # convert list_of_strings 2 list_of_floats (TaxID=INT!)
@@ -141,10 +151,10 @@ def relevant_taxa(args):
 			if cov_taxID != perc_taxID:	# check if taxIDs match!
 				sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!")

-			for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed]) 
+			for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
 				sample_count += 1

-				if c >= args.d and p >= args.b:	
+				if c >= args.d and p >= args.b:

 					sample_names.append(header_cov[sample_count-1]) # starting at 0 in array

@@ -165,7 +175,7 @@ def header_comparison(header_cov):
 	'''Comparing the sample order in the coverage and all_samples file'''

 	# sort snp_indices by sample_of_interest names
-	header = "\t".join(header_cov)
+	#header = "\t".join(header_cov)
 	#print("\nNr.Samples_cov_header: {}".format(len(header_cov)))
 	#print("cov_header: {}\n\n".format(header_cov[:10]))
 	#print("\ncov_header: {}\n {}\n\n".format(len(header_cov),header))
@@ -174,21 +184,21 @@ def header_comparison(header_cov):
 	all_samples = open(args.all_samples,'r')
 	snp_header = all_samples.read().splitlines()
 	snp_header = [i.split('/')[-1] for i in snp_header] #get name /trim/off/path/to/sample.name.bam
-	snp_header_joined = "\t".join(snp_header)
-	
+
 	#print("Nr.Samples in (all_samples): {}".format(len(snp_header)))
 	#print("all_samples: {}\n".format(snp_header[:10]))
-	
+
 	# Compare the sample order in ALL.COV/ALL.PERC and all_samples
 	#print("Comparing headers..")
+        #snp_header_joined = "\t".join(snp_header)
 	#if header == snp_header_joined:
 	#	print("Headers match nicely\n")
 	#else:
 	#	print("CAUTION: Header in COV_FILE does not match the order of samples in the SNP_FILES,\n\t no problem, we took care of it!\n")
-	
-	return snp_header	

-	
+	return snp_header
+
+
 #################################
 ## FILTER II: ACQUIRE SNPs WITH SUFFICIENT OCCURRENCE WITHIN SAMPLES_OF_INTEREST
 #  --SNP Conditions (Default):
@@ -197,21 +207,21 @@ def header_comparison(header_cov):

 def filter_two(args, snp_header, snp_files, outdir):
 	'''position wise filtering'''
-	
+
 	header_taxID = ''
 	snp_taxID = '_'
 #	print(locals())
 	for best_split_x in snp_files:
-	
+
 		with open(best_split_x, 'r') as file:
 			for snp_line in file:	#position wise loop
 				snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]

 			## SPECIES FILTER:
 				if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
-				
+
 					continue #Taxon is not relevant, NEXT!
-	
+
 				else:
 			## SAMPLE FILTER: only load samples with enough coverage
 					sample_list = samples_of_interest[snp_taxID]#Load Sample List
@@ -235,9 +245,9 @@ def filter_two(args, snp_header, snp_files, outdir):
 							nr_good += 1
 				#FILTER: Position incidence with sufficient coverage:
 					if float(nr_good)/len(sample_indices) < args.p:#if snp_incidence < x % drop SNP
-	
+
 						continue#mainly uninformative position, SNP incidence < x %, SNP droped!
-	
+
 					else:
 					# CALCULATE SNP ALLELE FREQUENCY:
 						# If at least one position passed cutoffs open file (if new TaxID):
@@ -248,42 +258,42 @@ def filter_two(args, snp_header, snp_files, outdir):
 							outfile = open(outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w')
 							print("Generating: {}".format(outdir+'/'+'%s.filtered.freq' % snp_taxID))
 							outfile.write('\t' + "\t".join(sample_list) + '\n')
-		
+
 							header_taxID = snp_taxID#Jump to current TaxID
-			
+
 					#Loop through alternative alleles [5](comma separated):
 						line_id = ":".join(whole_line[:4])#line_id composed of CHROM:REFGENE:POS:REFBASE
 						reference_base = whole_line[3]
 						alt_bases_totalcov = []#total coverage per alt allele
 						#VCF format
-	
+
 						#LOOP Start:
 						for snp in whole_line[5].split(','):
-	
+
 							xS = snp.split('|')
 							snp_coverage = list(map(float, xS[3:]))#coverage string (unfiltered!)
-	
-							#Sanity check:	
+
+							#Sanity check:
 							if len(site_coverage) != len(snp_coverage):
 								print("ERROR: SNP FILE {} is corrupted".format(best_split_x))
 								sys.exit("ERROR: Site coverage and SNP coverage string have uneven length!")
-	
+
 							#Compute allele frequency tables:
-							alt_base = snp.split('|')[1]#alternative base	
-	
+							alt_base = snp.split('|')[1]#alternative base
+
 							#FREQUENCY Computation
 							total_reads = 0
 							snp_frq = []#frequencies for SNPs (pos >5X in at least 50% of the SoIs)
 							for index in sample_indices:
-	
-						#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!	
+
+						#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
 								if site_coverage[index] >= args.c and site_coverage[index] != 0:
-	
+
 									snp_frq.append(snp_coverage[index]/site_coverage[index])
-	
+
 								else:
-									snp_frq.append(-1)	
-	
+									snp_frq.append(-1)
+
 					# WRITE OUTPUT Allele Frequencies (Default)
 							outfile.write(":".join(snp_line.split()[:4])+'>'+alt_base +':'+ xS[2] + '\t' + "\t".join(str(x) for x in snp_frq) + '\n')

@@ -300,20 +310,95 @@ def alleledist(d1,d2, threshold=.6):

 def computeAllDist(args):

-    print "Computing distances"
+    print("Computing distances")
    allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
    for f in allFreq:

-	data = pd.read_table(f, index_col=0, na_values=['-1']).T
-    	dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
-    	dist = pd.DataFrame(dist, index=data.index, columns=data.index)
+        data = pd.read_table(f, index_col=0, na_values=['-1']).T
+        dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
+        dist = pd.DataFrame(dist, index=data.index, columns=data.index)
+
+        dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
+
+        dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
+        dist = pd.DataFrame(dist, index=data.index, columns=data.index)
+
+        dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
+

-    	dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')

-    	dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
-    	dist = pd.DataFrame(dist, index=data.index, columns=data.index)

-    	dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
+def genetic_distance(sample1, sample2):
+    '''Pairwise genetic distance'''
+
+
+    # The expression used to compute dist_nd would compute all the necessary
+    # values if appplied to sample[12]. However, the case where there are no
+    # duplicates is the majority (often >90% of cases) and can be done much
+    # faster, so it is special cased here:
+    sample1nd = sample1.reset_index().drop_duplicates(subset='index', keep=False).set_index('index')
+    sample2nd = sample2.reset_index().drop_duplicates(subset='index', keep=False).set_index('index')
+    sample2nd = sample2nd.reindex(index=sample1nd.index)
+    s1 = sample1nd.values
+    s2 = sample2nd.values
+    valid = ~(np.isnan(s1) | np.isnan(s2))
+    s1 = s1[valid]
+    s2 = s2[valid]
+    s1 = np.vstack([s1, 1 - s1])
+    s2 = np.vstack([s2, 1 - s2])
+    dist_nd = (s1[0]*s2[1]+s1[1]*s2[0]).sum()
+
+    def compute_diversity(x):
+        out = np.outer(x.s1.values, x.s2.values)
+        return np.nansum(out) - np.nansum(out.diagonal())
+
+    sample1d = sample1.ix[sample1.index[sample1.index.duplicated()]]
+    sample2d = sample2.ix[sample2.index[sample2.index.duplicated()]]
+    if not len(sample1d) or not len(sample2d):
+        # No duplicates
+        return dist_nd
+    both = pd.DataFrame({'s1' : sample1d, 's2' : sample2d})
+    both = both.reset_index()
+    both = pd.concat([both,(1. - both.groupby('index').sum()).reset_index()])
+    dist_d = both.groupby('index', group_keys=False).apply(compute_diversity).sum()
+    return dist_d + dist_nd
+
+
+def computeAllDiv(args):
+    # Load external info : Coverage, genomes size, genes size
+    horizontal_coverage = pd.read_table(args.percentage_file, skiprows=[1], index_col=0)
+    vertical_coverage = pd.read_table(args.coverage_file, skiprows=[1], index_col=0)
+
+    bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
+    bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
+    bedfile_tab = bedfile_tab.set_index(pd.Index(bed_index))
+
+    allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
+    for f in allFreq:
+        species = int(f.split('/')[-1].split('.')[0])
+        genome_length = bedfile_tab.loc[str(species), 2].sum()
+
+        data = pd.read_table(f, index_col=0, na_values=['-1'])
+        pre_index = [i.split(':') for i in list(data.index)]
+        data = data.set_index(pd.Index([item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]))
+        data = data.sort_index()
+        ########
+        # Correcting the coverage by the genome length observed in each pairwise comparison
+        correction_coverage = [[(min(horizontal_coverage.loc[species, i], horizontal_coverage.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
+        ########
+        # Vertical coverage in pi within : AvgCov / (AvgCov - 1)
+        for j,i in enumerate(data.columns):
+            correction_within = vertical_coverage.loc[species,i] / (vertical_coverage.loc[species,i] - 1)
+            correction_coverage[j][j] = correction_coverage[j][j] / correction_within
+
+        dist = [[genetic_distance(data.iloc[:, i], data.iloc[:, j]) / correction_coverage[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
+        FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))]
+
+        dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
+        FST = pd.DataFrame(FST, index=data.columns, columns=data.columns)
+
+        dist.to_csv(args.projdir + '/distances/' + '%s.diversity' % species, sep='\t')
+        FST.to_csv(args.projdir + '/distances/' + '%s.FST' % species, sep='\t')

 if __name__ == "__main__":
 #	print("globals: {}".format(globals()))
@@ -346,3 +431,6 @@ if __name__ == "__main__":

 	computeAllDist(args)

+	if args.div:
+		computeAllDiv(args)
+
--- a/src/collapse_coverages.py
+++ b/src/collapse_coverages.py
@@ -28,7 +28,7 @@ def write_matrix(cov, header, ofile):
    out.write('TaxId\t')
    out.write('\t'.join([header for _ in bamfiles]))
    out.write('\n')
-    for taxid in sorted(avg_cov.keys(), key=int):
+    for taxid in sorted(avg_cov.keys()):
        c = cov[taxid]
        out.write('{}\t'.format(taxid))
        out.write('\t'.join([c[bf] for bf in bamfiles]))

--- a/src/qaTools/.gitignore
+++ b/src/qaTools/.gitignore
+computeInsertSizeHistogram
+doBWAQualTrimming
+qaCompute
+removeUnmapped
--- a/src/qaTools/qaCompute.cpp
+++ b/src/qaTools/qaCompute.cpp
@@ -526,13 +526,37 @@ int main(int argc, char *argv[])
 	   ++duplicates;
 	 } else {
 	   if (!userOpt.spanCov) {
-	     //All entries in SAM file are represented on the forward strand! (See specs of SAM format for details)    
-	     ++entireChr[core->pos];
-	     ++usedReads;
-	     if ((uint32_t)(core->pos+core->l_qseq) >= chrSize)
-	       --entireChr[chrSize-1];
-	     else
-	       --entireChr[core->pos+core->l_qseq];
+	     //All entries in SAM file are represented on the forward strand! (See specs of SAM format for details)
+	     uint32_t* cigar = bam_get_cigar(b);
+         uint32_t pp = core->pos+1;
+         int i = 0;
+         if(((*cigar) & BAM_CIGAR_MASK) == BAM_CSOFT_CLIP || ((*cigar) & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) {//BWA is actively fucking with me.
+            //The start of the alignment is reported without this clip.
+            ++cigar; ++i;
+         }
+         while (i<core->n_cigar) {
+            ++i;
+            if (((*cigar) & BAM_CIGAR_MASK) != BAM_CMATCH) {
+                    pp = pp + ((*cigar) >> BAM_CIGAR_SHIFT);
+            } else {
+                    ++entireChr[pp];
+                    pp = pp + ((*cigar) >> BAM_CIGAR_SHIFT);
+		    if (pp >= chrSize) {//These are mismatches that actually hang over the end. Silly mapper!
+				//fprintf(stderr,"you are fucked %d %d\n",pp,chrSize);
+				--entireChr[chrSize-1];
+			} else {
+                    		--entireChr[pp];
+			}
+            }
+            ++cigar;
+         }
+
+         //++entireChr[core->pos];
+         ++usedReads;
+         //if ((uint32_t)(core->pos+core->l_qseq) >= chrSize)
+         //  --entireChr[chrSize-1];
+         //else
+         //  --entireChr[core->pos+core->l_qseq];
 	   } else {
 	     //Computing span coverage. 
 	     //Only consider first read in pair! and extend a bit to the end of the insert
@@ -593,23 +617,23 @@ int main(int argc, char *argv[])
    bam_destroy1(b);
    free(entireChr);

-    fprintf(stdout,"\nDuplicates:%u \n", duplicates);
+    //fprintf(stdout,"\nDuplicates:%u \n", duplicates);

    //Print header for next table in output file
    fprintf(outputFile,"\nCov*X\tPercentage\tNr. of bases\n");

-    fprintf(stdout,"Total genome lenght %lu \n", totalGenomeLength);
+    //fprintf(stdout,"Total genome lenght %lu \n", totalGenomeLength);
    //Compute procentages of genome cover!.
    int i;
    for (i=0; i<=userOpt.maxCoverage; ++i) {
      if (i == 0) {
 	//Non-covered!
-	fprintf(stdout,"%3.2f of genome has not been covered\n", (double)(coverageHist[i])/totalGenomeLength*100);
+	//fprintf(stdout,"%3.2f of genome has not been covered\n", (double)(coverageHist[i])/totalGenomeLength*100);
      } else {
 	uint64_t coverage = 0;
 	//All that has been covered i, had been covered i+1, i+2 and so on times. Thus, do this addition
 	for (int x = i; x<=userOpt.maxCoverage; ++x) coverage += coverageHist[x];
-	fprintf(stdout,"%3.2f of genome has been covered at least %dX \n", (double)(coverage)/totalGenomeLength*100, i);
+	//fprintf(stdout,"%3.2f of genome has been covered at least %dX \n", (double)(coverage)/totalGenomeLength*100, i);
 	fprintf(outputFile,"%d\t%3.5f\t%lu\n",i, (double)(coverage)/totalGenomeLength*100, coverage);
      }

@@ -632,7 +656,7 @@ int main(int argc, char *argv[])
      fprintf(outputFile, "Number of interchromosomal pairs: %u\n",interChr);
    }

-    printf("Out of %u reads, you have %3.5f unmapped reads\n and %3.5f sub-par quality mappings\n", totalNumberOfReads ,procentageOfUnmapped, procentageOfZeroQuality);
+    //printf("Out of %u reads, you have %3.5f unmapped reads\n and %3.5f sub-par quality mappings\n", totalNumberOfReads ,procentageOfUnmapped, procentageOfZeroQuality);
    

    free(coverageHist);

--- a/src/snpCaller/.gitignore
+++ b/src/snpCaller/.gitignore
+snpCall
--- a/src/snpCaller/call_vC.cpp
+++ b/src/snpCaller/call_vC.cpp
@@ -130,7 +130,6 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
    filePosStart = strlen(line);//This includes the \n
    filePosition p1;
    while (fgets(line,10000,refGenes)) {
-        fileConsumed += strlen(line);
        int pos = 0;
        const char* rest = toksplit(line,'\t',tok,10000);
        while (*rest) {
@@ -152,6 +151,7 @@ bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
            ++pos;
            rest = toksplit(rest,'\t',tok,10000);
        }
+	fileConsumed += strlen(line);
        lineCount += 1;
    }
    //Add the last one!
@@ -248,7 +248,7 @@ bool loadGenome(std::string gName, FILE* refGenes, bool* hasGenes) {
            }
            if (pos == 2) {//This is the name!
                if (gName.compare(tok) != 0) {
-                    fprintf(stderr,"Reading wrong gene defintions");
+                    fprintf(stderr,"Reading wrong gene defintion for %s\n. Scafold supposed to be %s, but is %s\n.",geneName.c_str(),tok,gName.c_str());
                    break;
                }
            } else if (pos == 6) {//Start
No results found