Commit 0f98a7f9 authored by Paul Costea's avatar Paul Costea

moveing

parents
#!/bin/bash
#
wget http://vm-lux.embl.de/~rmuench/files/samples.bz2
tar xfvj samples.bz2
rm samples.bz2
exit
metaSNV --- A metagenomic SNV calling pipeline
Copyright (C) 2017 P. I. Costea & R. Munch
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
all: qaTools snpCaller
clean:
cd src/qaTools && $(MAKE) clean
cd src/snpCaller && $(MAKE) clean
qaTools:
cd src/$@ && $(MAKE)
snpCaller:
cd src/$@ && $(MAKE)
.PHONY: all clean
# MetaSNV, a metagenomic SNV calling pipeline
The metaSNV pipeline performs variant calling on aligned metagenomic samples.
Download
========
Via Git:
git clone git@git.embl.de:rmuench/metaSNP.git
or [download](https://git.embl.de/rmuench/metaSNP/repository/archive.zip?ref=master) a zip file of the repository.
Dependencies
============
* [Boost-1.53.0 or above](http://www.boost.org/users/download/)
* [htslib](http://www.htslib.org/)
* Python-2.7 or above
#### Installing dependencies on Ubuntu/debian
On an Ubuntu/debian system, the following sequence of commands will install all
required packages (the first two are only necessary if you have not enabled the
universe repository before):
sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe"
sudo apt-get update
sudo apt-get install libhts-dev libboost-dev
### Installing dependencies using anaconda
If you use [anaconda](https://www.continuum.io/downloads), you can create an
environment with all necessary dependencies using the following commands:
conda create --name metaSNV boost htslib pkg-config
source activate metaSNV
export CFLAGS=-I$CONDA_ENV_PATH/include
export LD_LIBRARY_PATH=$CONDA_ENV_PATH/lib:$LD_LIBRARY_PATH
If you do not have a C++ compiler, anaconda can also install G++:
conda create --name metaSNV boost htslib pkg-config
source activate metaSNV
# Add this command:
conda install gcc
export CFLAGS=-I$CONDA_ENV_PATH/include
export LD_LIBRARY_PATH=$CONDA_ENV_PATH/lib:$LD_LIBRARY_PATH
Setup & Compilation
===================
make
Workflow:
=========
## Required Files:
* **'all\_samples'** = a list of all BAM files, one /path/2/sample.bam per line (no duplicates)
* **'ref\_db'** = the reference database in fasta format (f.i. multi-sequence fasta)
## Optional Files:
* **'db\_ann'** = a gene annotation file for the reference database.
## To use one of the provided reference databases:
./getRefDB.sh
## 2. Run metaSNV
metaSNV.py project_dir/ all_samples ref_db ref_fasta [options]
## 3. Part II: Post-Processing (Filtering & Analysis)
### a) Filtering:
Note: requires SNP calling (Part II) to be done!
Caution: Perform this step seperately for individual SNPs and population SNPs.
usage: metaSNP_filtering.py
positional arguments:
perc_FILE input file with horizontal genome (taxon) coverage (breadth) per sample (percentage covered)
cov_FILE input file with average genome (taxon) coverage
(depth) per sample (average number reads per site)
snp_FILE input files from SNP calling
all_samples list of input BAM files, one per line
output_dir/ output folder
optional arguments:
-h, --help show this help message and exit
-p PERC, --perc PERC Coverage breadth: Horizontal coverage cutoff
(percentage taxon covered) per sample (default: 40.0)
-c COV, --cov COV Coverage depth: Average vertical coverage cutoff per
taxon, per sample (default: 5.0)
-m MINSAMPLES, --minsamples MINSAMPLES
Minimum number of sample required to pass the coverage
cutoffs per genome (default: 2)
-s SNPC, --snpc SNPC FILTERING STEP II: SNP coverage cutoff (default: 5.0)
-i SNPI, --snpi SNPI FILTERING STEP II: SNP occurence (incidence) cutoff
within samples_of_interest (default: 0.5)
### b) Analysis:
> TODO: include R scripts for computing pairwise distances and visualization
Example Tutorial
================
## 1. Run the setup & compilation steps and download the provided reference database.
./getRefDB.sh
## 2. Go to the EXAMPLE directory and download the samples with the getSamplesScript.sh
$ cd EXAMPLE
$ ./getSamplesScript.sh
## 3. Initiate a new project in the parent directory
$ metaSNV_New tutorial
## 4. Generate the 'all_samples' file
$ find `pwd`/EXAMPLE/samples -name “*.bam” > tutorial/all_samples
## 5. Prepare and run the coverage estimation
$ metaSNV_COV tutorial/ tutorial/all_samples > runCoverage
$ bash runCoverage
## 6. Perform a work load balancing step for run time optimization.
$ metaSNV_OPT tutorial/ db/Genomev9_definitions 5
$ bash runCoverage
## 7. Prepare and run the SNV calling step
$ metaSNV_SNP tutorial/ tutorial/all_samples db/RepGenomesv9.fna -a db/RefOrganismDB_v9_gene.clean -l tutorial/bestsplits/ > runSNPcall
$ bash runSNPcall
## 8. Run the post processing / filtering steps
### a) Compute allele frequencies for each position that pass the given thresholds.
$ metaSNV_filtering.py tutorial/tutorial.all_perc.tab tutorial/tutorial.all_cov.tab tutorial/snpCaller/called_SNVs.best_split_* tutorial/all_samples tutorial/filtered/pop/
### b) Compute pair-wise distances between samples on their SNP profiles and create a PCoA plot.
Advanced usage (tools and scripts)
==================================
If you are interested in using the pipeline in a more manual way (for example
the metaSNV caller stand alone), you will find the executables for the
individual steps in the `src/` directory.
metaSNV caller
--------------
Calls SNVs from samtools pileup format and generates two outputs.
usage: ./snpCall [options] < stdin.mpileup > std.out.popSNPs
Options:
-f, faidx indexed reference genome.
-g, gene annotation file.
-i, individual SNVs.
Note: Expecting samtools mpileup as standard input
### __Output__
1. Population SNVs (pSNVs):
Population wide variants that occur with a frequency of 1 % at positions with at least 4x coverage.
2. Individual specific SNVs (iSNVs):
Non population variants, that occur with a frequency of 10 % at positions with at least 10x coverage.
[qaCompute](https://github.com/CosteaPaul/qaTools)
-------------------------------------------------
Computes normal and span coverage from a bam/sam file.
Also counts unmapped and sub-par quality reads.
### __Parameters:__
m - Compute median coverage for each contig/chromosome.
Will make running a bit slower. Off by default.
q [INT] - Quality threshold. Any read with a mapping quality under
INT will be ignored when computing the coverage.
NOTE: bwa outputs mapping quality 0 for reads that map with
equal quality in multiple places. If you want to condier this,
set q to 0.
d - Print coverage histrogram over each individual contig/chromosome.
These details will be printed in file <output>.detail
p [INT] - Print coverage profile to bed file, averaged over given window size.
i - Silent run. Will not print running info to stdout.
s [INT] - Compute span coverage. (Use for mate pair libs)
Instead of actual read coverage, using the options will consider
the entire span of the insert as a read, if insert size is
lower than INT.
For an accurate estimation of span coverage, I recommend
setting an insert size limit INT around 3*std_dev of your lib's
insert size distribution.
c [INT] - Maximum X coverage to consider in histogram.
h [STR] - Use different header.
Because mappers sometimes break the headers or simply don't output them,
this is provieded as a non-kosher way around it. Use with care!
For more info on the parameteres try ./qaCompute
metaSNV_filtering.py
--------------------
usage: metaSNV filtering [-h] [-p PERC] [-c COV] [-m MINSAMPLES] [-s SNPC]
[-i SNPI]
perc_FILE cov_FILE snp_FILE [snp_FILE ...]
all_samples output_dir/
metaSNV filtering
positional arguments:
perc_FILE input file with horizontal genome (taxon) coverage
(breadth) per sample (percentage covered)
cov_FILE input file with average genome (taxon) coverage
(depth) per sample (average number reads per site)
snp_FILE input files from SNP calling
all_samples list of input BAM files, one per line
output_dir/ output folder
optional arguments:
-h, --help show this help message and exit
-p PERC, --perc PERC Coverage breadth: Horizontal coverage cutoff
(percentage taxon covered) per sample (default: 40.0)
-c COV, --cov COV Coverage depth: Average vertical coverage cutoff per
taxon, per sample (default: 5.0)
-m MINSAMPLES, --minsamples MINSAMPLES
Minimum number of sample that have to pass the
filtering criteria in order to write an output for the
representative Genome (default: 2)
-s SNPC, --snpc SNPC FILTERING STEP II: SNP coverage cutoff (default: 5.0)
-i SNPI, --snpi SNPI FILTERING STEP II: SNP occurence (incidence) cutoff
within samples_of_interest (default: 0.5)
# Required Variables for metaSNP compilation
HTSLIB_CFLAGS := $(shell pkg-config htslib --cflags)
HTSLIB_LIBS := $(shell pkg-config htslib --libs)
# For boost, only includes are necessary:
BOOST_CFLAGS :=
/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s5.sorted.bam
/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s2.sorted.bam
/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s1.sorted.bam
/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s6.sorted.bam
/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s3.sorted.bam
/net/mraid08/export/jafar/Microbiome/Analyses/Costea/metaSNP/EXAMPLE/samples/s4.sorted.bam
#!/bin/bash
download_9() {
[ -d "db" ] || mkdir -p db
cd db
echo "Downloading the archive."
wget vm-lux.embl.de/~rmuench/files/db_f9.tar.bz2
echo "This might take up to 10 minutes."
echo "Extracting files!"
echo "Patience please..."
tar xjvf db_f9.tar.bz2
rm db_f9.tar.bz2
exit
}
download_11() {
[ -d "db" ] || mkdir -p db
cd db
echo "Downloading the archive."
wget vm-lux.embl.de/~rmuench/files/db_f11.tar.bz2
echo "This might take up to 20 minutes."
echo "Extracting files!"
echo "Patience please..."
tar xjvf db_f11.tar.bz2
rm db_f11.tar.bz2
exit
}
echo "You are about to download the metaSNP database"
echo ""
echo "Requirements:"
echo " - Freeze9: 12G available disc space (1753 genomes)"
echo " - Freeze11: 33G available disc space (5278 genomes)"
echo ""
echo "Please make a selection by entering the right number:"
select yn in "f9" "f11" "cancel"; do
case $yn in
f9 ) download_9 ; break;;
f11 ) download_11; break;;
cancel ) exit;;
esac
done
#!/usr/bin/env python
# This code is part of the metagenomic SNV calling pipeline (metaSNV)
# Helper script to initiate a new project file structure.
import argparse
from sys import stderr, exit
from os import path
from glob import glob
import os
import shutil
import subprocess
import multiprocessing
basedir = os.path.dirname(os.path.abspath(__file__))
def mkdir_p(dirname):
'''Equivalent to 'mkdir -p' on the command line'''
from os import makedirs
try:
makedirs(dirname)
except OSError:
pass
def create_directories(basedir):
mkdir_p(basedir)
for sub in ['cov',
'bestsplits',
'snpCaller',
'filtered',
'filtered/pop',
'filtered/ind',
'distances']:
mkdir_p(path.join(basedir, sub))
def exit_worker(signum, frame):
raise RuntimeError("Keyboard Interrupt")
def init_worker():
import signal
signal.signal(signal.SIGINT, exit_worker)
def run_sample(sample, command):
'''Simply wraps subprocess.call and returns contextual information in order
to provide a good error message (if necessary)'''
ret = subprocess.call(command)
return sample, command, ret
def compute_opt(args):
out_dir = path.join(args.project_dir, 'cov')
try:
os.makedirs(out_dir)
except:
pass
p = multiprocessing.Pool(args.threads, init_worker)
results = []
for line in open(args.all_samples):
line = line.rstrip()
name = path.basename(line)
cmd = ['{}/src/qaTools/qaCompute'.format(basedir)
,'-c' ,'10', '-d',
'-i', line, '{}/{}.cov'.format(out_dir, name)]
if args.print_commands:
print(" ".join(cmd))
else:
results.append(p.apply_async(run_sample, (name, cmd)))
p.close()
p.join()
for r in results:
sample, command, ret = r.get()
if ret:
stderr.write("Failure in sample {}".format(sample))
stderr.write("Call to {} failed.".format(' '.join(cmd)))
exit(1)
def split_opt(args):
if args.n_splits > 100:
stderr.write("Maximum number of splits is 100.\n")
exit(1)
older_files = glob(args.project_dir + '/bestsplits/*')
if older_files:
stderr.write("\nremoving old splits.\n")
for f in older_files:
os.unlink(f)
cov_dir = path.join(args.project_dir, 'cov')
cov_files = glob(cov_dir + '/*.cov')
project_name = path.basename(args.project_dir)
if not cov_files:
if not args.print_commands:
stderr.write("Coverage files not found.\n")
else:
stderr.write("Coverage files not found.\nFinish running the commands prinnted above and then run this command again.\n")
exit(1)
for f in cov_files:
cmd = ['python',
path.join(basedir, 'src/computeGenomeCoverage.py'),
f,
f + '.detail',
f + '.summary']
subprocess.call(cmd)
print("\nCoverage summary here: {}".format(args.project_dir))
print(" Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, project_name))
print(" Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, project_name))
cmd = ['python',
'{}/src/collapse_coverages.py'.format(basedir),
args.project_dir]
subprocess.call(cmd)
print("\nCalculating best database split:")
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
cmd = ['python',
'{}/src/createOptimumSplit.py'.format(basedir),
"{}/{}.all_cov.tab".format(args.project_dir, project_name),
"{}/{}.all_perc.tab".format(args.project_dir, project_name),
args.ctg_len,
str(args.n_splits),
path.join(args.project_dir, "bestsplits", "best_split")]
subprocess.call(cmd)
def execute_snp_call(args, snpCaller, ifile, ofile, split):
db_ann_args = []
if args.db_ann != '':
db_ann_args = ['-g', args.db_ann]
split_args = []
if split:
split_args = ['-l', str(split)]
samtools_cmd = ['samtools',
'mpileup',
'-f', args.ref_db
] + split_args + [
'-B',
'-b', args.all_samples]
snpcaller_cmd = [
snpCaller, '-f', args.ref_db] + db_ann_args + [
'-i', ifile]
if args.print_commands:
print(" ".join(samtools_cmd + ['|'] + snpcaller_cmd + ['>', ofile]))
else:
with open(ofile, 'wt') as ofile:
samtools_call = subprocess.Popen(samtools_cmd, stdout=subprocess.PIPE)
snpcaller_call = subprocess.Popen(snpcaller_cmd, stdin=samtools_call.stdout, stdout=ofile)
samtools_call.stdout.close()
return snpcaller_call.wait()
def snp_call(args):
out_dir = path.join(args.project_dir, 'snpCaller')
try:
os.makedirs(out_dir)
except:
pass
shutil.copy(args.all_samples,args.project_dir+'/all_samples')
snpCaller = basedir + "/src/snpCaller/snpCall"
indiv_out = path.join(out_dir, "indiv_called")
called_SNP = path.join(out_dir, "called_SNPs")
## ACTUAL COMMANDLINE
# Note: Due to a bug in samtools v0.1.18, -Q 20 might be erroneous to use.
# Note: Different phred score scales might be disregarded.
# Note: If samtools > v0.1.18 is used -Q 20 filtering is highly recommended.
threads = (args.threads if not args.print_commands else 1)
p = multiprocessing.Pool(threads, init_worker)
results = []
if args.n_splits > 1:
splits = glob('{}/bestsplits/best_split_*'.format(args.project_dir))
for split in splits:
results.append(p.apply_async(execute_snp_call,
(args,
snpCaller,
'{}.{}'.format(indiv_out, path.basename(split)),
'{}.{}'.format(called_SNP, path.basename(split)),
split)))
p.close()
p.join()
for r in results:
v = r.wait()
if v > 0:
stderr.write("SNV calling failed")
exit(1)
else:
v = execute_snp_call(args, snpCaller, indiv_out, called_SNP, None)
if v > 0:
stderr.write("SNV calling failed")
exit(1)
def main():
parser = argparse.ArgumentParser(description='Compute SNV profiles')
parser.add_argument('project_dir', metavar='DIR',
help='A metaSNP initialized project directory')
parser.add_argument('all_samples', metavar='FILE',
help='File with an input list of bam files, one file per line')
parser.add_argument("ref_db", metavar='REF_DB_FILE',
help='reference multi-sequence FASTA file used for the alignments.')
parser.add_argument('--db_ann', metavar='BED_FILE',default='',
help='Database gene annotation.')
parser.add_argument('--print-commands', default=False, action='store_true',
help='Instead of executing the commands, simply print them out')
parser.add_argument('--threads', metavar='INT', default=1,type=int,
help='Number of jobs to run simmultaneously. Will create same number of splits, unless n_splits set differently.')
parser.add_argument('--n_splits', metavar='INT', default=1,type=int,
help='Number of bins to split ref into')
parser.add_argument('--ctg_len',metavar='BED_FILE',default='',
help='Per contig lengths, required for splitting.')
args = parser.parse_args()
if not path.isfile(args.ref_db):
stderr.write('''
ERROR: No reference database or annotation file found!"
ERROR: '{}' is not a file."
SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP caller
'''.format(args.ref_db))
parser.print_help()
exit(1)
if not path.isfile(basedir+"/src/qaTools/qaCompute") or not path.isfile(basedir+"/src/snpCaller/snpCall"):
stderr.write('''
ERROR: No binaries found
SOLUTION: make\n\n'''.format(basedir))
exit(1)
if (args.n_splits > 1 or args.threads > 1) and args.ctg_len=='':
stderr.write('''
ERROR: No contig information supplied. Cannot compute splits
SOLUTION: Supply contig information through --ctg_len\n\n'''.format(basedir))
exit(1)
if args.threads > 1 and args.n_splits==1:
args.n_splits=args.threads
if path.exists(args.project_dir) and not args.print_commands:
stderr.write("Project directory '{}' already exists\n\n\n".format(args.project_dir))
exit(1)
create_directories(args.project_dir)
compute_opt(args)
if args.n_splits > 1:
split_opt(args)
snp_call(args)
if __name__ == '__main__':
main()
This diff is collapsed.
#!/usr/bin/env python
import sys
from glob import glob
from collections import defaultdict
from os import path
avg_cov = defaultdict(lambda : {})
per_cov = defaultdict(lambda : {})
project_dir = sys.argv[1]
project_name = path.basename(project_dir)
bamfiles = []
for f in sorted(glob(project_dir + '/cov/*.summary')):
bamfile = path.basename(f)[:-len('.cov.summary')]
for i,line in enumerate(open(f)):
if i == 0:
continue
tokens = line.rstrip().split()
avg_cov[tokens[0]][bamfile] = tokens[1]
per_cov[tokens[0]][bamfile] = tokens[2]
bamfiles.append(bamfile)
def write_matrix(cov, header, ofile):
out = open(ofile, 'wt')
out.write('\t')
out.write('\t'.join(bamfiles))
out.write('\n')
out.write('TaxId\t')
out.write('\t'.join([header for _ in bamfiles]))
out.write('\n')
for taxid in sorted(avg_cov.keys(), key=int):
c = cov[taxid]
out.write('{}\t'.format(taxid))
out.write('\t'.join([c[bf] for bf in bamfiles]))
out.write('\n')
out.close()
write_matrix(avg_cov, 'Average_cov', path.join(project_dir, '{}.all_cov.tab'.format(project_name)))
write_matrix(per_cov, 'Percentage_1x', path.join(project_dir, '{}.all_perc.tab'.format(project_name)))
###############################################
# metaSNP : Step III `Post-processing` #
###############################################
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
#
# This script computes SNP profile distances between sample pairs.
# - distances are computed over the SNP space.
# - default distance measure: manhattan distance.
# - PCoA visualization to gain a quick overview of the SNP space.
## INVOCATION (manually):
#
# Example:
# $ R --no-save --file=/path/2/src/computeDistances.R --args infile.freq outfolder/
#
# Loop (all):
# for i in $(ls project_dir/filtered/pop/*.freq); \
# do echo R --no-save --file=../src/subspeciesClustering.R \
# --args $i project_dir/distances/ \
# ;done
#############
## Functions
majorAllel = function(d) {
apply(d,1,function(x) {
x <- x[x!=-1]
x[x <50] <- 0
x[x >=50] <- 1
return(median(x))
})
}
rowMedians = function(d) {
apply(d,1,function(x) median(x[x!=-1]))
}
rMeans = function(d) {
apply(d,1,function(x) mean(x[x!=-1]))
}
getPnPs = function(s,d) {
s <- as.character(s$V2)
s <- substr(s,1,1)
m <- apply(d,2,function(x) {
a <- s[which(x > 50)]
return(sum(a=='N')/sum(a=='S'))
})
return(m)
}
#############
## libraries
library(ape) #pcoa
library(ggplot2)
library(gridExtra)
#############
#############
## Arguments
#argv <- c("/g/bork3/home/costea/R-3.0.2/bin/R", "--no-save", "--file=src/computeDistances.R", "--args", "tutorial/filtered/pop/420247.filtered.freq", "tutorial/distances/")
argv <- commandArgs(trailingOnly = F) # All arguments including --file
scriptPath <- dirname(sub("--file=","",argv[grep("--file",argv)])) ##sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
dist_location <- paste(scriptPath,'/distances/',sep='')
####
# Source required packages
source(paste(dist_location,'Strain_functions.R',sep=''))
args <- commandArgs(trailingOnly = TRUE) #--args
print(args)
file <- args[1]
outf <- args[2]
print(file)
print(outf)
data <- read.table(file,header=T,row.names=1,check.names=F)
species <- strsplit(sapply(strsplit(file,"\\/"),tail,1),'\\.')[[1]][1]