Commit 70ce9fa7 authored by Robin Erich Muench's avatar Robin Erich Muench
Browse files

remove duplicate

parent 1425bbbf
#!/bin/bash
download_9() {
[ -d "db" ] || mkdir -p db
cd db
echo "Downloading the archive."
wget vm-lux.embl.de/~rmuench/files/db_f9.tar.bz2
echo "This might take up to 10 minutes."
echo "Extracting files!"
echo "Patience please..."
tar xjvf db_f9.tar.bz2
exit
}
download_11() {
[ -d "db" ] || mkdir -p db
cd db
echo "Downloading the archive."
wget vm-lux.embl.de/~rmuench/files/db_f11.tar.bz2
echo "This might take up to 20 minutes."
echo "Extracting files!"
echo "Patience please..."
tar xjvf db_f11.tar.bz2
exit
}
echo "You are about to download the metaSNP database"
echo ""
echo "Requirements:"
echo " - Freeze9: 12G available disc space (1753 genomes)"
echo " - Freeze11: 33G available disc space (5278 genomes)"
echo ""
echo "Please make a selection by entering the right number:"
select yn in "f9" "f11" "cancel"; do
case $yn in
f9 ) download_9 ; break;;
f11 ) download_11; break;;
cancel ) exit;;
esac
done
#!/bin/bash
display_usage() {
echo >&2 ""
echo >&2 " Usage: $(basename $0) project_dir/ all_samples"
echo >&2 ""
echo >&2 "Parameters:"
echo >&2 " Required:"
echo >&2 " project_dir/ DIR A metaSNP initialized project directory (metaSNP_NEW)"
echo >&2 " all_samples FILE Input list of bam files, one file per line."
echo >&2 ""
}
missing() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "ERROR: '$1' is not a metaSNP project. Subdirectory '$2' is missing."
echo >&2 ""
echo >&2 "Please initialize a new metaSNP project first!"
echo >&2 ""
echo >&2 " RUN: metaSNP_NEW projectName"
echo >&2 "or"
echo >&2 " RUN: mkdir $2"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# print error message if no argument is supplied
if [ $# -ne 2 ]
then
display_usage
exit 1
fi
# assign arguments to variables
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
all_samples="$2"
PROJECT_DIR="$1"
OUT_DIR="$PROJECT_DIR/cov"
# getopt to use -h flag
ARGS=$(getopt -o h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
while true; do
case "$1" in
# Shift before to throw away option
# Shift after if option has a required positional argument
-h)
shift
display_usage
exit 1
;;
--)
shift
break
;;
esac
done
## Control
[ -d "$PROJECT_DIR" ] || missing "$PROJECT_DIR"
[ -f "$all_samples" ] || missing "$all_samples"
# Check required output directories
[ -d "$OUT_DIR" ] || make_dir "$PROJECT_DIR" "$OUT_DIR"
## Generate command lines
while read file
do
# name=$(echo $file | awk -F"/" '{split($NF,a,"."); print a[1]}')
# name=$(echo $file | awk -F"/" '{split($NF,a,"/"); print a[1]}')
name=$(basename $file)
echo $DIR/src/qaTools/qaCompute -c 10 -d -i $file $OUT_DIR/$name.cov
done < $all_samples
exit
#!/bin/bash
#########################################
# metaSNP v1.1 - Initiate New Project #
#########################################
#
# Helper script to initiate a new project file structure.
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
# Variables
PROJECT_NAME="$1"
# Usage Message
display_usage(){
echo >&2 ""
echo >&2 " Usage: $(basename $0) project_name"
echo >&2 ""
}
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required parameter"
display_usage
exit 1
}
make_dir(){
echo >&2 ""
echo >&2 " Project name is available. New project directory created here: $PROJECT_NAME"
echo >&2 ""
mkdir -p $PROJECT_NAME/{cov,bestsplits,snpCaller,filtered/{pop,ind},distances}
}
file_exists(){
echo >&2 ""
echo >&2 " ERROR: The directory '$PROJECT_NAME' exists. Please choose a unique project name."
echo >&2 ""
}
# Check required parameters
[ -n "$PROJECT_NAME" ] || required_parameter "project_name"
# Make
if [ ! -d "$PROJECT_NAME" ]
then
make_dir
else
file_exists
fi
exit
#!/bin/bash
#########################################
# metaSNP Step II: `Genome Splitting` #
#########################################
#
# Helper script
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
# Abort on any errors
set -e
# Variables
wd=`pwd`
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
arg1="$1"
arg2="$2"
arg3="$3"
PROJECT_DIR=$arg1
GENOME_DEF=$arg2
NUM_SPLITS=10
SPLIT_LIMIT=100
COV_FILES=$PROJECT_DIR"/cov"
date=$(date +%Y-%m-%d)
#Usage Messages
display_usage() {
echo >&2 ""
echo >&2 ""
echo >&2 "Usage: $(basename $0) project_dir/ genome_def nr_splits[int]"
echo >&2 ""
echo >&2 "Parameters"
echo >&2 " Required:"
echo >&2 " project_dir/ = the metaSNP project directory"
echo >&2 " genome_def FILE = Contig ranges in BED format. (Fields: Contig_id, contigStart, contigEnd)"
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " nr_splits INT = INT for job parallelization (range: 1-100) [10]"
echo >&2 ""
echo >&2 "Note: Expecting 'metaSNP_COV' to be completed!"
echo >&2 ""
}
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required argument"
display_usage
exit 1
}
no_such_file() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
split_limit() {
echo >&2 ""
echo >&2 "ERROR: nr_splits[$1] exceeds the limit of 100"
echo >&2 ""
display_usage
exit 1
}
missing_input_dir() {
echo >&2 ""
echo >&2 "ERROR: '$1' missing in the project directory '$2'"
echo >&2 "ERROR: '$1' contains the input files (metaSNP_COV output)"
echo >&2 "SUGGESTION: Double check if '$2' is the intended project directory, or"
# echo >&2 "SUGGESTION: At this point the directory should have the coverage files in '$1'"
echo >&2 "SOLUTION: Initialize a new project (metaSNP_NEW) and rerun the coverage estimation (metaSNP_COV)."
echo >&2 ""
exit 1
}
rerun_cov() {
echo >&2 ""
echo >&2 "ERROR: Requires coverage estimations. Run 'metaSNP_COV' first"
echo >&2 ""
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "WARNING '$1' is not an initialized metaSNP project. Subdirectory '$(basename $2)' (output DIR) is missing."
echo >&2 "WARNING: If you used '$1' for the coverage estimation (metaSNP_COV) RUN:"
echo >&2 ""
echo >&2 " mkdir $2"
echo >&2 ""
echo >&2 "SUGGESTION: Double check if '$1' has the correct cov/ folder!"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# getopt to use -h flag
ARGS=$(getopt -o h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
while true; do
case "$1" in
# Shift before to throw away option
# Shift after if option has a required positional argument
-h)
shift
display_usage
exit 1
;;
--)
shift
break
;;
esac
done
## Test
# Required parameters:
[ -z "$arg1" ] && required_parameter "project_dir"
[ -z "$arg2" ] && required_parameter "genome_def"
[ -n "$arg3" ] && NUM_SPLITS=$arg3
# Nr_Splits
[ "$NUM_SPLITS" -gt $SPLIT_LIMIT ] && split_limit $NUM_SPLITS
# Required files and directories
[ -d "$PROJECT_DIR" ] || no_such_file "$PROJECT_DIR"
[ -d "$COV_FILES" ] || missing_input_dir "$COV_FILES" "$PROJECT_DIR"
[ -f "$GENOME_DEF" ] || no_such_file "$GENOME_DEF"
# Required output DIR
[ -d "$PROJECT_DIR/bestsplits/" ] || make_dir "$PROJECT_DIR" "$PROJECT_DIR/bestsplits/"
# Get project name
#PROJECT_NAME=$(echo $PROJECT_DIR | awk -F"/" '{print $(NF-1)}')
PROJECT_NAME=$(basename $PROJECT_DIR)
# Get a summary for the coverage computation (Step I) from the .cov and .detail files
# List of coverage files for all samples
cd $COV_FILES
[ "$(ls *.cov)" ] || rerun_cov
allFile=$(ls *.cov)
for file in $allFile
do
echo $file
python $DIR/src/computeGenomeCoverage.py $file $file.detail $file.summary
done
echo -e "\nCoverage summary here: $PROJECT_DIR"
echo " Average vertical genome coverage: '$PROJECT_DIR$PROJECT_NAME.all_cov.tab'"
echo " Horizontal genome coverage (1X): '$PROJECT_DIR$PROJECT_NAME.all_perc.tab'"
ls *.summary | xargs awk -f $DIR/src/collapse_AvgCov.awk > ../$PROJECT_NAME.all_cov.tab
ls *.summary | xargs awk -f $DIR/src/collapse_PercCov.awk > ../$PROJECT_NAME.all_perc.tab
cd $wd
## DATABASE INDEXING - (Genome Splitting)
# Parameters:
cov="$PROJECT_DIR/$PROJECT_NAME.all_cov.tab"
perc="$PROJECT_DIR/$PROJECT_NAME.all_perc.tab"
genDef=$GENOME_DEF
outFile=$PROJECT_DIR"/bestsplits/best_split"
nr=$NUM_SPLITS
# Clear bestplits directory
echo -e >&2 "\nremoving old splits in:"
echo -e >&2 $PROJECT_DIR"/bestsplits/*"
remove=$PROJECT_DIR"/bestsplits/*"
#echo $remove
rm -rf $remove
echo -e >&2 "\nGenome Splitting:"
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
python $DIR/src/createOptimumSplit.py $cov $perc $genDef $nr $outFile
# Remove second Row - CAUTION: change in filtering.py as well
#awk '!(NR == 2)' $PROJECT_DIR$PROJECT_NAME.all_cov.tab > temp
#mv temp $PROJECT_DIR$PROJECT_NAME.all_cov.tab
#awk '!(NR == 2)' $PROJECT_DIR$PROJECT_NAME.all_perc.tab > temp
#mv temp $PROJECT_DIR$PROJECT_NAME.all_perc.tab
exit
\ No newline at end of file
#! /bin/bash
#########################################
# metaSNP Step II: `SNP calling` #
#########################################
#
# Helper script to set up the metaSNP caller
# -- The script will generate either one or several commandline jobs (parallelization)
# -- each job has to be run in order to compute SNPs for the entire population
#
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
# Abort on any errors
set -e
# Variables
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PROJECT_DIR="$1"
SAMPLES="$2"
REF_DB="$3"
# Optional
SPLITS=""
DB_ANNO=""
DEBUG=0
display_usage() {
echo >&2 ""
echo >&2 " Usage: $(basename $0) project_dir/ all_samples ref_db [options]"
echo >&2 ""
echo >&2 "Parameters (positional):"
echo >&2 " Required:"
echo >&2 " project_dir DIR = the project directory."
echo >&2 " all_samples FILE = list of bam files, one file per line."
echo >&2 " ref_db FILE = reference multi-sequence FASTA file used for the alignments."
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " -a db_ann FILE = database annotation."
echo >&2 " -l splits/ DIR = genome splits DIR for job parallelization (pre-processing)."
echo >&2 ""
echo >&2 "Note: Alternatively, use the '-l splits' option to call SNPs for specific species, contig regions (BED) or single positions (contig_id pos). Unlisted contigs/pos are skipped."
echo >&2 ""
}
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required parameter"
display_usage
exit 1
}
dir_missing() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
db_missing() {
echo >&2 ""
echo >&2 "ERROR: No reference database or annotation file found!"
echo >&2 "ERROR: '$1' is not a file."
echo >&2 "SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP caller"
display_usage
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "WARNING '$1' is not an initialized metaSNP project. Subdirectory '$(basename $2)' (output DIR) is missing."
echo >&2 "RUN:"
echo >&2 " mkdir $2"
echo >&2 ""
echo >&2 "Then rerun '$0 $ARGA'"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# Parse args using getopt (instead of getopts) to allow arguments before options
ARGA=$@
ARGS=$(getopt -o l:a:d:h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
while true; do
case "$1" in
# Shift before to throw away option
# Shift after if option has a required positional argument
-l)
shift
SPLITS="$1/"
[ -d "$SPLITS" ] || dir_missing "$SPLITS"
best_splt=$(find $SPLITS -name "best_split_*")
best_splt="$best_splt"
shift
;;
-a)
shift
DB_ANNO="$1"
[ -f "$DB_ANNO" ] || db_missing "$DB_ANNO"
DB_ANNO="-g $DB_ANNO"
shift
;;
-d)
shift
DEBUG="$1"
shift
;;
-h)
shift
display_usage
exit 1
;;
--)
shift
break
;;
esac
done
# Check Files
# Required:
[ -n "$PROJECT_DIR" ] || required_parameter "project_dir/"
[ -d "$PROJECT_DIR" ] || dir_missing "$PROJECT_DIR"
[ -d "$PROJECT_DIR/snpCaller" ] || make_dir "$PROJECT_DIR" "$PROJECT_DIR/snpCaller"
[ -n "$SAMPLES" ] || required_parameter "all_samples"
[ -f "$SAMPLES" ] || dir_missing "$SAMPLES"
[ -n "$REF_DB" ] || required_parameter "ref_db"
[ -f "$REF_DB" ] || db_missing "$REF_DB"
REF_DB="-f $REF_DB"
# SNP-CALLER + I/O FILES
snpCaller="$DIR/src/snpCaller/snpCall"
indiv_out=$PROJECT_DIR"/snpCaller/indiv_called"
called_SP=$PROJECT_DIR"/snpCaller/called_SNPs"
## ACTUAL COMMANDLINE
# Note: Due to a bug in samtools v0.1.18, -Q 20 might be erroneous to use.
# Note: Different phred score scales might be disregarded.
# Note: If samtools > v0.1.18 is used -Q 20 filtering is highly recommended.
# Generate Commandline(s)
if [ "$SPLITS" ]
then
# Job parallelization (one command-line per split)
for split in $best_splt;
do
echo "samtools mpileup -l $split $REF_DB -B -b $SAMPLES | $snpCaller $REF_DB $DB_ANNO -i $indiv_out.$(echo $split | rev | cut -f1 -d/ | rev) > $called_SP.${split##*/}"
done
else
# Single batch jobs
echo "samtools mpileup $REF_DB -B -b $SAMPLES | $snpCaller $REF_DB $DB_ANNO -i $indiv_out > $called_SP"
fi
exit
#!/bin/env python
import os # interacting with operating system
import sys # interact with system commandline
import time # print current date/time
import argparse # manage command line options and arguments
#directory = os.path.realpath(__file__))
cwdir = os.getcwd()
print("\n")
#############################
# Parse Commandline Arguments
#############################
def get_arguments():
'''
Get commandline arguments and return namespace
'''
## Initialize Parser
parser = argparse.ArgumentParser(prog='metaSNP filtering', description='metaSNP filtering', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Not Showns:
parser.add_argument('--version', action='version', version='%(prog)s 1.0', help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
parser.add_argument("-v", "--vcf", action="store_true", help=argparse.SUPPRESS)#"output variants in variant call format (vcf4.2)"
# TODO: POPULAITON STATISTICS
## Population Statistics (Fst - ):
# parser.add_argument("-f", "--fst", action="store_true")
## pNpS ratio:
# parser.add_argument("-n", "--pnps", action="store_true")
# OPTIONAL arguments:
parser.add_argument('-b', metavar='FLOAT', type=float, help="Coverage breadth: Horizontal genome coverage percentage per sample per species", default=40.0)
parser.add_argument('-d', metavar='FLOAT', type=float, help="Coverage depth: Average vertical genome coverage per sample per species", default=5.0)
parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
# REQUIRED arguments:
parser.add_argument('percentage_file', help='input file with horizontal genome (taxon) coverage (breadth) per sample (percentage covered)', metavar='perc_FILE')
parser.add_argument('coverage_file', help='input file with average genome (taxon) coverage (depth) per sample (average number reads per site)', metavar='cov_FILE')
parser.add_argument('snp_files', nargs='+', help='input files from SNP calling', metavar='snp_FILE')
parser.add_argument('all_samples', help='list of input BAM files, one per line', metavar='all_samples')
parser.add_argument('outdir', help='output folder', action='store', metavar='output_dir/')
return parser.parse_args()
##############################
def debugging(taxids_of_interest, header_cov):
'''SanityCheck for the Dict of TaxID:Samples'''
nr_keys = 0
taxids_of_interest = []
for key in samples_of_interest:
nr_keys += 1
taxids_of_interest.append(key)
taxids_of_interest.sort()
print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
print(taxids_of_interest)
# print("\n")
# print("nr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
header = "\t".join(header_cov)
print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
# print("header: {}".format(header))
sys.exit("only filter 1")
def file_check():
''' Check if required files exist (True / False)'''
print("\nchecking for files...")
if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
else:
sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))