Commit 667b6991 authored by Robin Erich Muench's avatar Robin Erich Muench
Browse files

updated error messages + metaSNP_distances.sh

parent 57049711
#!/bin/bash
download() {
mkdir -p db
download_9() {
[ -d "db" ] || mkdir -p db
cd db
#wget http://vm-lux.embl.de/~rmuench/db_f9.tar.xz
wget http://vm-lux.embl.de/~rmuench/files/db_f9.tar.bz2
echo "Downloading the archive."
wget vm-lux.embl.de/~rmuench/files/db_f9.tar.bz2
echo "This might take up to 10 minutes."
echo "Extracting files!"
echo "Patience please..."
#tar xfvJ db_f9.tar.xz
tar xfvj db_f9.tar.bz2
tar xjvf db_f9.tar.bz2
exit
}
download_11() {
[ -d "db" ] || mkdir -p db
cd db
echo "Downloading the archive."
wget vm-lux.embl.de/~rmuench/files/db_f11.tar.bz2
echo "This might take up to 20 minutes."
echo "Extracting files!"
echo "Patience please..."
tar xjvf db_f11.tar.bz2
exit
}
echo "You are about to download the metaSNP database"
echo ""
echo "Requirements:"
echo " - Freeze9: 12G available disc space (1753 genomes)"
echo " - Freeze11: 33G available disc space (5278 genomes)"
echo ""
echo "Requirement: 12G available disc space"
echo "Do you wish to download our standard database?"
echo "Please make a selection by entering the right number:"
select yn in "Yes" "No"; do
select yn in "f9" "f11" "cancel"; do
case $yn in
Yes ) download ; break;;
No ) exit;;
f9 ) download_9 ; break;;
f11 ) download_11; break;;
cancel ) exit;;
esac
done
exit
#!/bin/bash
display_usage() {
echo -e "\n\tUsage: $(basename $0) project_dir/ all_samples > toRunCov\n"
#echo -e "\tall_samples FILE --- list of input BAM filenames, on per line"
echo >&2 ""
echo >&2 " Usage: $(basename $0) project_dir/ all_samples"
echo >&2 ""
echo >&2 "Parameters:"
echo >&2 " Required:"
echo >&2 " project_dir/ DIR A metaSNP initialized project directory (metaSNP_NEW)"
echo >&2 " all_samples FILE Input list of bam files, one file per line."
echo >&2 ""
}
missing() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "ERROR: '$1' is not a metaSNP project. Subdirectory '$2' is missing."
echo >&2 ""
echo >&2 "Please initialize a new metaSNP project first!"
echo >&2 ""
echo >&2 " RUN: metaSNP_NEW projectName"
echo >&2 "or"
echo >&2 " RUN: mkdir $2"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# print error message if no argument is supplied
if [ $# -ne 2 ]
then
......@@ -14,32 +43,45 @@ fi
# assign arguments to variables
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
all_samples=$2
project_name=$1
location=$project_name"/cov"
if [ ! -d "$project_name" ];
then
# Control will enter here if project $DIRECTORY doesn't exist.
(>&2 echo -e "\nSTDERR: project_dir/ does not exist. Initiate a new project first (metaSNP_NEW")
display_usage
exit 1
else
if [ ! -d "$location" ];
then
# Control will enter here if coverage $DIRECTORY doesn't exist.
(>&2 echo -e "\nSTDERR: cov directory does not exist. It will be created")
mkdir $location
fi
fi
all_samples="$2"
PROJECT_DIR="$1"
OUT_DIR="$PROJECT_DIR/cov"
# getopt to use -h flag
ARGS=$(getopt -o h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
while true; do
case "$1" in
# Shift before to throw away option
# Shift after if option has a required positional argument
-h)
shift
display_usage
exit 1
;;
--)
shift
break
;;
esac
done
## Control
[ -d "$PROJECT_DIR" ] || missing "$PROJECT_DIR"
[ -f "$all_samples" ] || missing "$all_samples"
# Check required output directories
[ -d "$OUT_DIR" ] || make_dir "$PROJECT_DIR" "$OUT_DIR"
## Generate command lines
while read file
do
#Get name of sample -- TODO: problem later on if file name has no filtered
# name=$(echo $file | awk -F"/" '{split($NF,a,".filtered"); print a[1]}')
# name=$(echo $file | awk -F"/" '{split($NF,a,"."); print a[1]}')
name=$(echo $file | awk -F"/" '{split($NF,a,"/"); print a[1]}')
echo time $DIR/src/qaTools/qaCompute -c 10 -d -i $file $location/$name.cov
# name=$(echo $file | awk -F"/" '{split($NF,a,"/"); print a[1]}')
name=$(basename $file)
echo $DIR/src/qaTools/qaCompute -c 10 -d -i $file $OUT_DIR/$name.cov
done < $all_samples
exit
......@@ -3,43 +3,54 @@
# metaSNP v1.1 - Initiate New Project #
#########################################
#
# Helper script to initiate a new project file structure with a 'uniq_project_name'
# Helper script to initiate a new project file structure.
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Code is (c) Copyright ???
# This code is released under LICENSE
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
# Variables
# directory=results.$1
PROJECTPATH=$1
directory=$PROJECTPATH
#printf '%s\n' "${PROJECTPATH##*/}"
PROJECT_NAME="$1"
# Usage Message
display_usage(){
echo -e "\nUsage: $0 project_dir\n"
echo >&2 ""
echo >&2 " Usage: $(basename $0) project_name"
echo >&2 ""
}
# print error message if no PROJECTNAME argument is supplied
if [ $# -eq 0 ]
then
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required parameter"
display_usage
exit 1
fi
}
# Check if the Project Directory already exists.
if [ ! -d "$directory" ]; then
# If $DIRECTORY doesn't exist.
echo -e "\nProject name is available. New project directory will be created here: $PROJECTPATH"
make_dir(){
mkdir -p $directory/{cov,bestsplits,snpCaller,filtered/{pop,ind},distances}
echo >&2 ""
echo >&2 " Project name is available. New project directory created here: $PROJECT_NAME"
echo >&2 ""
mkdir -p $PROJECT_NAME/{cov,bestsplits,snpCaller,filtered/{pop,ind},distances}
else
echo -e "\nCAUTION: The directory '$PROJECTPATH' already exists.\n\t Use a uniq project name or delet the existing directory.\n"
fi
}
file_exists(){
echo >&2 ""
echo >&2 " ERROR: The directory '$PROJECT_NAME' exists. Please choose a unique project name."
echo >&2 ""
}
# Check required parameters
[ -n "$PROJECT_NAME" ] || required_parameter "project_name"
# Make
if [ ! -d "$PROJECT_NAME" ]
then
make_dir
else
file_exists
fi
exit
......@@ -6,8 +6,8 @@
# Helper script
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 ???
# Licenced under LICENSE
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
# Abort on any errors
......@@ -16,48 +16,49 @@ set -e
# Variables
wd=`pwd`
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
#echo "WD: $wd"
#echo "Dir: $DIR"
#echo "$(basename $0)"
#echo "$0"
#echo "$(dirname $0)"
#exit
arg1="$1"
arg2="$2"
arg3="$3"
genome_def=$arg2
PROJECT_DIR=$arg1
GENOME_DEF=$arg2
NUM_SPLITS=10
SPLIT_LIMIT=100
#PROJECT_NAME=$(echo $PROJECT_DIR | awk -F"." '{split($2,a,"/"); print a[1]}')
#PROJECT_NAME=$(echo $PROJECT_DIR | awk -F"/" '{print $(NF-1)}')
#PROJECT_NAME=$(basename $PROJECT_DIR)
COV_FILES=$PROJECT_DIR"/cov"
date=$(date +%Y-%m-%d)
#Usage Messages
display_usage() {
display_usage() {
echo >&2 ""
echo >&2 ""
echo >&2 "Usage: $(basename $0) project_dir/ genome_def nr_splits[int]"
echo >&2 ""
echo >&2 "Parameters"
echo >&2 " Required:"
echo >&2 " project_dir/ = the projects result directory"
echo >&2 " project_dir/ = the metaSNP project directory"
echo >&2 " genome_def FILE = Contig ranges in BED format. (Fields: Contig_id, contigStart, contigEnd)"
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " nr_splits INT = INT for job parallelization (range: 1-100) [10]"
echo >&2 ""
echo >&2 "Note: Expecting 'metaSNP_COV' to be completed!"
echo >&2 ""
}
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required argument"
display_usage
exit 1
}
no_such_file() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
split_limit() {
echo >&2 ""
......@@ -67,89 +68,112 @@ split_limit() {
exit 1
}
required_project_name() {
missing_input_dir() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required argument"
display_usage
echo >&2 "ERROR: '$1' missing in the project directory '$2'"
echo >&2 "ERROR: '$1' contains the input files (metaSNP_COV output)"
echo >&2 "SUGGESTION: Double check if '$2' is the intended project directory, or"
# echo >&2 "SUGGESTION: At this point the directory should have the coverage files in '$1'"
echo >&2 "SOLUTION: Initialize a new project (metaSNP_NEW) and rerun the coverage estimation (metaSNP_COV)."
echo >&2 ""
exit 1
}
required_genome_definition_file() {
rerun_cov() {
echo >&2 ""
echo >&2 "ERROR: Requires coverage estimations. Run 'metaSNP_COV' first"
echo >&2 ""
echo >&2 "ERROR: '$1' is a required argument"
display_usage
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "WARNING '$1' is not an initialized metaSNP project. Subdirectory '$(basename $2)' (output DIR) is missing."
echo >&2 "WARNING: If you used '$1' for the coverage estimation (metaSNP_COV) RUN:"
echo >&2 ""
echo >&2 " mkdir $2"
echo >&2 ""
echo >&2 "SUGGESTION: Double check if '$1' has the correct cov/ folder!"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# Conditions:
[ -z "$arg1" ] && required_project_name "project_dir"
[ -z "$arg2" ] && required_genome_definition_file "genome_def"
[ -n "$arg3" ] && NUM_SPLITS=$arg3
# getopt to use -h flag
ARGS=$(getopt -o h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
while true; do
case "$1" in
# Shift before to throw away option
# Shift after if option has a required positional argument
-h)
shift
display_usage
exit 1
;;
--)
shift
break
;;
esac
done
# Set PROJECT_NAME
#PROJECT_NAME=$(echo $PROJECT_DIR | awk -F"." '{split($2,a,"/"); print a[1]}')
#PROJECT_NAME=$(echo $PROJECT_DIR | awk -F"/" '{print $(NF-1)}')
PROJECT_NAME=$(basename $PROJECT_DIR)
## Test
# Required parameters:
[ -z "$arg1" ] && required_parameter "project_dir"
[ -z "$arg2" ] && required_parameter "genome_def"
[ -n "$arg3" ] && NUM_SPLITS=$arg3
# Check Nr_Splits
# Nr_Splits
[ "$NUM_SPLITS" -gt $SPLIT_LIMIT ] && split_limit $NUM_SPLITS
# Check if directory exists
if [ ! -d "$PROJECT_DIR" ];
then
# Check if results $DIRECTORY exists.
(>&2 echo -e "\nSTDERR: '$PROJECT_DIR/cov' does not exist. Check if the project_name: '$PROJECT_DIR' is correct")
display_usage
exit 1
else
# Required files and directories
[ -d "$PROJECT_DIR" ] || no_such_file "$PROJECT_DIR"
[ -d "$COV_FILES" ] || missing_input_dir "$COV_FILES" "$PROJECT_DIR"
[ -f "$GENOME_DEF" ] || no_such_file "$GENOME_DEF"
cd $COV_FILES
# Required output DIR
[ -d "$PROJECT_DIR/bestsplits/" ] || make_dir "$PROJECT_DIR" "$PROJECT_DIR/bestsplits/"
fi
# Get project name
#PROJECT_NAME=$(echo $PROJECT_DIR | awk -F"/" '{print $(NF-1)}')
PROJECT_NAME=$(basename $PROJECT_DIR)
#
# Get a summary for the coverage computation (Step I) from the .cov and .detail files
#
# List of all coverage Files
# List of coverage files for all samples
cd $COV_FILES
[ "$(ls *.cov)" ] || rerun_cov
allFile=$(ls *.cov)
for file in $allFile
do
echo $file
# exit 1
python $DIR/src/computeGenomeCoverage.py $file $file.detail $file.summary
done
# TODO: Nicer? add date?
echo -e "\nCoverage summary here: $PROJECT_DIR"
echo " Average vertical genome coverage: '$PROJECT_DIR$PROJECT_NAME.all_cov.tab'"
echo " Horizontal genome coverage (1X): '$PROJECT_DIR$PROJECT_NAME.all_perc.tab'"
ls *.summary | xargs awk -f $DIR/src/collapse_AvgCov.awk > ../$PROJECT_NAME.all_cov.tab
ls *.summary | xargs awk -f $DIR/src/collapse_PercCov.awk > ../$PROJECT_NAME.all_perc.tab
#Remove second Row
#awk '!(NR == 2)' ../all_cov.tab > ../temp
#mv ../temp ../all_cov.tab
# Back to initial directory
cd $wd
## DATABASE INDEXING - (Genome Splitting)
# Genome Splitting
# Parameters:
cov="$PROJECT_DIR/$PROJECT_NAME.all_cov.tab"
perc="$PROJECT_DIR/$PROJECT_NAME.all_perc.tab"
geneDef=$genome_def
genDef=$GENOME_DEF
outFile=$PROJECT_DIR"/bestsplits/best_split"
nr=$NUM_SPLITS
# Empty bestplits directory
# Clear bestplits directory
echo -e >&2 "\nremoving old splits in:"
echo -e >&2 $PROJECT_DIR"/bestsplits/*"
remove=$PROJECT_DIR"/bestsplits/*"
......@@ -157,6 +181,12 @@ remove=$PROJECT_DIR"/bestsplits/*"
rm -rf $remove
echo -e >&2 "\nGenome Splitting:"
# usage createOptimumSplit.sh <all_cov.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
python $DIR/src/createOptimumSplit.py $cov $perc $geneDef $nr $outFile
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
python $DIR/src/createOptimumSplit.py $cov $perc $genDef $nr $outFile
# Remove second Row - CAUTION: change in filtering.py as well
#awk '!(NR == 2)' $PROJECT_DIR$PROJECT_NAME.all_cov.tab > temp
#mv temp $PROJECT_DIR$PROJECT_NAME.all_cov.tab
#awk '!(NR == 2)' $PROJECT_DIR$PROJECT_NAME.all_perc.tab > temp
#mv temp $PROJECT_DIR$PROJECT_NAME.all_perc.tab
exit
\ No newline at end of file
......@@ -8,7 +8,7 @@
# -- each job has to be run in order to compute SNPs for the entire population
#
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Muench
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
# Abort on any errors
......@@ -26,9 +26,9 @@ DEBUG=0
display_usage() {
echo >&2 ""
echo >&2 " Usage: $(basename $0) [options] project_dir/ all_samples ref_db"
echo >&2 " Usage: $(basename $0) project_dir/ all_samples ref_db [options]"
echo >&2 ""
echo >&2 "Parameters:"
echo >&2 "Parameters (positional):"
echo >&2 " Required:"
echo >&2 " project_dir DIR = the project directory."
echo >&2 " all_samples FILE = list of bam files, one file per line."
......@@ -36,7 +36,6 @@ display_usage() {
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " -a db_ann FILE = database annotation."
# echo >&2 " -f ref_db FILE = reference sequence FASTA file (db/RepGenomesv9.fna)"
echo >&2 " -l splits/ DIR = genome splits DIR for job parallelization (pre-processing)."
echo >&2 ""
echo >&2 "Note: Alternatively, use the '-l splits' option to call SNPs for specific species, contig regions (BED) or single positions (contig_id pos). Unlisted contigs/pos are skipped."
......@@ -61,13 +60,26 @@ db_missing() {
echo >&2 ""
echo >&2 "ERROR: No reference database or annotation file found!"
echo >&2 "ERROR: '$1' is not a file."
echo >&2 "SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP caller"
echo >&2 "SOLUTION: run getRefDB.sh or set up a custom database before running metaSNP caller"
display_usage
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "WARNING '$1' is not an initialized metaSNP project. Subdirectory '$(basename $2)' (output DIR) is missing."
echo >&2 "RUN:"
echo >&2 " mkdir $2"
echo >&2 ""
echo >&2 "Then rerun '$0 $ARGA'"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# Parse args using getopt (instead of getopts) to allow arguments before options
ARGA=$@
ARGS=$(getopt -o l:a:d:h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
......@@ -84,13 +96,6 @@ while true; do
best_splt="$best_splt"
shift
;;
# -f)
# shift
# REF_DB="$1"
# [ -f "$REF_DB" ] || db_missing "$REF_DB"
# REF_DB="-f $REF_DB"
# shift
# ;;
-a)
shift
DB_ANNO="$1"
......@@ -121,6 +126,8 @@ done
[ -n "$PROJECT_DIR" ] || required_parameter "project_dir/"
[ -d "$PROJECT_DIR" ] || dir_missing "$PROJECT_DIR"
[ -d "$PROJECT_DIR/snpCaller" ] || make_dir "$PROJECT_DIR" "$PROJECT_DIR/snpCaller"
[ -n "$SAMPLES" ] || required_parameter "all_samples"
[ -f "$SAMPLES" ] || dir_missing "$SAMPLES"
......@@ -144,12 +151,12 @@ if [ "$SPLITS" ]
# Job parallelization (one command-line per split)
for split in $best_splt;
do
echo "time samtools mpileup -l $split $REF_DB -B -b $SAMPLES | $snpCaller $REF_DB $DB_ANNO -i $indiv_out.$(echo $split | rev | cut -f1 -d/ | rev) > $called_SP.${split##*/}"
echo "samtools mpileup -l $split $REF_DB -B -b $SAMPLES | $snpCaller $REF_DB $DB_ANNO -i $indiv_out.$(echo $split | rev | cut -f1 -d/ | rev) > $called_SP.${split##*/}"
done
else
# Single batch jobs
echo "time samtools mpileup $REF_DB -B -b $SAMPLES | $snpCaller $REF_DB $DB_ANNO -i $indiv_out > $called_SP"
echo "samtools mpileup $REF_DB -B -b $SAMPLES | $snpCaller $REF_DB $DB_ANNO -i $indiv_out > $called_SP"
fi
exit
......@@ -269,40 +269,27 @@ def filter_two(args, snp_header):
# print(snp_line)
snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]
# print(snp_taxID)
## SPECIES FILTER:
if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
continue #Taxon is not relevant, NEXT!
else:
## SAMPLE FILTER: only load samples with enough coverage
sample_list = samples_of_interest[snp_taxID]#Load Sample List
# !!! Get indices - INDICES based on ORDER in COV/PERC file!!!
# !!! Sample order, get indices - INDICES based on ORDER in COV/PERC file!!!
sample_indices = []
for name in sample_list:
sample_indices.append(snp_header.index(name))
if header_taxID != snp_taxID:
outfile = open(args.outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w')
print("Generating: {}".format(args.outdir+'/'+'%s.filtered.freq' % snp_taxID))
outfile.write('\t' + "\t".join(sample_list) + '\n')
if args.vcf == True:#VCF format
outfile_vcf = open(args.outdir+'/'+'%s.filtered.vcf' % snp_taxID, 'w')
print("Generating: {}".format(args.outdir+'/'+'%s.filtered.vcf' % snp_taxID))
outfile_vcf.write(write_vcf_meta() + '\n')#Write VCF meta header
outfile_vcf.write(write_vcf_header()+'\n')#Write VCF line header
header_taxID = snp_taxID#Jump to current TaxID
# CONDITIONS:
#..coverage at position > cX
#..occurrence < p% within SoIs
## POSITION FILTER:
# Positions with sufficient coverage (c) and support (p, proportion) in the discovery set (covered samples).
# Discovery set: sufficiently covered samples
#..site coverage >= cX
#..proportion >= p% (SoIs)
whole_line = snp_line.split()
site_coverage = list(map(int, whole_line[4].split('|'))) # Site coverages as list of ints
#FILTER: Count position below threshold:
nr_good = 0
for index in sample_indices:
if site_coverage[index] < args.c or site_coverage[index] == 0:
......@@ -316,15 +303,29 @@ def filter_two(args, snp_header):
else:# CALCULATE SNP ALLELE FREQUENCY: