Commit 9dbc0e7e authored by Robin Erich Muench's avatar Robin Erich Muench
Browse files

renamed scripts

parent 78a22ff6
#!/bin/bash
# usage: ./createCoverageComputationRun all_samples > toRunCov
display_usage() {
# ToDo: maybe ask for a unique project name parameter to find corresponding cov files easier lateron
echo -e "\n\tUsage: $0 project_dir/ all_samples > toRunCov\n"
echo -e "\n\tUsage: $(basename $0) project_dir/ all_samples > toRunCov\n"
#echo -e "\tall_samples FILE --- list of input BAM filenames, on per line"
}
#echo $#
# print error message if no argument is supplied
if [ $# -ne 2 ]
then
......@@ -18,21 +12,16 @@ then
exit 1
fi
# assign arguments to variables
wd=$(pwd)
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
all_samples=$2
project_name=$1
#results="results".$project_name
#location="results".$project_name"/cov"
results=$project_name
location=$project_name"/cov"
if [ ! -d "$results" ];
if [ ! -d "$project_name" ];
then
# Control will enter here if results $DIRECTORY doesn't exist.
(>&2 echo -e "\nSTDERR: cov directory does not exist. Initiate a NewProject first")
# Control will enter here if project $DIRECTORY doesn't exist.
(>&2 echo -e "\nSTDERR: project_dir/ does not exist. Initiate a new project first (metaSNP_NEW")
display_usage
exit 1
else
......@@ -42,22 +31,15 @@ else
# Control will enter here if coverage $DIRECTORY doesn't exist.
(>&2 echo -e "\nSTDERR: cov directory does not exist. It will be created")
mkdir $location
# else
# (>&2 echo -e "\nSTDERR: writing results to $location\n")
fi
fi
while read file
do
#Get name of sample -- TODO: problem later on if file name has no filtered
# name=$(echo $file | awk -F"/" '{split($NF,a,".filtered"); print a[1]}')
# name=$(echo $file | awk -F"/" '{split($NF,a,"."); print a[1]}')
name=$(echo $file | awk -F"/" '{split($NF,a,"/"); print a[1]}')
# echo $file
# echo $name
echo $wd/src/qaTools/qaCompute -c 10 -d -i $file $location/$name.cov
name=$(echo $file | awk -F"/" '{split($NF,a,"/"); print a[1]}')
echo time $DIR/src/qaTools/qaCompute -c 10 -d -i $file $location/$name.cov
done < $all_samples
#!/bin/bash
#########################################
# metaSNP v1.1 - Initiate New Project #
#########################################
#
# Helper script to initiate a new project file structure with a 'uniq_project_name'
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Code is (c) Copyright ???
# This code is released under LICENSE
# Variables
# directory=results.$1
PROJECTPATH=$1
directory=$PROJECTPATH
#printf '%s\n' "${PROJECTPATH##*/}"
# Usage Message
display_usage(){
echo -e "\nUsage: $0 project_dir\n"
}
# print error message if no PROJECTNAME argument is supplied
if [ $# -eq 0 ]
then
display_usage
exit 1
fi
# Check if the Project Directory already exists.
if [ ! -d "$directory" ]; then
# If $DIRECTORY doesn't exist.
echo -e "\nProject name is available. New project directory will be created here: $PROJECTPATH"
mkdir -p $directory/{cov,bestsplits,snpCaller,filtered/{pop,ind},distances}
else
echo -e "\nCAUTION: The directory '$PROJECTPATH' already exists.\n\t Use a uniq project name or delet the existing directory.\n"
fi
......@@ -6,15 +6,22 @@
# Helper script
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Code is (c) Copyright ???
# This code is released under LICENSE
# Copyright (c) 2016 ???
# Licenced under LICENSE
# Abort on any errors
set -e
# Variables
wd=$(pwd)
wd=`pwd`
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
#echo "WD: $wd"
#echo "Dir: $DIR"
#echo "$(basename $0)"
#echo "$0"
#echo "$(dirname $0)"
#exit
arg1="$1"
arg2="$2"
arg3="$3"
......@@ -39,14 +46,15 @@ display_usage() {
echo >&2 ""
echo >&2 ""
echo >&2 "Usage: $0 project_dir/ genome_def nr_splits[int]"
echo >&2 "Usage: $(basename $0) project_dir/ genome_def nr_splits[int]"
echo >&2 ""
# echo -e >&2 "\tParameters\n"
echo >&2 "Required Input:"
echo >&2 " project_dir/ = the projects result directory"
echo >&2 " genome_def FILE = Genome file (contiq<tab>start_pos<tab>end_pos)\n"
echo >&2 "Optional Input:"
echo >&2 " nr_splits INT = INT for job parallelization (range: 1-100) [10] \n"
echo >&2 "Parameters"
echo >&2 " Required:"
echo >&2 " project_dir/ = the projects result directory"
echo >&2 " genome_def FILE = Contig ranges in BED format. (Fields: Contig_id, contigStart, contigEnd)"
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " nr_splits INT = INT for job parallelization (range: 1-100) [10]"
echo >&2 ""
}
......@@ -115,13 +123,13 @@ for file in $allFile
do
echo $file
# exit 1
python $wd/src/computeGenomeCoverage.py $file $file.detail $file.summary
python $DIR/src/computeGenomeCoverage.py $file $file.detail $file.summary
done
# TODO: Nicer? add date?
ls *.summary | xargs awk -f $wd"/src/collapse_AvgCov.awk" > ../$PROJECT_NAME.all_cov.tab
ls *.summary | xargs awk -f $wd"/src/collapse_PercCov.awk" > ../$PROJECT_NAME.all_perc.tab
ls *.summary | xargs awk -f $DIR/src/collapse_AvgCov.awk > ../$PROJECT_NAME.all_cov.tab
ls *.summary | xargs awk -f $DIR/src/collapse_PercCov.awk > ../$PROJECT_NAME.all_perc.tab
#Remove second Row
......@@ -135,6 +143,7 @@ cd $wd
# Genome Splitting
cov="$PROJECT_DIR/$PROJECT_NAME.all_cov.tab"
perc="$PROJECT_DIR/$PROJECT_NAME.all_perc.tab"
geneDef=$genome_def
outFile=$PROJECT_DIR"/bestsplits/best_split"
nr=$NUM_SPLITS
......@@ -149,5 +158,5 @@ rm -rf $remove
echo -e >&2 "\nGenome Splitting:"
# usage createOptimumSplit.sh <all_cov.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
python $wd/src/createOptimumSplit.py $cov $geneDef $nr $outFile
python $DIR/src/createOptimumSplit.py $cov $perc $geneDef $nr $outFile
......@@ -4,40 +4,43 @@
#########################################
#
# Helper script to set up the metaSNP caller
# -- The script will generate multiple commandline jobs
# -- The script will generate either one or several commandline jobs (parallelization)
# -- each job has to be run in order to compute SNPs for the entire population
#
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Code is (c) Copyright ???
# This code is released under LICENSE
# Copyright (c) 2016 Robin Muench
# Licenced under the GNU General Public License (see LICENSE)
# Abort on any errors
set -e
# Variables
DIR="$1"
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PROJECT_DIR="$1"
SAMPLES="$2"
# Default Variables
SPLITS=$DIR"/bestsplits/"
REF_DB="db/RepGenomesv9.fna"
DB_ANNO="db/RefOrganismDB_v9_gene.clean"
REF_DB="$3"
# Optional
SPLITS=""
DB_ANNO=""
DEBUG=0
display_usage() {
echo >&2 ""
echo >&2 " Usage: $0 [options] project_dir/ all_samples"
echo >&2 " Usage: $(basename $0) [options] project_dir/ all_samples ref_db"
echo >&2 ""
echo >&2 "Parameters:"
echo >&2 " Required:"
echo >&2 " project_dir DIR = the project directory."
echo >&2 " all_samples FILE = list of bam files, one file per line"
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " -l splits/ DIR = skip pre-processing use existing bestsplits DIR"
echo >&2 " -f ref_db FILE = reference sequence FASTA file"
echo >&2 " -a db_ann FILE = database annotation"
echo >&2 ""
echo >&2 " all_samples FILE = list of bam files, one file per line."
echo >&2 " ref_db FILE = reference multi-sequence FASTA file used for the alignments."
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " -a db_ann FILE = database annotation."
# echo >&2 " -f ref_db FILE = reference sequence FASTA file (db/RepGenomesv9.fna)"
echo >&2 " -l splits/ DIR = genome splits DIR for job parallelization (pre-processing)."
echo >&2 ""
echo >&2 "Note: Alternatively, use the '-l splits' option to call SNPs for specific species, contig regions (BED) or single positions (contig_id pos). Unlisted contigs/pos are skipped."
echo >&2 ""
}
required_parameter() {
......@@ -65,7 +68,7 @@ db_missing() {
# Parse args using getopt (instead of getopts) to allow arguments before options
ARGS=$(getopt -o l:f:a:d:h -n "$0" -- "$@")
ARGS=$(getopt -o l:a:d:h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
......@@ -76,16 +79,23 @@ while true; do
-l)
shift
SPLITS="$1/"
[ -d "$SPLITS" ] || dir_missing "$SPLITS"
best_splt=$(find $SPLITS -name "best_split_*")
best_splt="$best_splt"
shift
;;
-f)
shift
REF_DB="$1"
shift
;;
# -f)
# shift
# REF_DB="$1"
# [ -f "$REF_DB" ] || db_missing "$REF_DB"
# REF_DB="-f $REF_DB"
# shift
# ;;
-a)
shift
DB_ANNO="$1"
[ -f "$DB_ANNO" ] || db_missing "$DB_ANNO"
DB_ANNO="-g $DB_ANNO"
shift
;;
-d)
......@@ -108,33 +118,38 @@ done
# Check Files
# Required:
[ -n "$DIR" ] || required_parameter "project_dir/"
[ -d "$DIR" ] || dir_missing "$DIR"
[ -n "$PROJECT_DIR" ] || required_parameter "project_dir/"
[ -d "$PROJECT_DIR" ] || dir_missing "$PROJECT_DIR"
[ -n "$SAMPLES" ] || required_parameter "all_samples"
[ -f "$SAMPLES" ] || dir_missing "$SAMPLES"
# Optional:
[ -n "$REF_DB" ] || required_parameter "ref_db"
[ -f "$REF_DB" ] || db_missing "$REF_DB"
[ -f "$DB_ANNO" ] || db_missing "$DB_ANNO"
[ -d "$SPLITS" ] || dir_missing "$SPLITS"
REF_DB="-f $REF_DB"
# SNP-CALLER + I/O FILES
best_splt=$(find $SPLITS -name "best_split_*")
snpCaller="src/snpCaller/snpCall"
indiv_out=$DIR"/snpCaller/indiv_called"
called_SP=$DIR"/snpCaller/called_SNPs"
snpCaller="$DIR/src/snpCaller/snpCall"
indiv_out=$PROJECT_DIR"/snpCaller/indiv_called"
called_SP=$PROJECT_DIR"/snpCaller/called_SNPs"
## ACTUAL COMMANDLINE
# Note: Due to a bug in samtools v0.1.18, -Q 20 might be erroneous to use.
# Note: Different phred score scales might be disregarded.
# Note: If samples are properly aligned -Q 20 filtering is highly recommended.
# Generate Commandline
for split in $best_splt
do
echo "samtools mpileup -l $split -f $REF_DB -B -b $SAMPLES | $snpCaller -f $REF_DB -g $DB_ANNO -i $indiv_out.$(echo $split | rev | cut -f1 -d/ | rev) > $called_SP.${split##*/}"
done
# Note: If samtools > v0.1.18 is used -Q 20 filtering is highly recommended.
# Generate Commandline(s)
if [ "$SPLITS" ]
then
# Job parallelization (one command-line per split)
for split in $best_splt;
do
echo "time samtools mpileup -l $split $REF_DB -B -b $SAMPLES | $snpCaller $REF_DB $DB_ANNO -i $indiv_out.$(echo $split | rev | cut -f1 -d/ | rev) > $called_SP.${split##*/}"
done
else
# Single batch jobs
echo "time samtools mpileup $REF_DB -B -b $SAMPLES | $snpCaller $REF_DB $DB_ANNO -i $indiv_out > $called_SP"
fi
exit
......@@ -35,7 +35,7 @@ def get_arguments():
# OPTIONAL arguments:
parser.add_argument('-p','--perc', type=float, help="Coverage breadth: Horizontal coverage cutoff (percentage taxon covered) per sample", default=40.0)
parser.add_argument('-c','--cov', type=float, help="Coverage depth: Average vertical coverage cutoff per taxon, per sample", default=5.0)
parser.add_argument('-m','--minsamples', type=int, help="Minimum number of sample that have to pass the filtering criteria in order to write an output for the representative Genome", default=2)
parser.add_argument('-m','--minsamples', type=int, help="Minimum number of sample required to pass the coverage cutoffs per genome", default=2)
parser.add_argument('-s','--snpc', type=float, help="FILTERING STEP II: SNP coverage cutoff", default=5.0)
parser.add_argument('-i','--snpi', type=float, help="FILTERING STEP II: SNP occurence (incidence) cutoff within samples_of_interest", default=0.50)
......@@ -267,7 +267,7 @@ def filter_two(args, snp_header):
for snp_line in file: #position wise loop
# print(snp_line)
snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome
snp_taxID = snp_line.split()[0].split(']')[0]#Name of Genome change from . to ]
# print(snp_taxID)
if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment