Commit e5bc3190 authored by Luis Pedro Coelho's avatar Luis Pedro Coelho
Browse files

RFCT Rewrite metaSNP_OPT in Python

At this point, there is no longer any bash scripts.
parent 3b3d86b5
#!/bin/bash
#!/usr/bin/env python
#########################################
# metaSNP Step II: `Database Indexing` #
#########################################
......@@ -6,192 +6,78 @@
# Helper script
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Copyright (c) 2016 Robin Munch, 2017 Luis Pedro Coelho
# Licenced under the GNU General Public License (see LICENSE)
# Abort on any errors
set -e
# Variables
wd=`pwd`
DIR="$( dirname $( readlink -f "${BASH_SOURCE[0]}" ) )"
arg1="$1"
arg2="$2"
NUM_SPLITS="10"
date=$(date +%Y-%m-%d)
#Usage Messages
display_usage() {
echo >&2 ""
echo >&2 ""
echo >&2 "Usage: $(basename $0) project_dir/ genome_def nr_splits[int]"
echo >&2 ""
echo >&2 "Parameters"
echo >&2 " Required:"
echo >&2 " project_dir/ = the metaSNP project directory"
echo >&2 " genome_def FILE = Contig ranges in BED format. (Fields: Contig_id, contigStart, contigEnd)"
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " -n INT = split the workload into INT jobs for SNP call parallelization [10]"
echo >&2 ""
echo >&2 "Note: Expecting 'metaSNP_COV' to be completed!"
echo >&2 ""
}
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required argument"
display_usage
exit 1
}
param_non_integer() {
echo >&2 ""
echo >&2 "ERROR: '$2' is not an integer. Parameter ($1)."
display_usage
exit 1
}
no_such_file() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
split_limit() {
echo >&2 ""
echo >&2 "ERROR: nr_splits[$1] exceeds the limit of 100"
echo >&2 ""
display_usage
exit 1
}
missing_input_dir() {
echo >&2 ""
echo >&2 "ERROR: '$1' missing in the project directory '$2'"
echo >&2 "ERROR: '$1' contains the input files (metaSNP_COV output)"
echo >&2 "SUGGESTION: Double check if '$2' is the intended project directory, or"
# echo >&2 "SUGGESTION: At this point the directory should have the coverage files in '$1'"
echo >&2 "SOLUTION: Initialize a new project (metaSNP_NEW) and rerun the coverage estimation (metaSNP_COV)."
echo >&2 ""
exit 1
}
rerun_cov() {
echo >&2 ""
echo >&2 "ERROR: Requires coverage estimations. Run 'metaSNP_COV' first"
echo >&2 ""
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "WARNING '$1' is not an initialized metaSNP project. Subdirectory '$(basename $2)' (output DIR) is missing."
echo >&2 "WARNING: If you used '$1' for the coverage estimation (metaSNP_COV) RUN:"
echo >&2 ""
echo >&2 " mkdir $2"
echo >&2 ""
echo >&2 "SUGGESTION: Double check if '$1' has the correct cov/ folder!"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# getopt to use -h flag
ARGS=$(getopt -o hn: -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
while true; do
case "$1" in
# Shift before to throw away option
# Shift after if option has a required positional argument
-h)
shift
display_usage
exit 1
;;
-n)
shift
NUM_SPLITS="$1"
[[ $NUM_SPLITS =~ ^[0-9]+$ ]] || param_non_integer "-n INT" "$NUM_SPLITS"
shift
;;
--)
shift
break
;;
esac
done
## Test
# Required parameters:
[ -z "$arg1" ] && required_parameter "project_dir"
[ -z "$arg2" ] && required_parameter "genome_def"
# Read links
PROJECT_DIR="$(readlink -f $arg1)"
GENOME_DEF="$(readlink -f $arg2)"
COV_FILES="$PROJECT_DIR/cov"
# Required files and directories
[ -d "$PROJECT_DIR" ] || no_such_file "$PROJECT_DIR"
[ -d "$COV_FILES" ] || missing_input_dir "$COV_FILES" "$PROJECT_DIR"
[ -f "$GENOME_DEF" ] || no_such_file "$GENOME_DEF"
# Required output DIR
[ -d "$PROJECT_DIR/bestsplits/" ] || make_dir "$PROJECT_DIR" "$PROJECT_DIR/bestsplits/"
# Get project name
#PROJECT_NAME=$(echo $PROJECT_DIR | awk -F"/" '{print $(NF-1)}')
PROJECT_NAME=$(basename $PROJECT_DIR)
##
# Generate a summary for the coverage computation (Step I) from the .cov and .detail files
cd $COV_FILES
# List of coverage files for all samples
[ "$(ls *.cov)" ] || rerun_cov
allFile=$(ls *.cov)
for file in $allFile
do
echo $file
python $DIR/src/computeGenomeCoverage.py $file $file.detail $file.summary
done
echo -e "\nCoverage summary here: $PROJECT_DIR"
echo " Average vertical genome coverage: '$PROJECT_DIR/$PROJECT_NAME.all_cov.tab'"
echo " Horizontal genome coverage (1X): '$PROJECT_DIR/$PROJECT_NAME.all_perc.tab'"
ls *.summary | xargs awk -f $DIR/src/collapse_AvgCov.awk > ../$PROJECT_NAME.all_cov.tab
ls *.summary | xargs awk -f $DIR/src/collapse_PercCov.awk > ../$PROJECT_NAME.all_perc.tab
cd $wd
## DATABASE INDEXING - (Genome Splitting)
# Parameters:
cov="$PROJECT_DIR/$PROJECT_NAME.all_cov.tab"
perc="$PROJECT_DIR/$PROJECT_NAME.all_perc.tab"
genDef=$GENOME_DEF
outFile=$PROJECT_DIR"/bestsplits/best_split"
nr=$NUM_SPLITS
# Clear bestplits directory
echo -e >&2 "\nremoving old splits in:"
echo -e >&2 $PROJECT_DIR"/bestsplits/*"
remove=$PROJECT_DIR"/bestsplits/*"
#echo $remove
rm -rf $remove
echo -e >&2 "\nGenome Splitting:"
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
python $DIR/src/createOptimumSplit.py $cov $perc $genDef $nr $outFile
exit
import subprocess
from sys import stderr, exit
from os import path
import argparse
from glob import glob
import os
basedir = os.path.dirname(os.path.abspath(__file__))
def split_opt():
parser = argparse.ArgumentParser(description='Optimize project splits')
parser.add_argument('project_dir', metavar='PROJECT_DIR',
help='A metaSNP initialized project directory (metaSNP_NEW)')
parser.add_argument('db_ann', metavar='BED_FILE',
help='Database annotation (BED file).')
parser.add_argument('n_splits', metavar='INT', default=10, type=int, nargs='?',
help='Number of jobs to run simmultaneously')
args = parser.parse_args()
if not path.isdir(args.project_dir):
stderr.write("Cannot find project directory '{}'".format(args.project_dir))
args.print_help()
exit(1)
if args.n_splits > 100:
stderr.write("Maximum number of splits is 100.\n")
exit(1)
older_files = glob(args.project_dir + '/bestsplits/*')
if older_files:
stderr.write("\nremoving old splits.\n")
for f in older_files:
os.unlink(f)
cov_dir = path.join(args.project_dir, 'cov')
cov_files = glob(cov_dir + '/*.cov')
project_name = path.basename(args.project_dir)
if not cov_files:
stderr("Cov files not found. Please run metaSNP_COV.\n")
exit(1)
for f in cov_files:
cmd = ['python',
path.join(basedir, 'src/computeGenomeCoverage.py'),
f,
f + '.detail',
f + '.summary']
subprocess.call(cmd)
print("\nCoverage summary here: {}".format(args.project_dir))
print(" Average vertical genome coverage: '{}/{}.all_cov.tab'".format(args.project_dir, project_name))
print(" Horizontal genome coverage (1X): '{}/{}.all_perc.tab'".format(args.project_dir, project_name))
cmd = ['python',
'{}/src/collapse_coverages.py'.format(basedir),
args.project_dir]
subprocess.call(cmd)
print("\nGenome Splitting:")
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
cmd = ['python',
'{}/src/createOptimumSplit.py'.format(basedir),
"{}/{}.all_cov.tab".format(args.project_dir, project_name),
"{}/{}.all_perc.tab".format(args.project_dir, project_name),
args.db_ann,
str(args.n_splits),
path.join(args.project_dir, "bestsplits", "best_split")]
subprocess.call(cmd)
if __name__ == '__main__':
split_opt()
{
if( length(_[FNR]) < 1) {
_[FNR]=_[FNR] sprintf("%s\t",$1)
}
_[FNR]=_[FNR] sprintf("%s\t",$2)
}
END{
for(x=1; x < ARGC; x++) {
split(ARGV[x],a,".cov")
printf("\t%s",a[1])
}
printf("\n")
for(x=1; x < NR/(ARGC-1)+1; x++) {
print _[x]
}
}
\ No newline at end of file
{
if( length(_[FNR]) < 1) {
_[FNR]=_[FNR] sprintf("%s\t",$1)
}
_[FNR]=_[FNR] sprintf("%s\t",$3)
}
END{
for(x=1; x < ARGC; x++) {
split(ARGV[x],a,".cov")
printf("\t%s",a[1])
}
printf("\n")
for(x=1; x < NR/(ARGC-1)+1; x++) {
print _[x]
}
}
\ No newline at end of file
#!/usr/bin/env python
import sys
from glob import glob
from collections import defaultdict
from os import path
avg_cov = defaultdict(lambda : {})
per_cov = defaultdict(lambda : {})
project_dir = sys.argv[1]
project_name = path.basename(project_dir)
bamfiles = []
for f in sorted(glob(project_dir + '/cov/*.summary')):
bamfile = path.basename(f)[:-len('.cov.summary')]
for i,line in enumerate(open(f)):
if i == 0:
continue
tokens = line.rstrip().split()
avg_cov[tokens[0]][bamfile] = tokens[1]
per_cov[tokens[0]][bamfile] = tokens[2]
bamfiles.append(bamfile)
def write_matrix(cov, header, ofile):
out = open(ofile, 'wt')
out.write('\t')
out.write('\t'.join(bamfiles))
out.write('\n')
out.write('TaxId\t')
out.write('\t'.join([header for _ in bamfiles]))
out.write('\t') # backwards compatibility
out.write('\n')
for taxid in sorted(avg_cov.keys(), key=int):
c = cov[taxid]
out.write('{}\t'.format(taxid))
out.write('\t'.join([c[bf] for bf in bamfiles]))
out.write('\t') # backwards compatibility
out.write('\n')
out.close()
write_matrix(avg_cov, 'Average_cov', path.join(project_dir, '{}.all_cov.tab'.format(project_name)))
write_matrix(per_cov, 'Percentage_1x', path.join(project_dir, '{}.all_perc.tab'.format(project_name)))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment