Commit be897d95 authored by Robin Erich Muench's avatar Robin Erich Muench
Browse files

fix and update: genome splitting routine + helper script

parent ece0afa1
#!/bin/bash
#########################################
# metaSNP Step II: `Genome Splitting` #
# metaSNP Step II: `Database Indexing` #
#########################################
#
# Helper script
......@@ -15,17 +15,10 @@ set -e
# Variables
wd=`pwd`
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
DIR="$( dirname $( readlink -f "${BASH_SOURCE[0]}" ) )"
arg1="$1"
arg2="$2"
arg3="$3"
PROJECT_DIR=$arg1
GENOME_DEF=$arg2
NUM_SPLITS=10
SPLIT_LIMIT=100
COV_FILES=$PROJECT_DIR"/cov"
NUM_SPLITS="10"
date=$(date +%Y-%m-%d)
#Usage Messages
......@@ -40,7 +33,7 @@ display_usage() {
echo >&2 " genome_def FILE = Contig ranges in BED format. (Fields: Contig_id, contigStart, contigEnd)"
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " nr_splits INT = INT for job parallelization (range: 1-100) [10]"
echo >&2 " -n INT = split the workload into INT jobs for SNP call parallelization [10]"
echo >&2 ""
echo >&2 "Note: Expecting 'metaSNP_COV' to be completed!"
echo >&2 ""
......@@ -53,6 +46,13 @@ required_parameter() {
exit 1
}
param_non_integer() {
echo >&2 ""
echo >&2 "ERROR: '$2' is not an integer. Parameter ($1)."
display_usage
exit 1
}
no_such_file() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
......@@ -102,7 +102,7 @@ make_dir() {
# getopt to use -h flag
ARGS=$(getopt -o h -n "$0" -- "$@")
ARGS=$(getopt -o hn: -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
......@@ -115,6 +115,13 @@ while true; do
display_usage
exit 1
;;
-n)
shift
NUM_SPLITS="$1"
[[ $NUM_SPLITS =~ ^[0-9]+$ ]] || param_non_integer "-n INT" "$NUM_SPLITS"
shift
;;
--)
shift
break
......@@ -126,10 +133,11 @@ done
# Required parameters:
[ -z "$arg1" ] && required_parameter "project_dir"
[ -z "$arg2" ] && required_parameter "genome_def"
[ -n "$arg3" ] && NUM_SPLITS=$arg3
# Nr_Splits
[ "$NUM_SPLITS" -gt $SPLIT_LIMIT ] && split_limit $NUM_SPLITS
# Read links
PROJECT_DIR="$(readlink -f $arg1)"
GENOME_DEF="$(readlink -f $arg2)"
COV_FILES="$PROJECT_DIR/cov"
# Required files and directories
[ -d "$PROJECT_DIR" ] || no_such_file "$PROJECT_DIR"
......@@ -139,14 +147,16 @@ done
# Required output DIR
[ -d "$PROJECT_DIR/bestsplits/" ] || make_dir "$PROJECT_DIR" "$PROJECT_DIR/bestsplits/"
# Get project name
# Get project name
#PROJECT_NAME=$(echo $PROJECT_DIR | awk -F"/" '{print $(NF-1)}')
PROJECT_NAME=$(basename $PROJECT_DIR)
# Get a summary for the coverage computation (Step I) from the .cov and .detail files
# List of coverage files for all samples
##
# Generate a summary for the coverage computation (Step I) from the .cov and .detail files
cd $COV_FILES
# List of coverage files for all samples
[ "$(ls *.cov)" ] || rerun_cov
allFile=$(ls *.cov)
......@@ -157,8 +167,8 @@ do
done
echo -e "\nCoverage summary here: $PROJECT_DIR"
echo " Average vertical genome coverage: '$PROJECT_DIR$PROJECT_NAME.all_cov.tab'"
echo " Horizontal genome coverage (1X): '$PROJECT_DIR$PROJECT_NAME.all_perc.tab'"
echo " Average vertical genome coverage: '$PROJECT_DIR/$PROJECT_NAME.all_cov.tab'"
echo " Horizontal genome coverage (1X): '$PROJECT_DIR/$PROJECT_NAME.all_perc.tab'"
ls *.summary | xargs awk -f $DIR/src/collapse_AvgCov.awk > ../$PROJECT_NAME.all_cov.tab
ls *.summary | xargs awk -f $DIR/src/collapse_PercCov.awk > ../$PROJECT_NAME.all_perc.tab
......@@ -184,9 +194,4 @@ echo -e >&2 "\nGenome Splitting:"
# usage createOptimumSplit.sh <all_cov.tab> <all_perc.tab> <geneDefinitions> <INT_NrSplits> <.outfile>
python $DIR/src/createOptimumSplit.py $cov $perc $genDef $nr $outFile
# Remove second Row - CAUTION: change in filtering.py as well
#awk '!(NR == 2)' $PROJECT_DIR$PROJECT_NAME.all_cov.tab > temp
#mv temp $PROJECT_DIR$PROJECT_NAME.all_cov.tab
#awk '!(NR == 2)' $PROJECT_DIR$PROJECT_NAME.all_perc.tab > temp
#mv temp $PROJECT_DIR$PROJECT_NAME.all_perc.tab
exit
\ No newline at end of file
exit
import pdb
import sys
import operator
from collections import defaultdict
import numpy as np
cov = open(sys.argv[1],'r')
perc = open(sys.argv[2],'r')
......@@ -7,25 +11,22 @@ genomes = open(sys.argv[3],'r')
nrToSplit = int(sys.argv[4])
outf = sys.argv[5]
genomeDict = dict()
contigDict = dict()
genomeLen = defaultdict(int)
genomeContigs = defaultdict(list)
for line in genomes:
genome = line.split('\t')[0].split('.')[0]
leng = int(line.rstrip().split('\t')[2])
if genome not in genomeDict:
genomeDict[genome] = 0
contigDict[genome] = list()
genomeDict[genome] += leng
contigDict[genome].append(line)
genomeLen[genome] += leng
genomeContigs[genome].append(line)
print 'Found %d genomes'%(len(genomeDict))
print('Found {0} genomes'.format(len(genomeLen)))
#drop header
cov.readline()
cov.readline()
covDict = dict()
coverage = dict()
for line in cov:
#Get the sum coverage
......@@ -33,57 +34,28 @@ for line in cov:
l = line.rstrip().split('\t')
for i in range(1,len(l)):
s += float(l[i])
covDict[l[0]] = s
print 'Found %d genomes again. Hope they match'%(len(covDict))
coverage[l[0]] = s
perc.readline()
perc.readline()
percDict = dict()
for line in perc:
#Get the sum coverage
s = 0.0
l = line.rstrip().split('\t')
for i in range(1,len(l)):
s += float(l[i])
percDict[l[0]] = s
print 'Found %d genomes again. Hope they match'%(len(percDict))
table = []
# Get an approximation of how many reads hit each genome. This is as close as
# you will get to figuring out how long running it is going to take
for k in genomeDict.keys():
read = genomeDict[k]*covDict[k]/100 #*percDict[k]/100
table.append((k,read))
#Now, sort the table
t = sorted(table,key=operator.itemgetter(1),reverse=True)#must be True
#print(head(t))
#Great, now get these into X bins
res = [0]*nrToSplit
names = dict()
for k in range(len(t)):
#Put this in the smallest one!
pos = res.index(min(res))
res[pos] += table[k][1]
names[t[k][0]] = pos
#outf = sys.argv[5]
#Open as many files as bins
fileDict = dict()
for i in xrange(nrToSplit):
fileDict[i] = open(outf+'_'+str(i),'w')
for k in names.keys():
genomeName = k
for c in contigDict[k]:
fileDict[names[k]].write(c)
# you will get to figuring out how long running it is going to take:
for k in genomeLen.keys():
reads = genomeLen[k]*coverage[k]
table.append((reads, k))
# We greedily assign each genome in descending order to the less heavily used
# bin:
outputs = [open('{}_{}'.format(outf, i), 'w') for i in range(nrToSplit)]
weight = [0 for _ in range(nrToSplit)]
for (w, g) in sorted(table, reverse=True):
pos = np.argmin(weight)
weight[pos] += w
for c in genomeContigs[g]:
outputs[pos].write(c)
for o in outputs:
o.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment