Commit 86879e42 authored by Robin Erich Muench's avatar Robin Erich Muench
Browse files

update createOptSplit.py

parent 444b593a
......@@ -2,8 +2,10 @@ import sys
import operator
cov = open(sys.argv[1],'r')
genomes = open(sys.argv[2],'r')
nrToSplit = int(sys.argv[3])
perc = open(sys.argv[2],'r')
genomes = open(sys.argv[3],'r')
nrToSplit = int(sys.argv[4])
outf = sys.argv[5]
genomeDict = dict()
contigDict = dict()
......@@ -35,15 +37,33 @@ for line in cov:
print 'Found %d genomes again. Hope they match'%(len(covDict))
perc.readline()
perc.readline()
percDict = dict()
for line in perc:
#Get the sum coverage
s = 0.0
l = line.rstrip().split('\t')
for i in range(1,len(l)):
s += float(l[i])
percDict[l[0]] = s
print 'Found %d genomes again. Hope they match'%(len(percDict))
table = []
#Get an approximation of how many reads hit each genome. This is as close as you will get to figuring out how long running it is going to take
for k in genomeDict.keys():
read = genomeDict[k]*covDict[k]/100
read = genomeDict[k]*covDict[k]/100 #*percDict[k]/100
table.append((k,read))
#Now, sort the table
t = sorted(table,key=operator.itemgetter(1),reverse=True)
#Now, sort the table
t = sorted(table,key=operator.itemgetter(1),reverse=True)#must be True
#print(head(t))
#Great, now get these into X bins
res = [0]*nrToSplit
......@@ -55,7 +75,7 @@ for k in range(len(t)):
res[pos] += table[k][1]
names[t[k][0]] = pos
outf = sys.argv[4]
#outf = sys.argv[5]
#Open as many files as bins
fileDict = dict()
for i in xrange(nrToSplit):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment