### Refactor to make code simpler

`This code has fewer repetitions and is more readable`
parent 70ce9fa7
 import sys #We assume that contings are sorted within the genome, so that a contig won't show up #randomly in the file, but together with its friends. # We assume that contings are sorted within the genome, so that a contig won't # show up randomly in the file, but together with its friends. cov = open(sys.argv,'r') xcov = open(sys.argv,'r') ... ... @@ -15,42 +15,39 @@ while True: xcovL = xcov.readline() if not xcovL: break name = covL.split('\t') namex = xcovL.split('\t') covL = covL.split('\t') xcovL = xcovL.split('\t') name = covL namex = xcovL if name != namex: print name print namex print 'oups' print("Mismatch in names {} != {}".format(name, namex)) taxId = name.split('.') if taxId not in genomeMap: genomeMap[taxId] = [0.0,0.0,0.0,0.0] #why not [0,0.0,0.0,0.0] since 1st is int? genomeMap[taxId] = [0.0, 0.0, 0.0, 0.0] #Add the length genomeMap[taxId] += int(covL.split('\t')) #genomeMap[taxId] += 1 #Add the average coverage weighted with the length genomeMap[taxId] += float(covL.split('\t')) * int(covL.split('\t')) #genomeMap[taxId] += int(float(covL.split('\t'))) #Add the number of bases covered at 1x genomeMap[taxId] += int(xcovL.split('\t')) #Add the number of bases covered at 2x genomeMap[taxId] += int(xcovL.split('\t')) genomeMap[taxId] += int(covL) outf = open(sys.argv,'w') #Write header outf.write('TaxId\tAverage_cov\tPercentage_1x\tPercentage_2x\n') #Add the average coverage weighted with the length genomeMap[taxId] += float(covL) * int(covL) moreThan10x = 0 #Print the average coverage over the taxId's for k in genomeMap.keys(): if genomeMap[k]/genomeMap[k] > 10: moreThan10x += 1 outf.write('%s\t%f\t%f\t%f\n'%(k,genomeMap[k]/genomeMap[k],genomeMap[k]/genomeMap[k]*100,genomeMap[k]/genomeMap[k]*100)) #Add the number of bases covered at 1x genomeMap[taxId] += int(xcovL) outf.close() #Add the number of bases covered at 2x genomeMap[taxId] += int(xcovL) cov.close() xcov.close() #print moreThan10x print 'Done' with open(sys.argv,'w') as outf: #Write header outf.write('TaxId\tAverage_cov\tPercentage_1x\tPercentage_2x\n') #Print the average coverage over the taxId's for k in genomeMap: outf.write('%s\t%f\t%f\t%f\n'%(k, genomeMap[k]/genomeMap[k], genomeMap[k]/genomeMap[k]*100, genomeMap[k]/genomeMap[k]*100))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!