Commit f573c72a authored by Robin Erich Muench's avatar Robin Erich Muench
Browse files

update filterin.py

parent 66a4392d
......@@ -188,26 +188,31 @@ def relevant_taxa(args):
sys.exit("ERROR: Coverage file headers do not match!") #Exit with error message
# Read taxon by taxon (line by line) check coverage conditions (thresholds)
for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
coverage = list(map(float, cov.split())) # convert list_of_strings 2 list_of_floats (TaxID=INT!)
percentage = list(map(float, perc.split())) # convert list_of_strings 2 list_of_floats
cov_taxID = int(coverage.pop(0)) # get and remove taxID (TaxID = INT)
perc_taxID = int(percentage.pop(0)) # get and remove taxID (TaxID = INT)
for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
cstring = cov.split()
pstring = perc.split()
cov_taxID = cstring.pop(0)
perc_taxID = pstring.pop(0)
coverage = list(map(float, cstring)) # convert list_of_strings 2 list_of_floats (TaxID=INT!)
percentage = list(map(float, pstring)) # convert list_of_strings 2 list_of_floats
sample_count = 0
sample_names = []
# print(cov_taxID,perc_taxID)
if cov_taxID != perc_taxID: # check if taxIDs match!
sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!")
else:
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
sample_count += 1
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
sample_count += 1
if c >= args.cov and p >= args.perc:
sample_names.append(header_cov[sample_count-1]) # starting at 0 in array
if c >= args.cov and p >= args.perc:
if sample_count == len(header_cov) and len(sample_names) >= args.minsamples:
samples_of_interest[cov_taxID] = sample_names
sample_names.append(header_cov[sample_count-1]) # starting at 0 in array
if sample_count == len(header_cov) and len(sample_names) >= args.minsamples:
samples_of_interest[cov_taxID] = sample_names
return {'SoI':samples_of_interest, 'h':header_cov}#return dict()
COV.close()
......@@ -255,38 +260,37 @@ def header_comparison(header_cov):
def filter_two(args, snp_header):
'''position wise filtering'''
header_taxID = 0
header_taxID = ''
for best_split_x in args.snp_files:
with open(best_split_x, 'r') as file:
for snp_line in file: #position wise loop
snp_taxID = int(snp_line.split()[0].split('.')[0])#Name of Genome
# print(snp_line)
snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome
# print(snp_taxID)
if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
continue #Taxon is not relevant, NEXT!
else:
sample_list = samples_of_interest[snp_taxID]#Load Sample List
# !!! Get indices - INDICES based on ORDER in COV/PERC file!!!
# !!! Get indices - INDICES based on ORDER in COV/PERC file!!!
sample_indices = []
for name in sample_list:
sample_indices.append(snp_header.index(name))
# print(snp_taxID)
# print("args.outdir")
# One outfile per Genome
if header_taxID != snp_taxID:#first SNP of new taxID
outfile = open(args.outdir+'%i.filtered.freq' % snp_taxID, 'w')
if header_taxID != snp_taxID:
outfile = open(args.outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w')
print("Generating: {}".format(args.outdir+'/'+'%s.filtered.freq' % snp_taxID))
outfile.write('\t' + "\t".join(sample_list) + '\n')
if args.vcf == True:#VCF format
outfile_vcf = open(args.outdir+'%i.filtered.vcf' % snp_taxID, 'w')
print("Generating: {}".format(args.outdir+'%i.filtered.vcf' % snp_taxID))
outfile_vcf = open(args.outdir+'/'+'%s.filtered.vcf' % snp_taxID, 'w')
print("Generating: {}".format(args.outdir+'/'+'%s.filtered.vcf' % snp_taxID))
outfile_vcf.write(write_vcf_meta() + '\n')#Write VCF meta header
outfile_vcf.write(write_vcf_header()+'\n')#Write VCF line header
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment