Commit 646ddda7 authored by Robin Erich Muench's avatar Robin Erich Muench

inconsistent use of tabs and spaces in indentation

parent 2c1fc3c7
#!/usr/bin/env python
import os
import sys
import time
import argparse
import glob
try:
import numpy as np
import numpy as np
except ImportError:
sys.stderr.write("Numpy is necessary to run postfiltering.\n")
sys.exit(1)
sys.stderr.write("Numpy is necessary to run postfiltering.\n")
sys.exit(1)
try:
import pandas as pd
import pandas as pd
except ImportError:
sys.stderr.write("Pandas is necessary to run postfiltering.\n")
sys.exit(1)
sys.stderr.write("Pandas is necessary to run postfiltering.\n")
sys.exit(1)
basedir = os.path.dirname(os.path.abspath(__file__))
......@@ -24,103 +23,101 @@ basedir = os.path.dirname(os.path.abspath(__file__))
# Parse Commandline Arguments
#############################
def get_arguments():
'''
Get commandline arguments and return namespace
'''
## Initialize Parser
parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
'''
Get commandline arguments and return namespace
'''
## Initialize Parser
parser = argparse.ArgumentParser(prog='metaSNV_post.py', description='metaSNV post processing', epilog='''Note:''', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Not Showns:
parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
parser.add_argument('--version', action='version', version='%(prog)s 2.0', help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
# TODO: POPULATION STATISTICS
## Population Statistics (Fst - ):
# parser.add_argument("-f", "--fst", action="store_true")
## pNpS ratio:
# parser.add_argument("-n", "--pnps", action="store_true")
# OPTIONAL arguments:
parser.add_argument('-b', metavar='FLOAT', type=float, default=40.0,
parser.add_argument('-b', metavar='FLOAT', type=float, default=40.0,
help="Coverage breadth: minimal horizontal genome coverage percentage per sample per species")
parser.add_argument('-d', metavar='FLOAT', type=float, default=5.0,
parser.add_argument('-d', metavar='FLOAT', type=float, default=5.0,
help="Coverage depth: minimal average vertical genome coverage per sample per species")
parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
parser.add_argument('-div',action='store_true',help="Compute diversity measures")
parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
parser.add_argument('-div',action='store_true',help="Compute diversity measures")
# REQUIRED arguments:
parser.add_argument('projdir', help='project name', metavar='Proj')
return parser.parse_args()
parser.add_argument('projdir', help='project name', metavar='Proj')
return parser.parse_args()
##############################
def debugging(taxids_of_interest, header_cov):
'''Sanity check for the Dict of TaxID:Samples'''
nr_keys = 0
taxids_of_interest = []
for key in samples_of_interest:
nr_keys += 1
taxids_of_interest.append(key)
taxids_of_interest.sort()
print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
print(taxids_of_interest)
'''Sanity check for the Dict of TaxID:Samples'''
nr_keys = 0
taxids_of_interest = []
for key in samples_of_interest:
nr_keys += 1
taxids_of_interest.append(key)
taxids_of_interest.sort()
print("DEBUGGING:\nNr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
print(taxids_of_interest)
# print("\n")
# print("nr_TaxIDs_of_Interest.keys: {}".format(nr_keys))
header = "\t".join(header_cov)
print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
print("DEBUGGING: no. samples in header: {}\n".format(len(header_cov)))
# header = "\t".join(header_cov)
# print("header: {}".format(header))
sys.exit("only filter 1")
sys.exit("only filter 1")
def file_check():
''' Check if required files exist (True / False)'''
args.projdir = args.projdir.rstrip('/')
args.coverage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_cov.tab'
args.percentage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_perc.tab'
args.all_samples = args.projdir+'/'+'all_samples'
print("Checking for necessary input files...")
if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
else:
sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
if os.path.isfile(args.all_samples):
print("found: '{}'\n".format(args.all_samples))
else:
sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
''' Check if required files exist (True / False)'''
args.projdir = args.projdir.rstrip('/')
args.coverage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_cov.tab'
args.percentage_file = args.projdir+'/'+args.projdir.split('/')[-1]+'.all_perc.tab'
args.all_samples = args.projdir+'/'+'all_samples'
print("Checking for necessary input files...")
if os.path.isfile(args.coverage_file) and os.path.isfile(args.percentage_file):
print("found: '{}' \nfound:'{}'".format(args.coverage_file, args.percentage_file))
else:
sys.exit("\nERROR: No such file '{}',\nERROR: No such file '{}'".format(args.coverage_file, args.percentage_file))
if os.path.isfile(args.all_samples):
print("found: '{}'\n".format(args.all_samples))
else:
sys.exit("\nERROR: No such file '{}'".format(args.all_samples))
def print_arguments():
## Print Defined Thresholds:
print("Options:")
if args.b:
print("threshold: percentage covered (breadth) {}".format(args.b) )
if args.d:
print("threshold: average coverage (depth) {}".format(args.d) )
if args.m:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
if args.c:
print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
if args.p:
print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
print("")
## Print Defined Thresholds:
print("Options:")
if args.b:
print("threshold: percentage covered (breadth) {}".format(args.b) )
if args.d:
print("threshold: average coverage (depth) {}".format(args.d) )
if args.m:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
if args.c:
print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
if args.p:
print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
print("")
###############################
## FILTER I: Count SAMPLES_OF_INTEREST per TAXON_OF_INTEREST
# Example: "Which taxon has at least 10 SoI?"
# --Coverage Conditions:
# Sample of Interest:
# 1. breadth > 40 %
# 1. breadth > 40 %
# 2. depth > 5 X
# --Min. number Samples:
# 3. min samples/TaxID > 10
......@@ -139,9 +136,9 @@ def relevant_taxa(args):
sys.exit("ERROR: Coverage file headers do not match!") #Exit with error message
# Read taxon by taxon (line by line) check coverage conditions (thresholds)
for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
for cov,perc in zip(COV,PER): # Line_cov: TaxID \t cov_valueS1 \t cov_value_S2[...]\n (no white spaces)
cstring = cov.split()
pstring = perc.split()
pstring = perc.split()
cov_taxID = cstring.pop(0)
perc_taxID = pstring.pop(0)
coverage = list(map(float, cstring)) # convert list_of_strings 2 list_of_floats (TaxID=INT!)
......@@ -154,10 +151,10 @@ def relevant_taxa(args):
if cov_taxID != perc_taxID: # check if taxIDs match!
sys.exit("ERROR: TaxIDs in the coverage files are not in the same order!")
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
sample_count += 1
if c >= args.d and p >= args.b:
if c >= args.d and p >= args.b:
sample_names.append(header_cov[sample_count-1]) # starting at 0 in array
......@@ -178,7 +175,7 @@ def header_comparison(header_cov):
'''Comparing the sample order in the coverage and all_samples file'''
# sort snp_indices by sample_of_interest names
header = "\t".join(header_cov)
#header = "\t".join(header_cov)
#print("\nNr.Samples_cov_header: {}".format(len(header_cov)))
#print("cov_header: {}\n\n".format(header_cov[:10]))
#print("\ncov_header: {}\n {}\n\n".format(len(header_cov),header))
......@@ -187,21 +184,21 @@ def header_comparison(header_cov):
all_samples = open(args.all_samples,'r')
snp_header = all_samples.read().splitlines()
snp_header = [i.split('/')[-1] for i in snp_header] #get name /trim/off/path/to/sample.name.bam
snp_header_joined = "\t".join(snp_header)
#print("Nr.Samples in (all_samples): {}".format(len(snp_header)))
#print("all_samples: {}\n".format(snp_header[:10]))
# Compare the sample order in ALL.COV/ALL.PERC and all_samples
#print("Comparing headers..")
#snp_header_joined = "\t".join(snp_header)
#if header == snp_header_joined:
# print("Headers match nicely\n")
#else:
# print("CAUTION: Header in COV_FILE does not match the order of samples in the SNP_FILES,\n\t no problem, we took care of it!\n")
return snp_header
return snp_header
#################################
## FILTER II: ACQUIRE SNPs WITH SUFFICIENT OCCURRENCE WITHIN SAMPLES_OF_INTEREST
# --SNP Conditions (Default):
......@@ -210,21 +207,21 @@ def header_comparison(header_cov):
def filter_two(args, snp_header, snp_files, outdir):
'''position wise filtering'''
header_taxID = ''
snp_taxID = '_'
# print(locals())
for best_split_x in snp_files:
with open(best_split_x, 'r') as file:
for snp_line in file: #position wise loop
snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]
## SPECIES FILTER:
if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
continue #Taxon is not relevant, NEXT!
else:
## SAMPLE FILTER: only load samples with enough coverage
sample_list = samples_of_interest[snp_taxID]#Load Sample List
......@@ -248,9 +245,9 @@ def filter_two(args, snp_header, snp_files, outdir):
nr_good += 1
#FILTER: Position incidence with sufficient coverage:
if float(nr_good)/len(sample_indices) < args.p:#if snp_incidence < x % drop SNP
continue#mainly uninformative position, SNP incidence < x %, SNP droped!
else:
# CALCULATE SNP ALLELE FREQUENCY:
# If at least one position passed cutoffs open file (if new TaxID):
......@@ -261,42 +258,42 @@ def filter_two(args, snp_header, snp_files, outdir):
outfile = open(outdir+'/'+'%s.filtered.freq' % snp_taxID, 'w')
print("Generating: {}".format(outdir+'/'+'%s.filtered.freq' % snp_taxID))
outfile.write('\t' + "\t".join(sample_list) + '\n')
header_taxID = snp_taxID#Jump to current TaxID
#Loop through alternative alleles [5](comma separated):
line_id = ":".join(whole_line[:4])#line_id composed of CHROM:REFGENE:POS:REFBASE
reference_base = whole_line[3]
alt_bases_totalcov = []#total coverage per alt allele
#VCF format
#LOOP Start:
for snp in whole_line[5].split(','):
xS = snp.split('|')
snp_coverage = list(map(float, xS[3:]))#coverage string (unfiltered!)
#Sanity check:
#Sanity check:
if len(site_coverage) != len(snp_coverage):
print("ERROR: SNP FILE {} is corrupted".format(best_split_x))
sys.exit("ERROR: Site coverage and SNP coverage string have uneven length!")
#Compute allele frequency tables:
alt_base = snp.split('|')[1]#alternative base
alt_base = snp.split('|')[1]#alternative base
#FREQUENCY Computation
total_reads = 0
snp_frq = []#frequencies for SNPs (pos >5X in at least 50% of the SoIs)
for index in sample_indices:
#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
if site_coverage[index] >= args.c and site_coverage[index] != 0:
snp_frq.append(snp_coverage[index]/site_coverage[index])
else:
snp_frq.append(-1)
snp_frq.append(-1)
# WRITE OUTPUT Allele Frequencies (Default)
outfile.write(":".join(snp_line.split()[:4])+'>'+alt_base +':'+ xS[2] + '\t' + "\t".join(str(x) for x in snp_frq) + '\n')
......@@ -313,20 +310,20 @@ def alleledist(d1,d2, threshold=.6):
def computeAllDist(args):
print "Computing distances"
print("Computing distances")
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq:
data = pd.read_table(f, index_col=0, na_values=['-1']).T
dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
data = pd.read_table(f, index_col=0, na_values=['-1']).T
dist = [[l1nonans(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
dist.to_csv(args.projdir+'/distances/'+'%s.mann.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist = [[alleledist(data.iloc[i], data.iloc[j]) for i in range(len(data))] for j in range(len(data))]
dist = pd.DataFrame(dist, index=data.index, columns=data.index)
dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
dist.to_csv(args.projdir+'/distances/'+'%s.allele.dist' % f.split('/')[-1].replace('.freq',''), sep='\t')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment