Commit 379c46b9 authored by Robin Erich Muench's avatar Robin Erich Muench
Browse files

delimiter metaSNP_filtering.py

parent 6677c7ab
......@@ -22,7 +22,7 @@ def get_arguments():
# Not Showns:
parser.add_argument('--version', action='version', version='%(prog)s 1.0', help=argparse.SUPPRESS)
parser.add_argument("-d", "--debug", action="store_true", help=argparse.SUPPRESS)
parser.add_argument("--debug", action="store_true", help=argparse.SUPPRESS)
parser.add_argument("-v", "--vcf", action="store_true", help=argparse.SUPPRESS)#"output variants in variant call format (vcf4.2)"
# TODO: POPULAITON STATISTICS
......@@ -33,11 +33,11 @@ def get_arguments():
# OPTIONAL arguments:
parser.add_argument('-p','--perc', type=float, help="Coverage breadth: Horizontal coverage cutoff (percentage taxon covered) per sample", default=40.0)
parser.add_argument('-c','--cov', type=float, help="Coverage depth: Average vertical coverage cutoff per taxon, per sample", default=5.0)
parser.add_argument('-m','--minsamples', type=int, help="Minimum number of sample required to pass the coverage cutoffs per genome", default=2)
parser.add_argument('-s','--snpc', type=float, help="FILTERING STEP II: SNP coverage cutoff", default=5.0)
parser.add_argument('-i','--snpi', type=float, help="FILTERING STEP II: SNP occurence (incidence) cutoff within samples_of_interest", default=0.50)
parser.add_argument('-b', metavar='FLOAT', type=float, help="Coverage breadth: Horizontal genome coverage percentage per sample per species", default=40.0)
parser.add_argument('-d', metavar='FLOAT', type=float, help="Coverage depth: Average vertical genome coverage per sample per species", default=5.0)
parser.add_argument('-m',metavar='INT', type=int, help="Minimum number of samples per species", default=2)
parser.add_argument('-c',metavar='FLOAT', type=float, help="FILTERING STEP II: minimum coverage per position per sample per species", default=5.0)
parser.add_argument('-p',metavar='FLOAT', type=float, help="FILTERING STEP II: required proportion of informative samples (coverage non-zero) per position", default=0.50)
# REQUIRED arguments:
parser.add_argument('percentage_file', help='input file with horizontal genome (taxon) coverage (breadth) per sample (percentage covered)', metavar='perc_FILE')
......@@ -94,16 +94,16 @@ def file_check():
def print_arguments():
## Print Defined Thresholds:
print("Options:")
if args.perc:
print("threshold: percentage covered (breadth) {}".format(args.perc) )
if args.cov:
print("threshold: average coverage (depth) {}".format(args.cov) )
if args.minsamples:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.minsamples) )
if args.snpc:
print("threshold: Min. coverage per position within a sample_of_interest {}".format(args.snpc) )
if args.snpi:
print("threshold: Min. position incidence among samples_of_interest {}".format(args.snpi) )
if args.b:
print("threshold: percentage covered (breadth) {}".format(args.b) )
if args.d:
print("threshold: average coverage (depth) {}".format(args.d) )
if args.m:
print("threshold: Min. number samples_of_interest per taxid_of_interest {}".format(args.m) )
if args.c:
print("threshold: Min. position coverage per sample within samples_of_interest {}".format(args.c) )
if args.p:
print("threshold: Min. proportion of covered samples in samples_of_interest {}".format(args.p) )
if args.vcf:
print("threshold: vcf {}".format(args.vcf) )
if args.outdir:
......@@ -141,7 +141,7 @@ def write_vcf_meta():
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Reads with ALT base">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Position Read Depth">
##FORMAT=<ID=AF,Number=2,Type=Integer,Description="Haplotype Quality">""".format(time.strftime("%Y%m%d"),args.perc,args.cov,args.minsamples,args.snpc,args.snpi)
##FORMAT=<ID=AF,Number=2,Type=Integer,Description="Haplotype Quality">""".format(time.strftime("%Y%m%d"),args.b,args.d,args.m,args.c,args.p)
return vcf_meta
......@@ -206,11 +206,11 @@ def relevant_taxa(args):
for c,p in zip(coverage,percentage): # two arrays with floats (taxID pop[removed])
sample_count += 1
if c >= args.cov and p >= args.perc:
if c >= args.d and p >= args.b:
sample_names.append(header_cov[sample_count-1]) # starting at 0 in array
if sample_count == len(header_cov) and len(sample_names) >= args.minsamples:
if sample_count == len(header_cov) and len(sample_names) >= args.m:
samples_of_interest[cov_taxID] = sample_names
......@@ -267,7 +267,7 @@ def filter_two(args, snp_header):
for snp_line in file: #position wise loop
# print(snp_line)
snp_taxID = snp_line.split()[0].split(']')[0]#Name of Genome change from . to ]
snp_taxID = snp_line.split()[0].split('.')[0]#Name of Genome change from . to ]
# print(snp_taxID)
if snp_taxID not in samples_of_interest.keys():#Check if Genome is of interest
......@@ -305,12 +305,12 @@ def filter_two(args, snp_header):
#FILTER: Count position below threshold:
nr_good = 0
for index in sample_indices:
if site_coverage[index] < args.snpc or site_coverage[index] == 0:
if site_coverage[index] < args.c or site_coverage[index] == 0:
continue#NOT enough coverage depth at position in sample, no increment
else:
nr_good += 1
#FILTER: Position incidence with sufficient coverage:
if float(nr_good)/len(sample_indices) < args.snpi:#if snp_incidence < x % drop SNP
if float(nr_good)/len(sample_indices) < args.p:#if snp_incidence < x % drop SNP
continue#mainly uninformative position, SNP incidence < x %, SNP droped!
......@@ -342,7 +342,7 @@ def filter_two(args, snp_header):
#For Frequency Tables:
alt_base = snp.split('|')[1]#alternative base
alt_bases.append(alt_base)#list of alt_bases
alt_bases_totalcov.append(sum([snp_coverage[i] for i in sample_indices if site_coverage[i] > args.snpc]))#total nr.reads with alt_bases
alt_bases_totalcov.append(sum([snp_coverage[i] for i in sample_indices if site_coverage[i] > args.c]))#total nr.reads with alt_bases
polymorphism = xS[2].split('[')[0]#N or S?
# # PopStats:
# if args.dNdS:
......@@ -365,7 +365,7 @@ def filter_two(args, snp_header):
for index in sample_indices:
#prevent division by zero! TODO:Notify usr about Nonsense Value (f.i. -c = 0!
if site_coverage[index] >= args.snpc and site_coverage[index] != 0:
if site_coverage[index] >= args.c and site_coverage[index] != 0:
snp_frq.append(snp_coverage[index]/site_coverage[index])
......@@ -395,8 +395,8 @@ def filter_two(args, snp_header):
# NS=sum(snp_coverage[i] for i in range(len(site_coverage)) if site_coverage[i] > 0)
PS = len(header_cov)#Pool Size Nr. All samples
NT = len(sample_indices)#Total NR Samples
NS = len([i for i in sample_indices if site_coverage[i] > args.snpc])#Nr.SoI with Data (>5X)
DP = sum([site_coverage[i] for i in sample_indices if site_coverage[i] > args.snpc])#TotalDepth:FilteredData
NS = len([i for i in sample_indices if site_coverage[i] > args.c])#Nr.SoI with Data (>5X)
DP = sum([site_coverage[i] for i in sample_indices if site_coverage[i] > args.c])#TotalDepth:FilteredData
allele_frqq = [round(a/DP,3) for a in alt_bases_totalcov]
AF=",".join([str(a) for a in allele_frqq])#
# MA = Major Allele
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment