From 1c7e1b7e33528afa4174d9e6e6337570324398c4 Mon Sep 17 00:00:00 2001
From: Tobias Marschall <tobias.marschall@0ohm.net>
Date: Thu, 1 Nov 2018 14:02:48 +0100
Subject: [PATCH] Change rule filter_calls to retain all calls that overlap
 with any merged call

---
 Snakefile             |  5 +++--
 utils/apply_filter.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100755 utils/apply_filter.py

diff --git a/Snakefile b/Snakefile
index bdbcbe7..b501014 100644
--- a/Snakefile
+++ b/Snakefile
@@ -617,11 +617,12 @@ rule mosaiClassifier_make_call:
 
 rule filter_calls:
     input: 
-        calls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}_filterFALSE.txt"
+        inputcalls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}_filterFALSE.txt",
+        mergedcalls = "postprocessing/merge/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}.txt",
     output: 
         calls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}_filterTRUE.txt"
     shell:
-        'utils/filter_MosaiCatcher_calls.pl {input.calls} | awk \'BEGIN {{OFS="\\t"}} (NR==1) || ($16=="PASS") {{$16=""; print}}\' > {output.calls}'
+        'utils/apply_filter.py {input.inputcalls} {input.mergedcalls} > {output.calls}'
 
 
 rule mosaiClassifier_calc_probs:
diff --git a/utils/apply_filter.py b/utils/apply_filter.py
new file mode 100755
index 0000000..5e58aa4
--- /dev/null
+++ b/utils/apply_filter.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+"""
+Retains calls that overlap with a call after filtering and merging.
+That is, output all calls in the input sets that are present in the 
+provided merged set.
+"""
+
+import sys
+from argparse import ArgumentParser
+import pandas as pd
+
+def main():
+	parser = ArgumentParser(prog='apply_filter.py', description=__doc__)
+
+	parser.add_argument('inputcalls', metavar='INPUTCALLS', help='Input call set')
+	parser.add_argument('mergedcalls', metavar='MERGEDCALLS', help='Merged calls used to decide which input calls to retain')
+
+	args = parser.parse_args()
+
+	input_calls = pd.read_csv(args.inputcalls, sep='\t')
+	merged_calls = pd.read_csv(args.mergedcalls, sep=', ', engine='python')
+
+	input_calls['selected'] = False
+	for chrom, start, end in merged_calls[['chrom','start','end']].values:
+		input_calls.loc[(input_calls.chrom==chrom) & (input_calls.start<end) & (input_calls.end>start), 'selected'] = True
+
+	output_calls = input_calls[input_calls.selected].drop(columns=['selected'])
+	output_calls.to_csv(sys.stdout, sep='\t', index=False)
+
+if __name__ == '__main__':
+	main()
-- 
GitLab