From d536bde8c84c317ca781956bcd13e03945846f42 Mon Sep 17 00:00:00 2001
From: Tobias Marschall <tobias.marschall@0ohm.net>
Date: Fri, 5 Oct 2018 08:46:34 +0200
Subject: [PATCH] Added options --whitelist and --min_whitelist_interval_size
 to fine-tune blacklist by whitelisting known SV sites

---
 utils/merge-blacklist.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/utils/merge-blacklist.py b/utils/merge-blacklist.py
index 8612cc6..70c0bc4 100755
--- a/utils/merge-blacklist.py
+++ b/utils/merge-blacklist.py
@@ -11,6 +11,10 @@ def main():
 	parser = ArgumentParser(prog='merge-blacklist.py', description=__doc__)
 	parser.add_argument('--merge_distance', default=500000, type=int,
 		help='If the distance between two blacklisted intervals is below this threshold, they are merged.')
+	parser.add_argument('--whitelist', default=None, 
+		help='TSV file with intervals to be removed from the blacklist (columns: chrom, start, end).')
+	parser.add_argument('--min_whitelist_interval_size', default=400000, type=int,
+		help='Ignore whitelisted intervals below this size.')
 
 	parser.add_argument('normalization', metavar='NORM', help='File (tsv) with normalization and blacklist data')
 
@@ -21,6 +25,14 @@ def main():
 
 	assert set(norm_table.columns) == set(['chrom', 'start', 'end', 'scalar', 'class'])
 
+	whitelist = None
+	if args.whitelist is not None:
+		whitelist = pd.read_csv(args.whitelist, sep='\t')
+		assert set(whitelist.columns) == set(['chrom', 'start', 'end'])
+		print('Read', len(whitelist), 'whitelisted intervals from', args.whitelist, file=sys.stderr)
+		whitelist = whitelist[whitelist.end - whitelist.start >= args.min_whitelist_interval_size]
+		print('  -->', len(whitelist), 'remained after removing intervals below', args.min_whitelist_interval_size, 'bp', file=sys.stderr)
+
 	additional_blacklist = 0
 	prev_blacklist_index = None
 	prev_blacklist_chrom = None
@@ -42,6 +54,18 @@ def main():
 			prev_blacklist_end = row['end']
 
 	print('Additionally blacklisted', additional_blacklist, 'bp of sequence', file=sys.stderr)
+
+	additional_whitelist = 0
+	if whitelist is not None:
+		for i in range(len(norm_table)):
+			row = norm_table.iloc[i]
+			if row['class'] == 'None':
+				if len(whitelist[(whitelist.chrom == row.chrom) & (row.start<whitelist.end) & (whitelist.start<row.end)]) > 0:
+					norm_table.loc[[i],'class'] = 'good'
+					additional_whitelist += row.end - row.start
+
+	print('White listing: Removed', additional_whitelist, 'bp of sequence for blacklist', file=sys.stderr)
+
 	norm_table.to_csv(sys.stdout, index=False, sep='\t')
 
 	## Identify "complex" intervals
-- 
GitLab