From d536bde8c84c317ca781956bcd13e03945846f42 Mon Sep 17 00:00:00 2001 From: Tobias Marschall <tobias.marschall@0ohm.net> Date: Fri, 5 Oct 2018 08:46:34 +0200 Subject: [PATCH] Added options --whitelist and --min_whitelist_interval_size to fine-tune blacklist by whitelisting known SV sites --- utils/merge-blacklist.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/utils/merge-blacklist.py b/utils/merge-blacklist.py index 8612cc6..70c0bc4 100755 --- a/utils/merge-blacklist.py +++ b/utils/merge-blacklist.py @@ -11,6 +11,10 @@ def main(): parser = ArgumentParser(prog='merge-blacklist.py', description=__doc__) parser.add_argument('--merge_distance', default=500000, type=int, help='If the distance between two blacklisted intervals is below this threshold, they are merged.') + parser.add_argument('--whitelist', default=None, + help='TSV file with intervals to be removed from the blacklist (columns: chrom, start, end).') + parser.add_argument('--min_whitelist_interval_size', default=400000, type=int, + help='Ignore whitelisted intervals below this size.') parser.add_argument('normalization', metavar='NORM', help='File (tsv) with normalization and blacklist data') @@ -21,6 +25,14 @@ def main(): assert set(norm_table.columns) == set(['chrom', 'start', 'end', 'scalar', 'class']) + whitelist = None + if args.whitelist is not None: + whitelist = pd.read_csv(args.whitelist, sep='\t') + assert set(whitelist.columns) == set(['chrom', 'start', 'end']) + print('Read', len(whitelist), 'whitelisted intervals from', args.whitelist, file=sys.stderr) + whitelist = whitelist[whitelist.end - whitelist.start >= args.min_whitelist_interval_size] + print(' -->', len(whitelist), 'remained after removing intervals below', args.min_whitelist_interval_size, 'bp', file=sys.stderr) + additional_blacklist = 0 prev_blacklist_index = None prev_blacklist_chrom = None @@ -42,6 +54,18 @@ def main(): prev_blacklist_end = row['end'] print('Additionally blacklisted', additional_blacklist, 'bp of sequence', file=sys.stderr) + + additional_whitelist = 0 + if whitelist is not None: + for i in range(len(norm_table)): + row = norm_table.iloc[i] + if row['class'] == 'None': + if len(whitelist[(whitelist.chrom == row.chrom) & (row.start<whitelist.end) & (whitelist.start<row.end)]) > 0: + norm_table.loc[[i],'class'] = 'good' + additional_whitelist += row.end - row.start + + print('White listing: Removed', additional_whitelist, 'bp of sequence for blacklist', file=sys.stderr) + norm_table.to_csv(sys.stdout, index=False, sep='\t') ## Identify "complex" intervals -- GitLab