Skip to content
Snippets Groups Projects
Commit 3c7b01e4 authored by Tobias Marschall's avatar Tobias Marschall
Browse files

Merge blacklisted intervals when below 500kb apart

parent eb297e8d
No related branches found
No related tags found
No related merge requests found
......@@ -416,10 +416,22 @@ rule extract_single_cell_counts:
# Normalize counts #
################################################################################
rule merge_blacklist_bins:
input:
norm = "utils/normalization/HGSVC.{window}.txt"
output:
merged = "normalizations/HGSVC.{window}.merged.tsv"
log:
"log/merge_blacklist_bins/{window}.log"
shell:
"""
utils/merge-blacklist.py --merge_distance 500000 {input.norm} > {output.merged} 2> {log}
"""
rule normalize_counts:
input:
counts = "counts/{sample}/{window}_fixed.txt.gz",
norm = "utils/normalization/HGSVC.{window}.txt"
norm = "normalizations/HGSVC.{window}.merged.tsv",
output:
"counts/{sample}/{window}_fixed_norm.txt.gz"
log:
......
#!/usr/bin/env python
'''
Merge two blacklisted intervals if distance between them is below a given distance.
'''
import sys
from argparse import ArgumentParser
import pandas as pd
def main():
parser = ArgumentParser(prog='merge-blacklist.py', description=__doc__)
parser.add_argument('--merge_distance', default=500000, type=int,
help='If the distance between two blacklisted intervals is below this threshold, they are merged.')
parser.add_argument('normalization', metavar='NORM', help='File (tsv) with normalization and blacklist data')
args = parser.parse_args()
print('Reading', args.normalization, file=sys.stderr)
norm_table = pd.read_csv(args.normalization, sep='\t')
assert set(norm_table.columns) == set(['chrom', 'start', 'end', 'scalar', 'class'])
additional_blacklist = 0
prev_blacklist_index = None
prev_blacklist_chrom = None
prev_blacklist_end = None
for i in range(len(norm_table)):
row = norm_table.iloc[i]
#print('Processing row', i, ' -->', tuple(row), file=sys.stderr)
# is row blacklisted?
if row['class'] == 'None':
#print(' --> is black', file=sys.stderr)
if (prev_blacklist_chrom == row['chrom']) and (row['start'] - prev_blacklist_end <= args.merge_distance):
#print(' --> black listing', prev_blacklist_index+1, 'to', i, file=sys.stderr)
for j in range(prev_blacklist_index+1, i):
norm_table.loc[[j],'class'] = 'None'
row_j = norm_table.iloc[j]
additional_blacklist += row_j.end - row_j.start
prev_blacklist_index = i
prev_blacklist_chrom = row['chrom']
prev_blacklist_end = row['end']
print('Additionally blacklisted', additional_blacklist, 'bp of sequence', file=sys.stderr)
norm_table.to_csv(sys.stdout, index=False, sep='\t')
## Identify "complex" intervals
#segments = calls.groupby(by=['chrom','start','end']).sv_call_name.agg({'is_complex':partial(is_complex, ignore_haplotypes=args.ignore_haplotypes, min_cell_count=args.min_cell_count)}).reset_index().sort_values(['chrom','start','end'])
## merge complex segments if closer than args.merge_distance
#complex_segments = pd.DataFrame(columns=['chrom','start','end'])
#cur_chrom, cur_start, cur_end = None, None, None
#for chrom, start, end in segments[segments.is_complex][['chrom','start','end']].values:
#if cur_chrom is None:
#cur_chrom, cur_start, cur_end = chrom, start, end
#elif (cur_chrom == chrom) and (start - cur_end < args.merge_distance):
#cur_end = end
#else:
#complex_segments = complex_segments.append({'chrom': cur_chrom, 'start': cur_start,'end': cur_end}, ignore_index=True)
#cur_chrom, cur_start, cur_end = chrom, start, end
#if cur_chrom is not None:
#complex_segments = complex_segments.append({'chrom': cur_chrom, 'start': cur_start,'end': cur_end}, ignore_index=True)
#print(complex_segments, file=sys.stderr)
#total_complex = sum(complex_segments.end - complex_segments.start)
#print('Total amount of complex sequence: {}Mbp'.format(total_complex/1000000), file=sys.stderr)
#complex_segments[['chrom','start','end']].to_csv(sys.stdout, index=False, sep='\t')
##print(complex_segments, file=sys.stderr)
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment