From 4751bc7b7913e411d1a91c5c5032d52ba0f37b37 Mon Sep 17 00:00:00 2001 From: Tobias Marschall <tobias.marschall@0ohm.net> Date: Tue, 18 Sep 2018 11:27:06 +0200 Subject: [PATCH] Use local script to plot SV calls and mark complex regions. --- Snakefile | 21 ++++++++++++------ utils/call-complex-regions.py | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 7 deletions(-) create mode 100755 utils/call-complex-regions.py diff --git a/Snakefile b/Snakefile index 77c671b..88eb6eb 100644 --- a/Snakefile +++ b/Snakefile @@ -250,19 +250,19 @@ rule plot_SV_calls: input: counts = "counts/{sample}/{windows}.txt.gz", calls = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt", - strand = "strand_states/{sample}/final.txt", + complex = "sv_calls/{sample}/{windows}.{bpdens}/{method}.complex.tsv", + strand = "strand_states/{sample}/{windows}.{bpdens}/final.txt", segments = "segmentation2/{sample}/{windows}.{bpdens}.txt" output: "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/plots/sv_calls/{method}.{chrom}.pdf" log: "log/plot_SV_calls/{sample}/{windows}.{bpdens}.{method}.{chrom}.log" - params: - sv_plot_script = config["sv_plot_script"] shell: """ - Rscript {params.sv_plot_script} \ + Rscript utils/plot-sv-calls.R \ segments={input.segments} \ strand={input.strand} \ + complex={input.complex} \ calls={input.calls} \ {input.counts} \ {wildcards.chrom} \ @@ -280,11 +280,9 @@ rule plot_SV_calls_simulated: "sv_calls/simulation{seed}-{window}/{window}_fixed.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/plots/sv_calls/{method}.{chrom}.pdf" log: "log/plot_SV_calls_simulated/simulation{seed}-{window}/{window}_fixed.{bpdens}.{method}.{chrom}.log" - params: - sv_plot_script = config["sv_plot_script"] shell: """ - Rscript {params.sv_plot_script} \ + Rscript utils/plot-sv-calls.R \ segments={input.segments} \ strand={input.strand} \ truth={input.truth} \ @@ -575,6 +573,15 @@ rule mosaiClassifier_make_call_biallelic: script: "utils/mosaiClassifier_call_biallelic.snakemake.R" +rule call_complex_regions: + input: + calls = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt", + output: + complex = "sv_calls/{sample}/{windows}.{bpdens}/{method}.complex.tsv", + log: + "log/call_complex_regions/{sample}/{windows}.{bpdens}.{method}.log" + shell: + "utils/call-complex-regions.py {input.calls} > {output.complex} 2>{log}" ################################################################################ diff --git a/utils/call-complex-regions.py b/utils/call-complex-regions.py new file mode 100755 index 0000000..70012ba --- /dev/null +++ b/utils/call-complex-regions.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +import sys +from argparse import ArgumentParser +import pandas as pd +import numpy as np + +def is_complex(x): + s = set(x) + if (len(s) > 1) or ('complex' in s): + return True + else: + return False + + +def main(): + parser = ArgumentParser(prog='call-complex-regions.py', description=__doc__) + parser.add_argument('--window_size', default=5000000, type=int, + help='Window size in bp.') + + parser.add_argument('callset', metavar='CALLSET', help='Callset file (tsv) as output by MosaiClassifier') + + args = parser.parse_args() + + print('Reading', args.callset, file=sys.stderr) + calls = pd.read_csv(args.callset, sep='\t') + + # Identify "complex" intervals + segments = calls.groupby(by=['chrom','start','end']).sv_call_name.agg({'is_complex':is_complex}).reset_index() + + complex_segments = segments[segments.is_complex] + total_complex = sum(complex_segments.end - complex_segments.start) + + print('Total amount of complex sequence: {}Mbp'.format(total_complex/1000000), file=sys.stderr) + complex_segments[['chrom','start','end']].to_csv(sys.stdout, index=False, sep='\t') + #print(complex_segments, file=sys.stderr) + + +if __name__ == '__main__': + main() -- GitLab