From 216dbac54eebe26d018bcfd09f7e843c206cb96b Mon Sep 17 00:00:00 2001 From: Sascha Meiers <meiers@embl.de> Date: Thu, 26 Oct 2017 14:49:28 +0200 Subject: [PATCH] new step in the pipeline to prepare input files for calculation of SV propabilities --- Snake.config.json | 9 +++++ Snakefile | 72 +++++++++++++++++++++++++++------ utils/helper.prepare_segments.R | 6 +++ 3 files changed, 75 insertions(+), 12 deletions(-) create mode 100755 utils/helper.prepare_segments.R diff --git a/Snake.config.json b/Snake.config.json index ae35fab..3c93ebd 100644 --- a/Snake.config.json +++ b/Snake.config.json @@ -6,11 +6,20 @@ "plot_script" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/qc.R", "sce_script" : "/g/korbel/meiers/tools/mosaicatcher/strandsequtils/sce_utils/SCE.R", "samtools" : "module load SAMtools && samtools", + "class_script" : "finalCall_cmd.R", + "class_dir" : "/g/korbel/meiers/tools/mosaicatcher/strandsequtils/maryamCode", "exclude_file" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/hg19.exclude", "variable_bins" : { "50000" : "utils/variable_bins.GRCh38.50kb.bed", "100000" : "utils/variable_bins.GRCh38.100kb.bed" + }, + + + "bp_density" : { + "few" : 0.1, + "medium" : 0.33, + "many" : 0.9 } } diff --git a/Snakefile b/Snakefile index 82845d9..be0aa94 100644 --- a/Snakefile +++ b/Snakefile @@ -11,18 +11,18 @@ BAM, = glob_wildcards("bam/{bam}.bam") rule all: input: - expand("plots/" + config["sample"] + "_{window}_fixed.pdf", window = [50000, 100000, 200000, 500000]), - expand("plots/" + config["sample"] + "_{window}_variable.pdf", window = [50000, 100000]), - expand("segmentation/" + config["sample"] + "_{window}_fixed.txt", window = [50000, 100000, 200000, 500000]), - expand("segmentation/" + config["sample"] + "_{window}_variable.txt", window = [50000, 100000]), + expand("plots/" + config["sample"] + ".{window}_fixed.pdf", window = [50000, 100000, 200000, 500000]), + expand("plots/" + config["sample"] + ".{window}_variable.pdf", window = [50000, 100000]), + expand("segmentation2/" + config["sample"] + ".{window}_fixed.{bpdens}.txt", window = [50000, 100000, 200000, 500000], bpdens = ["few","many"]), + expand("segmentation2/" + config["sample"] + ".{window}_variable.{bpdens}.txt", window = [50000, 100000], bpdens = ["few","many"]), "strand_states/" + config["sample"] + ".txt" rule plot_mosaic_counts: input: - counts = "counts/" + config["sample"] + "_{file_name}.txt.gz", - info = "counts/" + config["sample"] + "_{file_name}.info" + counts = "counts/" + config["sample"] + ".{file_name}.txt.gz", + info = "counts/" + config["sample"] + ".{file_name}.info" output: - "plots/" + config["sample"] + "_{file_name}.pdf" + "plots/" + config["sample"] + ".{file_name}.pdf" params: plot_command = "Rscript " + config["plot_script"] shell: @@ -32,8 +32,8 @@ rule plot_mosaic_counts: rule mosaic_count_fixed: input: - bam = expand("bam/" + config["sample"] + "_{bam}.bam", bam = BAM), - bai = expand("bam/" + config["sample"] + "_{bam}.bam.bai", bam = BAM) + bam = expand("bam/{bam}.bam", bam = BAM), + bai = expand("bam/{bam}.bam.bai", bam = BAM) output: counts = "counts/" + config["sample"] + "_{window}_fixed.txt.gz", info = "counts/" + config["sample"] + "_{window}_fixed.info" @@ -86,11 +86,17 @@ rule determine_strand_states: """ + + +################################################################################ +# Segmentation # +################################################################################ + rule segmentation: input: - "counts/" + config["sample"] + "_{file_name}.txt.gz" + "counts/" + config["sample"] + ".{file_name}.txt.gz" output: - "segmentation/" + config["sample"] + "_{file_name}.txt" + "segmentation/" + config["sample"] + ".{file_name}.txt" params: mc_command = config["mosaicatcher"] shell: @@ -98,4 +104,46 @@ rule segmentation: {params.mc_command} segment \ -o {output} \ {input} - """ \ No newline at end of file + """ + +# Pick a few segmentations and prepare the input files for SV classification +rule prepare_segments: + input: + "segmentation/" + config["sample"] + ".{windows}.txt" + output: + "segmentation2/" + config["sample"] + ".{windows}.{bpdens}.txt" + params: + quantile = lambda wc: config["bp_density"][wc.bpdens] + script: + "utils/helper.prepare_segments.R" + + + +# Run SV classification +rule run_sv_classification: + input: + counts = "counts/" + config["sample"] + ".{windows}.txt.gz", + info = "counts/" + config["sample"] + ".{windows}.info", + states = "strand_states/" + config["sample"] + ".txt", + bp = "segmentation2/" + config["sample"] + ".{windows}.{bpdens}.txt" + output: + outdir = "sv_probabilities/{file_name}.{bpdens}/", + outfile1 = "sv_probabilities/{file_name}.{bpdens}/YYY" + params: + class_dir = config["class_dir"], + class_command = "Rscript " + config["class_script"] + shell: + """ + cd {params.class_dir} + {params.class_command} \ + binRCfile={input.counts} \ + BRfile={input.bp} \ + infoFile={input.info} \ + stateFile={input.states} \ + K=22 \ + maxCN= 4 \ + outputDir={output.outdir} + """ + + + diff --git a/utils/helper.prepare_segments.R b/utils/helper.prepare_segments.R new file mode 100755 index 0000000..e147f10 --- /dev/null +++ b/utils/helper.prepare_segments.R @@ -0,0 +1,6 @@ +library(data.table) +qu = snakemake@params[["quantile"]] +d = fread(snakemake@input[[1]]) +# type = 1 is important to get discrete values! +d = d[, .SD[k == quantile(1:max(k), qu, type = 1)], by=chrom][,.(k, chrom, bps = breakpoint)] +write.table(d, file = snakemake@output[[1]], row.names=F, quote=F, sep="\t") \ No newline at end of file -- GitLab