From 216dbac54eebe26d018bcfd09f7e843c206cb96b Mon Sep 17 00:00:00 2001
From: Sascha Meiers <meiers@embl.de>
Date: Thu, 26 Oct 2017 14:49:28 +0200
Subject: [PATCH] new step in the pipeline to prepare input files for
 calculation of SV propabilities

---
 Snake.config.json               |  9 +++++
 Snakefile                       | 72 +++++++++++++++++++++++++++------
 utils/helper.prepare_segments.R |  6 +++
 3 files changed, 75 insertions(+), 12 deletions(-)
 create mode 100755 utils/helper.prepare_segments.R

diff --git a/Snake.config.json b/Snake.config.json
index ae35fab..3c93ebd 100644
--- a/Snake.config.json
+++ b/Snake.config.json
@@ -6,11 +6,20 @@
     "plot_script"   : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/qc.R",
     "sce_script"    : "/g/korbel/meiers/tools/mosaicatcher/strandsequtils/sce_utils/SCE.R",
     "samtools"      : "module load SAMtools && samtools",
+    "class_script"  : "finalCall_cmd.R",
+    "class_dir"     : "/g/korbel/meiers/tools/mosaicatcher/strandsequtils/maryamCode",
 
 
     "exclude_file"  : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/hg19.exclude",
     "variable_bins" : {
         "50000"     : "utils/variable_bins.GRCh38.50kb.bed",
         "100000"    : "utils/variable_bins.GRCh38.100kb.bed"
+    },
+
+
+    "bp_density"    : {
+        "few"       : 0.1,
+        "medium"    : 0.33,
+        "many"      : 0.9
     }
 }
diff --git a/Snakefile b/Snakefile
index 82845d9..be0aa94 100644
--- a/Snakefile
+++ b/Snakefile
@@ -11,18 +11,18 @@ BAM, = glob_wildcards("bam/{bam}.bam")
 
 rule all:
     input:
-        expand("plots/" + config["sample"] + "_{window}_fixed.pdf", window = [50000, 100000, 200000, 500000]),
-        expand("plots/" + config["sample"] + "_{window}_variable.pdf", window = [50000, 100000]),
-        expand("segmentation/" + config["sample"] + "_{window}_fixed.txt", window = [50000, 100000, 200000, 500000]),
-        expand("segmentation/" + config["sample"] + "_{window}_variable.txt", window = [50000, 100000]),
+        expand("plots/" + config["sample"] + ".{window}_fixed.pdf", window = [50000, 100000, 200000, 500000]),
+        expand("plots/" + config["sample"] + ".{window}_variable.pdf", window = [50000, 100000]),
+        expand("segmentation2/" + config["sample"] + ".{window}_fixed.{bpdens}.txt", window = [50000, 100000, 200000, 500000], bpdens = ["few","many"]),
+        expand("segmentation2/" + config["sample"] + ".{window}_variable.{bpdens}.txt", window = [50000, 100000], bpdens = ["few","many"]),
         "strand_states/" + config["sample"] + ".txt"
 
 rule plot_mosaic_counts:
     input:
-        counts = "counts/" + config["sample"] + "_{file_name}.txt.gz",
-        info   = "counts/" + config["sample"] + "_{file_name}.info"
+        counts = "counts/" + config["sample"] + ".{file_name}.txt.gz",
+        info   = "counts/" + config["sample"] + ".{file_name}.info"
     output:
-        "plots/" + config["sample"] + "_{file_name}.pdf"
+        "plots/" + config["sample"] + ".{file_name}.pdf"
     params: 
         plot_command = "Rscript " + config["plot_script"]
     shell:
@@ -32,8 +32,8 @@ rule plot_mosaic_counts:
 
 rule mosaic_count_fixed:
     input:
-        bam = expand("bam/" + config["sample"] + "_{bam}.bam", bam = BAM),
-        bai = expand("bam/" + config["sample"] + "_{bam}.bam.bai", bam = BAM)
+        bam = expand("bam/{bam}.bam", bam = BAM),
+        bai = expand("bam/{bam}.bam.bai", bam = BAM)
     output:
         counts = "counts/" + config["sample"] + "_{window}_fixed.txt.gz",
         info   = "counts/" + config["sample"] + "_{window}_fixed.info"
@@ -86,11 +86,17 @@ rule determine_strand_states:
         """
 
 
+
+
+################################################################################
+# Segmentation                                                                 #
+################################################################################
+
 rule segmentation:
     input:
-        "counts/" + config["sample"] + "_{file_name}.txt.gz"
+        "counts/" + config["sample"] + ".{file_name}.txt.gz"
     output:
-        "segmentation/" + config["sample"] + "_{file_name}.txt"
+        "segmentation/" + config["sample"] + ".{file_name}.txt"
     params:
         mc_command = config["mosaicatcher"]
     shell:
@@ -98,4 +104,46 @@ rule segmentation:
         {params.mc_command} segment \
         -o {output} \
         {input}
-        """
\ No newline at end of file
+        """
+
+# Pick a few segmentations and prepare the input files for SV classification
+rule prepare_segments:
+    input:
+        "segmentation/" + config["sample"] + ".{windows}.txt"
+    output:
+        "segmentation2/" + config["sample"] + ".{windows}.{bpdens}.txt"
+    params:
+        quantile = lambda wc: config["bp_density"][wc.bpdens]
+    script:
+        "utils/helper.prepare_segments.R"
+
+
+
+# Run SV classification
+rule run_sv_classification:
+    input:
+        counts = "counts/" + config["sample"] + ".{windows}.txt.gz",
+        info   = "counts/" + config["sample"] + ".{windows}.info",
+        states = "strand_states/" + config["sample"] + ".txt",
+        bp     = "segmentation2/" + config["sample"] + ".{windows}.{bpdens}.txt"
+    output:
+        outdir = "sv_probabilities/{file_name}.{bpdens}/",
+        outfile1 = "sv_probabilities/{file_name}.{bpdens}/YYY"
+    params:
+        class_dir     = config["class_dir"],
+        class_command = "Rscript " + config["class_script"]
+    shell:
+        """
+        cd {params.class_dir}
+        {params.class_command} \
+            binRCfile={input.counts} \
+            BRfile={input.bp} \
+            infoFile={input.info} \
+            stateFile={input.states} \
+            K=22 \
+            maxCN= 4 \
+            outputDir={output.outdir}
+        """
+
+
+
diff --git a/utils/helper.prepare_segments.R b/utils/helper.prepare_segments.R
new file mode 100755
index 0000000..e147f10
--- /dev/null
+++ b/utils/helper.prepare_segments.R
@@ -0,0 +1,6 @@
+library(data.table)
+qu = snakemake@params[["quantile"]]
+d = fread(snakemake@input[[1]])
+# type = 1 is important to get discrete values!
+d = d[, .SD[k == quantile(1:max(k), qu, type = 1)], by=chrom][,.(k, chrom, bps = breakpoint)]
+write.table(d, file = snakemake@output[[1]], row.names=F, quote=F, sep="\t")
\ No newline at end of file
-- 
GitLab