Evaluate different parameter settings, including different modes to select a segmentation.

4d7da438 · Tobias Marschall · 5b1b34ec · 4d7da438 · 4d7da438
Commit 4d7da438 authored 6 years ago by Tobias Marschall
--- a/Snake.config.json
+++ b/Snake.config.json
@@ -44,6 +44,13 @@
    "simulation_cell_count" : 100,
    "simulation_alpha" : 0.02,

-    "genome_size" : 3e9
+    "genome_size" : 3e9,

+    "ground_truth_clonal" : {
+        "RPE-BM510" : "/MMCI/TM/scratch/strandseq/input-data/ground_truth/RPE-BM510_manual/clonal-events.tsv"
+    },
+
+    "ground_truth_single_cell" : {
+        "RPE-BM510" : "/MMCI/TM/scratch/strandseq/input-data/ground_truth/RPE-BM510_manual/single-cell-events.tsv"
+    }
 }
--- a/Snakefile
+++ b/Snakefile
@@ -43,6 +43,9 @@ METHODS = [
    "simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.05_regfactor6",
 ]

+BPDENS = [
+	"selected_j{}_s{}".format(joint, single) for joint in [0.5,0.1,0.01] for single in [1.0,0.5,0.1]
+]

 singularity: "docker://smei/mosaicatcher-pipeline:v0.1"

@@ -68,19 +71,24 @@ rule all:
               sample = SAMPLE,
               chrom = config["chromosomes"],
               window = [100000],
-               bpdens = ["selected"],
+               bpdens = BPDENS,
               method = METHODS),
        expand("ploidy/{sample}/ploidy.{chrom}.txt", sample = SAMPLES, chrom = config["chromosomes"]),
        expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{plottype}.pdf",
               sample = SAMPLES,
               window = [100000],
-               bpdens = ["selected"],
+               bpdens = BPDENS,
               method = METHODS,
               plottype = ["byaf","bypos"]),
        expand("halo/{sample}/{window}_{suffix}.json.gz",
               sample = SAMPLES,
               window = [100000],
-               suffix = ["fixed", "fixed_norm"])
+               suffix = ["fixed", "fixed_norm"]),
+        expand("stats/{sample}/{window}_fixed_norm.{bpdens}/{method}.tsv",
+               sample = SAMPLES,
+               window = [100000],
+               bpdens = BPDENS,
+               method = METHODS),


 ################################################################################
@@ -249,7 +257,7 @@ rule plot_SV_calls:
        strand = "strand_states/{sample}/final.txt",
        segments = "segmentation2/{sample}/{windows}.{bpdens}.txt"
    output:
-        "sv_calls/{sample}/{windows}.{bpdens}/plots/sv_calls/{method}.{chrom}.pdf"
+        "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/plots/sv_calls/{method}.{chrom}.pdf"
    log:
        "log/plot_SV_calls/{sample}/{windows}.{bpdens}.{method}.{chrom}.log"
    params:
@@ -273,7 +281,7 @@ rule plot_SV_calls_simulated:
        segments = "segmentation2/simulation{seed}-{window}/{window}_fixed.{bpdens}.txt",
        truth  = "simulation/variants/genome{seed}-{window}.txt"
    output:
-        "sv_calls/simulation{seed}-{window}/{window}_fixed.{bpdens}/plots/sv_calls/{method}.{chrom}.pdf"
+        "sv_calls/simulation{seed}-{window}/{window}_fixed.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/plots/sv_calls/{method}.{chrom}.pdf"
    log:
        "log/plot_SV_calls_simulated/simulation{seed}-{window}/{window}_fixed.{bpdens}.{method}.{chrom}.log"
    params:
@@ -295,8 +303,8 @@ rule plot_SV_consistency_barplot:
    input:
        sv_calls  = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt",
    output:
-        barplot_bypos = "sv_calls/{sample}/{windows}.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-bypos.pdf",
-        barplot_byaf = "sv_calls/{sample}/{windows}.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-byaf.pdf",
+        barplot_bypos = "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/plots/sv_consistency/{method}.consistency-barplot-bypos.pdf",
+        barplot_byaf = "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/plots/sv_consistency/{method}.consistency-barplot-byaf.pdf",
    log:
        "log/plot_SV_consistency/{sample}/{windows}.{bpdens}.{method}.log"
    script:
@@ -502,15 +510,15 @@ rule segmentation_selection:
        singleseg=lambda wc: ["segmentation-per-cell/{}/{}/{}_{}.txt".format(wc.sample, cell, wc.window, wc.file_name) for cell in CELL_PER_SAMPLE[wc.sample]],
        info="counts/{sample}/{window}_{file_name}.info",
    output:
-        jointseg="segmentation2/{sample}/{window,[0-9]+}_{file_name}.selected.txt",
-        strand_states="strand_states/{sample}/{window,[0-9]+}_{file_name}.intitial_strand_state",
+        jointseg="segmentation2/{sample}/{window,[0-9]+}_{file_name}.selected_j{min_diff_jointseg}_s{min_diff_singleseg}.txt",
+        strand_states="strand_states/{sample}/{window,[0-9]+}_{file_name}.selected_j{min_diff_jointseg}_s{min_diff_singleseg}.intitial_strand_state",
    log:
-        "log/segmentation_selection/{sample}/{window}_{file_name}.log"
+        "log/segmentation_selection/{sample}/{window}_{file_name}.selected_j{min_diff_jointseg}_s{min_diff_singleseg}.log"
    params:
        cellnames = lambda wc: ",".join(cell for cell in CELL_PER_SAMPLE[wc.sample]),
        sce_min_distance = 500000,
    shell:
-        "./utils/detect_strand_states.py --sce_min_distance {params.sce_min_distance} --output_jointseg {output.jointseg} --output_strand_states {output.strand_states} --samplename {wildcards.sample} --cellnames {params.cellnames} {input.info} {input.counts} {input.jointseg} {input.singleseg} > {log} 2>&1"
+        "./utils/detect_strand_states.py --sce_min_distance {params.sce_min_distance} --min_diff_jointseg {wildcards.min_diff_jointseg} --min_diff_singleseg {wildcards.min_diff_singleseg} --output_jointseg {output.jointseg} --output_strand_states {output.strand_states} --samplename {wildcards.sample} --cellnames {params.cellnames} {input.info} {input.counts} {input.jointseg} {input.singleseg} > {log} 2>&1"


 ################################################################################
@@ -525,7 +533,7 @@ rule plot_heatmap:
        info     = "counts/{sample}/{windows}.info",
        bamNames = "sv_probabilities/{sample}/{windows}.{bpdens}/bamNames.txt"
    output:
-        "sv_probabilities/{sample}/{windows}.{bpdens}/final_plots/heatmapPlots.pdf"
+        "sv_probabilities/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/final_plots/heatmapPlots.pdf"
    params:
        r_package_path = "utils/R-packages2"
    log:
@@ -542,7 +550,7 @@ rule mosaiClassifier_make_call:
    input:
        probs = 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.Rdata'
    output:
-        "sv_calls/{sample}/{windows}.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}.txt"
+        "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}.txt"
    log:
        "log/mosaiClassifier_make_call/{sample}/{windows}.{bpdens}.llr{llr}.poppriors{pop_priors}.haplotags{use_haplotags}.gtcutoff{gtcutoff}.regfactor{regfactor}.log"
    script:
@@ -555,7 +563,7 @@ rule mosaiClassifier_calc_probs:
        states = "strand_states/{sample}/final.txt",
        bp     = "segmentation2/{sample}/{windows}.{bpdens}.txt"
    output:
-        output = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
+        output = "sv_probabilities/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/probabilities.Rdata"
    log:
        "log/mosaiClassifier_calc_probs/{sample}/{windows}.{bpdens}.log"
    script:
@@ -565,7 +573,7 @@ rule mosaiClassifier_make_call_biallelic:
    input:
        probs = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
    output:
-        "sv_calls/{sample}/{windows}.{bpdens}/biAllelic_llr{llr}.txt"
+        "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/biAllelic_llr{llr}.txt"
    log:
        "log/mosaiClassifier_make_call_biallelic/{sample}/{windows}.{bpdens}.{llr}.log"
    script:
@@ -595,12 +603,12 @@ rule mosaiClassifier_make_call_biallelic:

 rule determine_initial_strand_states:
    input:
-        "strand_states/{sample}/100000_fixed_norm.intitial_strand_state"
+        "strand_states/{sample}/100000_fixed_norm.selected_j0.5_s1.0.intitial_strand_state"
    output:
        "strand_states/{sample}/intitial_strand_state"
    shell:
        """
-        cd strand_states/{wildcards.sample} && ln -s 100000_fixed_norm.intitial_strand_state intitial_strand_state && cd ../..
+        cd strand_states/{wildcards.sample} && ln -s 100000_fixed_norm.selected_j0.5_s1.0.intitial_strand_state intitial_strand_state && cd ../..
        """

 # Strandphaser needs a different input format which contains the path names to
@@ -772,7 +780,7 @@ rule create_haplotag_segment_bed:
    input:
        segments="segmentation2/{sample}/{size}{what}.{bpdens}.txt",
    output:
-        bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens}.bed",
+        bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}.bed",
    shell:
        "awk 'BEGIN {{s={wildcards.size};OFS=\"\\t\"}} $2!=c {{prev=0}} NR>1 {{print $2,prev*s+1,($3+1)*s; prev=$3+1; c=$2}}' {input.segments} > {output.bed}"

@@ -782,7 +790,7 @@ rule create_haplotag_table:
        bai='haplotag/bam/{sample}/{cell}.bam.bai',
        bed = "haplotag/bed/{sample}/{windows}.{bpdens}.bed"
    output:
-        tsv='haplotag/table/{sample}/by-cell/haplotag-counts.{cell}.{windows}.{bpdens}.tsv'
+        tsv='haplotag/table/{sample}/by-cell/haplotag-counts.{cell}.{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}.tsv'
    log:
        "log/create_haplotag_table/{sample}.{cell}.{windows}.{bpdens}.log"
    script:
@@ -792,7 +800,7 @@ rule merge_haplotag_tables:
    input:
        tsvs=lambda wc: ['haplotag/table/{}/by-cell/haplotag-counts.{}.{}.{}.tsv'.format(wc.sample,cell,wc.windows,wc.bpdens) for cell in BAM_PER_SAMPLE[wc.sample]],
    output:
-        tsv='haplotag/table/{sample}/full/haplotag-counts.{windows}.{bpdens}.tsv'
+        tsv='haplotag/table/{sample}/full/haplotag-counts.{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}.tsv'
    shell:
        '(head -n1 {input.tsvs[0]} && tail -q -n +2 {input.tsvs}) > {output.tsv}'

@@ -801,7 +809,7 @@ rule create_haplotag_likelihoods:
    input:
        haplotag_table='haplotag/table/{sample}/full/haplotag-counts.{windows}.{bpdens}.tsv',
        sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata',
-    output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.Rdata'
+    output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}.Rdata'
    log:
        "log/create_haplotag_likelihoods/{sample}.{windows}.{bpdens}.log"
    script:
@@ -903,3 +911,36 @@ rule split_external_snv_calls:
        > {log} 2>&1
        """

+
+################################################################################
+# Summary statistics on sv calls                                               #
+################################################################################
+
+
+rule summary_statistics:
+	input:
+		segmentation = 'segmentation2/{sample}/{windows}.{bpdens}.txt',
+		strandstates = 'strand_states/{sample}/{windows}.{bpdens}.intitial_strand_state',
+		sv_calls = 'sv_calls/{sample}/{windows}.{bpdens}/{method}.txt',
+	output:
+		tsv = 'stats/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/{method}.tsv',
+	log:
+		'log/summary_statistics/{sample}/{windows}.{bpdens}/{method}.log'
+	run:
+		p = []
+		try:
+			f = config["ground_truth_clonal"][wildcards.sample]
+			if len(f) > 0:
+				p.append('--true-events-clonal')
+				p.append(f)
+		except KeyError:
+			pass
+		try:
+			f = config["ground_truth_single_cell"][wildcards.sample]
+			if len(f) > 0:
+				p.append('--true-events-single-cell')
+				p.append(f)
+		except KeyError:
+			pass
+		additional_params = ' '.join(p)
+		shell('utils/callset_summary_stats.py --segmentation {input.segmentation} --strandstates {input.strandstates} {additional_params} {input.sv_calls}  > {output.tsv}')