Pipeline update to use the new SCE detection and segmentation selected...

Pipeline update to use the new SCE detection and segmentation selected (implemented in utils/detect_strand_states.py)

Pipeline update to use the new SCE detection and segmentation selected...
Pipeline update to use the new SCE detection and segmentation selected (implemented in utils/detect_strand_states.py)
898ab903 · Tobias Marschall · e46df4e6 · 898ab903
Commit 898ab903 authored 6 years ago by Tobias Marschall
--- a/Snakefile
+++ b/Snakefile
@@ -6,9 +6,11 @@ configfile: "Snake.config.json"
 SAMPLE,BAM = glob_wildcards("bam/{sample}/selected/{bam}.bam")
 SAMPLES = sorted(set(SAMPLE))

+CELL_PER_SAMPLE= defaultdict(list)
 BAM_PER_SAMPLE = defaultdict(list)
 for sample,bam in zip(SAMPLE,BAM):
    BAM_PER_SAMPLE[sample].append(bam)
+    CELL_PER_SAMPLE[sample].append(bam.replace('.sort.mdup',''))

 ALLBAMS_PER_SAMPLE = defaultdict(list)
 for sample in SAMPLES:
@@ -65,20 +67,16 @@ rule all:
        expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_calls/{method}.{chrom}.pdf",
               sample = SAMPLE,
               chrom = config["chromosomes"],
-               window = [50000, 100000],
-               bpdens = ["few","medium","many"],
+               window = [100000],
+               bpdens = ["selected"],
               method = METHODS),
        expand("ploidy/{sample}/ploidy.{chrom}.txt", sample = SAMPLES, chrom = config["chromosomes"]),
        expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{plottype}.pdf",
               sample = SAMPLES,
-               window = [50000, 100000],
-               bpdens = ["few","medium","many"],
+               window = [100000],
+               bpdens = ["selected"],
               method = METHODS,
               plottype = ["byaf","bypos"]),
-        expand("haplotag/table/{sample}/full/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv",
-               sample = SAMPLES,
-               window = [50000, 100000],
-               bpdens = ["few","medium","many"]),


 ################################################################################
@@ -386,6 +384,14 @@ rule mosaic_count_variable:
        > {log} 2>&1
        """

+rule extract_single_cell_counts:
+    input:
+        "counts/{sample}/{window}_{file_name}.txt.gz"
+    output:
+        "counts-per-cell/{sample}/{cell}/{window,[0-9]+}_{file_name}.txt.gz"
+    shell:
+        "zcat {input} | awk '(NR==1) || $5 ==\"{wildcards.cell}\"' | gzip > {output}"
+

 ################################################################################
 # Normalize counts                                                             #
@@ -446,7 +452,7 @@ rule prepare_segments:
    input:
        "segmentation/{sample}/{windows}.txt"
    output:
-        "segmentation2/{sample}/{windows}.{bpdens}.txt"
+        "segmentation2/{sample}/{windows}.{bpdens,(many|medium|few)}.txt"
    log:
        "log/prepare_segments/{sample}/{windows}.{bpdens}.log"
    params:
@@ -454,6 +460,43 @@ rule prepare_segments:
    script:
        "utils/helper.prepare_segments.R"

+rule segment_one_cell:
+    input:
+        "counts-per-cell/{sample}/{cell}/{window}_{file_name}.txt.gz"
+    output:
+        "segmentation-per-cell/{sample}/{cell}/{window,\d+}_{file_name}.txt"
+    log:
+        "log/segmentation-per-cell/{sample}/{cell}/{window}_{file_name}.log"
+    params:
+        mc_command = config["mosaicatcher"],
+        min_num_segs = lambda wc: math.ceil(200000 / float(wc.window)) # bins to represent 200 kb
+    shell:
+        """
+        {params.mc_command} segment \
+        --remove-none \
+        --forbid-small-segments {params.min_num_segs} \
+        -M 50000000 \
+        -o {output} \
+        {input} > {log} 2>&1
+        """
+
+rule segmentation_selection:
+    input:
+        counts="counts/{sample}/{window}_{file_name}.txt.gz",
+        jointseg="segmentation/{sample}/{window}_{file_name}.txt",
+        singleseg=lambda wc: ["segmentation-per-cell/{}/{}/{}_{}.txt".format(wc.sample, cell, wc.window, wc.file_name) for cell in CELL_PER_SAMPLE[wc.sample]],
+        info="counts/{sample}/{window}_{file_name}.info",
+    output:
+        jointseg="segmentation2/{sample}/{window,[0-9]+}_{file_name}.selected.txt",
+        strand_states="strand_states/{sample}/{window,[0-9]+}_{file_name}.intitial_strand_state",
+    log:
+        "log/segmentation_selection/{sample}/{window}_{file_name}.log"
+    params:
+        cellnames = lambda wc: ",".join(cell for cell in CELL_PER_SAMPLE[wc.sample]),
+        sce_min_distance = 400000,
+    shell:
+        "./utils/detect_strand_states.py --sce_min_distance {params.sce_min_distance} --output_jointseg {output.jointseg} --output_strand_states {output.strand_states} --samplename {wildcards.sample} --cellnames {params.cellnames} {input.info} {input.counts} {input.jointseg} {input.singleseg} > {log} 2>&1"
+

 ################################################################################
 # SV classification                                                            #
@@ -519,18 +562,30 @@ rule mosaiClassifier_make_call_biallelic:
 # Strand states & phasing                                                      #
 ################################################################################

+# DEPRECATED rule for calling SCEs / determining the initial strand states using 
+# the C++ MosaiCatcher code.
+#rule determine_initial_strand_states:
+    #input:
+        #"counts/{sample}/500000_fixed.txt.gz"
+    #output:
+        #"strand_states/{sample}/intitial_strand_state"
+    #log:
+        #"log/determine_initial_strand_states/{sample}.log"
+    #params:
+        #mc_command = config["mosaicatcher"]
+    #shell:
+        #"""
+        #{params.mc_command} states -o {output} {input} 2>&1 > {log}
+        #"""
+
 rule determine_initial_strand_states:
    input:
-        "counts/{sample}/500000_fixed.txt.gz"
+        "strand_states/{sample}/100000_fixed_norm.intitial_strand_state"
    output:
        "strand_states/{sample}/intitial_strand_state"
-    log:
-        "log/determine_initial_strand_states/{sample}.log"
-    params:
-        mc_command = config["mosaicatcher"]
    shell:
        """
-        {params.mc_command} states -o {output} {input} 2>&1 > {log}
+        cd strand_states/{wildcards.sample} && ln -s 100000_fixed_norm.intitial_strand_state intitial_strand_state && cd ../..
        """

 # Strandphaser needs a different input format which contains the path names to