diff --git a/Snakefile b/Snakefile index be0aa945db407a2e0c6b445de1f148c15ce7d010..d047ab959973b24d19f07e08c8e240c980ac16d8 100644 --- a/Snakefile +++ b/Snakefile @@ -15,7 +15,13 @@ rule all: expand("plots/" + config["sample"] + ".{window}_variable.pdf", window = [50000, 100000]), expand("segmentation2/" + config["sample"] + ".{window}_fixed.{bpdens}.txt", window = [50000, 100000, 200000, 500000], bpdens = ["few","many"]), expand("segmentation2/" + config["sample"] + ".{window}_variable.{bpdens}.txt", window = [50000, 100000], bpdens = ["few","many"]), - "strand_states/" + config["sample"] + ".txt" + "strand_states/" + config["sample"] + ".final.txt" + + + +################################################################################ +# Plots # +################################################################################ rule plot_mosaic_counts: input: @@ -30,6 +36,13 @@ rule plot_mosaic_counts: {params.plot_command} {input.counts} {input.info} {output} """ + + + +################################################################################ +# Read counting # +################################################################################ + rule mosaic_count_fixed: input: bam = expand("bam/{bam}.bam", bam = BAM), @@ -73,19 +86,6 @@ rule mosaic_count_variable: -rule determine_strand_states: - input: - "counts/" + config["sample"] + "_500000_fixed.txt.gz" - output: - "strand_states/" + config["sample"] + ".txt" - params: - sce_command = "Rscript " + config["sce_script"] - shell: - """ - {params.sce_command} {input} {output} - """ - - ################################################################################ @@ -124,7 +124,7 @@ rule run_sv_classification: input: counts = "counts/" + config["sample"] + ".{windows}.txt.gz", info = "counts/" + config["sample"] + ".{windows}.info", - states = "strand_states/" + config["sample"] + ".txt", + states = "strand_states/" + config["sample"] + ".final.txt", bp = "segmentation2/" + config["sample"] + ".{windows}.{bpdens}.txt" output: outdir = "sv_probabilities/{file_name}.{bpdens}/", @@ -134,6 +134,7 @@ rule run_sv_classification: class_command = "Rscript " + config["class_script"] shell: """ + # set haplotype cd {params.class_dir} {params.class_command} \ binRCfile={input.counts} \ @@ -142,8 +143,52 @@ rule run_sv_classification: stateFile={input.states} \ K=22 \ maxCN= 4 \ + haplotypeInfo \ outputDir={output.outdir} """ +################################################################################ +# Strand states & phasing # +################################################################################ + +rule determine_initial_strand_states: + input: + "counts/" + config["sample"] + "_500000_fixed.txt.gz" + output: + "strand_states/" + config["sample"] + ".txt" + params: + sce_command = "Rscript " + config["sce_script"] + shell: + """ + {params.sce_command} {input} {output} + """ + + +# Strandphaser needs a different input format which contains the path names to +# the bam files. This rule extracts this information and prepares an input file. +rule convert_strandphaser_input: + input: + states = "strand_states/" + config["sample"] + ".txt", + info = "counts/" + config["sample"] + "_500000_fixed.info" + output: + "strand_states/" + config["sample"] + ".strandphaser_input.txt" + script: + "utils/helper.convert_strandphaser_input.R" + + +# Dummy rule - this will be replaced by strand-phaser +rule run_strandphaser: + output: + "strand_states/" + config["sample"] + ".strandphaser_output.txt" + +rule convert_strandphaser_output: + input: + phased_states = "strand_states/" + config["sample"] + ".strandphaser_output.txt", + initial_states = "strand_states/" + config["sample"] + ".txt", + info = "counts/" + config["sample"] + "_500000_fixed.info" + output: + "strand_states/" + config["sample"] + ".final.txt" + script: + "utils/helper.convert_strandphaser_output.R" diff --git a/utils/helper.convert_strandphaser_input.R b/utils/helper.convert_strandphaser_input.R new file mode 100755 index 0000000000000000000000000000000000000000..00d09ebd28f18c233bd527b96e26ab8bcfdd4363 --- /dev/null +++ b/utils/helper.convert_strandphaser_input.R @@ -0,0 +1,6 @@ +library(data.table); +d = fread(snakemake@input[["states"]]) +e = fread(snakemake@input[["info"]]) +e$bam = basename(e$bam); +f = merge(d, e, by = c("sample","cell"))[class == "WC", .(chrom,start,end,bam)] +write.table(f, file=snakemake@output[[1]], quote=F, row.names=F, col.names=F, sep="\t") \ No newline at end of file diff --git a/utils/helper.convert_strandphaser_output.R b/utils/helper.convert_strandphaser_output.R new file mode 100755 index 0000000000000000000000000000000000000000..3b3b22c72cc94d7d82826ec9b78b5da95388f116 --- /dev/null +++ b/utils/helper.convert_strandphaser_output.R @@ -0,0 +1,23 @@ +library(data.table); +library(assertthat) +d = fread(snakemake@input[["phased_states"]]) +e = fread(snakemake@input[["info"]]) +g = fread(snakemake@intput[["initial_states"]]) + +d$bam = basename(d$bam); + +e$bam = e$cell +e$cell = NULL +e$sample = NULL +f = merge(d, e, by = "bam")[, .(chrom,start,end,sample,cell,class)] + +# Note that there is still a bug in Venla's strand state detection. +g = merge(g,f, by = c("chrom","start","end","sample","cell"), all.x = T) + +# Overwrite with David's phased strand state if available! +g = g[, class := ifelse(!is.na(class.y), class.y, class.x)][] +g$class.x = NULL +g$class.y = NULL +g = g[, .(chrom, start, end, sample, cell, class)] + +write.table(g, file=snakemake@output[[1]], quote=F, row.names=F, col.names=F, sep="\t") \ No newline at end of file