diff --git a/Snake.config.json b/Snake.config.json new file mode 100644 index 0000000000000000000000000000000000000000..ae35faba0a324cb976873895824765125e99d80e --- /dev/null +++ b/Snake.config.json @@ -0,0 +1,16 @@ +{ + "sample" : "D2Rfb", + + + "mosaicatcher" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/build/mosaicatcher", + "plot_script" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/qc.R", + "sce_script" : "/g/korbel/meiers/tools/mosaicatcher/strandsequtils/sce_utils/SCE.R", + "samtools" : "module load SAMtools && samtools", + + + "exclude_file" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/hg19.exclude", + "variable_bins" : { + "50000" : "utils/variable_bins.GRCh38.50kb.bed", + "100000" : "utils/variable_bins.GRCh38.100kb.bed" + } +} diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..82845d90c9372ddf6b429eeee8c37d2d2e65e0a1 --- /dev/null +++ b/Snakefile @@ -0,0 +1,101 @@ +configfile: "Snake.config.json" + +BAM, = glob_wildcards("bam/{bam}.bam") + +# Current state of the pipeline: +# ============================== +# * count reads in the BAM files (in fixed and variable-width bins of various sizes) +# * determine strand states of each chromosome in each single cell, including SCEs +# * plot all single cell libraries in different window sizes +# * calculate a segmentation into potential SVs using Mosaicatcher + +rule all: + input: + expand("plots/" + config["sample"] + "_{window}_fixed.pdf", window = [50000, 100000, 200000, 500000]), + expand("plots/" + config["sample"] + "_{window}_variable.pdf", window = [50000, 100000]), + expand("segmentation/" + config["sample"] + "_{window}_fixed.txt", window = [50000, 100000, 200000, 500000]), + expand("segmentation/" + config["sample"] + "_{window}_variable.txt", window = [50000, 100000]), + "strand_states/" + config["sample"] + ".txt" + +rule plot_mosaic_counts: + input: + counts = "counts/" + config["sample"] + "_{file_name}.txt.gz", + info = "counts/" + config["sample"] + "_{file_name}.info" + output: + "plots/" + config["sample"] + "_{file_name}.pdf" + params: + plot_command = "Rscript " + config["plot_script"] + shell: + """ + {params.plot_command} {input.counts} {input.info} {output} + """ + +rule mosaic_count_fixed: + input: + bam = expand("bam/" + config["sample"] + "_{bam}.bam", bam = BAM), + bai = expand("bam/" + config["sample"] + "_{bam}.bam.bai", bam = BAM) + output: + counts = "counts/" + config["sample"] + "_{window}_fixed.txt.gz", + info = "counts/" + config["sample"] + "_{window}_fixed.info" + params: + mc_command = config["mosaicatcher"], + mc_exclfile = config["exclude_file"] + shell: + """ + {params.mc_command} count \ + -o {output.counts} \ + -i {output.info} \ + -x {params.mc_exclfile} \ + -w {wildcards.window} \ + {input.bam} + """ + + +rule mosaic_count_variable: + input: + bam = expand("bam/{bam}.bam", bam = BAM), + bai = expand("bam/{bam}.bam.bai", bam = BAM), + bed = lambda wc: config["variable_bins"][str(wc.window)] + output: + counts = "counts/" + config["sample"] + "_{window}_variable.txt.gz", + info = "counts/" + config["sample"] + "_{window}_variable.info" + params: + mc_command = config["mosaicatcher"] + shell: + """ + {params.mc_command} count \ + -o {output.counts} \ + -i {output.info} \ + -b {input.bed} \ + {input.bam} + """ + + + + +rule determine_strand_states: + input: + "counts/" + config["sample"] + "_500000_fixed.txt.gz" + output: + "strand_states/" + config["sample"] + ".txt" + params: + sce_command = "Rscript " + config["sce_script"] + shell: + """ + {params.sce_command} {input} {output} + """ + + +rule segmentation: + input: + "counts/" + config["sample"] + "_{file_name}.txt.gz" + output: + "segmentation/" + config["sample"] + "_{file_name}.txt" + params: + mc_command = config["mosaicatcher"] + shell: + """ + {params.mc_command} segment \ + -o {output} \ + {input} + """ \ No newline at end of file