diff --git a/Snake.config.json b/Snake.config.json index cb6269f2d23a42e1cd69dcdc2debc0ce24171232..f54db59b144003f86164aa589f014dd4ef6d84f9 100644 --- a/Snake.config.json +++ b/Snake.config.json @@ -22,5 +22,20 @@ "few" : 0.1, "medium" : 0.33, "many" : 0.9 - } + }, + + "simulation_min_vaf" : 0.01, + "simulation_max_vaf" : 1.0, + "simulation_neg_binom_p" : { + "50000" : 0.4, + "100000" : 0.3, + "500000" : 0.1, + }, + "simulation_min_reads_per_library" : 250000, + "simulation_max_reads_per_library" : 1000000, + "simulation_window_sizes" : [50000, 100000], + "simulation_cell_count" : 200, + + "genome_size" : 3e9 + } diff --git a/Snakefile b/Snakefile index 29365ec24a6513f782d89b69cda4f12575ec1895..57e61429a4ef4594de1f00dc42f046be31b6c1cf 100644 --- a/Snakefile +++ b/Snakefile @@ -8,6 +8,7 @@ print("Detected {} samples:".format(len(set(SAMPLE)))) for s in set(SAMPLE): print(" {}:\t{} cells".format(s, len(BAM_PER_SAMPLE[s]))) +import os.path # Current state of the pipeline: # ============================== @@ -38,7 +39,7 @@ rule all: rule simulate_genome: output: - tsv="simulation/genome{seed}.tsv" + tsv="simulation/genome/genome{seed}.tsv" log: "log/simulate_genome/genome{seed}.tsv" params: @@ -49,6 +50,77 @@ rule simulate_genome: shell: "utils/simulate_SVs.R {wildcards.seed} {params.svcount} {params.minsize} {params.maxsize} {params.mindistance} {output.tsv} > {log} 2>&1" +rule add_vafs_to_simulated_genome: + input: + tsv="simulation/genome/genome{seed}.tsv" + output: + tsv="simulation/genome-with-vafs/genome{seed}.tsv" + params: + min_vaf = config["simulation_min_vaf"], + max_vaf = config["simulation_max_vaf"], + shell: + """ + awk -v min_vaf={params.min_vaf} -v max_vaf={params.max_vaf} -v seed={wildcards.seed} \ + 'BEGIN {{srand(seed); OFS="\\t"}} {{vaf=min_vaf+rand()*(max_vaf-min_vaf); print $0, vaf}}' {input.tsv} > {output.tsv} + """ + +def min_coverage(wildcards): + return round(float(config["simulation_min_reads_per_library"]) * int(wildcards.window_size) / float(config["genome_size"])) + +def max_coverage(wildcards): + return round(float(config["simulation_max_reads_per_library"]) * int(wildcards.window_size) / float(config["genome_size"])) + +def neg_binom_p(wildcards): + return float(config["simulation_neg_binom_p"][wildcards.window_size]) + +rule simulate_counts: + input: + config="simulation/genome-with-vafs/genome{seed}.tsv", + output: + counts="simulation/counts/genome{seed}-{window_size}.txt.gz", + segments="simulation/segments/genome{seed}-{window_size}.txt", + phases="simulation/phases/genome{seed}-{window_size}.txt", + info="simulation/info/genome{seed}-{window_size}.txt", + sce="simulation/sce/genome{seed}-{window_size}.txt", + variants="simulation/variants/genome{seed}-{window_size}.txt", + params: + mc_command = config["mosaicatcher"], + neg_binom_p = neg_binom_p, + min_coverage = min_coverage, + max_coverage = max_coverage, + cell_count = config["simulation_cell_count"], + log: + "log/simulate_counts/genome{seed}-{window_size}.txt" + shell: + """ + {params.mc_command} simulate \ + -w {wildcards.window_size} \ + -n {params.cell_count} \ + -p {params.neg_binom_p} \ + -c {params.min_coverage} \ + -C {params.max_coverage} \ + -V {output.variants} \ + -i {output.info} \ + -o {output.counts} \ + -U {output.segments} \ + -P {output.phases} \ + -S {output.sce} \ + {input.config} > {log} 2>&1 + """ + +rule link_to_simulated_counts: + input: + counts="simulation/counts/genome{seed}-{window_size}.txt.gz", + info="simulation/info/genome{seed}-{window_size}.txt", + output: + counts = "counts/simulation{seed}-{window_size}/{window_size}_fixed.txt.gz", + info = "counts/simulation{seed}-{window_size}/{window_size}_fixed.info" + run: + d = os.path.dirname(output.counts) + count_file = os.path.basename(output.counts) + info_file = os.path.basename(output.info) + shell("cd {d} && ln -s ../../{input.counts} {count_file} && ln -s ../../{input.info} {info_file} && cd ../..") + ################################################################################ # Plots # ################################################################################