-
Sascha Meiers authoredSascha Meiers authored
Snakefile 45.68 KiB
import math
from collections import defaultdict
configfile: "Snake.config.json"
SAMPLE,BAM = glob_wildcards("bam/{sample}/selected/{bam}.bam")
SAMPLES = sorted(set(SAMPLE))
CELL_PER_SAMPLE= defaultdict(list)
BAM_PER_SAMPLE = defaultdict(list)
for sample,bam in zip(SAMPLE,BAM):
BAM_PER_SAMPLE[sample].append(bam)
CELL_PER_SAMPLE[sample].append(bam.replace('.sort.mdup',''))
ALLBAMS_PER_SAMPLE = defaultdict(list)
for sample in SAMPLES:
ALLBAMS_PER_SAMPLE[sample] = glob_wildcards("bam/{}/all/{{bam}}.bam".format(sample)).bam
print("Detected {} samples:".format(len(SAMPLES)))
for s in SAMPLES:
print(" {}:\t{} cells\t {} selected cells".format(s, len(ALLBAMS_PER_SAMPLE[s]), len(BAM_PER_SAMPLE[s])))
import os.path
# Current state of the pipeline:
# ==============================
# * count reads in the BAM files (in fixed and variable-width bins of various sizes)
# * determine strand states of each chromosome in each single cell, including SCEs
# * plot all single cell libraries in different window sizes
# * calculate a segmentation into potential SVs using Mosaicatcher
METHODS = [
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0_regfactor6_filterFALSE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.05_regfactor6_filterTRUE",
]
BPDENS = [
"selected_j{}_s{}_scedist{}".format(joint, single, scedist) for joint in [0.1] for single in [0.5] for scedist in [20]
]
# Todo: specify an exact version of the singularity file!
singularity: "docker://smei/mosaicatcher-pipeline:v0.2"
localrules:
all,
simul,
simulate_genome,
add_vafs_to_simulated_genome,
link_to_simulated_counts,
link_to_simulated_strand_states,
generate_exclude_file_1,
generate_exclude_file_2,
link_normalized_info_file,
prepare_segments,
split_external_snv_calls,
prepare_strandphaser_config_per_chrom
rule all:
input:
expand("plots/{sample}/{window}_fixed.pdf", sample = SAMPLES, window = [50000, 100000, 200000, 500000]),
expand("plots/{sample}/{window}_fixed_norm.pdf", sample = SAMPLES, window = [50000, 100000, 200000]),
expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_calls/{method}.{chrom}.pdf",
sample = SAMPLE,
chrom = config["chromosomes"],
window = [100000],
bpdens = BPDENS,
method = METHODS),
expand("ploidy/{sample}/ploidy.{chrom}.txt", sample = SAMPLES, chrom = config["chromosomes"]),
expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{plottype}.pdf",
sample = SAMPLES,
window = [100000],
bpdens = BPDENS,
method = METHODS,
plottype = ["byaf","bypos"]),
expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_clustering/{method}-{plottype}.pdf",
sample = SAMPLES,
window = [100000],
bpdens = BPDENS,
method = METHODS,
plottype = ["position","chromosome"]),
expand("halo/{sample}/{window}_{suffix}.json.gz",
sample = SAMPLES,
window = [100000],
suffix = ["fixed", "fixed_norm"]),
expand("stats-merged/{sample}/stats.tsv", sample = SAMPLES),
expand("postprocessing/merge/{sample}/{window}_fixed_norm.{bpdens}/{method}.txt",
sample = SAMPLES,
window = [100000],
bpdens = BPDENS,
method = list(set(m.replace('_filterTRUE','').replace('_filterFALSE','') for m in METHODS))),
# expand("cell-mixing-eval/{window}_fixed_norm.{bpdens}/{method}.tsv",
# window = [100000],
# bpdens = BPDENS,
# method = METHODS),
################################################################################
# Simulation of count data #
################################################################################
rule simul:
input:
expand("sv_calls/simulation{seed}-{window}/{window}_fixed.{segments}/{method}.{chrom}.pdf",
seed = list(range(7)),
window = [50000],
segments = ["few","medium"],
method = METHODS,
chrom = config["chromosomes"]),
expand("plots/simulation{seed}-{window}/{window}_fixed.pdf",
seed = list(range(7)),
window = [50000])
rule simulate_genome:
output:
tsv="simulation/genome/genome{seed}.tsv"
log:
"log/simulate_genome/genome{seed}.tsv"
params:
svcount = 200,
minsize = 100000,
maxsize = 5000000,
mindistance = 1000000,
shell:
"utils/simulate_SVs.R {wildcards.seed} {params.svcount} {params.minsize} {params.maxsize} {params.mindistance} {output.tsv} > {log} 2>&1"
rule add_vafs_to_simulated_genome:
input:
tsv="simulation/genome/genome{seed}.tsv"
output:
tsv="simulation/genome-with-vafs/genome{seed}.tsv"
params:
min_vaf = config["simulation_min_vaf"],
max_vaf = config["simulation_max_vaf"],
shell:
"""
# Issue #1022 (https://bitbucket.org/snakemake/snakemake/issues/1022)
awk -v min_vaf={params.min_vaf} \
-v max_vaf={params.max_vaf} \
-v seed={wildcards.seed} \
- f utils/command4.awk {input.tsv} > {output.tsv}
"""
def min_coverage(wildcards):
return round(float(config["simulation_min_reads_per_library"]) * int(wildcards.window_size) / float(config["genome_size"]))
def max_coverage(wildcards):
return round(float(config["simulation_max_reads_per_library"]) * int(wildcards.window_size) / float(config["genome_size"]))
def neg_binom_p(wildcards):
return float(config["simulation_neg_binom_p"][wildcards.window_size])
rule simulate_counts:
input:
config="simulation/genome-with-vafs/genome{seed}.tsv",
output:
counts="simulation/counts/genome{seed}-{window_size}.txt.gz",
segments="simulation/segments/genome{seed}-{window_size}.txt",
phases="simulation/phases/genome{seed}-{window_size}.txt",
info="simulation/info/genome{seed}-{window_size}.txt",
sce="simulation/sce/genome{seed}-{window_size}.txt",
variants="simulation/variants/genome{seed}-{window_size}.txt",
params:
mc_command = config["mosaicatcher"],
neg_binom_p = neg_binom_p,
min_coverage = min_coverage,
max_coverage = max_coverage,
cell_count = config["simulation_cell_count"],
alpha = config["simulation_alpha"],
log:
"log/simulate_counts/genome{seed}-{window_size}.log"
shell:
"""
{params.mc_command} simulate \
-w {wildcards.window_size} \
--seed {wildcards.seed} \
-n {params.cell_count} \
-p {params.neg_binom_p} \
-c {params.min_coverage} \
-C {params.max_coverage} \
-a {params.alpha} \
-V {output.variants} \
-i {output.info} \
-o {output.counts} \
-U {output.segments} \
-P {output.phases} \
-S {output.sce} \
--sample-name simulation{wildcards.seed}-{wildcards.window_size} \
{input.config} > {log} 2>&1
"""
rule link_to_simulated_counts:
input:
counts="simulation/counts/genome{seed}-{window_size}.txt.gz",
info="simulation/info/genome{seed}-{window_size}.txt",
output:
counts = "counts/simulation{seed}-{window_size}/{window_size}_fixed.txt.gz",
info = "counts/simulation{seed}-{window_size}/{window_size}_fixed.info"
run:
d = os.path.dirname(output.counts)
count_file = os.path.basename(output.counts)
info_file = os.path.basename(output.info)
shell("cd {d} && ln -s ../../{input.counts} {count_file} && ln -s ../../{input.info} {info_file} && cd ../..")
rule link_to_simulated_strand_states:
input:
sce="simulation/sce/genome{seed}-{window_size}.txt",
output:
states="strand_states/simulation{seed}-{window_size}/final.txt",
run:
d = os.path.dirname(output.states)
f = os.path.basename(output.states)
shell("cd {d} && ln -s ../../{input.sce} {f} && cd ../..")
ruleorder: link_to_simulated_counts > mosaic_count_fixed
ruleorder: link_to_simulated_strand_states > convert_strandphaser_output
################################################################################
# Ploidy estimation #
################################################################################
rule estimate_ploidy:
input:
"counts/{sample}/100000_fixed.txt.gz"
output:
"ploidy/{sample}/ploidy.{chrom}.txt"
log:
"log/estimate_ploidy/{sample}/{chrom}.log"
shell:
"""
PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031)
python utils/ploidy-estimator.py --chromosome {wildcards.chrom} {input} > {output} 2> {log}
"""
################################################################################
# Plots #
################################################################################
rule plot_mosaic_counts:
input:
counts = "counts/{sample}/{file_name}.txt.gz",
info = "counts/{sample}/{file_name}.info"
output:
"plots/{sample}/{file_name}.pdf"
log:
"log/plot_mosaic_counts/{sample}/{file_name}.log"
params:
plot_command = "Rscript " + config["plot_script"]
shell:
"""
{params.plot_command} {input.counts} {input.info} {output} > {log} 2>&1
"""
ruleorder: plot_SV_calls_simulated > plot_SV_calls
rule plot_SV_calls:
input:
counts = "counts/{sample}/{windows}.txt.gz",
calls = "sv_calls/{sample}/{windows}.{bpdens}/{method}_filter{filter}.txt",
complex = "sv_calls/{sample}/{windows}.{bpdens}/{method}_filter{filter}.complex.tsv",
strand = "strand_states/{sample}/{windows}.{bpdens}/final.txt",
segments = "segmentation2/{sample}/{windows}.{bpdens}.txt",
scsegments = "segmentation-singlecell/{sample}/{windows}.{bpdens}.txt",
grouptrack = "postprocessing/group-table/{sample}/{windows}.{bpdens}/{method}.tsv",
output:
"sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/plots/sv_calls/{method}_filter{filter,(TRUE|FALSE)}.{chrom}.pdf"
log:
"log/plot_SV_calls/{sample}/{windows}.{bpdens}.{method}_filter{filter}.{chrom}.log"
shell:
"""
Rscript utils/plot-sv-calls.R \
segments={input.segments} \
singlecellsegments={input.scsegments} \
strand={input.strand} \
complex={input.complex} \
groups={input.grouptrack} \
calls={input.calls} \
{input.counts} \
{wildcards.chrom} \
{output} > {log} 2>&1
"""
rule plot_SV_calls_simulated:
input:
counts = "counts/simulation{seed}-{window}/{window}_fixed.txt.gz",
calls = "sv_calls/simulation{seed}-{window}/{window}_fixed.{bpdens}/{method}.txt",
strand = "strand_states/simulation{seed}-{window}/final.txt",
segments = "segmentation2/simulation{seed}-{window}/{window}_fixed.{bpdens}.txt",
truth = "simulation/variants/genome{seed}-{window}.txt"
output:
"sv_calls/simulation{seed}-{window}/{window}_fixed.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/plots/sv_calls/{method}.{chrom}.pdf"
log:
"log/plot_SV_calls_simulated/simulation{seed}-{window}/{window}_fixed.{bpdens}.{method}.{chrom}.log"
shell:
"""
Rscript utils/plot-sv-calls.R \
segments={input.segments} \
strand={input.strand} \
truth={input.truth} \
calls={input.calls} \
{input.counts} \
{wildcards.chrom} \
{output} 2>&1 > {log}
"""
rule plot_SV_consistency_barplot:
input:
sv_calls = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt",
output:
barplot_bypos = "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/plots/sv_consistency/{method}.consistency-barplot-bypos.pdf",
barplot_byaf = "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/plots/sv_consistency/{method}.consistency-barplot-byaf.pdf",
log:
"log/plot_SV_consistency/{sample}/{windows}.{bpdens}.{method}.log"
script:
"utils/sv_consistency_barplot.snakemake.R"
rule generate_halo_json:
input:
counts = "counts/{sample}/{windows}.txt.gz",
output:
json = "halo/{sample}/{windows}.json.gz",
log:
"log/generate_halo_json/{sample}/{windows}.{windows}.log"
shell:
"""
PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031)
(./utils/counts_to_json.py {input.counts} | gzip > {output.json}) 2> {log}
"""
rule plot_clustering:
input:
sv_calls = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt",
binbed = "utils/bin_200kb_all.bed",
output:
position = "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/plots/sv_clustering/{method}-position.pdf",
chromosome = "sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/plots/sv_clustering/{method}-chromosome.pdf",
log:
"log/plot_clustering/{sample}/{windows}.{bpdens}.{method}.log"
script:
"utils/plot-clustering.snakemake.R"
################################################################################
# Read counting #
################################################################################
rule generate_exclude_file_1:
output:
temp("log/exclude_file.temp")
input:
bam = expand("bam/{sample}/selected/{bam}.bam", sample = SAMPLES[0], bam = BAM_PER_SAMPLE[SAMPLES[0]][0])
log:
"log/generate_exclude_file_1.log"
params:
samtools = config["samtools"]
shell:
"""
{params.samtools} view -H {input.bam} | awk "/^@SQ/" > {output} 2> {log}
"""
rule generate_exclude_file_2:
output:
"log/exclude_file"
input:
"log/exclude_file.temp"
params:
chroms = config["chromosomes"]
run:
with open(input[0]) as f:
with open(output[0],"w") as out:
for line in f:
contig = line.strip().split()[1]
contig = contig[3:]
if contig not in params.chroms:
print(contig, file = out)
rule mosaic_count_fixed:
input:
bam = lambda wc: expand("bam/" + wc.sample + "/selected/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR",
bai = lambda wc: expand("bam/" + wc.sample + "/selected/{bam}.bam.bai", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR",
excl = "log/exclude_file"
output:
counts = "counts/{sample}/{window}_fixed.txt.gz",
info = "counts/{sample}/{window}_fixed.info"
log:
"log/{sample}/mosaic_count_fixed.{window}.log"
params:
mc_command = config["mosaicatcher"]
shell:
"""
{params.mc_command} count \
--verbose \
--do-not-blacklist-hmm \
-o {output.counts} \
-i {output.info} \
-x {input.excl} \
-w {wildcards.window} \
{input.bam} \
> {log} 2>&1
"""
rule mosaic_count_variable:
input:
bam = lambda wc: expand("bam/" + wc.sample + "/selected/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]),
bai = lambda wc: expand("bam/" + wc.sample + "/selected/{bam}.bam.bai", bam = BAM_PER_SAMPLE[wc.sample]),
bed = lambda wc: config["variable_bins"][str(wc.window)],
excl = "log/exclude_file"
output:
counts = "counts/{sample}/{window}_variable.txt.gz",
info = "counts/{sample}/{window}_variable.info"
log:
"log/{sample}/mosaic_count_variable.{window}.log"
params:
mc_command = config["mosaicatcher"]
shell:
"""
echo "NOTE: Exclude file not used in variable-width bins"
{params.mc_command} count \
--verbose \
-o {output.counts} \
-i {output.info} \
-b {input.bed} \
{input.bam} \
> {log} 2>&1
"""
rule extract_single_cell_counts:
input:
"counts/{sample}/{window}_{file_name}.txt.gz"
output:
"counts-per-cell/{sample}/{cell}/{window,[0-9]+}_{file_name}.txt.gz"
shell:
"""
# Issue #1022 (https://bitbucket.org/snakemake/snakemake/issues/1022)
zcat {input} | awk -v name={wildcards.cell} -f utils/command1.awk | gzip > {output}
"""
################################################################################
# Normalize counts #
################################################################################
rule merge_blacklist_bins:
input:
norm = "utils/normalization/HGSVC.{window}.txt",
whitelist = "utils/normalization/inversion-whitelist.tsv",
output:
merged = "normalizations/HGSVC.{window}.merged.tsv"
log:
"log/merge_blacklist_bins/{window}.log"
shell:
"""
PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031)
utils/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size 100000 > {output.merged} 2>> {log}
"""
rule normalize_counts:
input:
counts = "counts/{sample}/{window}_fixed.txt.gz",
norm = "normalizations/HGSVC.{window}.merged.tsv",
output:
"counts/{sample}/{window}_fixed_norm.txt.gz"
log:
"log/normalize_counts/{sample}/{window}_fixed.log"
shell:
"""
Rscript utils/normalize.R {input.counts} {input.norm} {output} 2>&1 > {log}
"""
rule link_normalized_info_file:
input:
info = "counts/{sample}/{window}_fixed.info"
output:
info = "counts/{sample}/{window}_fixed_norm.info"
run:
d = os.path.dirname(output.info)
file = os.path.basename(output.info)
shell("cd {d} && ln -s ../../{input.info} {file} && cd ../..")
################################################################################
# Segmentation #
################################################################################
rule segmentation:
input:
"counts/{sample}/{window}_{file_name}.txt.gz"
output:
"segmentation/{sample}/{window,\d+}_{file_name}.txt.fixme"
log:
"log/segmentation/{sample}/{window}_{file_name}.log"
params:
mc_command = config["mosaicatcher"],
min_num_segs = lambda wc: math.ceil(200000 / float(wc.window)) # bins to represent 200 kb
shell:
"""
{params.mc_command} segment \
--remove-none \
--forbid-small-segments {params.min_num_segs} \
-M 50000000 \
-o {output} \
{input} > {log} 2>&1
"""
# TODO: This is a workaround because latest versions of "mosaic segment" don't compute the "bps"
# TODO: column properly. Remove once fixed in the C++ code.
rule fix_segmentation:
input:
"segmentation/{sample}/{window}_{file_name}.txt.fixme"
output:
"segmentation/{sample}/{window,\d+}_{file_name}.txt"
shell:
"""
# Issue #1022 (https://bitbucket.org/snakemake/snakemake/issues/1022)
awk -v name={wildcards.sample} -v window={wildcards.window} -f utils/command2.awk {input} > {output}
"""
# Pick a few segmentations and prepare the input files for SV classification
rule prepare_segments:
input:
"segmentation/{sample}/{windows}.txt"
output:
"segmentation2/{sample}/{windows}.{bpdens,(many|medium|few)}.txt"
log:
"log/prepare_segments/{sample}/{windows}.{bpdens}.log"
params:
quantile = lambda wc: config["bp_density"][wc.bpdens]
script:
"utils/helper.prepare_segments.R"
rule segment_one_cell:
input:
"counts-per-cell/{sample}/{cell}/{window}_{file_name}.txt.gz"
output:
"segmentation-per-cell/{sample}/{cell}/{window,\d+}_{file_name}.txt.fixme"
log:
"log/segmentation-per-cell/{sample}/{cell}/{window}_{file_name}.log"
params:
mc_command = config["mosaicatcher"],
min_num_segs = lambda wc: math.ceil(200000 / float(wc.window)) # bins to represent 200 kb
shell:
"""
{params.mc_command} segment \
--remove-none \
--forbid-small-segments {params.min_num_segs} \
-M 50000000 \
-o {output} \
{input} > {log} 2>&1
"""
# TODO: This is a workaround because latest versions of "mosaic segment" don't compute the "bps"
# TODO: column properly. Remove once fixed in the C++ code.
rule fix_segmentation_one_cell:
input:
"segmentation-per-cell/{sample}/{cell}/{window}_{file_name}.txt.fixme"
output:
"segmentation-per-cell/{sample}/{cell}/{window,\d+}_{file_name}.txt"
shell:
"""
# Issue #1022 (https://bitbucket.org/snakemake/snakemake/issues/1022)
awk -v name={wildcards.sample} -v window={wildcards.window} -f utils/command2.awk {input} > {output}
"""
rule segmentation_selection:
input:
counts="counts/{sample}/{window}_{file_name}.txt.gz",
jointseg="segmentation/{sample}/{window}_{file_name}.txt",
singleseg=lambda wc: ["segmentation-per-cell/{}/{}/{}_{}.txt".format(wc.sample, cell, wc.window, wc.file_name) for cell in CELL_PER_SAMPLE[wc.sample]],
info="counts/{sample}/{window}_{file_name}.info",
output:
jointseg="segmentation2/{sample}/{window,[0-9]+}_{file_name}.selected_j{min_diff_jointseg}_s{min_diff_singleseg}_scedist{additional_sce_cutoff}.txt",
singleseg="segmentation-singlecell/{sample}/{window,[0-9]+}_{file_name}.selected_j{min_diff_jointseg}_s{min_diff_singleseg}_scedist{additional_sce_cutoff}.txt",
strand_states="strand_states/{sample}/{window,[0-9]+}_{file_name}.selected_j{min_diff_jointseg}_s{min_diff_singleseg}_scedist{additional_sce_cutoff}/intitial_strand_state",
log:
"log/segmentation_selection/{sample}/{window}_{file_name}.selected_j{min_diff_jointseg}_s{min_diff_singleseg}_scedist{additional_sce_cutoff}.log"
params:
cellnames = lambda wc: ",".join(cell for cell in CELL_PER_SAMPLE[wc.sample]),
sce_min_distance = 500000,
shell:
"""
PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031)
./utils/detect_strand_states.py \
--sce_min_distance {params.sce_min_distance} \
--sce_add_cutoff {wildcards.additional_sce_cutoff}000000 \
--min_diff_jointseg {wildcards.min_diff_jointseg} \
--min_diff_singleseg {wildcards.min_diff_singleseg} \
--output_jointseg {output.jointseg} \
--output_singleseg {output.singleseg} \
--output_strand_states {output.strand_states} \
--samplename {wildcards.sample} \
--cellnames {params.cellnames} \
{input.info} \
{input.counts} \
{input.jointseg} \
{input.singleseg} > {log} 2>&1
"""
################################################################################
# SV classification #
################################################################################
rule plot_heatmap:
input:
maryam = "utils/R-packages2/MaRyam/R/MaRyam",
haplotypeProbs = "sv_probabilities/{sample}/{windows}.{bpdens}/allSegCellProbs.table",
genotypeProbs = "sv_probabilities/{sample}/{windows}.{bpdens}/allSegCellGTprobs.table",
info = "counts/{sample}/{windows}.info",
bamNames = "sv_probabilities/{sample}/{windows}.{bpdens}/bamNames.txt"
output:
"sv_probabilities/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/final_plots/heatmapPlots.pdf"
params:
r_package_path = "utils/R-packages2"
log:
"log/plot_heatmap/{sample}/{windows}.{bpdens}.log"
script:
"utils/plot_heatmap.R"
################################################################################
# New SV classification based on a combination of Sascha's and Maryam's method #
################################################################################
rule mosaiClassifier_make_call:
input:
probs = 'haplotag/table/{sample}/haplotag-likelihoods.{window}_fixed_norm.{bpdens}.Rdata'
output:
"sv_calls/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}_filterFALSE.txt"
params:
minFrac_used_bins = 0.8
log:
"log/mosaiClassifier_make_call/{sample}/{window}_fixed_norm.{bpdens}.llr{llr}.poppriors{pop_priors}.haplotags{use_haplotags}.gtcutoff{gtcutoff}.regfactor{regfactor}.log"
script:
"utils/mosaiClassifier_call.snakemake.R"
rule filter_calls:
input:
inputcalls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}_filterFALSE.txt",
mergedcalls = "postprocessing/merge/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}.txt",
output:
calls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}_filterTRUE.txt"
shell:
"""
PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031)
utils/apply_filter.py {input.inputcalls} {input.mergedcalls} > {output.calls}
"""
rule mosaiClassifier_calc_probs:
input:
counts = "counts/{sample}/{windows}.txt.gz",
info = "counts/{sample}/{windows}.info",
states = "strand_states/{sample}/{windows}.{bpdens}/final.txt",
bp = "segmentation2/{sample}/{windows}.{bpdens}.txt"
output:
output = "sv_probabilities/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/probabilities.Rdata"
log:
"log/mosaiClassifier_calc_probs/{sample}/{windows}.{bpdens}.log"
script:
"utils/mosaiClassifier.snakemake.R"
rule mosaiClassifier_make_call_biallelic:
input:
probs = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
output:
"sv_calls/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/biAllelic_llr{llr}.txt"
log:
"log/mosaiClassifier_make_call_biallelic/{sample}/{windows}.{bpdens}.{llr}.log"
script:
"utils/mosaiClassifier_call_biallelic.snakemake.R"
rule call_complex_regions:
input:
calls = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt",
output:
complex = "sv_calls/{sample}/{windows}.{bpdens}/{method}.complex.tsv",
log:
"log/call_complex_regions/{sample}/{windows}.{bpdens}.{method}.log"
shell:
"""
PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031)
utils/call-complex-regions.py \
--merge_distance 5000000 \
--ignore_haplotypes \
--min_cell_count 2 {input.calls} > {output.complex} 2>{log}
"""
rule postprocessing_filter:
input:
calls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}_filterFALSE.txt"
output:
calls = "postprocessing/filter/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}.txt"
shell:
'utils/filter_MosaiCatcher_calls.pl {input.calls} > {output.calls}'
rule postprocessing_merge:
input:
calls = "postprocessing/filter/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}.txt"
output:
calls = "postprocessing/merge/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}.txt"
shell:
'utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl {input.calls} > {output.calls}'
rule postprocessing_sv_group_table:
input:
calls = "postprocessing/merge/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}.txt"
output:
grouptrack = "postprocessing/group-table/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}.tsv"
shell:
"""
PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031)
utils/create-sv-group-track.py {input.calls} > {output.grouptrack}
"""
################################################################################
# Strand states & phasing #
################################################################################
# DEPRECATED rule for calling SCEs / determining the initial strand states using
# the C++ MosaiCatcher code.
#rule determine_initial_strand_states:
#input:
#"counts/{sample}/500000_fixed.txt.gz"
#output:
#"strand_states/{sample}/intitial_strand_state"
#log:
#"log/determine_initial_strand_states/{sample}.log"
#params:
#mc_command = config["mosaicatcher"]
#shell:
#"""
#{params.mc_command} states -o {output} {input} 2>&1 > {log}
#"""
#rule determine_initial_strand_states:
#input:
#"strand_states/{sample}/100000_fixed_norm.selected_j0.5_s1.0.intitial_strand_state"
#output:
#"strand_states/{sample}/intitial_strand_state"
#shell:
#"""
#cd strand_states/{wildcards.sample} && ln -s 100000_fixed_norm.selected_j0.5_s1.0.intitial_strand_state intitial_strand_state && cd ../..
#"""
# Strandphaser needs a different input format which contains the path names to
# the bam files. This rule extracts this information and prepares an input file.
rule convert_strandphaser_input:
input:
states = "strand_states/{sample}/{windows}.{bpdens}/intitial_strand_state",
info = "counts/{sample}/500000_fixed.info"
output:
"strand_states/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/strandphaser_input.txt"
log:
"log/convert_strandphaser_input/{sample}/{windows}.{bpdens}.log"
script:
"utils/helper.convert_strandphaser_input.R"
rule install_StrandPhaseR:
output:
"utils/R-packages/StrandPhaseR/R/StrandPhaseR"
log:
"log/install_StrandPhaseR.log"
shell:
"""
TAR=$(which tar) Rscript utils/install_strandphaser.R > {log} 2>&1
"""
rule prepare_strandphaser_config_per_chrom:
input:
"strand_states/{sample}/{windows}.{bpdens}/intitial_strand_state"
output:
"strand_states/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/StrandPhaseR.{chrom}.config"
run:
with open(output[0], "w") as f:
print("[General]", file = f)
print("numCPU = 1", file = f)
print("chromosomes = '" + wildcards.chrom + "'", file = f)
if (config["paired_end"]):
print("pairedEndReads = TRUE", file = f)
else:
print("pairedEndReads = FALSE", file = f)
print("min.mapq = 10", file = f)
print("", file = f)
print("[StrandPhaseR]", file = f)
print("positions = NULL", file = f)
print("WCregions = NULL", file = f)
print("min.baseq = 20", file = f)
print("num.iterations = 2", file = f)
print("translateBases = TRUE", file = f)
print("fillMissAllele = NULL", file = f)
print("splitPhasedReads = TRUE", file = f)
print("compareSingleCells = TRUE", file = f)
print("callBreaks = FALSE", file = f)
print("exportVCF = '", wildcards.sample, "'", sep = "", file = f)
print("bsGenome = '", config["R_reference"], "'", sep = "", file = f)
def locate_snv_vcf(wildcards):
if "snv_calls" not in config or wildcards.sample not in config["snv_calls"] or config["snv_calls"][wildcards.sample] == "":
if "snv_sites_to_genotype" in config and config["snv_sites_to_genotype"] != "":
return "snv_genotyping/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
else:
return "snv_calls/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
else:
return "external_snv_calls/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
rule run_strandphaser_per_chrom:
input:
wcregions = "strand_states/{sample}/{windows}.{bpdens}/strandphaser_input.txt",
snppositions = locate_snv_vcf,
configfile = "strand_states/{sample}/{windows}.{bpdens}/StrandPhaseR.{chrom}.config",
strandphaser = "utils/R-packages/StrandPhaseR/R/StrandPhaseR",
bamfolder = "bam/{sample}/selected"
output:
"strand_states/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/StrandPhaseR_analysis.{chrom}/Phased/phased_haps.txt",
"strand_states/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/StrandPhaseR_analysis.{chrom}/VCFfiles/{chrom}_phased.vcf",
log:
"log/run_strandphaser_per_chrom/{sample}/{windows}.{bpdens}/{chrom}.log"
shell:
"""
Rscript utils/StrandPhaseR_pipeline.R \
{input.bamfolder} \
strand_states/{wildcards.sample}/{wildcards.windows}.{wildcards.bpdens}/StrandPhaseR_analysis.{wildcards.chrom} \
{input.configfile} \
{input.wcregions} \
{input.snppositions} \
$(pwd)/utils/R-packages/ \
> {log} 2>&1
"""
rule compress_vcf:
input:
vcf="{file}.vcf",
output:
vcf="{file}.vcf.gz",
log:
"log/compress_vcf/{file}.log"
shell:
"(cat {input.vcf} | bgzip > {output.vcf}) > {log} 2>&1"
rule index_vcf:
input:
vcf="{file}.vcf.gz",
output:
tbi="{file}.vcf.gz.tbi",
shell:
"bcftools index --tbi {input.vcf}"
rule merge_strandphaser_vcfs:
input:
vcfs=expand("strand_states/{{sample}}/{{windows}}.{{bpdens}}/StrandPhaseR_analysis.{chrom}/VCFfiles/{chrom}_phased.vcf.gz", chrom=config["chromosomes"]),
tbis=expand("strand_states/{{sample}}/{{windows}}.{{bpdens}}/StrandPhaseR_analysis.{chrom}/VCFfiles/{chrom}_phased.vcf.gz.tbi", chrom=config["chromosomes"]),
output:
vcf='phased-snvs/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}.vcf.gz'
log:
"log/merge_strandphaser_vcfs/{sample}/{windows}.{bpdens}.log"
shell:
"(bcftools concat -a {input.vcfs} | bcftools view -o {output.vcf} -O z --genotype het --types snps - ) > {log} 2>&1"
rule combine_strandphaser_output:
input:
expand("strand_states/{{sample}}/{{windows}}.{{bpdens}}/StrandPhaseR_analysis.{chrom}/Phased/phased_haps.txt",
chrom = config["chromosomes"])
output:
"strand_states/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/strandphaser_output.txt"
log:
"log/combine_strandphaser_output/{sample}/{windows}.{bpdens}.log"
shell:
"""
set +o pipefail
cat {input} | head -n1 > {output};
tail -q -n+2 {input} >> {output};
"""
rule convert_strandphaser_output:
input:
phased_states = "strand_states/{sample}/{windows}.{bpdens}/strandphaser_output.txt",
initial_states = "strand_states/{sample}/{windows}.{bpdens}/intitial_strand_state",
info = "counts/{sample}/500000_fixed.info"
output:
"strand_states/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/final.txt"
log:
"log/convert_strandphaser_output/{sample}/{windows}.{bpdens}.log"
script:
"utils/helper.convert_strandphaser_output.R"
################################################################################
# Haplotagging #
################################################################################
rule haplotag_bams:
input:
vcf='phased-snvs/{sample}/{windows}.{bpdens}.vcf.gz',
tbi='phased-snvs/{sample}/{windows}.{bpdens}.vcf.gz.tbi',
bam='bam/{sample}/selected/{bam}.bam',
bai='bam/{sample}/selected/{bam}.bam.bai'
output:
bam='haplotag/bam/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/{bam}.bam',
log:
"log/haplotag_bams/{sample}/{windows}.{bpdens}/{bam}.log"
params:
ref = config["reference"]
shell:
"whatshap haplotag -o {output.bam} -r {params.ref} {input.vcf} {input.bam} > {log} 2>{log}"
rule create_haplotag_segment_bed:
input:
segments="segmentation2/{sample}/{size}{what}.{bpdens}.txt",
output:
bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}.bed",
shell:
"""
# Issue #1022 (https://bitbucket.org/snakemake/snakemake/issues/1022)
awk -v s={wildcards.size} -f utils/command3.awk {input.segments} > {output.bed}
"""
rule create_haplotag_table:
input:
bam='haplotag/bam/{sample}/{windows}.{bpdens}/{cell}.bam',
bai='haplotag/bam/{sample}/{windows}.{bpdens}/{cell}.bam.bai',
bed = "haplotag/bed/{sample}/{windows}.{bpdens}.bed"
output:
tsv='haplotag/table/{sample}/by-cell/haplotag-counts.{cell}.{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}.tsv'
log:
"log/create_haplotag_table/{sample}.{cell}.{windows}.{bpdens}.log"
script:
"utils/haplotagTable.snakemake.R"
rule merge_haplotag_tables:
input:
tsvs=lambda wc: ['haplotag/table/{}/by-cell/haplotag-counts.{}.{}.{}.tsv'.format(wc.sample,cell,wc.windows,wc.bpdens) for cell in BAM_PER_SAMPLE[wc.sample]],
output:
tsv='haplotag/table/{sample}/full/haplotag-counts.{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}.tsv'
shell:
'(head -n1 {input.tsvs[0]} && tail -q -n +2 {input.tsvs}) > {output.tsv}'
rule create_haplotag_likelihoods:
input:
haplotag_table='haplotag/table/{sample}/full/haplotag-counts.{windows}.{bpdens}.tsv',
sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata',
output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}.Rdata'
log:
"log/create_haplotag_likelihoods/{sample}.{windows}.{bpdens}.log"
script:
"utils/haplotagProbs.snakemake.R"
################################################################################
# Call SNVs #
################################################################################
rule mergeBams:
input:
lambda wc: expand("bam/" + wc.sample + "/all/{bam}.bam", bam = ALLBAMS_PER_SAMPLE[wc.sample]) if wc.sample in ALLBAMS_PER_SAMPLE else "FOOBAR",
output:
"snv_calls/{sample}/merged.bam"
log:
"log/mergeBams/{sample}.log"
threads:
4
shell:
config["samtools"] + " merge -@ {threads} {output} {input} 2>&1 > {log}"
rule index_bam:
input:
"{file}.bam"
output:
"{file}.bam.bai"
log:
"{file}.bam.log"
shell:
config["samtools"] + " index {input} 2> {log}"
rule call_SNVs_bcftools_chrom:
input:
bam = "snv_calls/{sample}/merged.bam",
bai = "snv_calls/{sample}/merged.bam.bai"
output:
"snv_calls/{sample}/{chrom,chr[0-9A-Z]+}.vcf"
log:
"log/call_SNVs_bcftools_chrom/{sample}/{chrom}.log"
params:
fa = config["reference"],
samtools = config["samtools"],
bcftools = config["bcftools"]
shell:
"""
{params.samtools} mpileup -r {wildcards.chrom} -g -f {params.fa} {input.bam} \
| {params.bcftools} call -mv - | {params.bcftools} view --genotype het --types snps - > {output} 2> {log}
"""
rule regenotype_SNVs:
input:
bam = "snv_calls/{sample}/merged.bam",
bai = "snv_calls/{sample}/merged.bam.bai",
sites = config["snv_sites_to_genotype"],
output:
vcf = "snv_genotyping/{sample}/{chrom,chr[0-9A-Z]+}.vcf"
log:
"log/snv_genotyping/{sample}/{chrom}.log"
params:
fa = config["reference"],
bcftools = config["bcftools"]
shell:
"""
(freebayes -f {params.fa} -r {wildcards.chrom} -@ {input.sites} --only-use-input-alleles {input.bam} --genotype-qualities | {params.bcftools} view --exclude-uncalled --genotype het --types snps --include "QUAL>=10" - > {output.vcf}) 2> {log}
"""
rule merge_SNV_calls:
input:
expand("snv_calls/{{sample}}/{chrom}.vcf", chrom = config['chromosomes'])
output:
"snv_calls/{sample}/all.vcf"
log:
"log/merge_SNV_calls/{sample}.log"
shell:
config["bcftools"] + " concat -O v -o {output} {input} 2>&1 > {log}"
rule split_external_snv_calls:
input:
vcf = lambda wc: config["snv_calls"][wc.sample],
tbi = lambda wc: config["snv_calls"][wc.sample] + ".tbi"
output:
vcf = "external_snv_calls/{sample}/{chrom}.vcf"
log:
"log/split_external_snv_calls/{sample}/{chrom}.vcf.log"
params:
bcftools = config["bcftools"]
shell:
"""
({params.bcftools} view --samples {wildcards.sample} \
--types snps \
--exclude-uncalled \
--trim-alt-alleles \
-m 2 -M 2 \
{input.vcf} \
{wildcards.chrom} \
| {params.bcftools} view --genotype het - \
> {output.vcf} ) \
> {log} 2>&1
"""
################################################################################
# Summary statistics on sv calls #
################################################################################
rule summary_statistics:
input:
segmentation = 'segmentation2/{sample}/{windows}.{bpdens}.txt',
strandstates = 'strand_states/{sample}/{windows}.{bpdens}/intitial_strand_state',
sv_calls = 'sv_calls/{sample}/{windows}.{bpdens}/{method}_filter{filter}.txt',
complex = "sv_calls/{sample}/{windows}.{bpdens}/{method}_filter{filter}.complex.tsv",
merged = "postprocessing/merge/{sample}/{windows}.{bpdens}/{method}.txt",
output:
tsv = 'stats/{sample}/{windows}.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+_scedist[0-9\\.]+}/{method}_filter{filter,(TRUE|FALSE)}.tsv',
log:
'log/summary_statistics/{sample}/{windows}.{bpdens}/{method}_filter{filter}.log'
run:
p = []
try:
f = config["ground_truth_clonal"][wildcards.sample]
if len(f) > 0:
p.append('--true-events-clonal')
p.append(f)
except KeyError:
pass
try:
f = config["ground_truth_single_cell"][wildcards.sample]
if len(f) > 0:
p.append('--true-events-single-cell')
p.append(f)
except KeyError:
pass
if wildcards.filter == 'TRUE':
p.append('--merged-file')
p.append(input.merged)
additional_params = ' '.join(p)
shell('utils/callset_summary_stats.py --segmentation {input.segmentation} --strandstates {input.strandstates} --complex-regions {input.complex} {additional_params} {input.sv_calls} > {output.tsv} 2> {log}')
rule aggregate_summary_statistics:
input:
tsv=expand("stats/{{sample}}/{window}_fixed_norm.{bpdens}/{method}.tsv", window = [100000], bpdens = BPDENS, method = METHODS),
output:
tsv="stats-merged/{sample}/stats.tsv"
shell:
"(head -n1 {input.tsv[0]} && (tail -n1 -q {input.tsv} | sort -k1) ) > {output}"
#rule evaluate_cell_mixing:
# input:
# sv_calls = expand('sv_calls/{sample}/{{windows}}.{{bpdens}}/{{method}}.txt', sample=SAMPLES),
# truth = config["ground_truth_clonal"][wildcards.sample],
# output:
# tsv = 'cell-mixing-eval/{sample}/{windows}.{bpdens}/{method}.tsv'
# log:
# 'cell-mixing-eval/{sample}/{windows}.{bpdens}/{method}.log'
# run:
# names = ','.join(SAMPLES)
# shell('utils/evaluate_cell_mixing.py --names {names} {input.truth} {input.sv_calls} > {output.tsv} 2> {log}')