Skip to content
Snippets Groups Projects
Commit aff42f7e authored by Sascha Meiers's avatar Sascha Meiers
Browse files

First attempt at parallelizing samples !

parent d3008a55
No related branches found
No related tags found
No related merge requests found
configfile: "Snake.config.json"
BAM, = glob_wildcards("bam/{bam}.bam")
SAMPLE,BAM = glob_wildcards("bam/{sample}/{bam}.bam")
BAM_PER_SAMPLE = dict([(s,[]) for s in SAMPLE])
for i in range(len(SAMPLE)):
BAM_PER_SAMPLE[SAMPLE[i]].append(BAM[i])
print("Detected {} samples:".format(len(set(SAMPLE))))
for s in set(SAMPLE):
print(" {}:\t{} cells".format(s, len(BAM_PER_SAMPLE[s])))
# Current state of the pipeline:
# ==============================
......@@ -11,16 +18,16 @@ BAM, = glob_wildcards("bam/{bam}.bam")
rule all:
input:
expand("plots/" + config["sample"] + ".{window}_fixed.pdf", window = [50000, 100000, 200000, 500000]),
expand("plots/" + config["sample"] + ".{window}_variable.pdf", window = [50000, 100000]),
expand("segmentation2/" + config["sample"] + ".{window}_fixed.{bpdens}.txt",
expand("plots/{sample}/{window}_fixed.pdf", sample = SAMPLE, window = [50000, 100000, 200000, 500000]),
expand("plots/{sample}/{window}_variable.pdf", sample = SAMPLE, window = [50000, 100000]),
expand("segmentation2/{sample}/{window}_fixed.{bpdens}.txt", sample = SAMPLE,
window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
#expand("segmentation2/" + config["sample"] + ".{window}_variable.{bpdens}.chr1.txt",
#expand("segmentation2/{sample}/{window}_variable.{bpdens}.chr1.txt", sample = SAMPLE,
# window = [50000, 100000], bpdens = ["few","medium","many"]),
"strand_states/" + config["sample"] + ".final.txt",
expand("sv_calls/" + config["sample"] + ".{window}_fixed.{bpdens}.SV_probs.chr1.pdf",
expand("strand_states/{sample}/final.txt", sample = SAMPLE),
expand("sv_calls/{sample}/{window}_fixed.{bpdens}.SV_probs.chr1.pdf", sample = SAMPLE,
window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
#expand("sv_calls/" + config["sample"] + ".{window}_variable.{bpdens}.SV_probs.chr1.pdf",
#expand("sv_calls/{sample}/{window}_variable.{bpdens}.SV_probs.chr1.pdf", sample = SAMPLE,
# window = [50000, 100000], bpdens = ["few","medium","many"])
......@@ -31,12 +38,12 @@ rule all:
rule plot_mosaic_counts:
input:
counts = "counts/" + config["sample"] + ".{file_name}.txt.gz",
info = "counts/" + config["sample"] + ".{file_name}.info"
counts = "counts/{sample}/{file_name}.txt.gz",
info = "counts/{sample}/{file_name}.info"
output:
"plots/" + config["sample"] + ".{file_name}.pdf"
"plots/{sample}/{file_name}.pdf"
log:
"log/plot_mosaic_counts.{file_name}.txt"
"log/plot_mosaic_counts/{sample}.{file_name}.txt"
params:
plot_command = "Rscript " + config["plot_script"]
shell:
......@@ -46,12 +53,12 @@ rule plot_mosaic_counts:
rule plot_SV_calls:
input:
counts = "counts/" + config["sample"] + ".{windows}.txt.gz",
probs = "sv_probabilities/" + config["sample"] + ".{windows}.{bpdens}/probabilities.txt"
counts = "counts/{sample}/{windows}.txt.gz",
probs = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.txt"
output:
dynamic("sv_calls/" + config["sample"] + ".{windows}.{bpdens}.SV_probs.{chrom}.pdf")
dynamic("sv_calls/{sample}/{windows}.{bpdens}.SV_probs.{chrom}.pdf")
log:
"log/plot_SV_call.{windows}.{bpdens}.txt"
"log/plot_SV_call/{sample}.{windows}.{bpdens}.txt"
params:
plot_command = "Rscript " + config["sv_plot_script"]
shell:
......@@ -68,7 +75,7 @@ rule generate_exclude_file_1:
output:
temp("log/exclude_file.temp")
input:
bam = expand("bam/{bam}.bam", bam = BAM[0]),
bam = expand("bam/{sample}/{bam}.bam", sample = SAMPLE[0], bam = BAM[0]),
params:
samtools = config["samtools"]
shell:
......@@ -94,14 +101,14 @@ rule generate_exclude_file_2:
rule mosaic_count_fixed:
input:
bam = expand("bam/{bam}.bam", bam = BAM),
bai = expand("bam/{bam}.bam.bai", bam = BAM),
bam = lambda wc: expand("bam/" + wc.sample + "/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]),
bai = lambda wc: expand("bam/" + wc.sample + "/{bam}.bam.bai", bam = BAM_PER_SAMPLE[wc.sample]),
excl = "log/exclude_file"
output:
counts = "counts/" + config["sample"] + ".{window}_fixed.txt.gz",
info = "counts/" + config["sample"] + ".{window}_fixed.info"
counts = "counts/{sample}/{window}_fixed.txt.gz",
info = "counts/{sample}/{window}_fixed.info"
log:
"log/mosaic_count_fixed.{window}.txt"
"log/{sample}/mosaic_count_fixed.{window}.txt"
params:
mc_command = config["mosaicatcher"]
shell:
......@@ -119,15 +126,15 @@ rule mosaic_count_fixed:
rule mosaic_count_variable:
input:
bam = expand("bam/{bam}.bam", bam = BAM),
bai = expand("bam/{bam}.bam.bai", bam = BAM),
bam = lambda wc: expand("bam/" + wc.sample + "/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]),
bai = lambda wc: expand("bam/" + wc.sample + "/{bam}.bam.bai", bam = BAM_PER_SAMPLE[wc.sample]),
bed = lambda wc: config["variable_bins"][str(wc.window)],
excl = "log/exclude_file"
output:
counts = "counts/" + config["sample"] + ".{window}_variable.txt.gz",
info = "counts/" + config["sample"] + ".{window}_variable.info"
counts = "counts/{sample}/{window}_variable.txt.gz",
info = "counts/{sample}/{window}_variable.info"
log:
"log/mosaic_count_variable.{window}.txt"
"log/{sample}/mosaic_count_variable.{window}.txt"
params:
mc_command = config["mosaicatcher"]
shell:
......@@ -153,11 +160,11 @@ rule mosaic_count_variable:
rule segmentation:
input:
"counts/" + config["sample"] + ".{file_name}.txt.gz"
"counts/{sample}/{file_name}.txt.gz"
output:
"segmentation/" + config["sample"] + ".{file_name}.txt"
"segmentation/{sample}/{file_name}.txt"
log:
"log/segmentation.{file_name}.txt"
"log/{sample}/segmentation.{file_name}.txt"
params:
mc_command = config["mosaicatcher"]
shell:
......@@ -170,11 +177,11 @@ rule segmentation:
# Pick a few segmentations and prepare the input files for SV classification
rule prepare_segments:
input:
"segmentation/" + config["sample"] + ".{windows}.txt"
"segmentation/{sample}/{windows}.txt"
output:
"segmentation2/" + config["sample"] + ".{windows}.{bpdens}.txt"
"segmentation2/{sample}/{windows}.{bpdens}.txt"
log:
"log/prepare_segments.{windows}.{bpdens}.txt"
"log/{sample}/prepare_segments.{windows}.{bpdens}.txt"
params:
quantile = lambda wc: config["bp_density"][wc.bpdens]
script:
......@@ -198,15 +205,15 @@ rule install_MaRyam:
rule run_sv_classification:
input:
maryam = "utils/R-packages2/MaRyam/R/MaRyam",
counts = "counts/" + config["sample"] + ".{windows}.txt.gz",
info = "counts/" + config["sample"] + ".{windows}.info",
states = "strand_states/" + config["sample"] + ".final.txt",
bp = "segmentation2/" + config["sample"] + ".{windows}.{bpdens}.txt"
counts = "counts/{sample}/{windows}.txt.gz",
info = "counts/{sample}/{windows}.info",
states = "strand_states/{sample}/final.txt",
bp = "segmentation2/{sample}/{windows}.{bpdens}.txt"
output:
outdir = "sv_probabilities/" + config["sample"] + ".{windows}.{bpdens}/",
out1 = "sv_probabilities/" + config["sample"] + ".{windows}.{bpdens}/allSegCellProbs.table"
outdir = "sv_probabilities/{sample}/{windows}.{bpdens}/",
out1 = "sv_probabilities/{sample}/{windows}.{bpdens}/allSegCellProbs.table"
log:
"log/run_sv_classification.{windows}.{bpdens}.txt"
"log/{sample}/run_sv_classification.{windows}.{bpdens}.txt"
params:
windowsize = lambda wc: wc.windows.split("_")[0]
shell:
......@@ -227,14 +234,14 @@ rule run_sv_classification:
rule convert_SVprob_output:
input:
probs = "sv_probabilities/" + config["sample"] + ".{windows}.{bpdens}/allSegCellProbs.table",
info = "counts/" + config["sample"] + ".{windows}.info"
probs = "sv_probabilities/{sample}/{windows}.{bpdens}/allSegCellProbs.table",
info = "counts/{sample}/{windows}.info"
output:
"sv_probabilities/" + config["sample"] + ".{windows}.{bpdens}/probabilities.txt"
"sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.txt"
params:
sample_name = config["sample"]
sample_name = "{wildcards.sample}"
log:
"log/convert_SVprob_output.{windows}.{bpdens}.txt"
"log/{sample}/convert_SVprob_output.{windows}.{bpdens}.txt"
script:
"utils/helper.convert_svprob_output.R"
......@@ -245,31 +252,28 @@ rule convert_SVprob_output:
rule determine_initial_strand_states:
input:
"counts/" + config["sample"] + ".500000_fixed.txt.gz"
"counts/{sample}/500000_fixed.txt.gz"
output:
"strand_states/" + config["sample"] + ".txt"
"strand_states/{sample}/intitial_strand_state"
log:
"log/determine_initial_strand_states.txt"
"log/{sample}/determine_initial_strand_states.txt"
params:
sce_command = "Rscript " + config["sce_script"]
shell:
"""
echo "[Note] This is a dirty hack to remove chrX and chrY."
tmp=$(mktemp)
{params.sce_command} {input} $tmp > {log} 2>&1
grep -vP '^Y|^chrY' $tmp | grep -vP '^X|^chrX' > {output}
{params.sce_command} {input} $tmp > {output}
"""
# Strandphaser needs a different input format which contains the path names to
# the bam files. This rule extracts this information and prepares an input file.
rule convert_strandphaser_input:
input:
states = "strand_states/" + config["sample"] + ".txt",
info = "counts/" + config["sample"] + ".500000_fixed.info"
states = "strand_states/{sample}/intitial_strand_state",
info = "counts/{sample}/500000_fixed.info"
output:
"strand_states/" + config["sample"] + ".strandphaser_input.txt"
"strand_states/{sample}/strandphaser_input.txt"
log:
"log/convert_strandphaser_input.txt"
"log/{sample}/convert_strandphaser_input.txt"
script:
"utils/helper.convert_strandphaser_input.R"
......@@ -285,23 +289,14 @@ rule install_StrandPhaseR:
rule prepare_strandphaser_config_per_chrom:
input:
"strand_states/" + config["sample"] + ".txt"
"strand_states/{sample}/intitial_strand_state"
output:
dynamic("log/StrandPhaseR.{chrom}.config")
"strand_states/{sample}/StrandPhaseR.{chrom}.config"
run:
# Get chromosomes
chroms = set()
with open(input[0]) as f:
skipfirst = True
for line in f:
if skipfirst:
skipfirst = False
continue
chroms.add(line.split()[0])
with open(output[0], "w") as f:
print("[General]", file = f)
print("numCPU = 1", file = f)
print("chromosomes = '" + wildcards.chrom + "'", file = f)
print("chromosomes = '{wildcards.chrom}'", file = f)
print("pairedEndReads = TRUE", file = f)
print("min.mapq = 10", file = f)
print("", file = f)
......@@ -315,46 +310,47 @@ rule prepare_strandphaser_config_per_chrom:
print("splitPhasedReads = TRUE", file = f)
print("compareSingleCells = TRUE", file = f)
print("callBreaks = FALSE", file = f)
print("exportVCF = '", config["sample"], ".txt'", sep = "", file = f)
print("exportVCF = '{sample}.txt'", sep = "", file = f)
print("bsGenome = '", config["R_reference"], "'", sep = "", file = f)
def locate_snv_vcf(wildcards):
if config["snv_calls"] == "":
return "snv_calls/{}.{}.vcf".format(config["sample"], wildcards.chrom)
if "snv_calls" not in config or wildcards.sample not in config["snv_calls"] or config["snv_calls"][wildcards.sample] == "":
return "snv_calls/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
else:
return "external_snv_calls/{}.{}.vcf".format(config["sample"], wildcards.chrom)
return "external_snv_calls/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
rule run_strandphaser_per_chrom:
input:
wcregions = "strand_states/" + config["sample"] + ".strandphaser_input.txt",
wcregions = "strand_states/{sample}/strandphaser_input.txt",
snppositions = locate_snv_vcf,
configfile = "log/StrandPhaseR.{chrom}.config",
configfile = "strand_states/{sample}/StrandPhaseR.{chrom}.config",
strandphaser = "utils/R-packages/StrandPhaseR/R/StrandPhaseR",
bamfolder = "bam"
bamfolder = "bam/{sample}/"
output:
"strand_states/" + config["sample"] + ".strandphaser_output.{chrom}.txt"
"strand_states/{sample}/StrandPhaseR_analysis.{chrom}/phased_haps.txt"
log:
"log/run_strandphaser.{chrom}.txt"
"log/{sample}/run_strandphaser.{chrom}.txt"
shell:
"""
Rscript utils/StrandPhaseR_pipeline.R \
{input.bamfolder} \
log/StrandPhaseR_analysis \
strand_states/{wildcards.sample}/StrandPhaseR_analysis.{wildcards.chrom} \
{input.configfile} \
{input.wcregions} \
{input.snppositions} \
$(pwd)/utils/R-packages/ \
> {log} 2>&1
cp log/StrandPhaseR_analysis/Phased/phased_haps.txt {output}
"""
rule combine_strandphaser_output:
input:
dynamic("strand_states/" + config["sample"] + ".strandphaser_output.{chrom}.txt")
expand("strand_states/{{sample}}/StrandPhaseR_analysis.{chrom}/phased_haps.txt",
chrom = config["chromosomes"])
output:
"strand_states/" + config["sample"] + ".strandphaser_output.txt"
"strand_states/{sample}/strandphaser_output.txt"
shell:
"""
set +o pipefail
......@@ -365,13 +361,13 @@ rule combine_strandphaser_output:
rule convert_strandphaser_output:
input:
phased_states = "strand_states/" + config["sample"] + ".strandphaser_output.txt",
initial_states = "strand_states/" + config["sample"] + ".txt",
info = "counts/" + config["sample"] + ".500000_fixed.info"
phased_states = "strand_states/{sample}/strandphaser_output.txt",
initial_states = "strand_states/{sample}/intitial_strand_state",
info = "counts/{sample}/500000_fixed.info"
output:
"strand_states/" + config["sample"] + ".final.txt"
"strand_states/{sample}/final.txt"
log:
"log/convert_strandphaser_output.txt"
"log/{sample}/convert_strandphaser_output.txt"
script:
"utils/helper.convert_strandphaser_output.R"
......@@ -383,17 +379,17 @@ rule convert_strandphaser_output:
rule mergeBams:
input:
expand("bam/{bam}.bam", bam=BAM)
lambda wc: expand("bam/" + wc.sample + "/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample])
output:
"snv_calls/merged.bam"
"snv_calls/{sample}/merged.bam"
shell:
config["samtools"] + " merge {output} {input}"
rule indexMergedBam:
input:
"snv_calls/merged.bam"
"snv_calls/{sample}/merged.bam"
output:
"snv_calls/merged.bam.bai"
"snv_calls/{sample}/merged.bam.bai"
shell:
config["samtools"] + " index {input}"
......@@ -401,13 +397,12 @@ rule indexMergedBam:
rule call_SNVs_bcftools_chrom:
input:
fa = config["reference"],
chrom = "chroms/{chrom}",
bam = "snv_calls/merged.bam",
bai = "snv_calls/merged.bam.bai"
bam = "snv_calls/{sample}/merged.bam",
bai = "snv_calls/{sample}/merged.bam.bai"
output:
"snv_calls/" + config["sample"] + ".{chrom}.vcf"
"snv_calls/{sample}/{chrom}.vcf"
log:
"log/call_SNVs_bcftools_chrom.{chrom}.txt"
"log/{sample}/call_SNVs_bcftools_chrom.{chrom}.txt"
params:
samtools = config["samtools"],
bcftools = config["bcftools"]
......@@ -417,34 +412,22 @@ rule call_SNVs_bcftools_chrom:
| {params.bcftools} call -mv - | {params.bcftools} view --genotype het --types snps - > {output} 2> {log}
"""
# Write one file per chromosome that should be analysed.
rule prepare_chromosomes:
input:
"strand_states/" + config["sample"] + ".txt"
output:
dynamic("chroms/{chrom}")
shell:
"""
tail -n+2 {input} | cut -f1 | sort | uniq | awk '{{print "chroms/" $1}}' | xargs touch
"""
rule merge_SNV_calls:
input:
dynamic("snv_calls/" + config["sample"] + ".{chrom}.vcf")
expand("snv_calls/{{sample}}/{chrom}.vcf", chrom = config['chromosomes'])
output:
expand("snv_calls/" + config["sample"] + ".vcf")
"snv_calls/{sample}/all.vcf"
shell:
config["bcftools"] + " concat -O v -o {output} {input}"
rule split_external_snv_calls:
input:
vcf = config["snv_calls"]
vcf = lambda wc: config["snv_calls"][wc.sample]
output:
vcf = "external_snv_calls/" + config["sample"] + ".{chrom}.vcf"
log: "external_snv_calls/" + config["sample"] + ".{chrom}.vcf.log"
vcf = "external_snv_calls/{sample}/{chrom}.vcf"
log: "external_snv_calls/{sample}/{chrom}.vcf.log"
params:
bcftools = config["bcftools"],
#sample = config["sample"]
bcftools = config["bcftools"]
shell:
"({params.bcftools} view --samples " + config["sample"] + " --types snps {input.vcf} {wildcards.chrom} | bcftools view --genotype het - > {output.vcf}) > {log} 2>&1"
"({params.bcftools} view --samples {wildcards.sample} --types snps {input.vcf} {wildcards.chrom} | bcftools view --genotype het - > {output.vcf}) > {log} 2>&1"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment