Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Mosaicatcher Update
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Thomas Weber
Mosaicatcher Update
Commits
aff42f7e
Commit
aff42f7e
authored
7 years ago
by
Sascha Meiers
Browse files
Options
Downloads
Patches
Plain Diff
First attempt at parallelizing samples !
parent
d3008a55
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Snakefile
+97
-114
97 additions, 114 deletions
Snakefile
with
97 additions
and
114 deletions
Snakefile
+
97
−
114
View file @
aff42f7e
configfile: "Snake.config.json"
BAM, = glob_wildcards("bam/{bam}.bam")
SAMPLE,BAM = glob_wildcards("bam/{sample}/{bam}.bam")
BAM_PER_SAMPLE = dict([(s,[]) for s in SAMPLE])
for i in range(len(SAMPLE)):
BAM_PER_SAMPLE[SAMPLE[i]].append(BAM[i])
print("Detected {} samples:".format(len(set(SAMPLE))))
for s in set(SAMPLE):
print(" {}:\t{} cells".format(s, len(BAM_PER_SAMPLE[s])))
# Current state of the pipeline:
# ==============================
...
...
@@ -11,16 +18,16 @@ BAM, = glob_wildcards("bam/{bam}.bam")
rule all:
input:
expand("plots/
" + config["sample"] + ".
{window}_fixed.pdf", window = [50000, 100000, 200000, 500000]),
expand("plots/
" + config["sample"] + ".
{window}_variable.pdf", window = [50000, 100000]),
expand("segmentation2/
" + config["sample"] + ".
{window}_fixed.{bpdens}.txt",
expand("plots/
{sample}/
{window}_fixed.pdf",
sample = SAMPLE,
window = [50000, 100000, 200000, 500000]),
expand("plots/
{sample}/
{window}_variable.pdf",
sample = SAMPLE,
window = [50000, 100000]),
expand("segmentation2/
{sample}/
{window}_fixed.{bpdens}.txt",
sample = SAMPLE,
window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
#expand("segmentation2/
" + config["sample"] + ".
{window}_variable.{bpdens}.chr1.txt",
#expand("segmentation2/
{sample}/
{window}_variable.{bpdens}.chr1.txt",
sample = SAMPLE,
# window = [50000, 100000], bpdens = ["few","medium","many"]),
"strand_states/
" + config["sample"] + ".final.txt"
,
expand("sv_calls/
" + config["sample"] + ".
{window}_fixed.{bpdens}.SV_probs.chr1.pdf",
expand(
"strand_states/
{sample}/final.txt", sample = SAMPLE)
,
expand("sv_calls/
{sample}/
{window}_fixed.{bpdens}.SV_probs.chr1.pdf",
sample = SAMPLE,
window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
#expand("sv_calls/
" + config["sample"] + ".
{window}_variable.{bpdens}.SV_probs.chr1.pdf",
#expand("sv_calls/
{sample}/
{window}_variable.{bpdens}.SV_probs.chr1.pdf",
sample = SAMPLE,
# window = [50000, 100000], bpdens = ["few","medium","many"])
...
...
@@ -31,12 +38,12 @@ rule all:
rule plot_mosaic_counts:
input:
counts = "counts/
" + config["sample"] + ".
{file_name}.txt.gz",
info = "counts/
" + config["sample"] + ".
{file_name}.info"
counts = "counts/
{sample}/
{file_name}.txt.gz",
info = "counts/
{sample}/
{file_name}.info"
output:
"plots/
" + config["sample"] + ".
{file_name}.pdf"
"plots/
{sample}/
{file_name}.pdf"
log:
"log/plot_mosaic_counts.{file_name}.txt"
"log/plot_mosaic_counts
/{sample}
.{file_name}.txt"
params:
plot_command = "Rscript " + config["plot_script"]
shell:
...
...
@@ -46,12 +53,12 @@ rule plot_mosaic_counts:
rule plot_SV_calls:
input:
counts = "counts/
" + config["sample"] + ".
{windows}.txt.gz",
probs = "sv_probabilities/
" + config["sample"] + ".
{windows}.{bpdens}/probabilities.txt"
counts = "counts/
{sample}/
{windows}.txt.gz",
probs = "sv_probabilities/
{sample}/
{windows}.{bpdens}/probabilities.txt"
output:
dynamic("sv_calls/
" + config["sample"] + ".
{windows}.{bpdens}.SV_probs.{chrom}.pdf")
dynamic("sv_calls/
{sample}/
{windows}.{bpdens}.SV_probs.{chrom}.pdf")
log:
"log/plot_SV_call.{windows}.{bpdens}.txt"
"log/plot_SV_call
/{sample}
.{windows}.{bpdens}.txt"
params:
plot_command = "Rscript " + config["sv_plot_script"]
shell:
...
...
@@ -68,7 +75,7 @@ rule generate_exclude_file_1:
output:
temp("log/exclude_file.temp")
input:
bam = expand("bam/{
bam}.bam"
, bam = BAM[0]),
bam = expand("bam/{
sample}/{bam}.bam", sample = SAMPLE[0]
, bam = BAM[0]),
params:
samtools = config["samtools"]
shell:
...
...
@@ -94,14 +101,14 @@ rule generate_exclude_file_2:
rule mosaic_count_fixed:
input:
bam = expand("bam/{bam}.bam", bam = BAM),
bai = expand("bam/{bam}.bam.bai", bam = BAM),
bam =
lambda wc:
expand("bam/
" + wc.sample + "/
{bam}.bam", bam = BAM
_PER_SAMPLE[wc.sample]
),
bai =
lambda wc:
expand("bam/
" + wc.sample + "/
{bam}.bam.bai", bam = BAM
_PER_SAMPLE[wc.sample]
),
excl = "log/exclude_file"
output:
counts = "counts/
" + config["sample"] + ".
{window}_fixed.txt.gz",
info = "counts/
" + config["sample"] + ".
{window}_fixed.info"
counts = "counts/
{sample}/
{window}_fixed.txt.gz",
info = "counts/
{sample}/
{window}_fixed.info"
log:
"log/mosaic_count_fixed.{window}.txt"
"log/
{sample}/
mosaic_count_fixed.{window}.txt"
params:
mc_command = config["mosaicatcher"]
shell:
...
...
@@ -119,15 +126,15 @@ rule mosaic_count_fixed:
rule mosaic_count_variable:
input:
bam = expand("bam/{bam}.bam", bam = BAM),
bai = expand("bam/{bam}.bam.bai", bam = BAM),
bam =
lambda wc:
expand("bam/
" + wc.sample + "/
{bam}.bam", bam = BAM
_PER_SAMPLE[wc.sample]
),
bai =
lambda wc:
expand("bam/
" + wc.sample + "/
{bam}.bam.bai", bam = BAM
_PER_SAMPLE[wc.sample]
),
bed = lambda wc: config["variable_bins"][str(wc.window)],
excl = "log/exclude_file"
output:
counts = "counts/
" + config["sample"] + ".
{window}_variable.txt.gz",
info = "counts/
" + config["sample"] + ".
{window}_variable.info"
counts = "counts/
{sample}/
{window}_variable.txt.gz",
info = "counts/
{sample}/
{window}_variable.info"
log:
"log/mosaic_count_variable.{window}.txt"
"log/
{sample}/
mosaic_count_variable.{window}.txt"
params:
mc_command = config["mosaicatcher"]
shell:
...
...
@@ -153,11 +160,11 @@ rule mosaic_count_variable:
rule segmentation:
input:
"counts/
" + config["sample"] + ".
{file_name}.txt.gz"
"counts/
{sample}/
{file_name}.txt.gz"
output:
"segmentation/
" + config["sample"] + ".
{file_name}.txt"
"segmentation/
{sample}/
{file_name}.txt"
log:
"log/segmentation.{file_name}.txt"
"log/
{sample}/
segmentation.{file_name}.txt"
params:
mc_command = config["mosaicatcher"]
shell:
...
...
@@ -170,11 +177,11 @@ rule segmentation:
# Pick a few segmentations and prepare the input files for SV classification
rule prepare_segments:
input:
"segmentation/
" + config["sample"] + ".
{windows}.txt"
"segmentation/
{sample}/
{windows}.txt"
output:
"segmentation2/
" + config["sample"] + ".
{windows}.{bpdens}.txt"
"segmentation2/
{sample}/
{windows}.{bpdens}.txt"
log:
"log/prepare_segments.{windows}.{bpdens}.txt"
"log/
{sample}/
prepare_segments.{windows}.{bpdens}.txt"
params:
quantile = lambda wc: config["bp_density"][wc.bpdens]
script:
...
...
@@ -198,15 +205,15 @@ rule install_MaRyam:
rule run_sv_classification:
input:
maryam = "utils/R-packages2/MaRyam/R/MaRyam",
counts = "counts/
" + config["sample"] + ".
{windows}.txt.gz",
info = "counts/
" + config["sample"] + ".
{windows}.info",
states = "strand_states/
" + config["sample"] + ".
final.txt",
bp = "segmentation2/
" + config["sample"] + ".
{windows}.{bpdens}.txt"
counts = "counts/
{sample}/
{windows}.txt.gz",
info = "counts/
{sample}/
{windows}.info",
states = "strand_states/
{sample}/
final.txt",
bp = "segmentation2/
{sample}/
{windows}.{bpdens}.txt"
output:
outdir = "sv_probabilities/
" + config["sample"] + ".
{windows}.{bpdens}/",
out1 = "sv_probabilities/
" + config["sample"] + ".
{windows}.{bpdens}/allSegCellProbs.table"
outdir = "sv_probabilities/
{sample}/
{windows}.{bpdens}/",
out1 = "sv_probabilities/
{sample}/
{windows}.{bpdens}/allSegCellProbs.table"
log:
"log/run_sv_classification.{windows}.{bpdens}.txt"
"log/
{sample}/
run_sv_classification.{windows}.{bpdens}.txt"
params:
windowsize = lambda wc: wc.windows.split("_")[0]
shell:
...
...
@@ -227,14 +234,14 @@ rule run_sv_classification:
rule convert_SVprob_output:
input:
probs = "sv_probabilities/
" + config["sample"] + ".
{windows}.{bpdens}/allSegCellProbs.table",
info = "counts/
" + config["sample"] + ".
{windows}.info"
probs = "sv_probabilities/
{sample}/
{windows}.{bpdens}/allSegCellProbs.table",
info = "counts/
{sample}/
{windows}.info"
output:
"sv_probabilities/
" + config["sample"] + ".
{windows}.{bpdens}/probabilities.txt"
"sv_probabilities/
{sample}/
{windows}.{bpdens}/probabilities.txt"
params:
sample_name =
config["
sample"
]
sample_name =
"{wildcards.
sample
}
"
log:
"log/convert_SVprob_output.{windows}.{bpdens}.txt"
"log/
{sample}/
convert_SVprob_output.{windows}.{bpdens}.txt"
script:
"utils/helper.convert_svprob_output.R"
...
...
@@ -245,31 +252,28 @@ rule convert_SVprob_output:
rule determine_initial_strand_states:
input:
"counts/
" + config["sample"] + ".
500000_fixed.txt.gz"
"counts/
{sample}/
500000_fixed.txt.gz"
output:
"strand_states/
" + config["sample"] + ".txt
"
"strand_states/
{sample}/intitial_strand_state
"
log:
"log/determine_initial_strand_states.txt"
"log/
{sample}/
determine_initial_strand_states.txt"
params:
sce_command = "Rscript " + config["sce_script"]
shell:
"""
echo "[Note] This is a dirty hack to remove chrX and chrY."
tmp=$(mktemp)
{params.sce_command} {input} $tmp > {log} 2>&1
grep -vP '^Y|^chrY' $tmp | grep -vP '^X|^chrX' > {output}
{params.sce_command} {input} $tmp > {output}
"""
# Strandphaser needs a different input format which contains the path names to
# the bam files. This rule extracts this information and prepares an input file.
rule convert_strandphaser_input:
input:
states = "strand_states/
" + config["sample"] + ".txt
",
info = "counts/
" + config["sample"] + ".
500000_fixed.info"
states = "strand_states/
{sample}/intitial_strand_state
",
info = "counts/
{sample}/
500000_fixed.info"
output:
"strand_states/
" + config["sample"] + ".
strandphaser_input.txt"
"strand_states/
{sample}/
strandphaser_input.txt"
log:
"log/convert_strandphaser_input.txt"
"log/
{sample}/
convert_strandphaser_input.txt"
script:
"utils/helper.convert_strandphaser_input.R"
...
...
@@ -285,23 +289,14 @@ rule install_StrandPhaseR:
rule prepare_strandphaser_config_per_chrom:
input:
"strand_states/
" + config["sample"] + ".txt
"
"strand_states/
{sample}/intitial_strand_state
"
output:
dynamic("log
/StrandPhaseR.{chrom}.config"
)
"strand_states/{sample}
/StrandPhaseR.{chrom}.config"
run:
# Get chromosomes
chroms = set()
with open(input[0]) as f:
skipfirst = True
for line in f:
if skipfirst:
skipfirst = False
continue
chroms.add(line.split()[0])
with open(output[0], "w") as f:
print("[General]", file = f)
print("numCPU = 1", file = f)
print("chromosomes = '
" +
wildcards.chrom
+ "
'", file = f)
print("chromosomes = '
{
wildcards.chrom
}
'", file = f)
print("pairedEndReads = TRUE", file = f)
print("min.mapq = 10", file = f)
print("", file = f)
...
...
@@ -315,46 +310,47 @@ rule prepare_strandphaser_config_per_chrom:
print("splitPhasedReads = TRUE", file = f)
print("compareSingleCells = TRUE", file = f)
print("callBreaks = FALSE", file = f)
print("exportVCF = '
", config["
sample
"], "
.txt'", sep = "", file = f)
print("exportVCF = '
{
sample
}
.txt'", sep = "", file = f)
print("bsGenome = '", config["R_reference"], "'", sep = "", file = f)
def locate_snv_vcf(wildcards):
if
config[
"snv_calls"] == "":
return "snv_calls/{}
.
{}.vcf".format(
config["
sample
"]
, wildcards.chrom)
if "snv_calls"
not in config or wildcards.sample not in config["snv_calls"] or config["snv_calls"][wildcards.sample
] == "":
return "snv_calls/{}
/
{}.vcf".format(
wildcards.
sample, wildcards.chrom)
else:
return "external_snv_calls/{}
.
{}.vcf".format(
config["
sample
"]
, wildcards.chrom)
return "external_snv_calls/{}
/
{}.vcf".format(
wildcards.
sample, wildcards.chrom)
rule run_strandphaser_per_chrom:
input:
wcregions = "strand_states/
" + config["sample"] + ".
strandphaser_input.txt",
wcregions = "strand_states/
{sample}/
strandphaser_input.txt",
snppositions = locate_snv_vcf,
configfile = "
log
/StrandPhaseR.{chrom}.config",
configfile = "
strand_states/{sample}
/StrandPhaseR.{chrom}.config",
strandphaser = "utils/R-packages/StrandPhaseR/R/StrandPhaseR",
bamfolder = "bam"
bamfolder = "bam
/{sample}/
"
output:
"strand_states/
" + config["sample"] + ".s
trand
p
hase
r_output.{chrom}
.txt"
"strand_states/
{sample}/S
trand
P
hase
R_analysis.{chrom}/phased_haps
.txt"
log:
"log/run_strandphaser.{chrom}.txt"
"log/
{sample}/
run_strandphaser.{chrom}.txt"
shell:
"""
Rscript utils/StrandPhaseR_pipeline.R \
{input.bamfolder} \
log
/StrandPhaseR_analysis \
strand_states/{wildcards.sample}
/StrandPhaseR_analysis
.{wildcards.chrom}
\
{input.configfile} \
{input.wcregions} \
{input.snppositions} \
$(pwd)/utils/R-packages/ \
> {log} 2>&1
cp log/StrandPhaseR_analysis/Phased/phased_haps.txt {output}
"""
rule combine_strandphaser_output:
input:
dynamic("strand_states/" + config["sample"] + ".strandphaser_output.{chrom}.txt")
expand("strand_states/{{sample}}/StrandPhaseR_analysis.{chrom}/phased_haps.txt",
chrom = config["chromosomes"])
output:
"strand_states/
" + config["sample"] + ".
strandphaser_output.txt"
"strand_states/
{sample}/
strandphaser_output.txt"
shell:
"""
set +o pipefail
...
...
@@ -365,13 +361,13 @@ rule combine_strandphaser_output:
rule convert_strandphaser_output:
input:
phased_states = "strand_states/
" + config["sample"] + ".
strandphaser_output.txt",
initial_states = "strand_states/
" + config["sample"] + ".txt
",
info = "counts/
" + config["sample"] + ".
500000_fixed.info"
phased_states = "strand_states/
{sample}/
strandphaser_output.txt",
initial_states = "strand_states/
{sample}/intitial_strand_state
",
info = "counts/
{sample}/
500000_fixed.info"
output:
"strand_states/
" + config["sample"] + ".
final.txt"
"strand_states/
{sample}/
final.txt"
log:
"log/convert_strandphaser_output.txt"
"log/
{sample}/
convert_strandphaser_output.txt"
script:
"utils/helper.convert_strandphaser_output.R"
...
...
@@ -383,17 +379,17 @@ rule convert_strandphaser_output:
rule mergeBams:
input:
expand("bam/
{bam}.bam", bam=BAM
)
lambda wc:
expand("bam/
" + wc.sample + "/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]
)
output:
"snv_calls/merged.bam"
"snv_calls/
{sample}/
merged.bam"
shell:
config["samtools"] + " merge {output} {input}"
rule indexMergedBam:
input:
"snv_calls/merged.bam"
"snv_calls/
{sample}/
merged.bam"
output:
"snv_calls/merged.bam.bai"
"snv_calls/
{sample}/
merged.bam.bai"
shell:
config["samtools"] + " index {input}"
...
...
@@ -401,13 +397,12 @@ rule indexMergedBam:
rule call_SNVs_bcftools_chrom:
input:
fa = config["reference"],
chrom = "chroms/{chrom}",
bam = "snv_calls/merged.bam",
bai = "snv_calls/merged.bam.bai"
bam = "snv_calls/{sample}/merged.bam",
bai = "snv_calls/{sample}/merged.bam.bai"
output:
"snv_calls/
" + config["sample"] + ".
{chrom}.vcf"
"snv_calls/
{sample}/
{chrom}.vcf"
log:
"log/call_SNVs_bcftools_chrom.{chrom}.txt"
"log/
{sample}/
call_SNVs_bcftools_chrom.{chrom}.txt"
params:
samtools = config["samtools"],
bcftools = config["bcftools"]
...
...
@@ -417,34 +412,22 @@ rule call_SNVs_bcftools_chrom:
| {params.bcftools} call -mv - | {params.bcftools} view --genotype het --types snps - > {output} 2> {log}
"""
# Write one file per chromosome that should be analysed.
rule prepare_chromosomes:
input:
"strand_states/" + config["sample"] + ".txt"
output:
dynamic("chroms/{chrom}")
shell:
"""
tail -n+2 {input} | cut -f1 | sort | uniq | awk '{{print "chroms/" $1}}' | xargs touch
"""
rule merge_SNV_calls:
input:
dynamic
("snv_calls/
" + config["sample"] + ".{chrom}.vcf"
)
expand
("snv_calls/
{{sample}}/{chrom}.vcf", chrom = config['chromosomes']
)
output:
expand(
"snv_calls/
" + config["sample"] + "
.vcf"
)
"snv_calls/
{sample}/all
.vcf"
shell:
config["bcftools"] + " concat -O v -o {output} {input}"
rule split_external_snv_calls:
input:
vcf = config["snv_calls"]
vcf =
lambda wc:
config["snv_calls"]
[wc.sample]
output:
vcf = "external_snv_calls/
" + config["sample"] + ".
{chrom}.vcf"
log: "external_snv_calls/
" + config["sample"] + ".
{chrom}.vcf.log"
vcf = "external_snv_calls/
{sample}/
{chrom}.vcf"
log: "external_snv_calls/
{sample}/
{chrom}.vcf.log"
params:
bcftools = config["bcftools"],
#sample = config["sample"]
bcftools = config["bcftools"]
shell:
"({params.bcftools} view --samples
" + config["sample"] + "
--types snps {input.vcf} {wildcards.chrom} | bcftools view --genotype het - > {output.vcf}) > {log} 2>&1"
"({params.bcftools} view --samples
{wildcards.sample}
--types snps {input.vcf} {wildcards.chrom} | bcftools view --genotype het - > {output.vcf}) > {log} 2>&1"
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment