Newer
Older
configfile: "Snake.config_embl.yaml"
import os, sys
# print(os.listdir(os.getcwd()))
# print(os.listdir("bam"))
# TODO I/O : Function to define inputs ; simplify list/dict system
# TODO Use remote file system to download example files
SAMPLE,BAM = glob_wildcards(config['input_bam_location'] + "{sample}/selected/{bam}.bam")
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
SAMPLES = sorted(set(SAMPLE))
CELL_PER_SAMPLE= defaultdict(list)
BAM_PER_SAMPLE = defaultdict(list)
for sample,bam in zip(SAMPLE,BAM):
BAM_PER_SAMPLE[sample].append(bam)
CELL_PER_SAMPLE[sample].append(bam.replace('.sort.mdup',''))
ALLBAMS_PER_SAMPLE = defaultdict(list)
for sample in SAMPLES:
ALLBAMS_PER_SAMPLE[sample] = glob_wildcards(config['input_bam_location'] + "{}/all/{{bam}}.bam".format(sample)).bam
print("Detected {} samples:".format(len(SAMPLES)))
for s in SAMPLES:
print(" {}:\t{} cells\t {} selected cells".format(s, len(ALLBAMS_PER_SAMPLE[s]), len(BAM_PER_SAMPLE[s])))
# Current state of the pipeline:
# ==============================
# * count reads in the BAM files (in fixed and variable-width bins of various sizes)
# * determine strand states of each chromosome in each single cell, including SCEs
# * plot all single cell libraries in different window sizes
# * calculate a segmentation into potential SVs using Mosaicatcher
METHODS = [
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0_regfactor6_filterFALSE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.05_regfactor6_filterTRUE",
]
BPDENS = [
"selected_j{}_s{}_scedist{}".format(joint, single, scedist) for joint in [0.1] for single in [0.5] for scedist in [20]
]
# Todo: specify an exact version of the singularity file!
rule all:
input:
# expand(config['output_location'] + "counts/{sample}/{window}.txt.gz", sample=SAMPLES, window=[100000])
# expand(config['output_location'] + "plots/{sample}/{window}.pdf", sample=SAMPLES, window=[100000])
# expand(config['output_location'] + "norm_counts/{sample}/{window}.txt.gz", sample=SAMPLES, window=[100000]),
# expand(config['output_location'] + "norm_counts/{sample}/{window}.info", sample=SAMPLES, window=[100000])
expand(config['output_location'] + "segmentation/{sample}/{window}.txt", sample=SAMPLES, window=[100000])
################################################################################
# Read counting #
################################################################################
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# CHECKME : exclude file rule useful ?
# rule generate_exclude_file_1:
# output:
# temp("log/exclude_file.temp")
# input:
# bam = expand("bam/{sample}/selected/{bam}.bam", sample = SAMPLES[0], bam = BAM_PER_SAMPLE[SAMPLES[0]][0])
# log:
# "log/generate_exclude_file_1.log"
# params:
# samtools = config["samtools"]
# shell:
# """
# {params.samtools} view -H {input.bam} | awk "/^@SQ/" > {output}
# """
# rule generate_exclude_file_2:
# output:
# "log/exclude_file"
# input:
# "log/exclude_file.temp"
# params:
# chroms = config["chromosomes"]
# run:
# with open(input[0]) as f:
# with open(output[0],"w") as out:
# for line in f:
# contig = line.strip().split()[1]
# contig = contig[3:]
# # if contig not in params.chroms:
# # print(contig, file = out)
# CHECKME : same as above for input ???
# TODO : Simplify expand command
rule mosaic_count:
"""
Call mosaic count C++ function to count reads in each BAM file according defined window
"""
bam = lambda wc: expand(config['input_bam_location'] + wc.sample + "/selected/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR",
# excl = "log/exclude_file"
output:
counts = config['output_location'] + "counts/{sample}/{window}.txt.gz",
info = config['output_location'] + "counts/{sample}/{window}.info"
config['output_location'] + "log/{sample}/mosaic_count.{window}.log"
mc_command = config["mosaicatcher"]
{params.mc_command} count \
--verbose \
--do-not-blacklist-hmm \
-o {output.counts} \
-i {output.info} \
-w {wildcards.window} \
{input.bam}
> {log} 2>&1
# FIXME : Missing plots in final PDF ; R script + inputs to check
rule plot_mosaic_counts:
"""
Plot function of read counts for each bam file
"""
counts = config['output_location'] + "counts/{sample}/{window}.txt.gz",
info = config['output_location'] + "counts/{sample}/{window}.info"
output:
config['output_location'] + "plots/{sample}/{window}.pdf"
log:
config['output_location'] + "log/plot_mosaic_counts/{sample}/{window}.log"
plot_command = "Rscript " + config["plot_script"]
shell:
"""
{params.plot_command} {input.counts} {input.info} {output} > {log} 2>&1
"""
################################################################################
# Normalize counts #
################################################################################
# TODO : Reference blacklist BED file to retrieve easily on Git/Zenodo/remote system
rule merge_blacklist_bins:
"""
Call Python script to merge HGVSC normalization defined file & inversion whitelist file
"""
norm = "utils/normalization/HGSVC.{window}.txt",
whitelist = "utils/normalization/inversion-whitelist.tsv",
merged = config['output_location'] + "normalizations/HGSVC.{window}.merged.tsv"
config['output_location'] + "log/merge_blacklist_bins/{window}.log"
PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031)
utils/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size 100000 > {output.merged} 2>> {log}
# FIXME : snakemake ambiguity with I/O paths
# CHECKME : Check R code for normalization ; @Marco mention on Gitlab
rule normalize_counts:
"""
Normalization of mosaic counts based on normalization file produced above
"""
input:
counts = config['output_location'] + "counts/{sample}/{window}.txt.gz",
norm = config['output_location'] + "normalizations/HGSVC.{window}.merged.tsv",
output:
config['output_location'] + "norm_counts/{sample}/{window}.txt.gz"
log:
config['output_location'] + "log/normalize_counts/{sample}/{window}.log"
shell:
"""
Rscript utils/normalize.R {input.counts} {input.norm} {output} 2>&1 > {log}
"""
# FIXME : cleaner way to symlink info files
rule link_normalized_info_file:
"""
Symlink info file ouput mosaic count to normalization count directory
"""
input:
info = config['output_location'] + "counts/{sample}/{window}.info"
output:
info = config['output_location'] + "norm_counts/{sample}/{window}.info"
run:
d = os.path.dirname(output.info)
file = os.path.basename(output.info)
shell("cd {d} && ln -s {input.info} {file}")
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
################################################################################
# Segmentation #
################################################################################
# CHECKME : @Marco mention on Gitlab
rule segmentation:
"""
rule fct:
input:
output:
"""
input:
config['output_location'] + "counts/{sample}/{window}.txt.gz"
output:
config['output_location'] + "segmentation/{sample}/{window,\d+}.txt"
log:
config['output_location'] + "log/segmentation/{sample}/{window}.log"
params:
mc_command = config["mosaicatcher"],
min_num_segs = lambda wc: math.ceil(200000 / float(wc.window)) # bins to represent 200 kb
shell:
"""
{params.mc_command} segment \
--remove-none \
--forbid-small-segments {params.min_num_segs} \
-M 50000000 \
-o {output} \
{input} > {log} 2>&1
"""