import math from collections import defaultdict configfile: "Snake.config_embl.yaml" import os, sys # print(os.listdir(os.getcwd())) # print(os.listdir("bam")) # TODO I/O : Function to define inputs ; simplify list/dict system # TODO Use remote file system to download example files SAMPLE,BAM = glob_wildcards(config['input_bam_location'] + "{sample}/selected/{bam}.bam") SAMPLES = sorted(set(SAMPLE)) CELL_PER_SAMPLE= defaultdict(list) BAM_PER_SAMPLE = defaultdict(list) for sample,bam in zip(SAMPLE,BAM): BAM_PER_SAMPLE[sample].append(bam) CELL_PER_SAMPLE[sample].append(bam.replace('.sort.mdup','')) ALLBAMS_PER_SAMPLE = defaultdict(list) for sample in SAMPLES: ALLBAMS_PER_SAMPLE[sample] = glob_wildcards(config['input_bam_location'] + "{}/all/{{bam}}.bam".format(sample)).bam print("Detected {} samples:".format(len(SAMPLES))) for s in SAMPLES: print(" {}:\t{} cells\t {} selected cells".format(s, len(ALLBAMS_PER_SAMPLE[s]), len(BAM_PER_SAMPLE[s]))) # Current state of the pipeline: # ============================== # * count reads in the BAM files (in fixed and variable-width bins of various sizes) # * determine strand states of each chromosome in each single cell, including SCEs # * plot all single cell libraries in different window sizes # * calculate a segmentation into potential SVs using Mosaicatcher METHODS = [ "simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0_regfactor6_filterFALSE", "simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.05_regfactor6_filterTRUE", ] BPDENS = [ "selected_j{}_s{}_scedist{}".format(joint, single, scedist) for joint in [0.1] for single in [0.5] for scedist in [20] ] # Todo: specify an exact version of the singularity file! rule all: input: # expand(config['output_location'] + "counts/{sample}/{window}.txt.gz", sample=SAMPLES, window=[100000]) # expand(config['output_location'] + "plots/{sample}/{window}.pdf", sample=SAMPLES, window=[100000]) # expand(config['output_location'] + "norm_counts/{sample}/{window}.txt.gz", sample=SAMPLES, window=[100000]), # expand(config['output_location'] + "norm_counts/{sample}/{window}.info", sample=SAMPLES, window=[100000]) expand(config['output_location'] + "segmentation/{sample}/{window}.txt", sample=SAMPLES, window=[100000]) ################################################################################ # Read counting # ################################################################################ # CHECKME : exclude file rule useful ? # rule generate_exclude_file_1: # output: # temp("log/exclude_file.temp") # input: # bam = expand("bam/{sample}/selected/{bam}.bam", sample = SAMPLES[0], bam = BAM_PER_SAMPLE[SAMPLES[0]][0]) # log: # "log/generate_exclude_file_1.log" # params: # samtools = config["samtools"] # shell: # """ # {params.samtools} view -H {input.bam} | awk "/^@SQ/" > {output} # """ # rule generate_exclude_file_2: # output: # "log/exclude_file" # input: # "log/exclude_file.temp" # params: # chroms = config["chromosomes"] # run: # with open(input[0]) as f: # with open(output[0],"w") as out: # for line in f: # contig = line.strip().split()[1] # contig = contig[3:] # # if contig not in params.chroms: # # print(contig, file = out) # CHECKME : same as above for input ??? # TODO : Simplify expand command rule mosaic_count: """ Call mosaic count C++ function to count reads in each BAM file according defined window """ input: bam = lambda wc: expand(config['input_bam_location'] + wc.sample + "/selected/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR", # excl = "log/exclude_file" output: counts = config['output_location'] + "counts/{sample}/{window}.txt.gz", info = config['output_location'] + "counts/{sample}/{window}.info" log: config['output_location'] + "log/{sample}/mosaic_count.{window}.log" params: mc_command = config["mosaicatcher"] shell: """ {params.mc_command} count \ --verbose \ --do-not-blacklist-hmm \ -o {output.counts} \ -i {output.info} \ -w {wildcards.window} \ {input.bam} > {log} 2>&1 """ # FIXME : Missing plots in final PDF ; R script + inputs to check rule plot_mosaic_counts: """ Plot function of read counts for each bam file """ input: counts = config['output_location'] + "counts/{sample}/{window}.txt.gz", info = config['output_location'] + "counts/{sample}/{window}.info" output: config['output_location'] + "plots/{sample}/{window}.pdf" log: config['output_location'] + "log/plot_mosaic_counts/{sample}/{window}.log" params: plot_command = "Rscript " + config["plot_script"] shell: """ {params.plot_command} {input.counts} {input.info} {output} > {log} 2>&1 """ ################################################################################ # Normalize counts # ################################################################################ # TODO : Reference blacklist BED file to retrieve easily on Git/Zenodo/remote system rule merge_blacklist_bins: """ Call Python script to merge HGVSC normalization defined file & inversion whitelist file """ input: norm = "utils/normalization/HGSVC.{window}.txt", whitelist = "utils/normalization/inversion-whitelist.tsv", output: merged = config['output_location'] + "normalizations/HGSVC.{window}.merged.tsv" log: config['output_location'] + "log/merge_blacklist_bins/{window}.log" shell: """ PYTHONPATH="" # Issue #1031 (https://bitbucket.org/snakemake/snakemake/issues/1031) utils/merge-blacklist.py --merge_distance 500000 {input.norm} --whitelist {input.whitelist} --min_whitelist_interval_size 100000 > {output.merged} 2>> {log} """ # FIXME : snakemake ambiguity with I/O paths # CHECKME : Check R code for normalization ; @Marco mention on Gitlab rule normalize_counts: """ Normalization of mosaic counts based on normalization file produced above """ input: counts = config['output_location'] + "counts/{sample}/{window}.txt.gz", norm = config['output_location'] + "normalizations/HGSVC.{window}.merged.tsv", output: config['output_location'] + "norm_counts/{sample}/{window}.txt.gz" log: config['output_location'] + "log/normalize_counts/{sample}/{window}.log" shell: """ Rscript utils/normalize.R {input.counts} {input.norm} {output} 2>&1 > {log} """ # FIXME : cleaner way to symlink info files rule link_normalized_info_file: """ Symlink info file ouput mosaic count to normalization count directory """ input: info = config['output_location'] + "counts/{sample}/{window}.info" output: info = config['output_location'] + "norm_counts/{sample}/{window}.info" run: d = os.path.dirname(output.info) file = os.path.basename(output.info) shell("cd {d} && ln -s {input.info} {file}") ################################################################################ # Segmentation # ################################################################################ # CHECKME : @Marco mention on Gitlab rule segmentation: """ rule fct: input: output: """ input: config['output_location'] + "counts/{sample}/{window}.txt.gz" output: config['output_location'] + "segmentation/{sample}/{window,\d+}.txt" log: config['output_location'] + "log/segmentation/{sample}/{window}.log" params: mc_command = config["mosaicatcher"], min_num_segs = lambda wc: math.ceil(200000 / float(wc.window)) # bins to represent 200 kb shell: """ {params.mc_command} segment \ --remove-none \ --forbid-small-segments {params.min_num_segs} \ -M 50000000 \ -o {output} \ {input} > {log} 2>&1 """