From 3f0dc28ec22d88def269c215ef551800b8b1f7e5 Mon Sep 17 00:00:00 2001 From: Thomas Weber <thomas.weber@embl.de> Date: Wed, 18 May 2022 12:47:15 +0200 Subject: [PATCH] Initial commit Handle of multi samples in the same folder now Change the way to retrieve the selected cell list --- README.md | 33 ++++++---- workflow/.conda/environments.txt | 8 +-- workflow/Snakefile | 100 ++++++++++++++++--------------- workflow/rules/count.smk | 4 +- workflow/rules/examples.smk | 5 +- workflow/rules/haplotagging.smk | 5 +- workflow/rules/regenotyping.smk | 3 +- workflow/rules/segmentation.smk | 3 +- workflow/rules/stats.smk | 2 +- 9 files changed, 88 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index a651ae3..4f7892e 100644 --- a/README.md +++ b/README.md @@ -165,8 +165,7 @@ It is important to follow these rules for single-cell data ### âš¡ï¸ 4. Run the pipeline -After defining your configuration, you can launch the pipeline the following way: - +After defining your configuration, you can launch the pipeline the following way : @@ -183,10 +182,17 @@ snakemake \ -p \ --conda-frontend mamba \ --use-singularity \ - --singularity-args "-B /:/" \ - --dry-run + --singularity-args "-B /mounting_point:/mounting_point" \ ``` + +--- +**â„¹ï¸ Note** + +It is recommended to first run the command and check if there are any anomalies with the `--dry-run` option + +--- + --- **âš ï¸ Warning** @@ -226,17 +232,16 @@ Choose the conda frontend for installing environments. Mamba is much faster and If defined in the rule, run job within a singularity container. If this flag is not set, the singularity directive is ignored. ``` ---singularity-args "-B /:/" +--singularity-args "-B /mounting_point:/mounting_point" ``` Pass additional args to singularity. `-B` stands for binding point between the host and the container. --- **â„¹ï¸ Note** -Currently, raise the following WARNING message : -``` -WARNING: Skipping mount /etc/localtime [binds]: /etc/localtime doesn't exist in container -``` +Currently, the binding command needs to correspond to the mounting point of your system (i.e: "/tmp:/tmp"). +On seneca for example (EMBL), use "/g:/g" + --- Obviously, all other snakemake CLI options can also be used. @@ -275,13 +280,19 @@ snakemake \ ## 📆 Roadmap -- [x] Zenodo automatic download of external files + indexes +- [x] Zenodo automatic download of external files + indexes (1.2.1) - [ ] Change of reference genome (currently only GRCh38) - [ ] Plotting options (enable/disable segmentation back colors) - [ ] Full singularity image with preinstalled conda envs - [ ] Automatic testing of BAM SM tag compared to sample folder name - [ ] On-error/success e-mail -- [ ] Upstream QC pipeline and FastQ handle +- [ ] Upstream QC pipeline and FastQ handle +- [ ] Full singularity image + +## 🛑 Troubleshooting & Current limitations + +- Do not change the structure of your input folder after running the pipeline, first execution will build a config dataframe file (`workflow/config/config.tsv`) that contains the list of cells and the associated paths +- Do not change the list of chromosomes after a first execution (i.e: first execution using `count` mode on `chr21`, second execution using `segmentation` mode on all chromosomes) ## 📕 References diff --git a/workflow/.conda/environments.txt b/workflow/.conda/environments.txt index ee469f0..651baf1 100644 --- a/workflow/.conda/environments.txt +++ b/workflow/.conda/environments.txt @@ -1,8 +1,2 @@ -/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/efce729cd9fa8c17d9069c88e3ea8c4e -/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/3788d39fbf1c275d2f59b3c04095a5e9 -/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/65d65572a30753e49e970fe9607cc4f4 -/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/b0bb57cd4b77592c6a4933daec0680d9 -/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/5f46ea0d0a3b05eb9f18a6b48bf24ef7 -/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/f6e43dfbcbaf393428a25ebde2b7e489 -/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/b42044cf37b5a86324b0a1bf256c3955 +/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/491ec2a31932d72e514b8ce3f5c8551a diff --git a/workflow/Snakefile b/workflow/Snakefile index fc024c8..133d4a0 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -8,8 +8,9 @@ import pandas as pd import os, sys from pprint import pprint # import pysam -# from tqdm import tqdm +from tqdm import tqdm +# TODO : check config/config CLI ... configfile: "config/config.yaml" mode_selected = config["mode"].lower() @@ -43,62 +44,63 @@ if config["input_bam_location"].endswith("/") is False: # print("/ was missing at end of input folder, continuing ...") -# # this container defines the underlying OS for each job when using the workflow -# # with --use-conda --use-singularity + report: "report/workflow.rst" -# TODO : check config/config CLI ... +# TODO: commit only in another branch to test +# # this container defines the underlying OS for each job when using the workflow +# # with --use-conda --use-singularity +# container: "docker://condaforge/mambaforge:4.12.0-0" if os.path.isfile(config["output_location"] + "config/config_df.tsv") is False: - print("HELLO") + # print("HELLO") ###################################################################### # TODO : move to another file - # complete_df_list = list() - # input_dir = config["input_bam_location"] - # for sample in os.listdir(input_dir): - # print(sample) - # l_files_all = [f for f in os.listdir(input_dir + sample + "/all/") if f.endswith('.bam')] - # l_files_selected = [f for f in os.listdir(input_dir + sample + "/selected/") if f.endswith('.bam')] - # join = list(set(l_files_all).intersection(set(l_files_selected))) - # df = pd.DataFrame([{"File" : f} for f in l_files_all]) - # df.loc[df["File"].isin(join), "Selected"] = True - # df["File"] = df["File"].str.replace(".bam", "") - # df["Selected"] = df["Selected"].fillna(False) - # df["Folder"] = input_dir - # df["Sample"] = sample - # df["Cell"] = df["File"].apply(lambda r : r.split(".")[0]) - # df["Full_path"] = df["Folder"] + sample + "/all/" + df["File"] + ".bam" - # print(df) - # complete_df_list.append(df) - # complete_df = pd.concat(complete_df_list) - # print(complete_df) + complete_df_list = list() + input_dir = config["input_bam_location"] + for sample in tqdm(os.listdir(input_dir)): + l_files_all = [f for f in os.listdir(input_dir + sample + "/all/") if f.endswith('.bam')] + l_files_selected = [f for f in os.listdir(input_dir + sample + "/selected/") if f.endswith('.bam')] + join = list(set(l_files_all).intersection(set(l_files_selected))) + df = pd.DataFrame([{"File" : f} for f in l_files_all]) + df.loc[df["File"].isin(join), "Selected"] = True + df["File"] = df["File"].str.replace(".bam", "") + df["Selected"] = df["Selected"].fillna(False) + df["Folder"] = input_dir + df["Sample"] = sample + df["Cell"] = df["File"].apply(lambda r : r.split(".")[0]) + df["Full_path"] = df["Folder"] + sample + "/all/" + df["File"] + ".bam" + complete_df_list.append(df) + complete_df = pd.concat(complete_df_list) - # df_config_files = complete_df.copy() + df_config_files = complete_df.copy() + # print(df_config_files) # Defining cols # df["all/selected"] = df["Folder"].apply(lambda r: r.split("/")[-1]) - print(config["input_bam_location"]) - # Parsing folder and retrieve only files with .bam extension - data = [(r, file.replace(".bam", "")) for r, d, f in os.walk(config["input_bam_location"]) for file in f if ".bam" in file and ".bai" not in file] - # Building pandas df based on folder structure - df = pd.DataFrame(data, columns=["Folder", "File"]) + # print(config["input_bam_location"]) + # # Parsing folder and retrieve only files with .bam extension + # data = [(r, file.replace(".bam", "")) for r, d, f in os.walk(config["input_bam_location"]) for file in f if ".bam" in file and ".bai" not in file] + # print(data) + # # Building pandas df based on folder structure + # df = pd.DataFrame(data, columns=["Folder", "File"]) - # Defining cols - df["all/selected"] = df["Folder"].apply(lambda r: r.split("/")[-1]) - df["Sample"] = df["Folder"].apply(lambda r: r.split("/")[-2]) - df["Cell"] = df["File"].apply(lambda r : r.split(".")[0]) - df["Full_path"] = df["Folder"] + "/" + df["File"] + ".bam" + # # Defining cols + # df["all/selected"] = df["Folder"].apply(lambda r: r.split("/")[-1]) + # df["Sample"] = df["Folder"].apply(lambda r: r.split("/")[-2]) + # df["Cell"] = df["File"].apply(lambda r : r.split(".")[0]) + # df["Full_path"] = df["Folder"] + "/" + df["File"] + ".bam" - # Filtering based on exclude list defined - df_config_files = df.loc[~df["Cell"].isin(config["exclude_list"])] + # # Filtering based on exclude list defined + # df_config_files = df.loc[~df["Cell"].isin(config["exclude_list"])] def check_bam_header(bam_file_path): """_summary_ @@ -132,22 +134,22 @@ if os.path.isfile(config["output_location"] + "config/config_df.tsv") is False: else: df_config_files = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t") - print(df_config_files) +print(df_config_files) # Samples list samples = sorted(df_config_files.Sample.unique().tolist()) -print(samples) +# print(samples) # samples = ["H2NCTAFX2_GM20509B_20s000579-1-1"] # Output dictionnaries all_dict = df_config_files.groupby("Sample")["Cell"].nunique().to_dict() -selected_dict = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict() -# selected_dict = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict() -dict_cells_nb_per_sample = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict() -# dict_cells_nb_per_sample = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict() -print(all_dict) -print(selected_dict) -print(dict_cells_nb_per_sample) +# selected_dict = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict() +selected_dict = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict() +# dict_cells_nb_per_sample = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict() +dict_cells_nb_per_sample = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict() +# print(all_dict) +# print(selected_dict) +# print(dict_cells_nb_per_sample) print("Detected {} samples:".format(df_config_files.Sample.nunique())) [print(" {}:\t{} cells\t {} selected cells".format(s, all_dict[s], selected_dict[s])) for s in samples] @@ -187,8 +189,10 @@ include: "rules/examples.smk" # IF PLOT OPTION ENABLED, BUILD TMP DICT TO CALL OUTPUT if plot_option_selected == True: - dict_cells_nb_per_sample = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict() - tmp_dict = df_config_files.loc[df_config_files["all/selected"] == "selected", ["Sample", "Cell"]].groupby("Sample")["Cell"].apply(lambda r: sorted(list(r))).to_dict() + # dict_cells_nb_per_sample = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict() + dict_cells_nb_per_sample = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict() + # tmp_dict = df_config_files.loc[df_config_files["all/selected"] == "selected", ["Sample", "Cell"]].groupby("Sample")["Cell"].apply(lambda r: sorted(list(r))).to_dict() + tmp_dict = df_config_files.loc[df_config_files["Selected"] == True, ["Sample", "Cell"]].groupby("Sample")["Cell"].apply(lambda r: sorted(list(r))).to_dict() tmp_dict = {s:{i+1:c for i,c in enumerate(cell_list)} for s,cell_list in tmp_dict.items()} for s in tmp_dict.keys(): tmp_dict[s][0] = "SummaryPage" diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk index 8c5a5bd..602551a 100644 --- a/workflow/rules/count.smk +++ b/workflow/rules/count.smk @@ -3,8 +3,8 @@ config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep= pd.options.display.max_colwidth = 40 -# bam_per_sample_local = config_df.loc[config_df["Selected"] == True].groupby("Sample")["File"].apply(list).to_dict() -bam_per_sample_local = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["File"].apply(list).to_dict() +bam_per_sample_local = config_df.loc[config_df["Selected"] == True].groupby("Sample")["File"].apply(list).to_dict() +# bam_per_sample_local = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["File"].apply(list).to_dict() print(bam_per_sample_local) ################################################################################ # Read counting # diff --git a/workflow/rules/examples.smk b/workflow/rules/examples.smk index 6e27c81..9a0ffcb 100644 --- a/workflow/rules/examples.smk +++ b/workflow/rules/examples.smk @@ -12,11 +12,12 @@ rule dl_example_data: input: HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/TEST_EXAMPLE_DATA.zip", keep_local=True) output: - directory(config["input_bam_location"]) + # directory(config["input_bam_location"]) + touch(config["output_location"] + "config/dl_example_data.ok") shell: "mkdir {output};" "unzip {input} -d .;" - "mv TEST_EXAMPLE_DATA/* {output}" + "mv TEST_EXAMPLE_DATA/* {config[input_bam_location]}" # TODO: Adapt according reference rule dl_external_data: diff --git a/workflow/rules/haplotagging.smk b/workflow/rules/haplotagging.smk index f0f9f04..2920451 100644 --- a/workflow/rules/haplotagging.smk +++ b/workflow/rules/haplotagging.smk @@ -1,7 +1,8 @@ import pandas as pd config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t") # print(config_df) -bam_per_sample = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["File"].apply(list).to_dict() +# bam_per_sample = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["File"].apply(list).to_dict() +bam_per_sample = config_df.loc[config_df["Selected"] == True].groupby("Sample")["File"].apply(list).to_dict() ################################################################################ @@ -15,7 +16,7 @@ rule haplotag_bams: vcf = config["output_location"] + "strandphaser/phased-snvs/{sample}.vcf.gz", tbi = config["output_location"] + "strandphaser/phased-snvs/{sample}.vcf.gz.tbi", bam = config["input_bam_location"] + "{sample}/selected/{cell}.bam", - bai = config["input_bam_location"] + "{sample}/selected/{cell}.bam.bai" + # bai = config["input_bam_location"] + "{sample}/selected/{cell}.bam.bai" output: config["output_location"] + "haplotag/bam/{sample}/{cell}.bam", log: diff --git a/workflow/rules/regenotyping.smk b/workflow/rules/regenotyping.smk index 3db180c..5478f2c 100644 --- a/workflow/rules/regenotyping.smk +++ b/workflow/rules/regenotyping.smk @@ -1,6 +1,7 @@ import pandas as pd config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t") -allbams_per_sample = df_config_files.loc[df_config_files["all/selected"] == "all"].groupby("Sample")["File"].apply(list).to_dict() +# allbams_per_sample = df_config_files.loc[df_config_files["all/selected"] == "all"].groupby("Sample")["File"].apply(list).to_dict() +allbams_per_sample = config_df.groupby("Sample")["File"].apply(list).to_dict() ################################################################################ diff --git a/workflow/rules/segmentation.smk b/workflow/rules/segmentation.smk index 9122179..7374ed1 100644 --- a/workflow/rules/segmentation.smk +++ b/workflow/rules/segmentation.smk @@ -2,7 +2,8 @@ import math import pandas as pd config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t") # print(config_df) -cell_per_sample = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["Cell"].apply(list).to_dict() +# cell_per_sample = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["Cell"].apply(list).to_dict() +cell_per_sample = config_df.loc[config_df["Selected"] == True].groupby("Sample")["Cell"].apply(list).to_dict() ################################################################################ # Joint Segmentation # diff --git a/workflow/rules/stats.smk b/workflow/rules/stats.smk index 3a10886..ee2b7d6 100644 --- a/workflow/rules/stats.smk +++ b/workflow/rules/stats.smk @@ -1,6 +1,6 @@ import pandas as pd config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t") -samples = sorted(df_config_files.Sample.unique().tolist()) +samples = sorted(config_df.Sample.unique().tolist()) ################################################################################ # Summary statistics on sv calls # -- GitLab