From 3f0dc28ec22d88def269c215ef551800b8b1f7e5 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Wed, 18 May 2022 12:47:15 +0200
Subject: [PATCH] Initial commit Handle of multi samples in the same folder now
 Change the way to retrieve the selected cell list

---
 README.md                        |  33 ++++++----
 workflow/.conda/environments.txt |   8 +--
 workflow/Snakefile               | 100 ++++++++++++++++---------------
 workflow/rules/count.smk         |   4 +-
 workflow/rules/examples.smk      |   5 +-
 workflow/rules/haplotagging.smk  |   5 +-
 workflow/rules/regenotyping.smk  |   3 +-
 workflow/rules/segmentation.smk  |   3 +-
 workflow/rules/stats.smk         |   2 +-
 9 files changed, 88 insertions(+), 75 deletions(-)

diff --git a/README.md b/README.md
index a651ae3..4f7892e 100644
--- a/README.md
+++ b/README.md
@@ -165,8 +165,7 @@ It is important to follow these rules for single-cell data
 
 ### ⚡️ 4. Run the pipeline
 
-After defining your configuration, you can launch the pipeline the following way:
-
+After defining your configuration, you can launch the pipeline the following way :
 
 
 
@@ -183,10 +182,17 @@ snakemake \
     -p \
     --conda-frontend mamba \
     --use-singularity \
-    --singularity-args "-B /:/" \
-    --dry-run
+    --singularity-args "-B /mounting_point:/mounting_point" \
 ```
 
+
+---
+**ℹ️ Note**
+
+It is recommended to first run the command and check if there are any anomalies with the `--dry-run` option
+
+---
+
 ---
 **⚠️ Warning**
 
@@ -226,17 +232,16 @@ Choose the conda frontend for installing environments. Mamba is much faster and
 If defined in the rule, run job within a singularity container. If this flag is not set, the singularity directive is ignored.
 
 ```
---singularity-args "-B /:/"
+--singularity-args "-B /mounting_point:/mounting_point"
 ```
 Pass additional args to singularity. `-B` stands for binding point between the host and the container.
 
 ---
 **ℹ️ Note**
 
-Currently, raise the following WARNING message : 
-```
-WARNING: Skipping mount /etc/localtime [binds]: /etc/localtime doesn't exist in container
-```
+Currently, the binding command needs to correspond to the mounting point of your system (i.e: "/tmp:/tmp").
+On seneca for example (EMBL), use "/g:/g"
+
 ---
 
 Obviously, all other snakemake CLI options can also be used. 
@@ -275,13 +280,19 @@ snakemake \
 
 ## 📆 Roadmap 
 
-- [x] Zenodo automatic download of external files + indexes
+- [x] Zenodo automatic download of external files + indexes (1.2.1)
 - [ ] Change of reference genome (currently only GRCh38)
 - [ ] Plotting options (enable/disable segmentation back colors)
 - [ ] Full singularity image with preinstalled conda envs
 - [ ] Automatic testing of BAM SM tag compared to sample folder name
 - [ ] On-error/success e-mail
-- [ ] Upstream QC pipeline and FastQ handle  
+- [ ] Upstream QC pipeline and FastQ handle
+- [ ] Full singularity image
+
+## 🛑 Troubleshooting & Current limitations
+
+- Do not change the structure of your input folder after running the pipeline, first execution will build a config dataframe file (`workflow/config/config.tsv`) that contains the list of cells and the associated paths
+- Do not change the list of chromosomes after a first execution (i.e: first execution using `count` mode on `chr21`, second execution using `segmentation` mode on all chromosomes)
 
 ## 📕 References
 
diff --git a/workflow/.conda/environments.txt b/workflow/.conda/environments.txt
index ee469f0..651baf1 100644
--- a/workflow/.conda/environments.txt
+++ b/workflow/.conda/environments.txt
@@ -1,8 +1,2 @@
 
-/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/efce729cd9fa8c17d9069c88e3ea8c4e
-/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/3788d39fbf1c275d2f59b3c04095a5e9
-/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/65d65572a30753e49e970fe9607cc4f4
-/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/b0bb57cd4b77592c6a4933daec0680d9
-/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/5f46ea0d0a3b05eb9f18a6b48bf24ef7
-/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/f6e43dfbcbaf393428a25ebde2b7e489
-/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/b42044cf37b5a86324b0a1bf256c3955
+/g/korbel2/weber/workspace/mosaicatcher-update/workflow/.snakemake/conda/491ec2a31932d72e514b8ce3f5c8551a
diff --git a/workflow/Snakefile b/workflow/Snakefile
index fc024c8..133d4a0 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -8,8 +8,9 @@ import pandas as pd
 import os, sys
 from pprint import pprint
 # import pysam
-# from tqdm import tqdm
+from tqdm import tqdm
 
+# TODO : check config/config CLI ... 
 configfile: "config/config.yaml"
 
 mode_selected = config["mode"].lower()
@@ -43,62 +44,63 @@ if config["input_bam_location"].endswith("/") is False:
     # print("/ was missing at end of input folder, continuing ...")
 
 
-# # this container defines the underlying OS for each job when using the workflow
-# # with --use-conda --use-singularity
+
 
 
 report: "report/workflow.rst"
 
 
-# TODO : check config/config CLI ... 
+# TODO: commit only in another branch to test
+# # this container defines the underlying OS for each job when using the workflow
+# # with --use-conda --use-singularity
+# container: "docker://condaforge/mambaforge:4.12.0-0"
 
 
 
 if os.path.isfile(config["output_location"] + "config/config_df.tsv") is False:
-    print("HELLO")
+    # print("HELLO")
     ######################################################################
     # TODO : move to another file 
 
-    # complete_df_list = list()
-    # input_dir = config["input_bam_location"]
-    # for sample in os.listdir(input_dir):
-    #     print(sample)
-    #     l_files_all = [f for f in os.listdir(input_dir + sample + "/all/") if f.endswith('.bam')]
-    #     l_files_selected = [f for f in os.listdir(input_dir + sample + "/selected/") if f.endswith('.bam')]
-    #     join = list(set(l_files_all).intersection(set(l_files_selected)))
-    #     df = pd.DataFrame([{"File" : f} for f in l_files_all])
-    #     df.loc[df["File"].isin(join), "Selected"] = True
-    #     df["File"] = df["File"].str.replace(".bam", "")
-    #     df["Selected"] = df["Selected"].fillna(False)
-    #     df["Folder"] = input_dir
-    #     df["Sample"] = sample
-    #     df["Cell"] = df["File"].apply(lambda r  : r.split(".")[0])
-    #     df["Full_path"] = df["Folder"] + sample + "/all/" + df["File"] + ".bam"
-    #     print(df)
-    #     complete_df_list.append(df)
-    # complete_df = pd.concat(complete_df_list)
-    # print(complete_df)
+    complete_df_list = list()
+    input_dir = config["input_bam_location"]
+    for sample in tqdm(os.listdir(input_dir)):
+        l_files_all = [f for f in os.listdir(input_dir + sample + "/all/") if f.endswith('.bam')]
+        l_files_selected = [f for f in os.listdir(input_dir + sample + "/selected/") if f.endswith('.bam')]
+        join = list(set(l_files_all).intersection(set(l_files_selected)))
+        df = pd.DataFrame([{"File" : f} for f in l_files_all])
+        df.loc[df["File"].isin(join), "Selected"] = True
+        df["File"] = df["File"].str.replace(".bam", "")
+        df["Selected"] = df["Selected"].fillna(False)
+        df["Folder"] = input_dir
+        df["Sample"] = sample
+        df["Cell"] = df["File"].apply(lambda r  : r.split(".")[0])
+        df["Full_path"] = df["Folder"] + sample + "/all/" + df["File"] + ".bam"
+        complete_df_list.append(df)
+    complete_df = pd.concat(complete_df_list)
     
-    # df_config_files = complete_df.copy()
+    df_config_files = complete_df.copy()
+    # print(df_config_files)
 
     # Defining cols
     # df["all/selected"] = df["Folder"].apply(lambda r: r.split("/")[-1])
     
 
-    print(config["input_bam_location"])
-    # Parsing folder and retrieve only files with .bam extension
-    data = [(r, file.replace(".bam", "")) for r, d, f in os.walk(config["input_bam_location"]) for file in f if ".bam" in file and ".bai" not in file]
-    # Building pandas df based on folder structure
-    df = pd.DataFrame(data, columns=["Folder", "File"])
+    # print(config["input_bam_location"])
+    # # Parsing folder and retrieve only files with .bam extension
+    # data = [(r, file.replace(".bam", "")) for r, d, f in os.walk(config["input_bam_location"]) for file in f if ".bam" in file and ".bai" not in file]
+    # print(data)
+    # # Building pandas df based on folder structure
+    # df = pd.DataFrame(data, columns=["Folder", "File"])
 
-    # Defining cols
-    df["all/selected"] = df["Folder"].apply(lambda r: r.split("/")[-1])
-    df["Sample"] = df["Folder"].apply(lambda r: r.split("/")[-2])
-    df["Cell"] = df["File"].apply(lambda r  : r.split(".")[0])
-    df["Full_path"] = df["Folder"] + "/" + df["File"] + ".bam"
+    # # Defining cols
+    # df["all/selected"] = df["Folder"].apply(lambda r: r.split("/")[-1])
+    # df["Sample"] = df["Folder"].apply(lambda r: r.split("/")[-2])
+    # df["Cell"] = df["File"].apply(lambda r  : r.split(".")[0])
+    # df["Full_path"] = df["Folder"] + "/" + df["File"] + ".bam"
 
-    # Filtering based on exclude list defined
-    df_config_files = df.loc[~df["Cell"].isin(config["exclude_list"])]
+    # # Filtering based on exclude list defined
+    # df_config_files = df.loc[~df["Cell"].isin(config["exclude_list"])]
 
     def check_bam_header(bam_file_path):
         """_summary_
@@ -132,22 +134,22 @@ if os.path.isfile(config["output_location"] + "config/config_df.tsv") is False:
 
 else:
     df_config_files = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t")
-    print(df_config_files)
+print(df_config_files)
 
 # Samples list
 samples = sorted(df_config_files.Sample.unique().tolist())
-print(samples)
+# print(samples)
 # samples = ["H2NCTAFX2_GM20509B_20s000579-1-1"]
 
 # Output dictionnaries
 all_dict = df_config_files.groupby("Sample")["Cell"].nunique().to_dict()
-selected_dict = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict()
-# selected_dict = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict()
-dict_cells_nb_per_sample = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict()
-# dict_cells_nb_per_sample = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict()
-print(all_dict)
-print(selected_dict)
-print(dict_cells_nb_per_sample)
+# selected_dict = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict()
+selected_dict = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict()
+# dict_cells_nb_per_sample = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict()
+dict_cells_nb_per_sample = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict()
+# print(all_dict)
+# print(selected_dict)
+# print(dict_cells_nb_per_sample)
 
 print("Detected {} samples:".format(df_config_files.Sample.nunique()))
 [print("  {}:\t{} cells\t {} selected cells".format(s, all_dict[s], selected_dict[s])) for s in samples]
@@ -187,8 +189,10 @@ include: "rules/examples.smk"
 
 # IF PLOT OPTION ENABLED, BUILD TMP DICT TO CALL OUTPUT 
 if plot_option_selected == True:
-    dict_cells_nb_per_sample = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict()
-    tmp_dict = df_config_files.loc[df_config_files["all/selected"] == "selected", ["Sample", "Cell"]].groupby("Sample")["Cell"].apply(lambda r: sorted(list(r))).to_dict()
+    # dict_cells_nb_per_sample = df_config_files.loc[df_config_files["all/selected"] == "selected"].groupby("Sample")["Cell"].nunique().to_dict()
+    dict_cells_nb_per_sample = df_config_files.loc[df_config_files["Selected"] == True].groupby("Sample")["Cell"].nunique().to_dict()
+    # tmp_dict = df_config_files.loc[df_config_files["all/selected"] == "selected", ["Sample", "Cell"]].groupby("Sample")["Cell"].apply(lambda r: sorted(list(r))).to_dict()
+    tmp_dict = df_config_files.loc[df_config_files["Selected"] == True, ["Sample", "Cell"]].groupby("Sample")["Cell"].apply(lambda r: sorted(list(r))).to_dict()
     tmp_dict = {s:{i+1:c for i,c in enumerate(cell_list)} for s,cell_list in tmp_dict.items()}
     for s in tmp_dict.keys():
         tmp_dict[s][0] = "SummaryPage"
diff --git a/workflow/rules/count.smk b/workflow/rules/count.smk
index 8c5a5bd..602551a 100644
--- a/workflow/rules/count.smk
+++ b/workflow/rules/count.smk
@@ -3,8 +3,8 @@ config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep=
 
 
 pd.options.display.max_colwidth = 40
-# bam_per_sample_local = config_df.loc[config_df["Selected"] == True].groupby("Sample")["File"].apply(list).to_dict()
-bam_per_sample_local = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["File"].apply(list).to_dict()
+bam_per_sample_local = config_df.loc[config_df["Selected"] == True].groupby("Sample")["File"].apply(list).to_dict()
+# bam_per_sample_local = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["File"].apply(list).to_dict()
 print(bam_per_sample_local)
 ################################################################################
 # Read counting                                                                #
diff --git a/workflow/rules/examples.smk b/workflow/rules/examples.smk
index 6e27c81..9a0ffcb 100644
--- a/workflow/rules/examples.smk
+++ b/workflow/rules/examples.smk
@@ -12,11 +12,12 @@ rule dl_example_data:
     input:
         HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/TEST_EXAMPLE_DATA.zip", keep_local=True)
     output:
-        directory(config["input_bam_location"])
+        # directory(config["input_bam_location"])
+        touch(config["output_location"] + "config/dl_example_data.ok")
     shell:
         "mkdir  {output};"
         "unzip {input} -d .;"
-        "mv TEST_EXAMPLE_DATA/* {output}"
+        "mv TEST_EXAMPLE_DATA/* {config[input_bam_location]}"
 
 # TODO: Adapt according reference
 rule dl_external_data:
diff --git a/workflow/rules/haplotagging.smk b/workflow/rules/haplotagging.smk
index f0f9f04..2920451 100644
--- a/workflow/rules/haplotagging.smk
+++ b/workflow/rules/haplotagging.smk
@@ -1,7 +1,8 @@
 import pandas as pd
 config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t")
 # print(config_df)
-bam_per_sample = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["File"].apply(list).to_dict()
+# bam_per_sample = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["File"].apply(list).to_dict()
+bam_per_sample = config_df.loc[config_df["Selected"] == True].groupby("Sample")["File"].apply(list).to_dict()
 
 
 ################################################################################
@@ -15,7 +16,7 @@ rule haplotag_bams:
         vcf = config["output_location"] + "strandphaser/phased-snvs/{sample}.vcf.gz",
         tbi = config["output_location"] + "strandphaser/phased-snvs/{sample}.vcf.gz.tbi",
         bam = config["input_bam_location"] + "{sample}/selected/{cell}.bam",
-        bai = config["input_bam_location"] + "{sample}/selected/{cell}.bam.bai"
+        # bai = config["input_bam_location"] + "{sample}/selected/{cell}.bam.bai"
     output:
         config["output_location"] + "haplotag/bam/{sample}/{cell}.bam",
     log:
diff --git a/workflow/rules/regenotyping.smk b/workflow/rules/regenotyping.smk
index 3db180c..5478f2c 100644
--- a/workflow/rules/regenotyping.smk
+++ b/workflow/rules/regenotyping.smk
@@ -1,6 +1,7 @@
 import pandas as pd
 config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t")
-allbams_per_sample = df_config_files.loc[df_config_files["all/selected"] == "all"].groupby("Sample")["File"].apply(list).to_dict()
+# allbams_per_sample = df_config_files.loc[df_config_files["all/selected"] == "all"].groupby("Sample")["File"].apply(list).to_dict()
+allbams_per_sample = config_df.groupby("Sample")["File"].apply(list).to_dict()
 
 
 ################################################################################
diff --git a/workflow/rules/segmentation.smk b/workflow/rules/segmentation.smk
index 9122179..7374ed1 100644
--- a/workflow/rules/segmentation.smk
+++ b/workflow/rules/segmentation.smk
@@ -2,7 +2,8 @@ import math
 import pandas as pd
 config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t")
 # print(config_df)
-cell_per_sample = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["Cell"].apply(list).to_dict()
+# cell_per_sample = config_df.loc[config_df["all/selected"] == "selected"].groupby("Sample")["Cell"].apply(list).to_dict()
+cell_per_sample = config_df.loc[config_df["Selected"] == True].groupby("Sample")["Cell"].apply(list).to_dict()
 
 ################################################################################
 # Joint Segmentation                                                                 #
diff --git a/workflow/rules/stats.smk b/workflow/rules/stats.smk
index 3a10886..ee2b7d6 100644
--- a/workflow/rules/stats.smk
+++ b/workflow/rules/stats.smk
@@ -1,6 +1,6 @@
 import pandas as pd
 config_df = pd.read_csv(config["output_location"] + "config/config_df.tsv", sep="\t")
-samples = sorted(df_config_files.Sample.unique().tolist())
+samples = sorted(config_df.Sample.unique().tolist())
 
 ################################################################################
 # Summary statistics on sv calls                                               #
-- 
GitLab