From a835f79928bf6ec5c5b93678bd89bc54c59e3206 Mon Sep 17 00:00:00 2001
From: Thomas Weber <thomas.weber@embl.de>
Date: Tue, 17 May 2022 17:27:28 +0200
Subject: [PATCH] Download example & external data Implemented rules based on
 snakemake.remote.HTTP function that can be called through config.yaml / CLI
 arguments Update config.yaml file Update rules/examples.smk Update Snakefile
 Update README.md

---
 README.md                   |  2 +-
 workflow/Snakefile          | 48 ++++++++++++++++++++++++-------------
 workflow/config/config.yaml | 12 +++++++---
 workflow/rules/examples.smk | 34 ++++++++++++++------------
 4 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index d45f1fc..ac8e733 100644
--- a/README.md
+++ b/README.md
@@ -117,7 +117,7 @@ wget https://sandbox.zenodo.org/record/1062182/files/GCA_000001405.15_GRCh38_no_
 #### 3A. Download example data automatically with snakemake [Optional] 
 
 ```
-snakemake -c1 --config mode=download_example_data input_bam_location=/path/to/INPUT
+snakemake -c1 --config mode=download_data input_bam_location=/path/to/INPUT
 ```
 **Warning:** Download example data currently requires 35GB of free space disk. 
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index d51b840..fc024c8 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -13,12 +13,18 @@ from pprint import pprint
 configfile: "config/config.yaml"
 
 mode_selected = config["mode"].lower()
-correct_modes = ["setup", "count", "segmentation", "haplotagging", "strandphasing", "mosaiclassifier", "download_example_data"]
+correct_modes = ["count", "segmentation", "mosaiclassifier", "download_data"]
 assert mode_selected in correct_modes, "Wrong mode selected : {}\nFollowing list of modes are available : {}".format(config["mode"], ", ".join(correct_modes))
 
 plot_option_selected = config["plot"]
 assert type(plot_option_selected) is bool, "Wrong plot option selected : {}\nPlease enter a valid value (True / False)".format(config["plot"]) 
 
+dl_bam_example_option_selected = config["dl_bam_example"]
+assert type(dl_bam_example_option_selected) is bool, "Wrong plot option selected : {}\nPlease enter a valid value (True / False)".format(config["plot"]) 
+
+dl_external_files_option_selected = config["dl_external_files"]
+assert type(dl_external_files_option_selected) is bool, "Wrong plot option selected : {}\nPlease enter a valid value (True / False)".format(config["plot"]) 
+
 print(os.getcwd())
 
 print("Mode selected : {}".format(mode_selected))
@@ -49,7 +55,7 @@ report: "report/workflow.rst"
 
 
 if os.path.isfile(config["output_location"] + "config/config_df.tsv") is False:
-
+    print("HELLO")
     ######################################################################
     # TODO : move to another file 
 
@@ -79,10 +85,9 @@ if os.path.isfile(config["output_location"] + "config/config_df.tsv") is False:
     # df["all/selected"] = df["Folder"].apply(lambda r: r.split("/")[-1])
     
 
-
+    print(config["input_bam_location"])
     # Parsing folder and retrieve only files with .bam extension
     data = [(r, file.replace(".bam", "")) for r, d, f in os.walk(config["input_bam_location"]) for file in f if ".bam" in file and ".bai" not in file]
-
     # Building pandas df based on folder structure
     df = pd.DataFrame(data, columns=["Folder", "File"])
 
@@ -196,14 +201,14 @@ if plot_option_selected == True:
 
 
 # SETUP ENV
-if mode_selected == "setup":
-    rule all:
-        input:
-            # rules.install_rlib_strandphaser.output,
-            rules.compile_mosaic.output,
+# if mode_selected == "setup":
+#     rule all:
+#         input:
+#             # rules.install_rlib_strandphaser.output,
+#             rules.compile_mosaic.output,
 
 # MODE MOSAIC COUNT
-elif mode_selected == "count":
+if mode_selected == "count":
     if plot_option_selected == True:
         rule all:
             input:
@@ -291,9 +296,20 @@ elif mode_selected == "mosaiclassifier":
 
 
 # # TEST MODE
-elif mode_selected == "download_example_data":
-
-    rule all:
-        input:
-            rules.dl_example_data.output,
-            rules.dl_external_data.output
+elif mode_selected == "download_data":
+    if dl_bam_example_option_selected is True and dl_external_files_option_selected is True:
+        rule all:
+            input:
+                rules.dl_example_data.output,
+                rules.dl_external_data.output
+    
+    if dl_bam_example_option_selected is True and dl_external_files_option_selected is False:
+        rule all:
+            input:
+                rules.dl_example_data.output,
+    
+    if dl_bam_example_option_selected is False and dl_external_files_option_selected is True:
+        rule all:
+            input:
+                rules.dl_external_data.output
+    
diff --git a/workflow/config/config.yaml b/workflow/config/config.yaml
index e41a068..17606d6 100644
--- a/workflow/config/config.yaml
+++ b/workflow/config/config.yaml
@@ -12,18 +12,24 @@ mode: "count"
 plot: False
 ### Enable / Disable comparison for each BAM file between folder name & SM tag
 check_sm_tag: False
+### Enable / Disable download of BAM examples (RPE-BM510)
+dl_bam_example: False
+### Enable / Disable download of external files (1000G SNV & Fasta ref genome)
+dl_external_files: False
 
 ## Input BAM location
-input_bam_location: "TEST_EXAMPLE_DATA/bam/"
+input_bam_location: "TEST_EXAMPLE_DATA/RPE-BM510/"
 ## Output location
 output_location: "TEST_OUTPUT/"
 
 # External files
 ## 1000G SNV sites to genotype : https://sandbox.zenodo.org/record/1060653/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz
-snv_sites_to_genotype: "sandbox.zenodo.org/record/1060653/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz"
+snv_sites_to_genotype: "sandbox.zenodo.org/record/1062182/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz"
 
 # Reference genome : https://sandbox.zenodo.org/record/1060653/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna
-reference: "sandbox.zenodo.org/record/1060653/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
+reference: "sandbox.zenodo.org/record/1062182/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
+
+tmp_check: "sandbox.zenodo.org/record/1062186/files/mosaic_logo.png"
 
 # Chromosomes list to process
 chromosomes:
diff --git a/workflow/rules/examples.smk b/workflow/rules/examples.smk
index 8555b07..6e27c81 100644
--- a/workflow/rules/examples.smk
+++ b/workflow/rules/examples.smk
@@ -4,27 +4,31 @@ from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
 HTTP = HTTPRemoteProvider()
 
 rule dl_example_data:
+    """
+    rule fct: Download BAM example data as input for MosaiCatcher pipeline
+    input: zip file stored on Zenodo
+    output: input_bam_location given by the user
+    """
     input:
-        # HTTP.remote("https://git.embl.de/tweber/mosaicatcher-update/-/raw/dev/workflow/bam/RPE-BM510/all/BM510x04_PE20301.sort.mdup.bam", keep_local=True)
-        HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/TEST_EXAMPLE_DATA.zip", keep_local=True)
+        HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/TEST_EXAMPLE_DATA.zip", keep_local=True)
     output:
         directory(config["input_bam_location"])
     shell:
         "mkdir  {output};"
-        "tar -xf {input} -C .;"
-        "mv TEST_EXAMPLE_DATA/ {output}"
+        "unzip {input} -d .;"
+        "mv TEST_EXAMPLE_DATA/* {output}"
 
+# TODO: Adapt according reference
 rule dl_external_data:
+    """
+    rule fct: Download External files 
+    input: files stored on Zenodo
+    output: touch file to check if everything was running correctly
+    """
     input:
-        # HTTP.remote("https://git.embl.de/tweber/mosaicatcher-update/-/raw/dev/workflow/bam/RPE-BM510/all/BM510x04_PE20301.sort.mdup.bam", keep_local=True)
-        HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna", keep_local=True),
-        HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai", keep_local=True),
-        HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz", keep_local=True),
-        HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz.tbi", keep_local=True),
+        HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna", keep_local=True),
+        HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai", keep_local=True),
+        HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz", keep_local=True),
+        HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz.tbi", keep_local=True),
     output:
-        "sandbox.zenodo.org/record/1060987/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna",
-        "sandbox.zenodo.org/record/1060987/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai",
-        "sandbox.zenodo.org/record/1060987/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz"
-        "sandbox.zenodo.org/record/1060987/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz.tbi"
-    shell:
-        "echo Download completed"
+        touch(config["output_location"] + "config/dl_external_data.ok")
-- 
GitLab