From a835f79928bf6ec5c5b93678bd89bc54c59e3206 Mon Sep 17 00:00:00 2001 From: Thomas Weber <thomas.weber@embl.de> Date: Tue, 17 May 2022 17:27:28 +0200 Subject: [PATCH] Download example & external data Implemented rules based on snakemake.remote.HTTP function that can be called through config.yaml / CLI arguments Update config.yaml file Update rules/examples.smk Update Snakefile Update README.md --- README.md | 2 +- workflow/Snakefile | 48 ++++++++++++++++++++++++------------- workflow/config/config.yaml | 12 +++++++--- workflow/rules/examples.smk | 34 ++++++++++++++------------ 4 files changed, 61 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index d45f1fc..ac8e733 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ wget https://sandbox.zenodo.org/record/1062182/files/GCA_000001405.15_GRCh38_no_ #### 3A. Download example data automatically with snakemake [Optional] ``` -snakemake -c1 --config mode=download_example_data input_bam_location=/path/to/INPUT +snakemake -c1 --config mode=download_data input_bam_location=/path/to/INPUT ``` **Warning:** Download example data currently requires 35GB of free space disk. diff --git a/workflow/Snakefile b/workflow/Snakefile index d51b840..fc024c8 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -13,12 +13,18 @@ from pprint import pprint configfile: "config/config.yaml" mode_selected = config["mode"].lower() -correct_modes = ["setup", "count", "segmentation", "haplotagging", "strandphasing", "mosaiclassifier", "download_example_data"] +correct_modes = ["count", "segmentation", "mosaiclassifier", "download_data"] assert mode_selected in correct_modes, "Wrong mode selected : {}\nFollowing list of modes are available : {}".format(config["mode"], ", ".join(correct_modes)) plot_option_selected = config["plot"] assert type(plot_option_selected) is bool, "Wrong plot option selected : {}\nPlease enter a valid value (True / False)".format(config["plot"]) +dl_bam_example_option_selected = config["dl_bam_example"] +assert type(dl_bam_example_option_selected) is bool, "Wrong plot option selected : {}\nPlease enter a valid value (True / False)".format(config["plot"]) + +dl_external_files_option_selected = config["dl_external_files"] +assert type(dl_external_files_option_selected) is bool, "Wrong plot option selected : {}\nPlease enter a valid value (True / False)".format(config["plot"]) + print(os.getcwd()) print("Mode selected : {}".format(mode_selected)) @@ -49,7 +55,7 @@ report: "report/workflow.rst" if os.path.isfile(config["output_location"] + "config/config_df.tsv") is False: - + print("HELLO") ###################################################################### # TODO : move to another file @@ -79,10 +85,9 @@ if os.path.isfile(config["output_location"] + "config/config_df.tsv") is False: # df["all/selected"] = df["Folder"].apply(lambda r: r.split("/")[-1]) - + print(config["input_bam_location"]) # Parsing folder and retrieve only files with .bam extension data = [(r, file.replace(".bam", "")) for r, d, f in os.walk(config["input_bam_location"]) for file in f if ".bam" in file and ".bai" not in file] - # Building pandas df based on folder structure df = pd.DataFrame(data, columns=["Folder", "File"]) @@ -196,14 +201,14 @@ if plot_option_selected == True: # SETUP ENV -if mode_selected == "setup": - rule all: - input: - # rules.install_rlib_strandphaser.output, - rules.compile_mosaic.output, +# if mode_selected == "setup": +# rule all: +# input: +# # rules.install_rlib_strandphaser.output, +# rules.compile_mosaic.output, # MODE MOSAIC COUNT -elif mode_selected == "count": +if mode_selected == "count": if plot_option_selected == True: rule all: input: @@ -291,9 +296,20 @@ elif mode_selected == "mosaiclassifier": # # TEST MODE -elif mode_selected == "download_example_data": - - rule all: - input: - rules.dl_example_data.output, - rules.dl_external_data.output +elif mode_selected == "download_data": + if dl_bam_example_option_selected is True and dl_external_files_option_selected is True: + rule all: + input: + rules.dl_example_data.output, + rules.dl_external_data.output + + if dl_bam_example_option_selected is True and dl_external_files_option_selected is False: + rule all: + input: + rules.dl_example_data.output, + + if dl_bam_example_option_selected is False and dl_external_files_option_selected is True: + rule all: + input: + rules.dl_external_data.output + diff --git a/workflow/config/config.yaml b/workflow/config/config.yaml index e41a068..17606d6 100644 --- a/workflow/config/config.yaml +++ b/workflow/config/config.yaml @@ -12,18 +12,24 @@ mode: "count" plot: False ### Enable / Disable comparison for each BAM file between folder name & SM tag check_sm_tag: False +### Enable / Disable download of BAM examples (RPE-BM510) +dl_bam_example: False +### Enable / Disable download of external files (1000G SNV & Fasta ref genome) +dl_external_files: False ## Input BAM location -input_bam_location: "TEST_EXAMPLE_DATA/bam/" +input_bam_location: "TEST_EXAMPLE_DATA/RPE-BM510/" ## Output location output_location: "TEST_OUTPUT/" # External files ## 1000G SNV sites to genotype : https://sandbox.zenodo.org/record/1060653/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz -snv_sites_to_genotype: "sandbox.zenodo.org/record/1060653/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz" +snv_sites_to_genotype: "sandbox.zenodo.org/record/1062182/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz" # Reference genome : https://sandbox.zenodo.org/record/1060653/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna -reference: "sandbox.zenodo.org/record/1060653/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" +reference: "sandbox.zenodo.org/record/1062182/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" + +tmp_check: "sandbox.zenodo.org/record/1062186/files/mosaic_logo.png" # Chromosomes list to process chromosomes: diff --git a/workflow/rules/examples.smk b/workflow/rules/examples.smk index 8555b07..6e27c81 100644 --- a/workflow/rules/examples.smk +++ b/workflow/rules/examples.smk @@ -4,27 +4,31 @@ from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider HTTP = HTTPRemoteProvider() rule dl_example_data: + """ + rule fct: Download BAM example data as input for MosaiCatcher pipeline + input: zip file stored on Zenodo + output: input_bam_location given by the user + """ input: - # HTTP.remote("https://git.embl.de/tweber/mosaicatcher-update/-/raw/dev/workflow/bam/RPE-BM510/all/BM510x04_PE20301.sort.mdup.bam", keep_local=True) - HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/TEST_EXAMPLE_DATA.zip", keep_local=True) + HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/TEST_EXAMPLE_DATA.zip", keep_local=True) output: directory(config["input_bam_location"]) shell: "mkdir {output};" - "tar -xf {input} -C .;" - "mv TEST_EXAMPLE_DATA/ {output}" + "unzip {input} -d .;" + "mv TEST_EXAMPLE_DATA/* {output}" +# TODO: Adapt according reference rule dl_external_data: + """ + rule fct: Download External files + input: files stored on Zenodo + output: touch file to check if everything was running correctly + """ input: - # HTTP.remote("https://git.embl.de/tweber/mosaicatcher-update/-/raw/dev/workflow/bam/RPE-BM510/all/BM510x04_PE20301.sort.mdup.bam", keep_local=True) - HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna", keep_local=True), - HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai", keep_local=True), - HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz", keep_local=True), - HTTP.remote("https://sandbox.zenodo.org/record/1060987/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz.tbi", keep_local=True), + HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna", keep_local=True), + HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai", keep_local=True), + HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz", keep_local=True), + HTTP.remote("https://sandbox.zenodo.org/record/1062182/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz.tbi", keep_local=True), output: - "sandbox.zenodo.org/record/1060987/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna", - "sandbox.zenodo.org/record/1060987/files/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai", - "sandbox.zenodo.org/record/1060987/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz" - "sandbox.zenodo.org/record/1060987/files/ALL.chr1-22plusX_GRCh38_sites.20170504.renamedCHR.vcf.gz.tbi" - shell: - "echo Download completed" + touch(config["output_location"] + "config/dl_external_data.ok") -- GitLab