From e44512603852bbfd6bb61dcb0270430e4ea71634 Mon Sep 17 00:00:00 2001 From: Christian Arnold <carnold@embl.de> Date: Thu, 4 Oct 2018 19:32:45 +0200 Subject: [PATCH] cleanup --- src/Snakemake/dev/Snakefile | 1909 ----------------------- src/Snakemake/dev/src/.dropbox.attr | 1 - src/Snakemake/dev/src/Fragment_length.R | 124 -- src/Snakemake/dev/src/PCA.R | 121 -- src/Snakemake/dev/src/aut_stats.R | 447 ------ src/Snakemake/dev/src/filterBAMFile.R | 103 -- src/Snakemake/dev/src/functions.R | 702 --------- src/Snakemake/dev/src/functions_beta.R | 533 ------- src/Snakemake/old/Snakefile | 1860 ---------------------- src/Snakemake/old/Snakefile_hg38 | 1904 ---------------------- src/misc/generateExample.sh | 9 - 11 files changed, 7713 deletions(-) delete mode 100644 src/Snakemake/dev/Snakefile delete mode 100755 src/Snakemake/dev/src/.dropbox.attr delete mode 100755 src/Snakemake/dev/src/Fragment_length.R delete mode 100644 src/Snakemake/dev/src/PCA.R delete mode 100755 src/Snakemake/dev/src/aut_stats.R delete mode 100644 src/Snakemake/dev/src/filterBAMFile.R delete mode 100644 src/Snakemake/dev/src/functions.R delete mode 100644 src/Snakemake/dev/src/functions_beta.R delete mode 100755 src/Snakemake/old/Snakefile delete mode 100755 src/Snakemake/old/Snakefile_hg38 delete mode 100644 src/misc/generateExample.sh diff --git a/src/Snakemake/dev/Snakefile b/src/Snakemake/dev/Snakefile deleted file mode 100644 index 25997de..0000000 --- a/src/Snakemake/dev/Snakefile +++ /dev/null @@ -1,1909 +0,0 @@ -# ATAC-seq analysis pipeline - -# TODO: -# idr -# pca based on R -# 4. Change to scripts and adjust R sripts, particularly the aut_stats script -# 5. script_PCA - - -####################################### -# General stuff to make things easier # -####################################### - -# Make the output nicer and easier to follow -ruleDisplayMessage = "\n########################\n# START EXECUTING RULE #\n########################\n" - -############################################ -# Libraries, versions, authors and license # -############################################ - -from snakemake.utils import min_version -import subprocess -from os import makedirs -import pandas -import numpy - -# Enforce a minimum Snakemake version because of various features -min_version("4.6") - -__author__ = "Christian Arnold" -__license__ = "MIT" - - -############################################ -# Working directory and configuration file # -############################################ - -# Not needed, will be provided via the command line in Snakemake -#DEFAULT_CONFIG_FILE = "/g/scb/zaugg/carnold/Projects/AtacSeq/example/config.json" -#configfile: DEFAULT_CONFIG_FILE - -#ROOT_dir = config["par_general"]["workdir"] -#workdir: ROOT_dir - -########################################### -# Onstart, onsuccess and onerror handlers # -########################################### - -# Sometimes, it is necessary to specify code that shall be executed when the workflow execution is finished (e.g. cleanup, or notification of the user). - -# The onsuccess handler is executed if the workflow finished without error. -onsuccess: - print("\n\n###############################\n# Workflow finished, no error #\n###############################\n\n") - -# Else, the onerror handler is executed. -onerror: - print("\n\n#####################\n# An error occurred #\n#####################\n\n") - -# onstart handler will be executed before the workflow starts. Note that dry-runs do not trigger any of the handlers -onstart: - print("Reading samples and metadata....\n") - print ("Running workflow for the following samples:\n " + ' \n '.join(map(str, allSamplesUnique))) - - - -def read_samplesTable(samplesSummaryFile): - """text""" - - data = pandas.read_table(samplesSummaryFile) - - # Expect a particular number of columns, do a sanity check here - - if not {'individual', 'sampleName', 'Flowcell_ID', 'lane_ID', 'Technology', 'Library_ID'}.issubset(data.columns.values): - raise KeyError("The samples file must contain the following named columns (TAB separated!): individual, sampleName, Flowcell_ID, lane_ID, Technology, Library_ID") - - # Make sure the individual column is a string - data['individual'] = data['individual'].astype(str) - - - return data - - -def constructRGFields(samplesData): - """text""" - readGroupFields = {} - - for rowCur in range(0, len(samplesData.index)): - - individual = samplesData.ix[rowCur,"individual"] - field_PL = samplesData.ix[rowCur,"Technology"] - flowcell = samplesData.ix[rowCur,"Flowcell_ID"] - lane = samplesData.ix[rowCur,"lane_ID"] - sample = samplesData.ix[rowCur,"sampleName"] - library = samplesData.ix[rowCur,"Library_ID"] - - field_ID = str(flowcell) + "." + str(lane) - field_SM = str(individual) - field_PU = str(flowcell) + "." + str(lane) + "." + str(sample) - field_LB = str(library) - - readGroupFields[sample] = {"ID": field_ID, - "LB": field_LB, - "PL": field_PL, - "PU": field_PU, - "SM": field_SM} - - - return readGroupFields - - - -############################# -# DIRECTORIES AND VARIABLES # -############################# - -# Maximum number of cores per rule. This value will never be achieved because the minimum of this value and the --cores parameter will define the -# number of CPUs per rule in the end. -threadsMax = 12 - -missingParameters = [] - - - -configDict = { - "par_general": - ["outdir", "scriptsDir"], - "samples": - ["summaryFile", "pairedEnd", "fileEnding"], - "additionalInputFiles": - ["trimmomatic_adapters", "blacklistRegions", "refGenome_fasta", "refGenome_2bit", "annotationGTF"], - "executables": - ["GATK_jar", "java", "PICARD_jar"], - "par_trimming": - ["trimmomatic_ILLUMINACLIP", "trimmomatic_trailing", "trimmomatic_minlen", "trimmomatic_phredType"], - "par_align": - ["bowtie2_sensitivity", "bowtie2_maxFragmentLength", "bowtie2_refGenome", "assemblyVersion"], - "par_postalign": - ["minMAPQscore", "ValidationStringencySortSam", "ValidationStringencyMarkDuplicates", "CIGAR", "adjustRSS_forward", "adjustRSS_reverse"], - "par_scripts": - ["STATS_script_withinThr", "STATS_script_outsideThr", "STATS_script_geneTypesToKeep", "FL_distr_script_cutoff"], - "par_peakCalling": - ["modelNonStringent", "modelStringent", "modelStringent_minQValue", "modelNonStringent_minQValue", "modelNonStringent_slocal", "Encode_pValThreshold", "Encode_modelBroadAndGapped", "Encode_modelNarrow"], - "par_deepTools": - ["effectiveGenomeSize", "bamCoverage_normalizationCoverage", "bamCoverage_binSize", "bamCoverage_otherOptions"] - } - -for sectionCur in configDict: - if not sectionCur in config: - raise KeyError("Could not find section \"" + sectionCur + "\" in the config file or no config file has been specified.") - - requiredPar = configDict[sectionCur] - - for parCur in requiredPar: - if not parCur in config[sectionCur]: - missingParameters.append(parCur) - - if len(missingParameters) > 0: - missingParStr = ",".join(missingParameters) - raise KeyError("Could not find parameter(s) \"" + missingParStr + "\" in section \"" + sectionCur + "\" in the config file.") - - - - -samplesSummaryFile = config["samples"]["summaryFile"] -pairedEnd = config["samples"]["pairedEnd"] -refGen = config["additionalInputFiles"]["refGenome_fasta"] -script_FL_distr = config["executables"]["FL_distr_script"] -script_STATS = config["executables"]["STATS_script"] -script_PCA = "PCA.R" # TODO -java_exec = config["executables"]["java"] -par_minMAPQscore = config["par_postalign"]["minMAPQscore"] - -######## -# JAVA # -######## -minMemoryJavaGB = 5 -maxMemoryJavaGB = 50 - -picard_command = java_exec + " -Xms" + str(minMemoryJavaGB) + "g -Xmx" + str(maxMemoryJavaGB) + "g -jar " + config["executables"]["PICARD_jar"] - - -if not pairedEnd: - print ("Error: SE reads not supported yet by pipeline.\n") - sys.exit(1) - -INPUT_ORIG_DIR = os.path.dirname(samplesSummaryFile) - -ROOT_dir = config["par_general"]["outdir"] -INPUT_DIR = ROOT_dir + "/0.Input" -FASTQC_BT_dir = ROOT_dir + "/1.FastQC_beforeTrimming" -TRIM_dir = ROOT_dir + "/2.Trimming" -FASTQC_AT_dir = ROOT_dir + "/3.FastQC_afterTrimming" -ALIGN_dir = ROOT_dir + "/4.Alignment" -POSTALIGN_dir = ROOT_dir + "/5.Postalignment" -CLEAN_dir = POSTALIGN_dir + "/1.Clean" -BASERECAL_dir = POSTALIGN_dir + "/2.BaseRecalibration" -CHRM_dir = POSTALIGN_dir + "/3.Filter_chrM" -RMDUP_DIR = POSTALIGN_dir + "/4.MarkAndRemove_Duplicates" -MAPQsort_dir = POSTALIGN_dir + "/5.Filter_MAPQ" -ADJRSS_dir = POSTALIGN_dir + "/6.Adjust_RSS" -#RMINDEL_dir = POSTALIGN_dir + "/7.Filter_INDELs" -PEAKCALLING_dir = ROOT_dir + "/6.PeakCalling" -DOWNSTREAM_dir = ROOT_dir + "/7.DownstreamAnalyses" -FINAL_OUTPUT_dir = ROOT_dir + "/8.FinalOutput" -REPORTS_dir = ROOT_dir + "/Reports_and_Stats" -REPORTS_dir_baseQual= REPORTS_dir + "/baseQuality" -REPORTS_dir_corr = REPORTS_dir + "/Correlations" -REPORTS_dir_cov = REPORTS_dir + "/Coverage" -REPORTS_dir_PCA = REPORTS_dir + "/PCA" -REPORTS_dir_gcbias = REPORTS_dir + "/GCBias" -REPORTS_dir_summary= REPORTS_dir + "/sampleSummary" -REPORTS_dir_multiqc= REPORTS_dir + "/multiqc" -LOG_BENCHMARK_dir = ROOT_dir + "/LOGS_AND_BENCHMARKS" - -PEAKCALLING_STRINGENT_dir = PEAKCALLING_dir + "/stringent" -PEAKCALLING_NONSTRINGENT_dir = PEAKCALLING_dir + "/nonStringent" -PEAKCALLING_ENCODE_dir = PEAKCALLING_dir + "/Encode" - - -# scripts -# Not to be confused with the dir_scripts directory, which is only used to load the correct functions.R file in all R scripts -dir_scripts = "R/" - - -global samplesData -samplesData = read_samplesTable(config["samples"]["summaryFile"]) - -# Make it accessible also within functions -global RGFields -RGFields = constructRGFields(samplesData) - -#print (RGFields) -#print (RGFields["test1_rep1"]["LB"]) -#sys.exit(1) - -# Get all unique sample names -allIndividualsUnique = numpy.unique(samplesData.loc[:,"individual"]) -allIndividualsUniqueStrSpaces = ' '.join(allIndividualsUnique) -allSamplesUnique = numpy.unique(samplesData.loc[:,"sampleName"]) -# TODO: allSamplesPath = numpy.unique(samplesData.loc[:,"path"]) -# Get only two sampels for nicer graphs -#allIndividualsUnique = numpy.unique(samplesData.loc[:,"individual"])[0:2] -#allSamplesUnique = numpy.unique(samplesData.loc[:,"sampleName"])[0:2] -allSamplesUniqueStr = ','.join(allSamplesUnique) -allSamplesUniqueStrSpaces = ' '.join(allSamplesUnique) -# -# print(samplesData.loc[:,"sampleName"]) -# print(samplesData.loc[:,"individual"]) -# print (allSamplesUnique) -# print (allIndividualsUnique) -# print(numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique])) -# sys.exit(0) - -# Execuables - - -file_knownSNPs = "" -file_knownINDELS = "" - -if config["par_align"]["assemblyVersion"] in ('hg19', 'hg38'): - gatk_command = java_exec + " -Xms" + str(minMemoryJavaGB) + "g -Xmx" + str(maxMemoryJavaGB) + "g -jar " + config["executables"]["GATK_jar"] - - file_knownSNPs = config["additionalInputFiles"]["knownSNPs"] - file_knownINDELS = config["additionalInputFiles"]["knownIndels"] - if not os.path.isfile(config["executables"]["GATK_jar"]): - raise IOError("File " + config["executables"]["GATK_jar"] + " not found.") - if not os.path.isfile(file_knownSNPs): - raise IOError("File " + file_knownSNPs + " not found.") - if not os.path.isfile(file_knownINDELS): - raise IOError("File " + file_knownINDELS + " not found.") - - dictFile = os.path.splitext(refGen)[0] + ".dict" - - faiFile = refGen + ".fai" - if not (os.path.isfile(faiFile)) or not (os.path.isfile(dictFile)): - raise IOError("Either index file *.fai or *.dict for " + refGen + " not found. See https://software.broadinstitute.org/gatk/guide/article?id=1601") - -filesToCheck = [config["additionalInputFiles"]["trimmomatic_adapters"], - config["executables"]["FL_distr_script"], - config["executables"]["STATS_script"], - config["executables"]["PICARD_jar"], - config["additionalInputFiles"]["blacklistRegions"], - config["additionalInputFiles"]["annotationGTF"], - config["additionalInputFiles"]["refGenome_fasta"], - config["additionalInputFiles"]["refGenome_2bit"] - ] - -for fileCur in filesToCheck: - if not os.path.isfile(fileCur): - raise IOError("File " + fileCur + " not found.") - -if not config["samples"]["fileEnding"].endswith(".gz"): - raise AssertionError("The fastq files must be gzipped and end in \".gz\". Please prepare the input files accordingly.") - -# TODO: Ultimately support both compressed and uncompressed data - -#### Parameter for bamCoverage - -bamCoverage_normOption = config["par_deepTools"]["bamCoverage_normalizationCoverage"] -if not (bamCoverage_normOption == "normalizeTo1x") and not (bamCoverage_normOption == "normalizeUsingRPKM") and not (bamCoverage_normOption == "ignoreForNormalization"): - raise AssertionError("The config parameter config[\"par_deepTools\"][\"bamCoverage_normalizationCoverage\"] has to be one of: normalizeTo1x, normalizeUsingRPKM, ignoreForNormalization") - -if bamCoverage_normOption == "normalizeTo1x": - bamCoverage_normOption = bamCoverage_normOption + " " + str(config["par_deepTools"]["effectiveGenomeSize"]) - - -########################################################################### -# Get the versions of the used tools and script to record them rigorously # -########################################################################### - -# Almost obselete due to the conda environments. Only record versions for scripts etc - -# For custom scripts, retrieve the modification date instead -VERSION_FL_distr_script = str(os.path.getmtime(config["executables"]["FL_distr_script"])).replace('\n', ' ') -VERSION_STATS_script = str(os.path.getmtime(config["executables"]["STATS_script"])).replace('\n', ' ') - - -######### -# RULES # -######### - -# For cluster usage: The keyword localrules allows to mark a rule as local, so that it is not submitted to the cluster and instead executed on the host node -localrules: all,link_inputFiles - -################### -################### -## FINAL TARGETS ## -################### -################### - - -allSamplesAndIndividuals = numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique + ".final"]) - -rule all: - input: - #idr = expand('{dir}/allSamples.final.{stringency}.peaks.IDR.bed.png', dir = PEAKCALLING_dir, stringency = ("stringent", "nonStringent")), - # First, the individual sample peak files - individualPeaksEncode = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed.gz', dir = PEAKCALLING_ENCODE_dir, sample = allSamplesUnique, - GCBias = ["",".noGCBias"], - analysisType = ["Encode"], - peakType = ["broad", "gapped", "narrow"] - ), - individualPeaksStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed.gz', dir = PEAKCALLING_STRINGENT_dir, sample = allSamplesUnique, - GCBias = ["",".noGCBias"], - analysisType = ["stringent"], - peakType = ["narrow"] - ), - individualPeaksNonStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed.gz', dir = PEAKCALLING_NONSTRINGENT_dir, sample = allSamplesUnique, - GCBias = ["",".noGCBias"], - analysisType = ["nonStringent"], - peakType = ["narrow"] - ), - pooledPeaksEncode = expand('{dir}/{indiv}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered2.bed.gz', dir = PEAKCALLING_ENCODE_dir, indiv = allIndividualsUnique, - GCBias = ["",".noGCBias"], - analysisType = ["Encode"], - peaktype2 = [".pooled", ".replicate"], - peakType = ["broad", "gapped", "narrow"] - ), - pooledPeaksStringent = expand('{dir}/{indiv}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered2.bed.gz', dir = PEAKCALLING_STRINGENT_dir, indiv = allIndividualsUnique, - GCBias = ["",".noGCBias"], - analysisType = ["stringent"], - peaktype2 = [".pooled", ".replicate"], - peakType = ["narrow"] - ), - pooledPeaksNonStringent = expand('{dir}/{indiv}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered2.bed.gz', dir = PEAKCALLING_NONSTRINGENT_dir, indiv = allIndividualsUnique, - GCBias = ["",".noGCBias"], - analysisType = ["nonStringent"], - peaktype2 = [".pooled", ".replicate"], - peakType = ["narrow"] - ), - stats = expand('{dir}/multiqc_report.html', dir = REPORTS_dir_multiqc) - message: "{ruleDisplayMessage}One Pipeline to rule them \"all\"..." - - -################## -################## -## PREPARE DATA ## -################## -################## - - -def getSamplePathForSampleName(sampleName): - """text""" - sampleBasenames = numpy.asarray(samplesData.loc[samplesData["sampleName"] == sampleName, "path"]) - return sampleBasenames - -# TODO: SPlit the ending. Make it configurable - -rule link_inputFiles: - input: - forward = expand('{dir}/{{sample}}_1{fileEnding}', dir = INPUT_ORIG_DIR, fileEnding = config["samples"]["fileEnding"]), - reverse = expand('{dir}/{{sample}}_2{fileEnding}', dir = INPUT_ORIG_DIR, fileEnding = config["samples"]["fileEnding"]) - output: - forward = expand('{dir}/data/{{sample}}_1.fastq.gz', dir = INPUT_DIR), - reverse = expand('{dir}/data/{{sample}}_2.fastq.gz', dir = INPUT_DIR) - log: - message: "{ruleDisplayMessage}Create symbolic links for the input files {input:q} in directory {INPUT_DIR:q}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/link_inputFiles.{sample}.benchmark" - resources: - version:"NA" - params: sampleCSV = config["samples"]["summaryFile"] - shell: - """ - ln -fs {input.forward:q} {output.forward:q} && - ln -fs {input.reverse:q} {output.reverse:q} && - touch -h {output.forward:q} && - touch -h {output.reverse:q} && - cp {params.sampleCSV} {INPUT_DIR} - """ - - # Workaround for the warning "Unable to set utime on symlink {}. Your Python build does not support it" warning related to symbolic links - # https://bitbucket.org/snakemake/snakemake/issues/397/unable-to-set-utime-on-symlink-your-python - - -rule Picard_CreateSequenceDictionary: - input: - #environment = rules.prepareEnvironments_JavaMemoryPicard.output, - fasta = config["additionalInputFiles"]["refGenome_fasta"] - output: - dict = CLEAN_dir + '/ref.dict.fasta' - log: expand('{dir}/Picard_CreateSequenceDictionary.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Creates a sequence dictionary for a reference sequence for input {input.fasta}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_CreateSequenceDictionary.benchmark" - params: - shell: - """ - {picard_command} CreateSequenceDictionary \ - R={input.fasta:q} \ - O={output.dict:q} \ - 2> {log:q} - """ - - -################################# -################################# -## FASTQC, TRIMMING, ALIGNMENT ## -################################# -################################# - -rule fastqc_BT: - input: - forward = rules.link_inputFiles.output.forward, - reverse = rules.link_inputFiles.output.reverse - output: - forward = expand('{dir}/{{sample}}_1_fastqc.zip', dir = FASTQC_BT_dir), - reverse = expand('{dir}/{{sample}}_2_fastqc.zip', dir = FASTQC_BT_dir) - log: expand('{dir}/fastqc_BT.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Perform FASTQC on the samples {input:q} before trimming..." - threads: threadsMax - priority: 1 - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/fastqc.yaml" - benchmark: LOG_BENCHMARK_dir + "/fastqc_BT.{sample}.benchmark" - shell: - """ - fastqc \ - -o {FASTQC_BT_dir:q} \ - -t {threads} \ - --extract \ - {input:q} \ - 2> {log:q} - """ - -rule trimming_PE: - input: - forward = rules.link_inputFiles.output.forward, - reverse = rules.link_inputFiles.output.reverse, - report = rules.fastqc_BT.output # Not really needed, but force execution here - output: - forward_paired = expand('{dir}/{{sample}}_1.trimmed.fq.gz', dir = TRIM_dir), - forward_unpaired = expand('{dir}/{{sample}}_1.unpaired.fq.gz', dir = TRIM_dir), - reverse_paired = expand('{dir}/{{sample}}_2.trimmed.fq.gz', dir = TRIM_dir), - reverse_unpaired = expand('{dir}/{{sample}}_2.unpaired.fq.gz', dir = TRIM_dir) - log: output = expand('{dir}/trimming_PE_TrimmomaticOutput.{{sample}}.log', dir = LOG_BENCHMARK_dir), - trimlog = expand('{dir}/trimming_PE_trimlog.{{sample}}.log' , dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Trimming of adapters with TRIMMOMATIC in the PE mode for files {input:q} using adapters file {params.adapters:q} ..." - threads: threadsMax - priority: 1 - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/trimmomatic.yaml" - benchmark: LOG_BENCHMARK_dir + "/trimming_PE.{sample}.benchmark" - params: - ILLUMINACLIP = config["par_trimming"]["trimmomatic_ILLUMINACLIP"], - trailing = config["par_trimming"]["trimmomatic_trailing"], - minlen = config["par_trimming"]["trimmomatic_minlen"], - adapters = config["additionalInputFiles"]["trimmomatic_adapters"] - shell: - """ - trimmomatic PE \ - -threads {threads} \ - {input.forward:q} {input.reverse:q} \ - {output.forward_paired:q} {output.forward_unpaired:q} {output.reverse_paired:q} {output.reverse_unpaired:q} \ - ILLUMINACLIP:{params.adapters}:{params.ILLUMINACLIP} \ - TRAILING:{params.trailing} \ - MINLEN:{params.minlen} \ - 2>{log.output:q} - """ - - # removed: -trimlog {log.trimlog:q} \ - -rule fastqc_AT: - input: - forward = rules.trimming_PE.output.forward_paired, - reverse = rules.trimming_PE.output.reverse_paired - output: - forward = expand('{dir}/{{sample}}_1.trimmed_fastqc.zip', dir = FASTQC_AT_dir), - reverse = expand('{dir}/{{sample}}_2.trimmed_fastqc.zip', dir = FASTQC_AT_dir) - log: expand('{dir}/fastqc_AT.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Perform FASTQC on the samples {input:q} after trimming..." - threads: threadsMax - priority: 1 - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/fastqc.yaml" - benchmark: LOG_BENCHMARK_dir + "/fastqc_AT.{sample}.benchmark" - shell: - """fastqc \ - -o {FASTQC_AT_dir:q} \ - -t {threads} \ - --extract \ - {input:q} \ - 2> {log:q} - """ - -rule Bowtie2_alignment: - input: - file1 = rules.trimming_PE.output.forward_paired, - file2 = rules.trimming_PE.output.reverse_paired, - report = rules.fastqc_AT.output - output: - temp(expand('{dir}/{{sample}}.bt2.sam', dir = ALIGN_dir)) - threads: threadsMax - priority: 1 - log: expand('{dir}/Bowtie2_alignment.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Do Bowtie2 alignment for files {input:q}. This may take a while..." - benchmark: LOG_BENCHMARK_dir + "/alignment.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bowtie2.yaml" - params: - sensitivity = config["par_align"]["bowtie2_sensitivity"], - refGenome = config["par_align"]["bowtie2_refGenome"], - maxFragmentLength = config["par_align"]["bowtie2_maxFragmentLength"] - shell: - """bowtie2 \ - -p {threads} \ - -X {params.maxFragmentLength} \ - {params.sensitivity} \ - -t \ - -x {params.refGenome} \ - -1 {input.file1:q} -2 {input.file2:q} \ - -S {output:q} \ - 2> {log:q} - """ - - - -##################################### -##################################### -## CLEANING AND BASE RECALIBRATION ## -##################################### -##################################### - -rule samtools_SAM_TO_BAM: - input: - #environment = rules.prepareEnvironments_JavaMemoryPicard.output, - sam = rules.Bowtie2_alignment.output - output: - unsortedBam = temp(expand('{dir}/{{sample}}.bam' , dir = ALIGN_dir)), - sortedBam = expand('{dir}/{{sample}}.s.bam' , dir = ALIGN_dir), - index = expand('{dir}/{{sample}}.s.bam.bai', dir = ALIGN_dir) - threads: threadsMax - priority: 1 - log: - message: "{ruleDisplayMessage}Conversion to BAM, sort, index for file {input.sam:q} ..." - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/samtools_SAM_TO_BAM.{sample}.benchmark" - shell: - """ - samtools view -b -S -o {output.unsortedBam:q} {input.sam:q} && - samtools sort -o {output.sortedBam:q} --threads {threads} {output.unsortedBam:q} && - samtools index {output.sortedBam:q} - """ - - - -basenameSuffix = ".cleaned" - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_cleanSAM_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + ".s" - -rule Picard_cleanSAM: - input: - bam = rules.samtools_SAM_TO_BAM.output.sortedBam - output: - bam = temp(Picard_cleanSAM_outputName + ".bam"), - index = temp(Picard_cleanSAM_outputName + ".bai") - log: expand('{dir}/Picard_cleanSAM.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}CleanSam: Clean the provided SAM/BAM, soft-clipping beyond-end-of-reference alignments and setting MAPQ to 0 for unmapped reads for {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_cleanSAM.{sample}.benchmark" - shell: - """ - {picard_command} CleanSam \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} - """ - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_FixMateInformation_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "2" + ".s" - -rule Picard_FixMateInformation: - input: - bam = rules.Picard_cleanSAM.output.bam, - bai = rules.Picard_cleanSAM.output.index - output: - bam = temp(Picard_FixMateInformation_outputName + '.bam'), - index = temp(Picard_FixMateInformation_outputName + '.bai'), - stats = Picard_FixMateInformation_outputName + '.bam.stats' - log: expand('{dir}/Picard_FixMateInformation.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Verify mate-pair information between mates and fix if needed for input {input.bam}..." - threads: 1 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_FixMateInformation.{sample}.benchmark" - shell: - """ - {picard_command} FixMateInformation \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} && - samtools flagstat {output.bam:q} > {output.stats:q} - """ - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_AddOrReplaceReadGroups_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "3" + ".s" - -rule Picard_AddOrReplaceReadGroups: - input: - bam = rules.Picard_FixMateInformation.output.bam, - bai = rules.Picard_FixMateInformation.output.index - output: - bam = temp(Picard_AddOrReplaceReadGroups_outputName + '.bam'), - index = temp(Picard_AddOrReplaceReadGroups_outputName + '.bai') - log: expand('{dir}/Picard_AddOrReplaceReadGroups.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Replace read groups in a BAM file for input {input.bam}..." - threads: 1 - priority: 1 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_AddOrReplaceReadGroups.{sample}.benchmark" - params: - RGID = lambda wildcards: RGFields[wildcards.sample]["ID"], # Read Group ID - RGLB = lambda wildcards: RGFields[wildcards.sample]["LB"], # Read Group library - RGPL = lambda wildcards: RGFields[wildcards.sample]["PL"], # Read Group platform - RGPU = lambda wildcards: RGFields[wildcards.sample]["PU"], # Read Group platform unit (eg. run barcode) - RGSM = lambda wildcards: RGFields[wildcards.sample]["SM"] # Read Group sample name - shell: - """ - {picard_command} AddOrReplaceReadGroups \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - RGID={params.RGID} \ - RGLB={params.RGLB} \ - RGPL={params.RGPL} \ - RGPU={params.RGPU} \ - RGSM={params.RGSM} \ - 2> {log:q} - """ - - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_ReorderSam_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "4" + ".s" - -rule Picard_ReorderSam: - input: - bam = rules.Picard_AddOrReplaceReadGroups.output.bam, - index = rules.Picard_AddOrReplaceReadGroups.output.index, - ref = config["additionalInputFiles"]["refGenome_fasta"] - output: - bam = temp(Picard_ReorderSam_outputName + '.bam'), - index = temp(Picard_ReorderSam_outputName + '.bai') - log: expand('{dir}/Picard_ReorderSam.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Reorders reads in a SAM/BAM file to match the contig ordering in a provided reference file for input {input.bam}..." - threads: 1 - priority: 1 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_ReorderSam.{sample}.benchmark" - params: - shell: - """ - {picard_command} ReorderSam \ - I={input.bam:q} \ - REFERENCE={input.ref:q} \ - O={output.bam:q} \ - ALLOW_INCOMPLETE_DICT_CONCORDANCE=TRUE \ - CREATE_INDEX=true \ - 2> {log:q} - """ - - -rule GATK_baseRecalibration1: - input: - #environment = rules.prepareEnvironments_JavaMemoryGATK.output, - bam = rules.Picard_ReorderSam.output.bam, - index = rules.Picard_ReorderSam.output.index - output: - recalibrationTable1 = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.recalTable1', dir = BASERECAL_dir) - log: expand('{dir}/GATK_baseRecalibration1.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}BaseRecalibrator(1): Detect systematic errors in base quality scores for file {input.bam:q}..." - threads: threadsMax - priority: 1 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - params: - knownSNPs = file_knownSNPs, - knownIndels = file_knownINDELS, - fasta = config["additionalInputFiles"]["refGenome_fasta"] - benchmark: LOG_BENCHMARK_dir + "/GATK_baseRecalibration1.{sample}.benchmark" - shell: - """ - {gatk_command} -T BaseRecalibrator\ - -R {params.fasta:q} \ - -knownSites {params.knownSNPs:q} \ - -knownSites {params.knownIndels:q} \ - --lowMemoryMode \ - -I {input.bam:q} \ - --out {output.recalibrationTable1:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - """ - -rule GATK_printReadsBQSR: - input: - #environment = rules.prepareEnvironments_JavaMemoryGATK.output, - bam = rules.Picard_ReorderSam.output.bam, - index = rules.Picard_ReorderSam.output.index, - recalibrationTable = rules.GATK_baseRecalibration1.output.recalibrationTable1 - output: - bam = temp(expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam', dir = BASERECAL_dir)), - bai = temp(expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.bai', dir = BASERECAL_dir)) - log: expand('{dir}/GATK_printReadsBQSR.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}PrintReads: Recalibrate {input.bam:q} using recalibration table {input.recalibrationTable:q} ..." - threads: threadsMax - priority: 1 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: LOG_BENCHMARK_dir + "/GATK_printReadsBQSR.{sample}.benchmark" - params: - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """ - {gatk_command} -T PrintReads \ - -R {params.fasta:q} \ - -I {input.bam:q} \ - -BQSR {input.recalibrationTable:q} \ - --out {output.bam:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - """ - -rule GATK_baseRecalibration2: - input: - bam = rules.Picard_ReorderSam.output.bam, - index = rules.Picard_ReorderSam.output.index, - recalibrationTable1 = rules.GATK_baseRecalibration1.output.recalibrationTable1 - output: - recalibrationTable2 = expand('{dir}/{{sample}}.BQrecal.s.bam.recalTable2', dir = BASERECAL_dir) - log: expand('{dir}/GATK_baseRecalibration2.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}BaseRecalibrator(2): Detect systematic errors in base quality scores for file {input.bam:q} and first recalibration table..." - threads: threadsMax - priority: 1 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: LOG_BENCHMARK_dir + "/GATK_baseRecalibration2.{sample}.benchmark" - params: - knownSNPs = file_knownSNPs, - knownIndels = file_knownINDELS, - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """ - {gatk_command} -T BaseRecalibrator \ - -R {params.fasta:q} \ - -knownSites {params.knownSNPs:q} \ - -knownSites {params.knownIndels:q} \ - -I {input.bam:q} \ - -BQSR {input.recalibrationTable1:q} \ - --out {output.recalibrationTable2:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - """ - -# Use -L argument with BaseRecalibrator to restrict recalibration to capture targets on WEx -# - BQSR depends on key assumption: every mismatch is an error, except sites in known variants -# - Off-Âtarget sequence likely to have higher error rates with different error modes -# - If off-target sequence is included in recalibration, may skew the model and mess up results - - - -rule GATK_analyzeCovariates: - input: - recalibrationTable1 = rules.GATK_baseRecalibration1.output.recalibrationTable1, - recalibrationTable2 = rules.GATK_baseRecalibration2.output.recalibrationTable2 - output: - plots = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.plots.pdf', dir = REPORTS_dir_baseQual), - csv = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.plots.csv', dir = REPORTS_dir_baseQual) - log: - expand('{dir}/GATK_analyzeCovariates.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: - "{ruleDisplayMessage}AnalyzeCovariates: Create plots to visualize base recalibration results for {input}..." - threads: 1 - priority: 1 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: - LOG_BENCHMARK_dir + "/GATK_analyzeCovariates.{sample}.benchmark" - params: - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """ - {gatk_command} -T AnalyzeCovariates \ - -R {params.fasta:q} \ - -before {input.recalibrationTable1:q} \ - -after {input.recalibrationTable2:q} \ - -plots {output.plots:q} \ - -csv {output.csv:q} \ - --log_to_file {log:q} - """ - -############### -############### -## FILTERING ## -############### -############### - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4.BQrecal.rmChrM.s.bam' - -# Define if GATK should be run or not -def postalign_inputs(assemblyVersion): - if assemblyVersion in ('hg19', 'hg38'): - return rules.GATK_printReadsBQSR.output.bam - else: - return rules.Picard_ReorderSam.output.bam - - -rule postalign_remove_chrMAndUnassembledChr: - input: - bam = postalign_inputs(config["par_align"]["assemblyVersion"]) - output: - #index1 = temp(str(rules.GATK_printReadsBQSR.output.bam) + ".bai"), - bam = temp(postalign_remove_chrMAndUnassembledChr_outputName), - index2 = temp(postalign_remove_chrMAndUnassembledChr_outputName + ".bai"), - stats = postalign_remove_chrMAndUnassembledChr_outputName + ".stats" - log: - message: "{ruleDisplayMessage}Remove mitochondrial reads for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_remove_chrMAndUnassembledChr.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """ samtools index {input.bam:q} && - samtools idxstats {input.bam:q} | cut -f 1 | grep chr | grep -Pv "chrM|chrUn|random|hap" | xargs samtools view -b {input.bam:q} >{output.bam:q} && - samtools index {output.bam:q} && - samtools flagstat {output.bam:q} > {output.stats:q} - """ - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -markDuplicates_Picardtools_outputName = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s' - - -rule markDuplicates_Picardtools: - input: - bam = rules.postalign_remove_chrMAndUnassembledChr.output.bam, - index = rules.postalign_remove_chrMAndUnassembledChr.output.index2 - output: - bam = temp(markDuplicates_Picardtools_outputName + ".bam"), - index = temp(markDuplicates_Picardtools_outputName + ".bai") - log: - log = expand('{dir}/markDuplicates_Picardtools.{{sample}}.log' , dir = LOG_BENCHMARK_dir), - metricsFile = expand('{dir}/markDuplicates_Picardtools.{{sample}}_metrics.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Mark (not remove!) duplicate reads for file {input:q} with Picard tools..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/markDuplicates_Picardtools.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - ValidationStringency = config["par_postalign"]["ValidationStringencyMarkDuplicates"], - removeDuplicates = "false", - assumeSorted = "true" - shell: - """{picard_command} MarkDuplicates \ - INPUT={input.bam:q} \ - OUTPUT={output.bam:q} \ - ASSUME_SORTED={params.assumeSorted} \ - METRICS_FILE={log.metricsFile:q} \ - VALIDATION_STRINGENCY={params.ValidationStringency} \ - REMOVE_DUPLICATES={params.removeDuplicates} \ - CREATE_INDEX=TRUE \ - 2> {log.log:q}""" - - - -rule computeLibraryComplexity: - input: - bam = rules.markDuplicates_Picardtools.output.bam - output: - stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl' - log: - message:"{ruleDisplayMessage}Compute library complexity for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/computeLibraryComplexity.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bedtools.yaml" - params: - shell: - # Taken from https://github.com/kundajelab/atac_dnase_pipelines/blob/72ed6ba2502cca074c51740b612cbc6ebea07b08/modules/postalign_bam.bds - # Implementing the ENCODE ATAC-Seq library complexity guidelines - # PBC File output - # TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair - """ - bedtools bamtobed -i {input.bam:q} | \ - awk 'BEGIN{{OFS="\\t"}} {{print $1,$2,$3,$6}}' | \ - grep -v 'chrM' | sort | uniq -c | \ - awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; if(m2>0) m1_m2=m1/m2; \ - printf "readsTotal\\treadsDistinct\\treadsOccOne\\treadsOccTwo\\tNRF\\tPBC1\\tPBC2\\n%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",mt,m0,m1,m2,m0/mt,m1/m0,m1_m2}}' > {output.stats}""" - - - -rule removeDuplicates: - input: - bam = rules.markDuplicates_Picardtools.output.bam, - index = rules.markDuplicates_Picardtools.output.index, - stats = rules.computeLibraryComplexity.output.stats - output: - bam = temp(RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam'), - stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats' - log: - message: "{ruleDisplayMessage}Remove duplicate reads and QC-failing reads for file {input:q} with samtools..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/removeDuplicates.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - removeReadsWithFlags = 1804, # read or mate unmapped, not primary alignment, read fails platform/vendor quality checks, read is PCR or optical duplicate - keepReadsWithFlags = 2 # read mapped in proper pair - shell: - """samtools view -F {params.removeReadsWithFlags} -f {params.keepReadsWithFlags} -b {input.bam:q} > {output.bam:q} && - samtools flagstat {output.bam:q} > {output.stats:q} - """ - # https://github.com/kundajelab/atac_dnase_pipelines/blob/72ed6ba2502cca074c51740b612cbc6ebea07b08/modules/postalign_bam.bds - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.s.bam' - -rule postalign_MAPQ: - input: - bam = rules.removeDuplicates.output.bam - output: - bam = temp(postalign_MAPQ_outputName), - index = temp(postalign_MAPQ_outputName + ".bai"), - stats = postalign_MAPQ_outputName + ".stats" - log: - message: "{ruleDisplayMessage}Remove reads with a MAPQ quality lower than {par_minMAPQscore} for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_MAPQ.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - minMAPQ = par_minMAPQscore - shell: - """ - samtools view -b -q {params.minMAPQ} -F 4 {input.bam:q} > {output.bam:q} && - samtools index {output.bam:q} && - samtools flagstat {output.bam:q} > {output.stats:q} - """ - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.adjRSS.s.bam' - - -rule postaling_RSS: - input: - bam = rules.postalign_MAPQ.output.bam, - index = rules.postalign_MAPQ.output.index - output: - bam = postalign_RSS_outputName, - stats = postalign_RSS_outputName + ".stats", - csv = postalign_RSS_outputName + ".csv.gz" - log: - message: "{ruleDisplayMessage}Adjust read start sites for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postaling_RSS.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - adjustRSS_forward = config["par_postalign"]["adjustRSS_forward"], - adjustRSS_reverse = config["par_postalign"]["adjustRSS_reverse"] - shell: - """ cat <(samtools view -H {input.bam:q}) <(samtools view -F 16 {input.bam:q} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4=$4+{params.adjustRSS_forward}; print $0}}') <(samtools view -f 16 {input.bam:q} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4=$4+{params.adjustRSS_reverse}; print $0}}') | samtools view -S -b -o {output.bam:q} - && - samtools flagstat {output.bam:q} > {output.stats:q} && - samtools view {output.bam:q} | cut -f3,5,9 | gzip -f > {output.csv:q} - """ - - -Picard_sortFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.adjRSS.cleaned1.s' - -# Necessary to resport, as Picard seems to require a different sorting order -# VALIDATION_STRINGENCY=SILENT is important because any strict check might identify the out-of-reference alignments and abort otherwise. -rule Picard_sortFinal: - input: - bam = rules.postaling_RSS.output.bam - output: - bam = temp(Picard_sortFinal_outputName + '.bam') - log: expand('{dir}/Picard_sortFinal.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}SortSam for {input.bam:q} ..." - threads: 1 - priority: 1 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """ - {picard_command} SortSam \ - I={input.bam:q} \ - O={output.bam:q} \ - SORT_ORDER=coordinate \ - VALIDATION_STRINGENCY=SILENT \ - 2> {log:q} - """ - -Picard_cleanSamFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.adjRSS.cleaned2.s' - -# Necessary to handle potential out-of-reference alignments due to the read start adjustment. -# Out-of-references bases will be soft-clipped, which is conform with the BAM specifications -rule Picard_cleanSamFinal: - input: - bam = rules.Picard_sortFinal.output.bam - output: - bam = temp(Picard_cleanSamFinal_outputName + '.bam'), - index = temp(Picard_cleanSamFinal_outputName + '.bai') - log: expand('{dir}/Picard_cleanSamFinal.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}CleanSam: Clean the provided SAM/BAM, soft-clipping beyond-end-of-reference alignments and setting MAPQ to 0 for unmapped reads for {input.bam:q} ..." - threads: 1 - priority: 1 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """ - {picard_command} CleanSam \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} - """ - - -# Necessary because adjusting the RSS makes the start position column of the next mate invalid -rule Picard_FixMateInformationFinal: - input: - bam = rules.Picard_cleanSamFinal.output.bam, - bai = rules.Picard_cleanSamFinal.output.index - output: - bam = expand('{dir}/{{sample}}.final.bam', dir = FINAL_OUTPUT_dir), - index = expand('{dir}/{{sample}}.final.bai', dir = FINAL_OUTPUT_dir) - log: expand('{dir}/Picard_FixMateInformationFinal.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Verify mate-pair information between mates and fix if needed for input {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - shell: - """ - {picard_command} FixMateInformation \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} - """ - - -######################## -######################## -## MERGING REPLICATES ## -######################## -######################## - - - -def getSampleBasenamesForIndividual(individual): - """text""" - sampleBasenames = numpy.asarray(samplesData.loc[samplesData["individual"] == individual, "sampleName"]) - return sampleBasenames - - -rule Picardtools_MergeSamFiles: - input: - lambda wildcards: expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = getSampleBasenamesForIndividual(wildcards.individual)) - output: - bam = expand('{dir}/{{individual}}.merged.final.bam', dir = FINAL_OUTPUT_dir), - index = expand('{dir}/{{individual}}.merged.final.bam.bai', dir = FINAL_OUTPUT_dir) - log: - expand('{dir}/Picardtools_MergeSamFiles.{{individual}}.log', dir = LOG_BENCHMARK_dir) - message: - "{ruleDisplayMessage}Merging all replicates for individual {wildcards.individual} (files {input:q} with Picardtools..." - threads: 1 - priority: 1 - - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picardtools_MergeSamFiles.{individual}.benchmark" - params: inputString = lambda wildcards, input: " I=".join(input) - shell: - """ - {picard_command} MergeSamFiles \ - I={params.inputString} \ - O={output.bam:q} \ - 2> {log:q} && - samtools index {output.bam:q} - """ - - - - -############# -############# -## GC BIAS ## -############# -############# - - - -rule deepTools_computeGCBias: - input: - bam = FINAL_OUTPUT_dir + '/{basename}.final.bam' - output: - frequencies = REPORTS_dir_gcbias + '/{basename}.GCBias.frequencies', - biasPlot = REPORTS_dir_gcbias + '/{basename}.GCBias.plot.pdf' - log: LOG_BENCHMARK_dir + "/deepTools_computeGCBias.{basename}.log" - message: "{ruleDisplayMessage}Run deepTools: computeGCBias for file {input.bam}..." - threads: threadsMax - priority: 1 - params: - genome2Bit = config["additionalInputFiles"]["refGenome_2bit"], - blacklistRegions = config["additionalInputFiles"]["blacklistRegions"], - effectiveGenomeSize = config["par_deepTools"]["effectiveGenomeSize"], - fragmentLength = "200", # will be ignored for paired-end data, as the fragment length is computed based from the bam file - other = "" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """ - computeGCBias \ - --bamfile {input.bam} \ - --effectiveGenomeSize {params.effectiveGenomeSize} \ - --genome {params.genome2Bit} \ - --blackListFileName {params.blacklistRegions} \ - --fragmentLength {params.fragmentLength}\ - {params.other} \ - --plotFileFormat pdf \ - --biasPlot {output.biasPlot} \ - --GCbiasFrequenciesFile {output.frequencies} \ - --numberOfProcessors {threads} \ - 2> {log:q} - """ - - -rule deepTools_correctGCBias: - input: - bam = FINAL_OUTPUT_dir + '/{basename}.final.bam', - frequencies = rules.deepTools_computeGCBias.output.frequencies - output: - bam = FINAL_OUTPUT_dir + '/{basename}.noGCBias.final.bam', - index = FINAL_OUTPUT_dir + '/{basename}.noGCBias.final.bam.bai', - log: LOG_BENCHMARK_dir + "/deepTools_correctGCBias.{basename}.log" - message: "{ruleDisplayMessage}Run deepTools: correctGCBias for file {input.bam}..." - threads: threadsMax - priority: 1 - params: - genome2Bit = config["additionalInputFiles"]["refGenome_2bit"], - effectiveGenomeSize = config["par_deepTools"]["effectiveGenomeSize"], - other = "" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """ - correctGCBias \ - --bamfile {input.bam} \ - --GCbiasFrequenciesFile {input.frequencies} \ - --effectiveGenomeSize {params.effectiveGenomeSize} \ - --genome {params.genome2Bit} \ - --correctedFile {output.bam} \ - --numberOfProcessors {threads} \ - 2> {log:q} && - samtools index {output.bam} - """ - - -################## -################## -## PEAK CALLING ## -################## -################## - - -def getGenomeTypeMacs2(assemblyVersion): - - if assemblyVersion == "mm9" or assemblyVersion == "mm10": - genomeType = "mm" - elif assemblyVersion == "hg19" or assemblyVersion == "hg38": - genomeType = "hs" - else: - raise NotImplementedError("Genome assembly version " + assemblyVersion + " not yet implemented for the -g parameter in MACS2.") - - return genomeType - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_stringent_outputName = PEAKCALLING_STRINGENT_dir + '/{basename}' + '.stringent' - -# Runs for all bam fiels in the final output folder, also the replicated one -rule macs2_stringent: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - peaks_bedT = temp(macs2_stringent_outputName + '_peaks.narrowPeak'), - peaks_bed = temp(macs2_stringent_outputName + '.narrowPeak.gz'), - summit_bed = macs2_stringent_outputName + '_summits.bed', - xls = temp(macs2_stringent_outputName + '_peaks.xls') - log: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (stringent) for {input.bam:q} ..." - threads: 1 - priority: 1 - - benchmark: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - qValue = config["par_peakCalling"]["modelStringent_minQValue"], - modelPar = config["par_peakCalling"]["modelStringent"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.stringent', - outputDir = PEAKCALLING_STRINGENT_dir, - keepDuplicates = "--keep-dup all" - shell: - """macs2 callpeak \ - --treatment {input.bam} \ - -q {params.qValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelPar} \ - 2> {log:q} && - sort -k 8gr,8gr {output.peaks_bedT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' | gzip -f > {output.peaks_bed} - """ - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_nonStringent_outputName = PEAKCALLING_NONSTRINGENT_dir + '/{basename}' + '.nonStringent' - - -rule macs2_nonStringent: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - peaks_bedT = temp(macs2_nonStringent_outputName + '_peaks.narrowPeak'), - peaks_bed = temp(macs2_nonStringent_outputName + '.narrowPeak.gz'), - summit_bed = macs2_nonStringent_outputName + '_summits.bed', - xls = temp(macs2_nonStringent_outputName + '_peaks.xls') - log: LOG_BENCHMARK_dir + "/macs2_nonStringent.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (non-stringent) for {input.bam:q} ..." - threads: 1 - priority: 1 - - benchmark: LOG_BENCHMARK_dir + "/macs2_nonStringent.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - qValue = config["par_peakCalling"]["modelNonStringent_minQValue"], - slocalVal = config["par_peakCalling"]["modelNonStringent_slocal"], - modelPar = config["par_peakCalling"]["modelNonStringent"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.nonStringent', - outputDir = PEAKCALLING_NONSTRINGENT_dir, - keepDuplicates = "--keep-dup all" - shell: - """macs2 callpeak \ - --treatment {input.bam} \ - -q {params.qValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.modelPar} \ - {params.keepDuplicates} \ - --slocal {params.slocalVal} \ - 2> {log:q} && - sort -k 8gr,8gr {output.peaks_bedT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' | gzip -f > {output.peaks_bed} - """ - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_Encode_outputName = PEAKCALLING_ENCODE_dir + '/{basename}' + '.Encode' - -rule macs2_Encode: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - broadPeakfileT = temp(macs2_Encode_outputName + '_peaks.broadPeak'), - gappedPeakfileT = temp(macs2_Encode_outputName + '_peaks.gappedPeak'), - narrowPeakfileT = temp(macs2_Encode_outputName + '_peaks.narrowPeak'), - xls = temp(macs2_Encode_outputName + '_peaks.xls'), - broadPeakfile = temp(macs2_Encode_outputName + '.broadPeak.gz'), - gappedPeakfile = temp(macs2_Encode_outputName + '.gappedPeak.gz'), - narrowPeakfile = temp(macs2_Encode_outputName + '.narrowPeak.gz'), - bdg1 = macs2_Encode_outputName + '_control_lambda.bdg.gz', - bdg2 = macs2_Encode_outputName + '_treat_pileup.bdg.gz' - log: - broadAndGapped = LOG_BENCHMARK_dir + "/macs2_Encode_broadAndGapped.{basename}.log", - narrow = LOG_BENCHMARK_dir + "/macs2_Encode_narrow.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (Encode version) for {input.bam:q} ..." - threads: 1 - priority: 1 - - benchmark: LOG_BENCHMARK_dir + "/macs2_Encode.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - pValue = config["par_peakCalling"]["Encode_pValThreshold"], - modelParBroadAndGapped = config["par_peakCalling"]["Encode_modelBroadAndGapped"], - modelParNarrow = config["par_peakCalling"]["Encode_modelNarrow"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.Encode', - outputDir = PEAKCALLING_ENCODE_dir, - keepDuplicates = "--keep-dup all", - bdg1 = macs2_Encode_outputName + '_control_lambda.bdg', - bdg2 = macs2_Encode_outputName + '_treat_pileup.bdg' - shell: - # 1. First produce broad and gapped peaks, then narrow ones - # See https://www.encodeproject.org/atac-seq/ and https://github.com/kundajelab/atac_dnase_pipelines/blob/62e1c544a394d3215d0b2d24743fc1e8bb08123c/modules/callpeak_macs2.bds - # After peak calling, sort by col 8and 14 in descending order and replace long peak names in Column 4 with Peak_<peakRank> - """ - macs2 callpeak \ - --treatment {input.bam} \ - -p {params.pValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelParBroadAndGapped} \ - 2> {log.broadAndGapped:q} && - sort -k 8gr,8gr {output.broadPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' | gzip -f > {output.broadPeakfile} && - sort -k 14gr,14gr {output.gappedPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' | gzip -f > {output.gappedPeakfile} && - macs2 callpeak \ - --treatment {input.bam} \ - -p {params.pValue} \ - --outdir {params.outputDir} \ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelParNarrow} \ - 2> {log.narrow:q} && - sort -k 8gr,8gr {output.narrowPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' | gzip -f > {output.narrowPeakfile} && - gzip -f {params.bdg1} > {output.bdg1} && - gzip -f {params.bdg2} > {output.bdg2} - """ - - -rule filterPeaks: - input: - bed = '{dir}/{basename}Peak.gz' - - output: - bed = '{dir}/{basename}Peak' + '.filtered.bed.gz' - log: - message: "{ruleDisplayMessage}Exclude blacklist regions for file {input.bed}..." - threads: 1 - priority: 1 - params: - blacklistRegions = config["additionalInputFiles"]["blacklistRegions"] - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bedtools.yaml" - shell: - """ - bedtools subtract \ - -a {input.bed} \ - -b {params.blacklistRegions} \ - | gzip -f > {output.bed} - """ - -def awkStringPeakType (peakType, overlap): - """text""" - - awkStr = "{{s1=$3-$2; " - - if peakType == "narrow": - awkStr = awkStr + "s2=$13-$12; if (($21/s1 >= " + str(overlap) + ") || ($21" - elif peakType == "broad": - awkStr = awkStr + "s2=$12-$11; if (($19/s1 >= " + str(overlap) + ") || ($19" - elif peakType == "gapped": - awkStr = awkStr + "s2=$18-$17; if (($31/s1 >= " + str(overlap) + ") || ($31" - - awkStr = awkStr + "/s2 >= " + str(overlap) + ")) {{print $0}}}}" - return awkStr - - -def cutColPeakType (peakType): - """text""" - - if peakType == "narrow": - return "1-10" - elif peakType == "broad": - return "1-9" - elif peakType == "gapped": - return "1-15" - -#/scratch/bunina/atac/output/6.PeakCalling/Encode/pOPCsr1.noGCBias.final.Encode.gappedPeak.filtered.bed.gz -def generateInputFiles (wildcards): - if len(getSampleBasenamesForIndividual(wildcards.individual)) == 0: - raise AssertionError("Cannot determine sample basenames for wildcard " + wildcards.individual + ": " + str(len(getSampleBasenamesForIndividual(wildcards.individual)))) - return expand('{dir}/{samples}{GCBiasStr}final.{analysisType}.{peakType}Peak.filtered.bed.gz', dir = wildcards.dir, samples = getSampleBasenamesForIndividual(wildcards.individual), GCBiasStr = wildcards.GCBiasStr, analysisType = wildcards.analysisType, peakType = wildcards.peakType) - -rule poolPeaksReplicateSamples: - input: - peakfiles = generateInputFiles - output: - pooledPeaks = expand('{{dir}}/{{individual}}.merged{{GCBiasStr}}final.{{analysisType}}.pooled.{{peakType}}Peak.filtered2.bed.gz'), - replicatePeaks = expand('{{dir}}/{{individual}}.merged{{GCBiasStr}}final.{{analysisType}}.replicate.{{peakType}}Peak.filtered2.bed.gz') - log: - message: "{ruleDisplayMessage}Pool peaks for individual {wildcards.individual}, GCBias:{wildcards.GCBiasStr}, analysisType: {wildcards.analysisType}, peaktype: {wildcards.peakType} and produce pooled and replicated peaks for input {input.peakfiles:q} ..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - #benchmark: LOG_BENCHMARK_dir + "/poolPeaksReplicateSamples.{GCBiasStr}.{individual}.{analysisType}.{peakType}.benchmark" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bedtools.yaml" - params: minOverlap = 0.5 - run: - - # 1. Pool replicate samples and produce a replicate peak file - - # With compression : zcat {input.peakfiles} | gzip -nc > {output.pooledPeaks} - - # Without compression - shell("""zcat {input.peakfiles} | gzip -f > {output.pooledPeaks}""") - - # 2. From this set of peaks on pooled data, we only retain those that have at least 50% overlap with a peak both replicates. - - # https://github.com/kundajelab/atac_dnase_pipelines/blob/master/modules/callpeak_naive_overlap.bds - # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs >= 0.5 - - cutStr = cutColPeakType (wildcards.peakType) - awkStr = awkStringPeakType (wildcards.peakType, params.minOverlap) - - command = "" - - # Strategy: call intersectBed multiple times: -a is always the result of the previous one, starting with all pooled peaks and only those peaks overlapping at least 50% with the previous run. There, this is a logical "and" connection - for i in range(len(input)): - - command = command + "intersectBed -wo -a " - - if ( i == 0 ): - command += """{output.pooledPeaks}""" - else: - command += "stdin " - - command = command + " -b " + input[i] + """ | awk 'BEGIN{{FS="\\t";OFS="\\t"}} {awkStr}' | cut -f {cutStr} | sort | uniq """ - - if not (i == (len(input) - 1) ): - command = command + "|" - - - - command = command + """ | gzip -f >{output.replicatePeaks}""" - - shell(command) - - - - -# rule idr: -# input: -# sample1Peaks = expand('{dir}/{{sample1}}.final.{{analysisType}}.{{peakType}}Peak.Peak' , dir = PEAKCALLING_dir) -# sample2Peaks = expand('{dir}/{{sample2}}.final.{{analysisType}}.{{peakType}}Peak.Peak' , dir = PEAKCALLING_dir) -# pooledPeaks = expand() -# output: -# peaks = expand('{dir}/idr_{{sample1}}.{{sample2}}.{{analysisType}}.{{peakType}}Peak' , dir = PEAKCALLING_dir), -# plot = expand('{dir}/idr_{{sample1}}.{{sample2}}.{{analysisType}}.{{peakType}}Peak.png', dir = PEAKCALLING_dir) -# log: -# LOG_BENCHMARK_dir + "/idr.{sample1}.{sample2}.{analysisType}.{peakType}.log" -# message: -# "{ruleDisplayMessage}Run IDR analysis for files {sample1} and {sample2} using analysisType={analysisType} and peakType={peakType}..." -# threads: 1 -# priority: 1 -# -# benchmark: -# LOG_BENCHMARK_dir + "/idr.{sample1}.{sample2}.{analysisType}.{peakType}.benchmark" -# params: -# rank = config["par_peakCalling"]["idr_rank"], -# softIDRThreshold = config["par_peakCalling"]["idr_softIDRThreshold"] -# other = "--plot --use-best-multisummit-IDR", -# inputFileType = "narrowPeak" # File type of --samples and --peak-list -# conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/idr.yaml" -# shell: -# """{idr_exec:q} \ -# --samples {input.sample1Peaks} {input.sample2Peaks} \ -# --input-file-type {params.inputFileType} \ -# --peak-list {input.pooledPeaks} \ -# --output-file {output.bed:q} \ -# --rank {params.rank} \ -# --soft-idr-threshold {params.softIDRThreshold} \ -# {params.other} \ -# --log-output-file {log:q}""" - - - # after IDR: - # sys idr_thresh_transformed=$(awk -v p=$idr_thresh 'BEGIN{print -log(p)/log(10)}') - # - # //# Get peaks passing global IDR threshold and convert file to narrowPeak format (Step 9) - # sys awk 'BEGIN{OFS="\\t"} $12>='"${idr_thresh_transformed}"' {if ($2<0) $2=0; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,"0"}' $idr_out \ - # | sort | uniq | sort -k7n,7n | gzip -nc > $peak_idr_trk_tmp - # - # sys zcat $peak_idr_trk_tmp | awk 'BEGIN{OFS="\\t"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' | gzip -nc > $peak_idr - # sys zcat $peak_idr_trk_tmp | awk 'BEGIN{OFS="\\t"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' | gzip -nc > $peak_idr_trk - # - # - - -############################## -############################## -## STATISTICS AND SUMAMRIES ## -############################## -############################## - -rule fragment_length_distr: - input: - expand('{dir}/{allSamples}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.csv.gz', dir = ADJRSS_dir, MAPQ = par_minMAPQscore, allSamples = allSamplesUnique) - output: - pdf = expand('{dir}/allSamples_fragmentLengthDistr.pdf', dir = REPORTS_dir_summary), - rdata = expand('{dir}/allSamples_fragmentLengthDistr.RData', dir = REPORTS_dir_summary) - log: expand('{dir}/fragment_length_distr.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Create fragment length distribution..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/fragment_length_distr.benchmark" - resources: - version: VERSION_FL_distr_script - params: - FL_distr_cutoff = config["par_scripts"]["FL_distr_script_cutoff"], - inputString = lambda wildcards, input: ','.join(input) - shell: - """ - Rscript {script_FL_distr:q} \ - {params.inputString} \ - {params.FL_distr_cutoff} \ - {output.pdf:q} \ - {output.rdata:q} \ - {log:q} - """ - - -# Enforce the creation of the report here, use a "dummy" file in case of GATK was skipped -def GATKReport(assemblyVersion): - if assemblyVersion in ('hg19', 'hg38'): - return expand('{dir}/{sample}.cleaned4.BQrecal.s.bam.plots.pdf', dir = REPORTS_dir_baseQual, sample = allSamplesUnique) - else: - return expand('{dir}/{sample}{basename}4.s.bam', dir = CLEAN_dir, basename = basenameSuffix, sample = allSamplesUnique) - - -rule stats: - input: - # We have to specify the results from ALL samples here because they are collectively needed as input - GATKReport(config["par_align"]["assemblyVersion"]), - expand('{dir}/allSamples_fragmentLengthDistr.pdf', - dir = REPORTS_dir_summary), - expand('{dir}/{sample}.s.bam', - dir = ALIGN_dir, sample = allSamplesUnique), - expand('{dir}/{sample}.s.bam.bai', - dir = ALIGN_dir, sample = allSamplesUnique), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.s.bam.stats', - dir = CHRM_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats', - dir = RMDUP_DIR, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats', - dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.stats', - dir = ADJRSS_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl', - dir = RMDUP_DIR, sample = allSamplesUnique), - expand('{dir}/{sample}.final.bam', dir = FINAL_OUTPUT_dir, sample = allSamplesUnique), - # PCA_R = rules.PCA_R.output - output: - pdf = expand('{dir}/allSamples_statSummary.pdf', dir = REPORTS_dir_summary), - rdata = expand('{dir}/allSamples_statSummary.rds', dir = REPORTS_dir_summary), - log: expand('{dir}/stats.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Generate statistics about pipeline and produce file {output:q}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/stats.benchmark" - resources: - version: VERSION_STATS_script - params: - pairedEnd = config["samples"]["pairedEnd"], - withinThr = config["par_scripts"]["STATS_script_withinThr"], - outsideThr = config["par_scripts"]["STATS_script_outsideThr"], - geneTypesToKeep = config["par_scripts"]["STATS_script_geneTypesToKeep"], - annotationFile = config["additionalInputFiles"]["annotationGTF"], - statsPattern = "*.s.bam.stats$", - libraryStatsPattern = "*.s.bam.statsLibraryCompl$", - rootDir = ROOT_dir - shell: - """ - Rscript {script_STATS:q} \ - {allSamplesUniqueStr} \ - {params.rootDir:q} \ - {output.pdf:q} \ - {output.rdata:q} \ - {params.annotationFile} \ - {params.pairedEnd} \ - {params.withinThr} \ - {params.outsideThr} \ - {params.geneTypesToKeep} \ - {log:q} \ - {params.statsPattern} \ - {params.libraryStatsPattern} - """ - - -rule bamCoverage: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - bigwig = expand('{dir}/{{basename}}.bigwig' , dir = FINAL_OUTPUT_dir), - bedgraph = temp(expand('{dir}/{{basename}}.bedgraph', dir = FINAL_OUTPUT_dir)), - bedgraphgz = expand('{dir}/{{basename}}.bedgraph.gz', dir = FINAL_OUTPUT_dir), - bedgraphIndex = expand('{dir}/{{basename}}.bedgraph.gz.tbi', dir = FINAL_OUTPUT_dir) - log: - bigwig = LOG_BENCHMARK_dir + "/bamCoverage.{basename}.bigwig.log", - bedgraph = LOG_BENCHMARK_dir + "/bamCoverage.{basename}.bedgraph.log" - message: "{ruleDisplayMessage}Run bamCoverage for {input.bam:q} ..." - threads: threadsMax - priority: 1 - - benchmark: LOG_BENCHMARK_dir + "/bamCoverage.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - params: - normalization = bamCoverage_normOption, - binSize = config["par_deepTools"]["bamCoverage_binSize"], - ignoreChr = "chrX chrM", - otherOptions = config["par_deepTools"]["bamCoverage_otherOptions"], - duplicates = "--ignoreDuplicates" # Ignored, should NOT be set for GC-corrected data - shell: - # First bigwig, then bedgraph - """ - bamCoverage \ - --bam {input.bam} \ - --binSize {params.binSize} \ - --{params.normalization} \ - {params.otherOptions} \ - --numberOfProcessors {threads} \ - --ignoreForNormalization {params.ignoreChr} \ - --outFileName {output.bigwig} \ - --outFileFormat bigwig \ - 2> {log.bigwig:q} && - bamCoverage \ - --bam {input.bam} \ - --binSize {params.binSize} \ - --{params.normalization} \ - {params.otherOptions} \ - --numberOfProcessors {threads} \ - --ignoreForNormalization {params.ignoreChr} \ - --outFileName {output.bedgraph} \ - --outFileFormat bedgraph \ - 2> {log.bedgraph:q} && - sort -k1,1 -k2,2n {output.bedgraph} | bgzip -f -@ {threads} > {output.bedgraphgz} && - tabix -s 1 -b 2 -e 3 {output.bedgraphgz} - """ - - -# Note the necessary [] for concatenate two arrays -basenameSamplesAndIndArray = numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique]) -rule deepTools_plotCoverage: - input: - bamAll = expand('{dir}/{basename}{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, basename = basenameSamplesAndIndArray, GCBiasStr = ["",".noGCBias"]) - output: - plot = REPORTS_dir_cov + '/allSamples_CoveragePlot.pdf', - rawCounts = REPORTS_dir_cov + '/allSamples_CoveragePlot.counts', - log: LOG_BENCHMARK_dir + "/deepTools_plotCoverage.log" - message: "{ruleDisplayMessage}Run deepTools: plotCoverage for file {input.bamAll}..." - threads: threadsMax - priority: 1 - - benchmark: LOG_BENCHMARK_dir + "/deepTools_plotCoverage.benchmark" - params: - fragmentLength = "--minFragmentLength", # Currently ignored, might be useful for ATAC-seq data - other = "--centerReads --plotFileFormat pdf --numberOfSamples 3000000", - titlePlot = "Coverage plot for all samples", - duplicates = "--ignoreDuplicates" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """ - plotCoverage \ - --bamfiles {input.bamAll} \ - --plotTitle "{params.titlePlot}" \ - {params.other} \ - --numberOfProcessors {threads} \ - {params.duplicates} \ - --outRawCounts {output.rawCounts} \ - --plotFile {output.plot:q} \ - 2> {log:q}""" - - -rule deepTools_correlationPlots: - input: - bamAll = expand('{dir}/{allSamples}{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, allSamples = allSamplesUnique, GCBiasStr = ["",".noGCBias"]) - output: - npz = REPORTS_dir_PCA + '/allSamples' + '.bins.npz', - pdf = REPORTS_dir_corr + '/allSamples' + '.correlations.pdf', - rawCounts = REPORTS_dir_PCA + '/allSamples' + '.correlations.rawCounts' - #corMatrix = REPORTS_dir + '/allSamples' + '.correlations.matrix' - log: - multiBamSummary = LOG_BENCHMARK_dir + "/deepTools_correlationPlots.multiBamSummary.log", - plotCorrelation = LOG_BENCHMARK_dir + "/deepTools_correlationPlots.plotCorrelation.log" - message: "{ruleDisplayMessage}Run deepTools: multiBamSummary and correlationPlots for files {input.bamAll}..." - threads: threadsMax - priority: 1 - - benchmark: LOG_BENCHMARK_dir + "/deepTools_correlationPlots.benchmark" - params: - corMethod = "pearson", - whatToPlot = "heatmap", - colorMap = "hsv", - other = "--skipZeros --plotFileFormat pdf --removeOutliers --plotNumbers", - titlePlot = "Correlation plot (" + "Pearson" + ")", - binSize = 10000 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """ - multiBamSummary \ - bins \ - --bamfiles {input.bamAll} \ - --binSize {params.binSize} \ - --numberOfProcessors {threads} \ - --outRawCounts {output.rawCounts} \ - -out {output.npz:q} \ - 2> {log.multiBamSummary:q} && - plotCorrelation \ - --corData {output.npz:q} \ - --corMethod "{params.corMethod}" \ - --whatToPlot "{params.whatToPlot}" \ - --plotTitle "{params.titlePlot}" \ - --colorMap "{params.colorMap}" \ - {params.other} \ - --plotFile {output.pdf:q} \ - 2> {log.plotCorrelation:q} - """ - -# bug in version 2.4.1: --outFileCorMatrix {output.corMatrix} \ -# --labels {allSamplesUniqueStrSpaces} # No " " here these cause an error! - - - -rule deepTools_correlationPlotsPooledSamples: - input: - bamAll = expand('{dir}/{allIndividuals}.merged{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, allIndividuals = allIndividualsUnique, GCBiasStr = ["",".noGCBias"]) - output: - npz = REPORTS_dir_PCA + '/allSamplesPooled' + '.bins.npz', - pdf = REPORTS_dir_corr + '/allSamplesPooled' + '.correlations.pdf', - rawCounts = REPORTS_dir_PCA + '/allSamplesPooled' + '.correlations.rawCounts' - #corMatrix = DOWNSTREAM_dir + '/allSamples' + '.minMapQ' + str(par_minMAPQscore) + '.rmChrM.adRSS.rmDup.rmINDEL.correlations.corMatrix' - log: - multiBamSummary = LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.multiBamSummary.log", - plotCorrelation = LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.plotCorrelation.log" - message: "{ruleDisplayMessage}Run deepTools: multiBamSummary and correlationPlots for files {input.bamAll}..." - threads: threadsMax - priority: 1 - - benchmark: LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.benchmark" - params: - corMethod = "pearson", - whatToPlot = "heatmap", - colorMap = "hsv", - other = "--skipZeros --plotFileFormat pdf --removeOutliers --plotNumbers", - titlePlot = "Correlation plot (" + "Pearson" + ")" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """ - multiBamSummary \ - bins \ - --bamfiles {input.bamAll} \ - --numberOfProcessors {threads} \ - --outRawCounts {output.rawCounts} \ - -out {output.npz:q} \ - 2> {log.multiBamSummary:q} && - plotCorrelation \ - --corData {output.npz:q} \ - --corMethod "{params.corMethod}" \ - --whatToPlot "{params.whatToPlot}" \ - --plotTitle "{params.titlePlot}" \ - --colorMap "{params.colorMap}" \ - {params.other} \ - --plotFile {output.pdf:q} \ - 2> {log.plotCorrelation:q} - """ -# bug in version 2.4.1: --outFileCorMatrix {output.corMatrix} \ --labels {allIndividualsUniqueStrSpaces} \ - -# TODO: Integrate -# rule PCA_R: -# input: -# metadata = samplesSummaryFile, -# rawCounts = rules.deepTools_correlationPlots.output.rawCounts, -# rawCountsMerged = rules.deepTools_correlationPlotsPooledSamples.output.rawCounts -# output: -# pdf = expand('{dir}/allSamples_PCAPlot_R.pdf', dir = REPORTS_dir_PCA), -# pdfMerged = expand('{dir}/allSamplesPooled_PCAPlot_R.pdf', dir = REPORTS_dir_PCA) -# log: expand('{dir}/PCA_R.log', dir = LOG_BENCHMARK_dir) -# message: "{ruleDisplayMessage}Generate PCA plots in R..." -# threads: 1 -# priority: 1 -# benchmark: LOG_BENCHMARK_dir + "/PCA_R.benchmark" -# resources: -# params: -# binSize = 10000, -# filterChr = true, -# splitByGCCorrection = true, -# minNoCountsRows = 0 -# script: dir_scripts + script_PCA - - - -# -# rule deepTools_plotPCA: -# input: -# coverage = REPORTS_dir_PCA + '/{basename}.bins.npz' -# output: -# plot = REPORTS_dir_PCA + '/{basename}_PCAPlot.pdf', -# data = REPORTS_dir_PCA + '/{basename}_PCAPlot.data' -# log: LOG_BENCHMARK_dir + "/deepTools_plotPCA.{basename}.log" -# message: "{ruleDisplayMessage}Run deepTools: plotPCA for coverage file {input.coverage}..." -# threads: 1 -# priority: 1 -# -# benchmark: LOG_BENCHMARK_dir + "/deepTools_plotPCA.benchmark" -# params: -# other = "", -# titlePlot = "PCA plot for all samples" -# conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" -# shell: -# """ -# plotPCA \ -# --corData {input.coverage} \ -# --plotTitle "{params.titlePlot}" \ -# --plotFileFormat pdf \ -# {params.other} \ -# --outFileNameData {output.data} \ -# --plotFile {output.plot:q}\ -# 2> {log:q} -# """ - - -# Enforce that it runs at the very end - -rule multiqc: - input: - summaryStats = expand('{dir}/allSamples_statSummary.{fileType}', dir = REPORTS_dir_summary, fileType = ["pdf", "RData"]), - fragmentLength = expand('{dir}/allSamples_fragmentLengthDistr.{fileType}', dir = REPORTS_dir_summary, fileType = ["pdf", "RData"]), - corrPlots = REPORTS_dir_corr + '/allSamples' + '.correlations.pdf', - corrPlotsPooled = REPORTS_dir_corr + '/allSamplesPooled' + '.correlations.pdf', - #PCAPlot = expand('{dir}/{basename}_PCAPlot.pdf', dir = REPORTS_dir_PCA, basename = ["allSamples", "allSamplesPooled"]), - GCBiasPlot = expand('{dir}/{sample}{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir_gcbias, sample = allSamplesUnique, GCBiasStr = ["", ".noGCBias"]), - GCBiasPlotPooled= expand('{dir}/{individual}.merged{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir_gcbias, individual = allIndividualsUnique, GCBiasStr = ["", ".noGCBias"]), - coveragePlot = REPORTS_dir_cov + '/allSamples_CoveragePlot.pdf', - coverage = expand('{dir}/{sample}{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, sample = allSamplesUnique , GCBiasStr = ["", ".noGCBias"], type = ["bigwig", "bedgraph.gz"]), - coveragePooled = expand('{dir}/{individual}.merged{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, individual = allIndividualsUnique, GCBiasStr = ["", ".noGCBias"], type = ["bigwig", "bedgraph.gz"]) - output: - report = REPORTS_dir_multiqc + '/multiqc_report.html' - log: - message: "{ruleDisplayMessage}Finally, run multiqc for the folder {ROOT_dir}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/multiqc.benchmark" - resources: - params: - outputDir = lambda wildcards, output: os.path.dirname(output.report), - basename = lambda wildcards, output: os.path.basename(output.report), - rootDir = ROOT_dir - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/multiqc.yaml" - shell: - """ - multiqc \ - --force \ - --ignore *.out --ignore *.err \ - -o {params.outputDir} \ - --filename {params.basename} \ - {params.rootDir:q} - """ diff --git a/src/Snakemake/dev/src/.dropbox.attr b/src/Snakemake/dev/src/.dropbox.attr deleted file mode 100755 index 9e26dfe..0000000 --- a/src/Snakemake/dev/src/.dropbox.attr +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/src/Snakemake/dev/src/Fragment_length.R b/src/Snakemake/dev/src/Fragment_length.R deleted file mode 100755 index d00b263..0000000 --- a/src/Snakemake/dev/src/Fragment_length.R +++ /dev/null @@ -1,124 +0,0 @@ - -######################### -# LIBRARY AND FUNCTIONS # -######################### - -source("/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/src/functions.R") - -initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE) -checkAndLoadPackages(c("checkmate", "futile.logger", "tidyverse", "tools"), verbose = TRUE) - - -######################### -# ADDITIONAL PARAMETERS # -######################### - -par.l = list() -par.l$log_minlevel = "INFO" -par.l$ggplot_binwidth = 1 -par.l$ggplot_maxBreaks = 50 -par.l$pdf_width = 12 -par.l$pdf_height = 6 -par.l$ggplot_axis.title.size = 16 -par.l$ggplot_axis.text.x = 8 -par.l$ggplot_axis.text.y = 12 -par.l$ggplot_axis.col = "black" -par.l$ggplot_hist.col = "black" - - - -############################################ -# READ AND VALIDATE COMMAND LINE ARGUMENTS # -############################################ - -args <- commandArgs(trailingOnly = TRUE) - -if (length(args) != 5) { - stop("Expecting 5 arguments but found only ", length(args), ". Exiting.") -} else { - par.l$files_input = args[1] - par.l$cutoff = as.numeric(args[2]) - par.l$file_outputPDF = args[3] - par.l$file_outputRData = args[4] - par.l$file_log = args[5] -} - - -assertCharacter(par.l$files_input, len = 1) -assertIntegerish(par.l$cutoff, lower = 1) -assertDirectory(dirname(par.l$file_outputPDF), access = "w") -assertDirectory(dirname(par.l$file_log), access = "w") - - -startLogger(par.l$file_log, par.l$log_minlevel, appenderName = "file", removeOldLog = TRUE) -printParametersLog(par.l) - - - -allFiles = strsplit(par.l$files_input, ",", fixed = TRUE)[[1]] - - - -################################## -# Plot distribution of fragments # -################################## - -pdf(par.l$file_outputPDF, width = par.l$pdf_width, height = par.l$pdf_height) - - -for (fileCur in allFiles) { - - assertFileExists(fileCur, access = "r") - - ########################## - # READ AND PROCESS INPUT # - ########################## - - tbl.df <- read_tsv(fileCur, col_names = FALSE) - #tbl.df <- read.table(fileCur, sep = "\t", header = FALSE) - - # Sanity check - stopifnot(ncol(tbl.df) == 3) - - colnames(tbl.df) = c("CHR", "MAPQ", "Read_length") - tbl.df$Read_length = abs(tbl.df$Read_length) - - #Filter all by the length of the cutoff - tbl_filtered.df <- as.data.frame(tbl.df[tbl.df$Read_length < par.l$cutoff, ]) - - # Filter reads of length 0 - tbl_filtered.df = filter(tbl_filtered.df, Read_length > 0) - - title = splitStringInMultipleLines(paste0(file_path_sans_ext(basename(fileCur))), 50) - - cap <- ggplot(tbl_filtered.df, aes(x = Read_length)) + - geom_histogram(aes(y = ..count..), color = par.l$ggplot_hist.col, binwidth = par.l$ggplot_binwidth) + - xlab("Fragment length (in bp)") + ylab("Abundance") + - ggtitle(title) + - scale_x_continuous(breaks = seq(0, par.l$cutoff, par.l$ggplot_maxBreaks), labels = seq(0,par.l$cutoff, par.l$ggplot_maxBreaks), limits = c(0, par.l$cutoff)) + - theme(axis.text.x = element_text(color = par.l$ggplot_axis.col, size = par.l$ggplot_axis.text.x), - axis.text.y = element_text(color = par.l$ggplot_axis.col, size = par.l$ggplot_axis.text.y), - axis.title.x = element_text(colour = par.l$ggplot_axis.col, size = par.l$ggplot_axis.title.size, margin = margin(25,0,0,0)), - axis.title.y = element_text(colour = par.l$ggplot_axis.col, size = par.l$ggplot_axis.title.size, margin = margin(0,25,0,0)), - axis.line.x = element_line(color = par.l$ggplot_axis.col), axis.line.y = element_line(color = par.l$ggplot_axis.col), - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_blank(), - panel.background = element_blank(), - legend.position = c(0.1, 0.9), - legend.justification = "center", - legend.title = element_blank(), - plot.title = element_text(hjust = 0.5) - ) - - print(cap) - - -} - - -dev.off() - -save.image(file = par.l$file_outputRData) - -flog.info(paste0("\nGenerated file", par.l$file_outputPDF, " and ", par.l$file_outputRData, " and finished script successfully.\n")) diff --git a/src/Snakemake/dev/src/PCA.R b/src/Snakemake/dev/src/PCA.R deleted file mode 100644 index fe716fc..0000000 --- a/src/Snakemake/dev/src/PCA.R +++ /dev/null @@ -1,121 +0,0 @@ -start.time <- Sys.time() - -######################### -# LIBRARY AND FUNCTIONS # -######################### - -# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes) -# Replace {outputFolder} correspondingly. -# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/2.DESeqPeaks.R.rds") - - -library("checkmate") -assertClass(snakemake, "Snakemake") -assertDirectoryExists(snakemake@config$par_general$scriptsDir) -source(paste0(snakemake@config$par_general$scriptsDir, "/functions.R")) - -######################################################################## -# SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES # -######################################################################## - -createDebugFile(snakemake) - -initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE) -checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "tools", "methods", "DESeq2"), verbose = FALSE) - - - -################### -#### PARAMETERS ### -################### - -par.l = list() - -par.l$verbose = TRUE -par.l$log_minlevel = "INFO" - -##################### -# VERIFY PARAMETERS # -##################### - -checkAndLogWarningsAndErrors(snakemake, checkClass(snakemake, "Snakemake")) - -## INPUT ## -checkAndLogWarningsAndErrors(snakemake@input, checkList(snakemake@input, min.len = 3)) -checkAndLogWarningsAndErrors(snakemake@input, checkSubset(names(snakemake@input), c("", "metadata", "rawCounts", "rawCountsMerged"))) - - -par.l$file_metadata = snakemake@input$metadata -par.l$file_rawCounts = snakemake@input$rawCounts -par.l$file_rawCountsMerged = snakemake@input$rawCountsMerged - -checkAndLogWarningsAndErrors(par.l$file_metadata, checkFileExists(par.l$file_metadata, access = "r")) -checkAndLogWarningsAndErrors(par.l$file_rawCounts, checkFileExists(par.l$file_rawCounts, access = "r")) -checkAndLogWarningsAndErrors(par.l$file_rawCountsMerged, checkFileExists(par.l$file_rawCountsMerged, access = "r")) - - -## OUTPUT ## -checkAndLogWarningsAndErrors(snakemake@output, checkList(snakemake@output, min.len = 2)) -checkAndLogWarningsAndErrors(names(snakemake@output), checkSubset(names(snakemake@output), c("", "pdf", "pdfMerged"))) - -par.l$file_output_pdf = snakemake@output$pdf -par.l$file_output_pdfMerged = snakemake@output$pdfMerged - - -## PARAMS ## -checkAndLogWarningsAndErrors(snakemake@params, checkList(snakemake@params, min.len = 4)) -checkAndLogWarningsAndErrors(names(snakemake@params), checkSubset(names(snakemake@params), c("", "binSize", "filterChr", "splitByGCCorrection", "minNoCountsRows"))) - -par.l$binSize = as.integer(snakemake@params$binSize) -checkAndLogWarningsAndErrors(par.l$binSize, checkIntegerish(par.l$binSize)) - -par.l$filterChr = as.logical(snakemake@params$filterChr) -checkAndLogWarningsAndErrors(par.l$filterChr, checkFlag(par.l$filterChr)) - -par.l$splitByGCCorrection = as.logical(snakemake@params$splitByGCCorrection) -checkAndLogWarningsAndErrors(par.l$splitByGCCorrection, checkFlag(par.l$splitByGCCorrection)) - -par.l$minNoCountsRows = as.integer(snakemake@params$minNoCountsRows) -checkAndLogWarningsAndErrors(par.l$minNoCountsRows, checkIntegerish(par.l$minNoCountsRows)) - -## LOG ## -checkAndLogWarningsAndErrors(snakemake@log, checkList(snakemake@log, min.len = 1)) -par.l$file_log = snakemake@log[[1]] - - -allDirs = c(dirname(par.l$file_output_pdfMerged), - dirname(par.l$file_output_pdf), - dirname(par.l$file_log) -) - -testExistanceAndCreateDirectoriesRecursively(allDirs) - - -###################### -# FINAL PREPARATIONS # -###################### -startLogger(par.l$file_log, par.l$log_minlevel, removeOldLog = TRUE) -printParametersLog(par.l) - -readCounts.l = list(all = par.l$file_rawCounts, pooled = par.l$file_rawCountsMerged) -output.l = list(all = par.l$file_output_pdf, pooled = par.l$file_output_pdfMerged) - -for (typeCur in c("all", "pooled")) { - - flog.info(paste0("Generate PCA plots for ", typeCur, " samples")) - - readCountsCur = readCounts.l[[typeCur]] - outputCur = output.l[[typeCur]] - - doPCAPlot(file_readCounts = readCountsCur, - file_metadata = par.l$file_metadata, - file_output = outputCur, - file_log = par.l$file_log, - metadataToInclude = c("sampleName", "individual"), - type = typeCur, - binSize = par.l$binSize, - filterChr = par.l$filterChr, - splitByGCCorrection = par.l$splitByGCCorrection, - minNoCountsRows = par.l$minNoCountsRows) -} - diff --git a/src/Snakemake/dev/src/aut_stats.R b/src/Snakemake/dev/src/aut_stats.R deleted file mode 100755 index a3193ac..0000000 --- a/src/Snakemake/dev/src/aut_stats.R +++ /dev/null @@ -1,447 +0,0 @@ - -######################### -# LIBRARY AND FUNCTIONS # -######################### - -#source("/g/scb2/zaugg/zaugg_shared/scripts/Christian/src/R/functionsCollection.R") -source("/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/src/functions.R") - - - -initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE) -checkAndLoadPackages(c("checkmate", "futile.logger", "tidyverse", "reshape2", "tools", "rlist", "scales", "grDevices", "gridExtra", "ggplot2", "Rsamtools", "GenomicRanges"), verbose = TRUE) - - -######################### -# ADDITIONAL PARAMETERS # -######################### - -par.l = list() - -# Currently hard-coded parameters -par.l$verbose = TRUE -par.l$commandSamtools = "samtools" -par.l$ggplot_axis.title.size = 20 -par.l$ggplot_axis.text.x = 8 -par.l$ggplot_axis.text.y = 12 -par.l$ggplot_axis.col = "black" -par.l$ggplot.text.size = 8 -par.l$nRowsPlotPDF = 2 -par.l$log_minlevel = "INFO" - - -############################################ -# READ AND VALIDATE COMMAND LINE ARGUMENTS # -############################################ -args <- commandArgs(trailingOnly = TRUE) - -if (length(args) != 12) { - stop("Expecting 12 arguments but found only ", length(args),". Exiting.") -} else { - par.l$sampleNames = strsplit(args[1], ",", fixed = TRUE)[[1]] - par.l$rootDir = args[2] - par.l$file_outputPDF = args[3] - par.l$file_outputRData = args[4] - par.l$annotationFile = args[5] - par.l$pairedEnd = as.logical(args[6]) - par.l$TSSEnrichment_withinThreshold = as.integer(args[7]) - par.l$TSSEnrichment_outsideThreshold = as.integer(args[8]) - par.l$TSSEnrichment_geneTypesToKeep = args[9] - par.l$file_log = args[10] - par.l$fileStatsPattern = args[11] - par.l$fileLibraryStatsPattern = args[12] - - -} -# -# par.l$sampleNames = strsplit("2FPCp15r1,2FPCp15r2,2FPCp25r1,2FPCp25r2,2FPCp5r1,2FPCp5r2,PCsr1,PCsr2,pOPCsr1,pOPCsr2", ",", fixed = TRUE)[[1]] -# par.l$rootDir = "/scratch/bunina/atac/output" -# par.l$file_outputPDF = "/scratch/bunina/atac/output/Reports_and_Stats/sampleSummary/allSamples_statSummary.pdf " -# par.l$file_outputRData = "/scratch/bunina/atac/output/Reports_and_Stats/sampleSummary/allSamples_statSummary.RData" -# par.l$annotationFile = "/g/scb2/zaugg/zaugg_shared/annotations/mm10/Gencode_M16/gencode.vM16.annotation.gtf" -# par.l$pairedEnd = TRUE -# par.l$TSSEnrichment_withinThreshold = 4000 -# par.l$TSSEnrichment_outsideThreshold = 1000 -# par.l$TSSEnrichment_geneTypesToKeep ="protein_coding" -# par.l$file_log = "/scratch/bunina/atac/output/LOGS_AND_BENCHMARKS/stats.log" -# par.l$fileStatsPattern ="*.s.bam.stats$" -# par.l$fileLibraryStatsPattern = "*.s.bam.statsLibraryCompl$" - - -startLogger(par.l$file_log, par.l$log_minlevel, appenderName = "file", removeOldLog = TRUE) - -printParametersLog(par.l) - -assertCharacter(par.l$sampleNames, min.len = 1, min.chars = 1) -assertDirectoryExists(dirname(par.l$file_outputPDF), access = "w") -assertLogical(par.l$pairedEnd) -assertDirectoryExists(dirname(par.l$file_log), access = "w") - -assertIntegerish(par.l$TSSEnrichment_withinThreshold , lower = 0) -assertIntegerish(par.l$TSSEnrichment_outsideThreshold , lower = 0) -assertCharacter(par.l$TSSEnrichment_geneTypesToKeep , len = 1, min.chars = 1) - -assertDirectoryExists(par.l$rootDir, access = "r") - -assertFileExists(par.l$annotationFile, access = "r") -assertCharacter(par.l$fileStatsPattern, len = 1, min.chars = 1) -assertCharacter(par.l$fileLibraryStatsPattern, len = 1, min.chars = 1) -# Check existence of output directories - -dir_beforeTrimming = paste0(par.l$rootDir, "/1.FastQC_beforeTrimming") -assertDirectoryExists(dir_beforeTrimming, access = "r") - -dir_afterTrimming = paste0(par.l$rootDir, "/3.FastQC_afterTrimming") -assertDirectoryExists(dir_afterTrimming, access = "r") - -dir_alignment = paste0(par.l$rootDir, "/4.Alignment") -assertDirectoryExists(dir_alignment, access = "r") - - -dir_rmChrm = paste0(par.l$rootDir, "/5.Postalignment/3.Filter_chrM") -assertDirectoryExists(dir_rmChrm, access = "r") - -dir_rmDUP = paste0(par.l$rootDir, "/5.Postalignment/4.MarkAndRemove_Duplicates") -assertDirectoryExists(dir_rmDUP, access = "r") - -dir_MAPQ = paste0(par.l$rootDir, "/5.Postalignment/5.Filter_MAPQ") -assertDirectoryExists(dir_MAPQ, access = "r") - -dir_FINAL = paste0(par.l$rootDir, "/8.FinalOutput") -assertDirectoryExists(dir_FINAL, access = "r") - -output.l = list() - -###################### -# Search stats files # -###################### - -stats.df = tribble(~sampleName, ~BT, ~AT, ~align, ~alignrate, ~MAPQsort, ~chrM, ~rmDUP, - ~readsTotal , ~readsDistinct, ~readsOccOnce, ~readsOccTwo , ~NRF, ~PBC1, ~PBC2) - - -for (sampleNameCur in par.l$sampleNames) { - - trimmingData = c() - - for (dirCur in c(dir_beforeTrimming, dir_afterTrimming)) { - - # 1. FASTQC - files = createFileList(dir_beforeTrimming, "fastqc_data.txt", verbose = FALSE, recursive = TRUE) - # So far contains all samples, grep for the current one - files = files[which(grepl(paste0("/", sampleNameCur,".*_fastqc"), files))] - stopifnot(length(files) == 2) - command = paste0("less ", dir_beforeTrimming, "/", sampleNameCur, "*_fastqc/fastqc_data.txt") - stat = system(command, intern = TRUE) - #stat = system2(command) - numCur = as.numeric(unlist(strsplit(stat[7], "\t", fixed = TRUE))[2]) - if (par.l$pairedEnd) numCur = numCur *2 - trimmingData = c(trimmingData, numCur) - } - - - # 3. Alignment - #Ivan bug remove * from .s.bam$ - file = createFileList(dir_alignment, paste0(sampleNameCur, ".s.bam$"), verbose = FALSE) - file = file[which(grepl(paste0("/", sampleNameCur,".s.bam"), file))] - stopifnot(length(file) == 1) - command = paste0(par.l$commandSamtools, " flagstat ", file) - align_stat = system(command, intern = TRUE) - align_num = as.numeric(unlist(strsplit(align_stat[5],"+", fixed = TRUE))[1]) - align_rate = as.numeric(unlist(strsplit(align_stat[5],"[% | (]", fixed = FALSE))[6]) - - # 4. Postalign chrM - # Ivan add "." syntax to search - file = createFileList(dir_rmChrm, paste0(sampleNameCur,".", par.l$fileStatsPattern), verbose = FALSE) - file = file[which(grepl(paste0("/", sampleNameCur,".", par.l$fileStatsPattern), file))] - stopifnot(length(file) == 1) - command = paste0("less ", file) - chrM_stat = system(command, intern = TRUE) - chrm_num = as.numeric(unlist(strsplit(chrM_stat[1], "+", fixed = TRUE))[1]) - - # 5. Postalign rmDUP - # ths same as above - file = createFileList(dir_rmDUP, paste0(sampleNameCur,".", par.l$fileStatsPattern), verbose = FALSE) - file = file[which(grepl(paste0("/", sampleNameCur,".", par.l$fileStatsPattern), file))] - stopifnot(length(file) == 1) - command = paste0("less ", file) - rmDup_stat = system(command, intern = TRUE) - rmDup_num = as.numeric(unlist(strsplit(rmDup_stat[1],"+", fixed = TRUE))[1]) - - # 6. Postalign MAPQ - # the same as above - file = createFileList(dir_MAPQ, paste0(sampleNameCur,".", par.l$fileStatsPattern), verbose = FALSE) - file = file[which(grepl(paste0("/", sampleNameCur,".", par.l$fileStatsPattern), file))] - stopifnot(length(file) == 1) - command = paste0("less ", file) - MAPQsort_stat = system(command, intern = TRUE) - MAPQsort_num = as.numeric(unlist(strsplit(MAPQsort_stat[1],"+", fixed = TRUE))[1]) - - # 7. Postalign rmINDEL - # the same as above - #file = createFileList(dir_INDEL, paste0(sampleNameCur,".", par.l$fileStatsPattern), verbose = FALSE) - #stopifnot(length(file) == 1) - #command = paste0("less ", file) - #rmINDEL_stat = system(command, intern = TRUE) - #rmINDEL_num = as.numeric(unlist(strsplit(rmINDEL_stat[1],"+", fixed = TRUE))[1]) - - # Library complexity numbers - # the same as above - file = createFileList(dir_rmDUP, paste0(sampleNameCur,".", par.l$fileLibraryStatsPattern), verbose = FALSE) - file = file[which(grepl(paste0("/", sampleNameCur,".", par.l$fileLibraryStatsPattern), file))] - stopifnot(length(file) == 1) - libraryCompl.df = read.table(file, header = TRUE, stringsAsFactors = FALSE, sep = "\t") - stopifnot(nrow(libraryCompl.df) == 1) - - - stats.df = add_row(stats.df, - sampleName = sampleNameCur, - BT = trimmingData[1], - AT = trimmingData[2], - align = align_num, - alignrate = align_rate, - MAPQsort = MAPQsort_num, - chrM = chrm_num, - rmDUP = rmDup_num, - # rmINDEL = rmINDEL_num, - readsTotal = libraryCompl.df$readsTotal, - readsDistinct = libraryCompl.df$readsDistinct, - readsOccOnce = libraryCompl.df$readsOccOne, - readsOccTwo = libraryCompl.df$readsOccTwo, - NRF = libraryCompl.df$NRF, - PBC1 = libraryCompl.df$PBC1, - PBC2 = libraryCompl.df$PBC2 - ) -} - - -################## -# TSS enrichment # -################## - -finalOutputFiles = paste0(dir_FINAL, "/", stats.df$sampleName, ".final.bam") - -res.l = list() - -for (fileCur in finalOutputFiles) { - - assertDirectory(dirname(fileCur), access = "w") - - res.l[[fileCur]] = checkEnrichmentOverTSS(par.l$annotationFile, - fileCur, - thresholdWithinTSS = par.l$TSSEnrichment_withinThreshold, - thresholdOutsideTSS = par.l$TSSEnrichment_outsideThreshold, - readFlags = generateDefaultReadFlags(pairedEndReads = par.l$pairedEnd), - geneTypesToKeep = par.l$TSSEnrichment_geneTypesToKeep, - verbose = par.l$verbose) - -} - -######## -# PLOT # -######## - -enrichmentRaw.l = list.select(res.l, overallEnrichmentRaw) - - -# reshape to plot graphs - -qualityMetrics.df <- data.frame(sampleName = stats.df$sampleName, - nReadsMin = stats.df$MAPQsort, - nReadsMax = stats.df$BT, - NRF = stats.df$NRF, - PBC1 = stats.df$PBC1, - PBC2 = stats.df$PBC2, - enrichmentRaw = NA) - -qualityMetrics.df = mutate(qualityMetrics.df, - NRF_color = ifelse(NRF < 0.7 , "orange", ifelse(NRF > 0.9 , "forestgreen", "yellow")), - PBC1_color = ifelse(PBC1 < 0.7, "orange", ifelse(PBC1 > 0.9, "forestgreen", "yellow")), - PBC2_color = ifelse(PBC2 < 1 , "orange", ifelse(PBC2 > 3 , "forestgreen", "yellow")) - ) - - -qualityMetrics.df$NRF_colorFac = factor(qualityMetrics.df$NRF_color , levels = c("forestgreen","yellow", "orange")) -qualityMetrics.df$PBC1_colorFac = factor(qualityMetrics.df$PBC1_color, levels = c("forestgreen","yellow", "orange")) -qualityMetrics.df$PBC2_colorFac = factor(qualityMetrics.df$PBC2_color, levels = c("forestgreen","yellow", "orange")) - -# IVAN: this was super annoying bug add regex to find only with fixed specific ending -for (i in seq_len(nrow(qualityMetrics.df))) { - cleanName = gsub(".final.bam", "", basename(names(enrichmentRaw.l)[i]), fixed = TRUE) - index = which(qualityMetrics.df$sampleName == cleanName) - stopifnot(length(index) == 1) - qualityMetrics.df$enrichmentRaw[index] = enrichmentRaw.l[[i]]$overallEnrichmentRaw -} - -stopifnot(!any(is.na(qualityMetrics.df$enrichmentRaw))) - -# Add Percentages -stats.df = mutate(stats.df, - BT_perc = BT * 100 / BT, - AT_perc = AT * 100 / BT, - align_perc = align * 100 / BT, - chrM_perc = chrM * 100 / BT, - rmDUP_perc = rmDUP * 100 / BT, - MAPQsort_perc = MAPQsort * 100 / BT - ) - - - -#colnames(stats.df.perc) = c("sampleName","Before\ntrimming","After\ntrimming","Alignment","MAPQ\nfilter","Filter\nchrM"," Filter\nduplicates"," Filter\nINDELs") - -stats.subset.df = dplyr::select(stats.df, one_of("sampleName", "BT_perc", "AT_perc", "align_perc", "chrM_perc", "rmDUP_perc", "MAPQsort_perc")) -stats.subset2.df = dplyr::select(stats.df, one_of("sampleName", "BT", "AT", "align", "chrM", "rmDUP", "MAPQsort")) - -colnames(stats.subset.df) = colnames(stats.subset2.df) = c("sampleName","Before\ntrimming","After\ntrimming","Alignment","Filter\nchrM", "Filter\nduplicates", "MAPQ\nfilter") - - - -stats.red.melted.df = melt(stats.subset.df) -stats.red.melted2.df = melt(stats.subset2.df) - - - -#pdf_width = (nrow(stats.df) / par.l$nRowsPlotPDF) * 7 # 6 width per column -pdf_width = 10 -pdf_height = 10 # fixed because of always two rows - - -pdf(par.l$file_outputPDF, width = pdf_width, height = pdf_height) - - -pos_x = 5.7 - -# stats.plot = ggplot(stats.red.melted.df, aes(x = variable, y = value)) + geom_bar(stat = "identity", fill = "palevioletred4", color = "whitesmoke", size = 1.5) + -# facet_wrap(~sampleName, nrow = par.l$nRowsPlotPDF) + xlab("Filtering steps in pipeline") + ylab("% of the initial number of reads") + -# geom_label(aes(x = variable, y = value, label = paste0(round(value,1),"%") , vjust = 0.5, hjust = 0.5), size = 4, color = "darkblue") + -# theme(axis.text.x = element_text(face = "bold", color = par.l$ggplot_axis.col, size = par.l$ggplot_axis.text.x), -# axis.text.y = element_text(face = "bold", color = par.l$ggplot_axis.col, size = par.l$ggplot_axis.text.y), -# axis.title.x = element_text(face = "bold", colour = par.l$ggplot_axis.col, size = par.l$ggplot_axis.title.size, margin = margin(25,0,0,0)), -# axis.title.y = element_text(face = "bold", colour = par.l$ggplot_axis.col, size = par.l$ggplot_axis.title.size, margin = margin(0,25,0,0)), -# axis.line.x = element_line(color = par.l$ggplot_axis.col), axis.line.y = element_line(color = "black"), -# panel.grid.major = element_blank(), -# panel.grid.minor = element_blank(), -# panel.border = element_blank(), -# panel.background = element_blank(), -# legend.position = c(0.1, 0.9), -# legend.justification = "center", -# legend.title = element_blank()) + -# theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) + -# geom_text(data = qualityMetrics.df, aes(x = pos_x, y = 100, label = paste("Start: ", .prettyNum(nReadsMax))), size = 3, hjust = 0) + -# geom_text(data = qualityMetrics.df, aes(x = pos_x, y = 95, label = paste("Final: ", .prettyNum(nReadsMin))), size = 3, hjust = 0) + -# geom_text(data = qualityMetrics.df, aes(x = pos_x, y = 90, label = paste("TSS:", enrichmentRaw, ", NRF:", round(NRF,2))), size = 3, hjust = 0) + -# geom_text(data = qualityMetrics.df, aes(x = pos_x, y = 85, label = paste("PBC1/2:", round(PBC1,2), "/", round(PBC2,2))), size = 3, hjust = 0) - -# print(stats.plot) -allSamples = unique(stats.df$sampleName) - -statsSamples.l = list() - -for (sampleCur in allSamples) { - local({ - subsetSample.df = filter(stats.subset.df, sampleName %in% sampleCur) - stopifnot(nrow(subsetSample.df) == 1) - subsetSample.melted.df = melt(subsetSample.df) - - qualityMetrics.filt.df = filter(qualityMetrics.df, sampleName %in% sampleCur) - - stats.plot = ggplot(subsetSample.melted.df, aes(x = variable, y = value)) + geom_bar(stat = "identity", fill = "palevioletred4", color = "whitesmoke", size = 1.5) + - xlab("Filtering steps in pipeline") + ylab("% of the initial number of reads") + - geom_label(aes(x = variable, y = value, label = paste0(round(value,1),"%") , vjust = 0.5, hjust = 0.5), size = 4, color = "darkblue") + - theme(axis.text.x = element_text(face = "bold", color = par.l$ggplot_axis.col, size = par.l$ggplot_axis.text.x), - axis.text.y = element_text(face = "bold", color = par.l$ggplot_axis.col, size = par.l$ggplot_axis.text.y), - axis.title.x = element_text(face = "bold", colour = par.l$ggplot_axis.col, size = par.l$ggplot_axis.title.size, margin = margin(25,0,0,0)), - axis.title.y = element_text(face = "bold", colour = par.l$ggplot_axis.col, size = par.l$ggplot_axis.title.size, margin = margin(0,25,0,0)), - axis.line.x = element_line(color = par.l$ggplot_axis.col), axis.line.y = element_line(color = "black"), - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_blank(), - panel.background = element_blank(), - legend.position = c(0.1, 0.9), - legend.justification = "center", - legend.title = element_blank()) + - theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) + - ggtitle(basename(sampleCur)) + - geom_text(data = qualityMetrics.filt.df, aes(x = pos_x, y = 100, label = paste("Start: ", .prettyNum(nReadsMax))), size = 3, hjust = 0) + - geom_text(data = qualityMetrics.filt.df, aes(x = pos_x, y = 95, label = paste("Final: ", .prettyNum(nReadsMin))), size = 3, hjust = 0) + - geom_text(data = qualityMetrics.filt.df, aes(x = pos_x, y = 90, label = paste("TSS:", enrichmentRaw, ", NRF:", round(NRF,2))), size = 3, hjust = 0) + - geom_text(data = qualityMetrics.filt.df, aes(x = pos_x, y = 85, label = paste("PBC1/2:", round(PBC1,2), "/", round(PBC2,2))), size = 3, hjust = 0) - - print(stats.plot) - statsSamples.l[[sampleCur]] <<- stats.plot - }) - -} - - -p2 <- ggplot(qualityMetrics.df) -p2 <- p2 + aes(x = sampleName, y = NRF, fill = NRF_colorFac) -p2 <- p2 + geom_bar(stat = 'identity') -p2 <- p2 + ylim(0, 1) -p2 <- p2 + .getThemeForGGPlot() -p2 <- p2 + ggtitle("Non-Redundant Fraction (NRF): \nNo. distinct uniquely mapping reads / No. reads total") -p2 <- p2 + geom_hline(yintercept = 0.9, color = "yellow", size = 1, alpha = 0.5) -p2 <- p2 + geom_hline(yintercept = 0.7, color = "orange", size = 1, alpha = 0.5) -p2 <- p2 + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5)) -p2 <- p2 + scale_fill_manual(name = "ENCODE\nlibrary\ncomplexity", values = c("forestgreen" = "forestgreen", "yellow" = "yellow", "orange" = "orange"), labels = c("forestgreen" = "Ideal", "yellow" = "Acceptable", "orange" = "Concerning")) - - - -p3 <- ggplot(qualityMetrics.df) -p3 <- p3 + aes(x = sampleName, y = PBC1, fill = PBC1_colorFac) -p3 <- p3 + geom_bar(stat = 'identity') -p3 <- p3 + ylim(0, 1) -p3 <- p3 + .getThemeForGGPlot() -p3 <- p3 + ggtitle("Library Complexity:\nPCR Bottlenecking Coefficient 1 (PBC1)") -p3 <- p3 + geom_hline(yintercept = 0.9, color = "yellow", size = 1, alpha = 0.5) -p3 <- p3 + geom_hline(yintercept = 0.7, color = "orange", size = 1, alpha = 0.5) -p3 <- p3 + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5)) -p3 <- p3 + scale_fill_manual(name = "ENCODE\nbottlenecking\nlevel", values = c("forestgreen" = "forestgreen", "yellow" = "yellow", "orange" = "orange"), labels = c("forestgreen" = "None", "yellow" = "Moderate", "orange" = "Severe")) - -p4 <- ggplot(qualityMetrics.df) -p4 <- p4 + aes(x = sampleName, y = PBC2, fill = PBC2_colorFac) -p4 <- p4 + geom_bar(stat = 'identity') -p4 <- p4 + ylim(0, NA) -p4 <- p4 + .getThemeForGGPlot() -p4 <- p4 + ggtitle("Library Complexity:\nPCR Bottlenecking Coefficient 2 (PBC2)") -p4 <- p4 + geom_hline(yintercept = 3, color = "yellow", size = 1, alpha = 0.5) -p4 <- p4 + geom_hline(yintercept = 1, color = "orange", size = 1, alpha = 0.5) -p4 <- p4 + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5)) -p4 <- p4 + scale_fill_manual(name = "ENCODE\nbottlenecking\nlevel", values = c("forestgreen" = "forestgreen", "yellow" = "yellow", "orange" = "orange"), labels = c("forestgreen" = "None", "yellow" = "Moderate", "orange" = "Severe")) - - - -p5 <- ggplot(stats.red.melted.df, aes(x = variable, y = value)) -p5 <- p5 + geom_boxplot() + geom_jitter(height = 0, alpha = 0.1, color = "black") -p5 <- p5 + .getThemeForGGPlot() -p5 <- p5 + ggtitle("Summary overview") -p5 <- p5 + ylab("% of the initial number of reads") -p5 <- p5 + xlab("Filtering steps in pipeline") - -p6 <- ggplot(stats.red.melted2.df, aes(x = variable, y = value)) -p6 <- p6 + geom_boxplot(outlier.shape = NA) + geom_jitter(height = 0, alpha = 0.3, color = "black") -p6 <- p6 + .getThemeForGGPlot() -p6 <- p6 + ggtitle("Summary overview") -p6 <- p6 + scale_y_continuous(name = "Number of reads", labels = comma) -p6 <- p6 + xlab("Filtering steps in pipeline") - - - -print(p5) -print(p6) -printMultipleGraphsPerPage(list(p2,p3,p4), nCol = 1, nRow = 3, pdfFile = NULL, height = pdf_height, width = pdf_width) -# print(p2) -# print(p3) -# print(p4) -dev.off() - -# Write all data to an rds file -output.l$summaryStatistics = stats.df -output.l$TSSEnrichment = res.l -output.l$qualityMetrics = qualityMetrics.df -output.l$plots = list(summary = statsSamples.l, NRF = p2, PBC1 = p3, PBC2 = p4, overview1 = p5, overview2 = p6) - - -saveRDS(output.l, par.l$file_outputRData) - -flog.info(paste0("\nGenerated file", par.l$file_outputPDF, " and ", par.l$file_outputRData, " and finished script successfully.\n")) diff --git a/src/Snakemake/dev/src/filterBAMFile.R b/src/Snakemake/dev/src/filterBAMFile.R deleted file mode 100644 index c230915..0000000 --- a/src/Snakemake/dev/src/filterBAMFile.R +++ /dev/null @@ -1,103 +0,0 @@ - -######################### -# LIBRARY AND FUNCTIONS # -######################### - -#source("/g/scb2/zaugg/zaugg_shared/scripts/Christian/src/R/functionsCollection.R") -source("/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/src/functions.R") - - - - - -start.time <- Sys.time() - - -######################### -# LIBRARY AND FUNCTIONS # -######################### - -library("checkmate") -assertClass(snakemake, "Snakemake") -assertDirectoryExists(snakemake@config$par_general$dir_scripts) -source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R")) - -initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE) -checkAndLoadPackages(c("checkmate", "futile.logger", "tidyverse", "reshape2", "tools", "rlist", "scales", "grDevices", "gridExtra", "ggplot2", "Rsamtools", "GenomicRanges"), verbose = TRUE) - -######################################################################## -# SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES # -######################################################################## - -# snakemake = readRDS("/scratch/carnold/CLL/27ac_TF/output/Logs_and_Benchmarks/3.analyzeTF.R_TF=MAFK.S.rds") -createDebugFile(snakemake, "3.analyzeTF.R") - -################### -#### PARAMETERS ### -################### - -par.l = list() - -par.l$verbose = TRUE -par.l$log_minlevel = "INFO" -par.l$maxPairwiseComparisonsDiagnosticPermutations = 2 - -##################### -# VERIFY PARAMETERS # -##################### - -assertClass(snakemake, "Snakemake") - -## INPUT ## -assertList(snakemake@input, min.len = 1) -assertSubset(names(snakemake@input), c("", "overlapFile", "sampleDataR", "peakFile", "peakFile2", "normFacs", "plotsPerm")) - -par.l$file_input_peakTFOverlaps = snakemake@input$overlapFile -assertFileExists(par.l$file_input_peakTFOverlaps, access = "r") - - -## OUTPUT ## -assertList(snakemake@output, min.len = 1) -assertSubset(names(snakemake@output), c("", "outputTSV", "outputRDS", "plot_diagnostic", "plot_diagnosticPerm", "plot_TFSummary", "plot_TFSummaryPerm", "DESeqObj")) - -par.l$file_output_summaryAll = snakemake@output$outputTSV - - -## WILDCARDS ## -assertList(snakemake@wildcards, min.len = 1) -assertSubset(names(snakemake@wildcards), c("", "TF")) - -par.l$TF = snakemake@wildcards$TF -assertCharacter(par.l$TF, len = 1, min.chars = 1) - -## CONFIG ## -assertList(snakemake@config, min.len = 1) - -par.l$designFormula = snakemake@config$par_general$designContrast -assertCharacter(par.l$designFormula, len = 1, min.chars = 3) - -par.l$designFormula = snakemake@config$par_general$designContrast -checkAndLogWarningsAndErrors(par.l$designFormula, checkCharacter(par.l$designFormula, len = 1, min.chars = 3)) - -## PARAMS ## -assertList(snakemake@params, min.len = 1) -assertSubset(names(snakemake@params), c("", "doCyclicLoess", "allBAMS")) - -## LOG ## -assertList(snakemake@log, min.len = 1) -par.l$file_log = snakemake@log[[1]] - - -allDirs = c(dirname(par.l$file_output_summaryAll), - dirname(par.l$file_log) -) - - -testExistanceAndCreateDirectoriesRecursively(allDirs) - - -###################### -# FINAL PREPARATIONS # -###################### -startLogger(par.l$file_log, par.l$log_minlevel, appenderName = "file", removeOldLog = TRUE) -printParametersLog(par.l) \ No newline at end of file diff --git a/src/Snakemake/dev/src/functions.R b/src/Snakemake/dev/src/functions.R deleted file mode 100644 index c8f014c..0000000 --- a/src/Snakemake/dev/src/functions.R +++ /dev/null @@ -1,702 +0,0 @@ -# Almost all of these functions are originally described in source("/g/scb2/zaugg/zaugg_shared/scripts/Christian/src/R/functionsCollection.R") -# and have been copied from there - - -.prettyNum <- function(number, verbose = TRUE) { - prettyNum(number, big.mark = ",", scientific = FALSE) -} - - -startLogger <- function(logfile, level, removeOldLog = TRUE, appenderName = "consoleAndFile", verbose = FALSE) { - - checkAndLoadPackages(c("futile.logger"), verbose = verbose) - - assertSubset(level, c("TRACE", "DEBUG", "INFO", "WARN", "ERROR", "FATAL")) - assertFlag(removeOldLog) - assertSubset(appenderName, c("console", "file", "consoleAndFile")) - assertFlag(verbose) - - if (appenderName != "console") { - assertDirectory(dirname(logfile), access = "w") - if (file.exists(logfile)) { - file.remove(logfile) - } - } - - # LEVELS: TRACE, DEBUG, INFO, WARN, ERROR, FATAL - invisible(flog.threshold(level)) - - - if (appenderName == "console") { - invisible(flog.appender(appender.console())) - } else if (appenderName == "file") { - invisible(flog.appender(appender.file(file = logfile))) - } else { - invisible(flog.appender(appender.tee(file = logfile))) - } - - -} - -printParametersLog <- function(par.l, verbose = FALSE) { - - checkAndLoadPackages(c("futile.logger"), verbose = verbose) - assertList(par.l) - flog.info(paste0("PARAMETERS:")) - for (parCur in names(par.l)) { - - flog.info(paste0(" ", parCur, "=", paste0(par.l[[parCur]], collapse = ","))) - - } -} - - -clearOpenDevices <- function() { - - while (length(dev.list()) > 0) { - dev.off() - } -} - - -printMultipleGraphsPerPage <- function(plots.l, nCol = 1, nRow = 1, pdfFile = NULL, height = NULL, width = NULL, verbose = FALSE) { - - checkAndLoadPackages(c("grDevices", "gridExtra"), verbose = verbose) - - assertList(plots.l, min.len = 1) - assertInt(nCol, lower = 1) - assertInt(nRow, lower = 1) - assert(checkNull(pdfFile), checkCharacter(pdfFile)) - if (!testNull(pdfFile)) assertDirectory(dirname(pdfFile), access = "r") - assert(checkNull(height), checkInt(height, lower = 1)) - assert(checkNull(width), checkInt(width, lower = 1)) - - if (testNull(height)) { - height = 7 - } - - if (testNull(width)) { - width = 7 - } - - - for (i in seq_len(length(plots.l))) { - assertClass(plots.l[[i]], classes = c("ggplot", "gg")) - } - - - nPlotsPerPage = nCol * nRow - - plotsNew.l = list() - - if (!testNull(pdfFile)) { - clearOpenDevices() - pdf(pdfFile, height = height, width = width) - } - index = 0 - - for (indexAll in 1:length(plots.l)) { - - index = index + 1 - plotsNew.l[[index]] = plots.l[[indexAll]] - - # Print another page - if (index %% nPlotsPerPage == 0) { ## print 8 plots on a page - suppressMessages(print(do.call(grid.arrange, c(plotsNew.l, list(ncol = nCol, nrow = nRow))))) - plotsNew.l = list() # reset plot - index = 0 # reset index - } - - } - - # Print the remainding plots in case they don't perfectly fit with the layout - if (length(plotsNew.l) != 0) { - suppressMessages(print(do.call(grid.arrange, c(plotsNew.l, list(ncol = nCol, nrow = nRow))))) - } - - if (!testNull(pdfFile)) - dev.off() - - cat("Finished writing plots to file ", pdfFile, "\n") -} - - -initFunctionsScript <- function(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE, verbose = TRUE) { - - checkAndLoadPackages("checkmate", verbose = verbose) - assert(checkNull(packagesReq), checkCharacter(packagesReq, min.len = 1, min.chars = 1)) - assertCharacter(minRVersion, len = 1) - assertInt(warningsLevel, lower = 0, upper = 2) - assertFlag(disableScientificNotation) - assertFlag(verbose) - - clearOpenDevices() - - - # No annoying strings as factors by default - options(stringsAsFactors = FALSE) - - # Print warnings as they occur - options(warn = warningsLevel) - - # Just print 50 lines instead of 99999 - options(max.print = 200) - - # Disable scientific notation - if (disableScientificNotation) options(scipen = 999) - - - # We need at least R version 3.1.0 to continue - stopifnot(getRversion() >= minRVersion) - - - .detachAllPackages() - - checkAndLoadPackages(packagesReq, verbose = verbose) - -} - -checkAndLoadPackages <- function(packages, verbose = TRUE) { - - .checkAndInstallMissingPackages(packages, verbose = verbose) - - for (packageCur in packages) { - library(packageCur, character.only = TRUE) - } - -} - - -.checkAndInstallMissingPackages <- function(packages.vec, verbose = TRUE, cranMirror = "http://cran.uni-muenster.de/") { - - if (verbose) cat("Trying to automatically install missing packages. If this fails, install them manually...\n") - - packagesToInstall = setdiff(packages.vec, rownames(installed.packages())) - - - if (length(packagesToInstall) > 0) { - if (verbose) cat("Could not find the following packages: ", paste( packagesToInstall , collapse = ", "), "\n") - install.packages(packagesToInstall, repos = cranMirror) - - source("http://bioconductor.org/biocLite.R") - for (packageCur in packagesToInstall) { - biocLite(packageCur, suppressUpdates = TRUE) - } - } else { - if (verbose) cat("All packages are already installed\n") - } - -} - -.detachAllPackages <- function() { - - basic.packages <- c("package:stats","package:graphics","package:grDevices","package:utils","package:datasets","package:methods","package:base") - - package.list <- search()[ifelse(unlist(gregexpr("package:",search())) == 1,TRUE,FALSE)] - - package.list <- setdiff(package.list,basic.packages) - - if (length(package.list) > 0) for (package in package.list) detach(package, character.only = TRUE) - -} - - -createFileList <- function(directory, pattern, recursive = FALSE, ignoreCase = FALSE, verbose = TRUE) { - - assertCharacter(directory, min.chars = 1, any.missing = FALSE, len = 1) - assertCharacter(pattern, min.chars = 1, any.missing = FALSE, len = 1) - assertFlag(recursive) - assertFlag(ignoreCase) - assertFlag(verbose) - - assertDirectoryExists(directory) - - # Multiple patterns are now supported, integrate over them - patternAll = strsplit(pattern, ",")[[1]] - assertCharacter(patternAll, min.len = 1) - - if (verbose) cat("Found ", length(patternAll), " distinct pattern(s) in pattern string.\n") - - nFilesToProcessTotal = 0 - filesToProcess.vec = c() - - for (patternCur in patternAll) { - - # Replace wildcards by functioning patterns (such as .) - patternMod = glob2rx(patternCur) - - # Remove anchoring at beginning and end - patternMod = substr(patternMod, 2, nchar(patternMod) - 1) - - filesToProcessCur.vec = list.files(path = directory, pattern = patternMod, full.names = TRUE, recursive = recursive, ignore.case = ignoreCase) - filesToProcess.vec = c(filesToProcess.vec, filesToProcessCur.vec) - - if (verbose) cat("Search for files with pattern \"", patternCur, "\" in directory ", directory, " (case insensitive:", ignoreCase, ")\n", sep ="") - - nFilesToProcessTotal = nFilesToProcessTotal + length(filesToProcessCur.vec) - } - - - - if (nFilesToProcessTotal == 0) { - stop(paste0("No files to process in folder ", directory, " that fulfill the desired criteria (", patternCur, ").")) - } else { - - if (verbose) cat("The following", nFilesToProcessTotal, "files were found:\n", paste0(filesToProcess.vec, collapse = "\n ")) - } - - if (verbose) cat("\n") - - filesToProcess.vec - -} - - -.getThemeForGGPlot <- function(verticalLines = FALSE, horizontalLines = FALSE, gridColor = "gray", linetype = "dotted", lineSize = 0.5, centerTitle = TRUE, verbose = FALSE) { - - checkAndLoadPackages(c("ggplot2"), verbose = verbose) - assertFlag(horizontalLines) - assertFlag(verticalLines) - assertFlag(verbose) - assertNumber(lineSize, lower = 0) - assertSubset(linetype, c("blank", "solid", "dashed", "dotted", "dotdash", "longdash", "twodash")) - assertFlag(centerTitle) - - gridX = element_blank() - gridY = element_blank() - - if (verticalLines) { - gridX = element_line(colour = gridColor, linetype = linetype, size = lineSize) - } - if (horizontalLines) { - gridY = element_line(colour = gridColor, linetype = linetype, size = lineSize) - } - - - theme(panel.grid.minor.y = gridY, panel.grid.major.y = gridY, - panel.grid.minor.x = gridX, panel.grid.major.x = gridX) + - theme_bw() + - theme(plot.title = element_text(hjust = ifelse(centerTitle, 0.5, 0))) - -} - - -#' @import checkmate -generateDefaultReadFlags <- function(pairedEndReads = TRUE, verbose = TRUE) { - - assertFlag(pairedEndReads) - assertFlag(verbose) - - par.l = list( - - "readFlag_isPaired" = TRUE, - "readFlag_isProperPair" = TRUE , - "readFlag_isUnmappedQuery" = FALSE, - "readFlag_hasUnmappedMate" = FALSE, - "readFlag_isMinusStrand" = NA, - "readFlag_isMateMinusStrand" = NA, - "readFlag_isFirstMateRead" = NA, - "readFlag_isSecondMateRead" = NA, - "readFlag_isNotPrimaryRead" = FALSE, - "readFlag_isNotPassingQualityControls" = FALSE, - "readFlag_isDuplicate" = FALSE - ) - - if (!pairedEndReads) { - - par.l$readFlag_isPaired = NA - par.l$readFlag_isProperPair = NA - par.l$readFlag_isUnmappedQuery = NA - par.l$readFlag_hasUnmappedMate = NA - par.l$readFlag_isMateMinusStrand = NA - par.l$readFlag_isFirstMateRead = NA - par.l$readFlag_isSecondMateRead = NA - par.l$readFlag_isNotPrimaryRead = NA - - - } - - .constructScanBamFlags(par.l, verbose = verbose) -} - - -.constructScanBamFlags <- function(par.l, verbose = TRUE) { - - checkAndLoadPackages(c("Rsamtools"), verbose = verbose) - - namesReqElems = c("readFlag_isPaired", "readFlag_isProperPair", "readFlag_isUnmappedQuery", "readFlag_hasUnmappedMate", - "readFlag_isMinusStrand", "readFlag_isMateMinusStrand", "readFlag_isFirstMateRead", - "readFlag_isSecondMateRead", "readFlag_isNotPrimaryRead", "readFlag_isNotPassingQualityControls") - - assertList(par.l, min.len = length(namesReqElems)) - - assertSubset(namesReqElems, names(par.l)) - - flags = scanBamFlag(isPaired = par.l$readFlag_isPaired, - isProperPair = par.l$readFlag_isProperPair, - isUnmappedQuery = par.l$readFlag_isUnmappedQuery, - hasUnmappedMate = par.l$readFlag_hasUnmappedMate, - isMinusStrand = par.l$readFlag_isMinusStrand, - isMateMinusStrand = par.l$readFlag_isMateMinusStrand, - isFirstMateRead = par.l$readFlag_isFirstMateRead, - isSecondMateRead = par.l$readFlag_isSecondMateRead, - isSecondaryAlignment = par.l$readFlag_isNotPrimaryRead, - isNotPassingQualityControls = par.l$readFlag_isNotPassingQualityControls, - isDuplicate = par.l$readFlag_isDuplicate - ) - - flags - -} - - - -#TODO: Set 0 counts to 0.1 or 1 to allow enrichment values -#Soll man paired end reads doppelt zählen? Strange effect with even and odd numbers -# readFlags = generateDefaultReadFlags(pairedEndReads = TRUE) - -# annotationFile = par.l$annotationFile -# BAMFile = fileCur -# thresholdWithinTSS = par.l$TSSEnrichment_withinThreshold -# thresholdOutsideTSS = par.l$TSSEnrichment_outsideThreshold -# readFlags = generateDefaultReadFlags(pairedEndReads = par.l$pairedEnd) -# geneTypesToKeep = par.l$TSSEnrichment_geneTypesToKeep -# verbose = par.l$verbose - -checkEnrichmentOverTSS <- function(annotationFile, BAMFile, thresholdWithinTSS = 4000, thresholdOutsideTSS = 1000, - readFlags, geneTypesToKeep = c("protein_coding"), verbose = TRUE) { - - checkAndLoadPackages(c("tools", "GenomicRanges"), verbose = verbose) - - assertFile(annotationFile, access = "r") - assertFile(BAMFile, access = "r") - assertSubset(tolower(file_ext(BAMFile)), c("bam")) - assertInt(thresholdWithinTSS, lower = 1) - assertInt(thresholdOutsideTSS, lower = 1) - assert(checkNull(readFlags), checkIntegerish(readFlags, len = 2, lower = 0)) - assertCharacter(geneTypesToKeep, min.chars = 1, min.len = 1) - assertFlag(verbose) - - if (testNull(readFlags)) { - readFlags = scanBamFlag() - } - - - flog.info(paste0("Check enrichment over TSS for file: ", BAMFile)) - flog.info(paste("thresholdWithinTSS=", thresholdWithinTSS, ", thresholdOutsideTSS=", thresholdOutsideTSS)) - - flog.info(paste("Retrieving TSS annotation from file", annotationFile, sep = " ")) - filename_TSS = paste0(annotationFile, ".TSS_nonOverlapping", (thresholdWithinTSS + thresholdOutsideTSS), ".bed") - - if (!file.exists(filename_TSS)) { - - flog.info(paste0("Could not find file", filename_TSS, ", calculating TSS anew...", sep = " ")) - - # Read only particular columns(positions and type) - colsToRead = c("character", "NULL", "character", "integer", "integer", "NULL", "character", "NULL", "character") - - genomeAnnotation.df = read.table(annotationFile, colClasses = colsToRead, skip = 5, sep = "\t") - colnames( genomeAnnotation.df) = c("chr", "type", "start", "end", "strand", "annot") - - # How to define a TSS? Current solution: - # The first codon of the first exon is approximately the transcription start site. The translation start site(i.e. the start of the CDS) is located somewhere within the transcript, but doesn't necessarily have to be in the first exon at all. - # The first nucleotide of the first exon is a good approximation. The main issue with it is that the exact TSS can shift around a bit depending on tissue and condition, so the genome annotations are only so accurate. But they're a good start. - - genomeAnnotation.df = genomeAnnotation.df[-which(!genomeAnnotation.df$type %in% c("exon")), ] - - annot = strsplit(x = genomeAnnotation.df$annot, ";", fixed = TRUE) - - genomeAnnotation.df$ID = gsub(pattern = "gene_id ", replacement = "", unlist(lapply(annot, "[[", 1))) - - - # TSS for minus strand is NOT identical as if it was the plus strand - # See http://www.researchgate.net/post/Can_someone_orient_me_to_the_coordinates_numbering_between_positive_and_negative_strand2 - - gr.list = list() - - for (strandCur in c("+", "-")) { - - genomeAnnotationCur.df = genomeAnnotation.df[which(genomeAnnotation.df$strand == strandCur), ] - - if (strandCur == "+") { - genomeAnnotationCur.df = genomeAnnotationCur.df[order(genomeAnnotationCur.df$ID, genomeAnnotationCur.df$start), ] - } else { - genomeAnnotationCur.df = genomeAnnotationCur.df[order(genomeAnnotationCur.df$ID, -genomeAnnotationCur.df$end), ] - } - - # Find duplicate rows given the gene ID, quicker solution - genomeAnnotationRed = genomeAnnotationCur.df$ID - isDuplicate = duplicated(genomeAnnotationRed) - genomeAnnotationRed.df = genomeAnnotationCur.df[!isDuplicate, ] - - if (strandCur == "+") { - gr.list[[strandCur]] <- GRanges(seqnames = Rle(genomeAnnotationRed.df$chr), - strand = genomeAnnotationRed.df$strand, - ranges = IRanges(start = genomeAnnotationRed.df$start, - end = genomeAnnotationRed.df$start, - names = genomeAnnotationRed.df$ID), - annot = genomeAnnotationRed.df$annot) - - } else { - gr.list[[strandCur]] <- GRanges(seqnames = Rle(genomeAnnotationRed.df$chr), - strand = genomeAnnotationRed.df$strand, - ranges = IRanges(start = genomeAnnotationRed.df$end, - end = genomeAnnotationRed.df$end, - names = genomeAnnotationRed.df$ID), - annot = genomeAnnotationRed.df$annot) - - } - - - - } - - # https://support.bioconductor.org/p/93347/#93373 - - - gr = c(gr.list[[1]], gr.list[[2]]) - gr <- sort(sortSeqlevels(gr)) - - flog.info(paste0("Found", length(gr), " TSS")) - - # If TSS are too close together, the calculation might be biased. Count only reads that uniquely map to one TSS - # Exclude TSS that are to close together, delete TSs so that all TSS have at least a distance of 2*thresholdWithinTSS - - # Compute the distance to the neighboring element - res2 = distance(gr[-length(gr)], gr[-1]) - indexes = which(res2 < (thresholdWithinTSS + thresholdOutsideTSS)) - - # Delete the i-th and(i+1)th element of indexes for each element - deleteRows = sort(unique(c(indexes, (indexes + 1)))) - flog.info(paste0("Deleting ", length(deleteRows), " rows because they are located within ", (thresholdWithinTSS + thresholdOutsideTSS), " bp")) - - gr_red = gr[-deleteRows] - - write.table(paste(as.character(seqnames(gr_red)), - as.character(ranges(gr_red)@start), - as.character(ranges(gr_red)@start), - names( gr_red), - mcols(gr_red)$annot, - as.character(strand(gr_red)), - collapse = "\n", sep = "\t"), - file = filename_TSS, row.names = FALSE, quote = FALSE, col.names = FALSE, sep = "\t") - - } - - TSS.df = read.table(filename_TSS, sep = "\t") - colnames(TSS.df) = c("chr", "start", "end", "ID", "annotation", "strand") - - - if (length(geneTypesToKeep) > 0 & length(which(geneTypesToKeep == "all")) == 0) { - flog.info(paste0("Use only the following gene types and discard all others:", paste(geneTypesToKeep, collapse = ", "))) - TSS.df = TSS.df[which(grepl(paste(geneTypesToKeep, collapse = "|"), TSS.df$annotation)), ] - } - - TSS.GRanges = GRanges(seqnames = Rle(TSS.df$chr), ranges = IRanges(start = TSS.df$start, end = TSS.df$end), strand = TSS.df$strand, annotation = TSS.df$ID) - - # Compile a list of positions that are located within the vicinity of TSS and count the reads that are located within - TSS_within = promoters(TSS.GRanges, upstream = thresholdWithinTSS/2, downstream = thresholdWithinTSS/2) - - # Compile a list of positions that are located outside the vicinity of TSS and count the reads that are located within - TSS_outside1 = flank(TSS_within, width = thresholdOutsideTSS, start = TRUE) - #TSS_outside2 = flank(TSS_within, width=thresholdWithinTSS/2, start= FALSE) - - #TSS_outside = c(TSS_outside1, TSS_outside2) - - param <- ScanBamParam(which = TSS_within, flag = readFlags) - counts_within = countBam(BAMFile, param = param) - - param <- ScanBamParam(which = TSS_outside1, flag = readFlags) - counts_outside1 = countBam(BAMFile, param = param) - - #param <- ScanBamParam(which=TSS_outside2, flag = readFlags) - #counts_outside2 = countBam(BAMFile, param=param) - - # How many reads are inside a thresholdWithinTSS/2 bp(to either side) window of the TSS? - nReads_within = sum(counts_within$records) - nReads_outside = sum(counts_outside1$records) #+ sum(counts_outside2$records) - - enrichment = nReads_within / nReads_outside - - fractionWindowSizes = thresholdWithinTSS / thresholdOutsideTSS - enrichmentCorrected = enrichment / fractionWindowSizes - - - counts_within = scanBam(BAMFile, param = ScanBamParam(what = c("isize"))) - - - - flog.info(paste("Found ", nReads_within, " reads within [-", - thresholdWithinTSS/2, ":+", thresholdWithinTSS/2, "] bp of annotated TSS and ", - nReads_outside, " reads in(-", - ((thresholdWithinTSS/2) + thresholdOutsideTSS), - ":-", thresholdWithinTSS/2, #" and +", - #thresholdWithinTSS/2, ":+", ((thresholdWithinTSS/2) + thresholdOutsideTSS/2), - "). ", "Overall TSS enrichment: ", - round(enrichment, 2), "(corrected:", - round(enrichmentCorrected, 2), - ")", sep = " ")) - - return(list("countsAroundTSS" = counts_within, - "countsUpstreamTSS" = counts_outside1, - "overallEnrichmentRaw" = round(enrichment, 2), - "overallEnrichmentNorm" = round(enrichmentCorrected, 2), - #"countsDownstreamTSS" = counts_outside2, - "par" = list( - "gencodeVersion" = annotationFile, - "BAMFile" = BAMFile, - "thresholdWithinTSS" = thresholdWithinTSS, - "thresholdOutsideTSS" = thresholdOutsideTSS, - "readFlags" = readFlags - - ) - - )) - -} - - -splitStringInMultipleLines <- function(input, width, sepChar = "\n", verbose = TRUE) { - - assertCharacter(input, min.len = 1) - assertInt(width, lower = 1) - assertCharacter(sepChar, len = 1) - - - as.character(sapply(input, function(x) { - paste0(strsplit(x, paste0("(?<=.{", width, "})"), perl = TRUE)[[1]], collapse = sepChar) - } - )) - -} - - -doPCAPlot <- function(file_readCounts, file_metadata, file_output, file_log, metadataToInclude, type, - binSize = 10000, filterChr = TRUE, splitByGCCorrection = TRUE, minNoCountsRows = 0) { - - - start.time <- Sys.time() - par.l = list() - par.l$log_minlevel = "INFO" - - checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "tools", "methods", "DESeq2", "readr"), verbose = TRUE) - - - - coldata = read_tsv(file_metadata) - - if (nrow(problems(coldata)) > 0) { - flog.fatal(paste0("Parsing errors: "), problems(coldata), capture = TRUE) - stop("Error when parsing the file ", file_metadata, ", see errors above") - } - - startLogger(file_log, par.l$log_minlevel, removeOldLog = TRUE) - - countsPCA = read_tsv(file_readCounts) - if (nrow(problems(countsPCA)) > 0) { - flog.fatal(paste0("Parsing errors: "), problems(countsPCA), capture = TRUE) - stop("Error when parsing the file ", file_readCounts, ", see errors above") - } - - # Test purposes - #countsPCA = countsPCA[1:1000,] - - colnames(countsPCA)[1] = "chr" - - if (filterChr) { - countsPCA = dplyr::filter(countsPCA, !grepl("random|chrUn|chrM|hap|_gl", chr, perl = TRUE)) - } - - - if (type == "all") { - colnames(countsPCA) = gsub(".final.bam", "", colnames(countsPCA)) - - } else if (type == "pooled") { - colnames(countsPCA) = gsub(".merged", "", colnames(countsPCA)) - colnames(countsPCA) = gsub(".final.bam", "", colnames(countsPCA)) - - # Rewrite coldata - coldata = coldata[duplicated(coldata[,"individual"]),] - coldata$sampleName = coldata$individual - - } else { - flog.fatal(paste0("Unsupported type ", type)) - } - - colnames(countsPCA) = gsub("'", "", colnames(countsPCA)) - - - - allMatrices.l = list() - allColdata.l = list() - - if (splitByGCCorrection) { - - countsPCA_original = dplyr::select(countsPCA, -contains("noGCBias")) - allMatrices.l[["original"]] = as.matrix(countsPCA_original[,-c(1:3)]) - allColdata.l [["original"]] = coldata - - countsPCA_GC = dplyr::select(countsPCA, contains("noGCBias")) - allMatrices.l[["GCCorrected"]] = as.matrix(countsPCA_GC) - coldata_GC = coldata - coldata_GC$sampleName = paste0(coldata_GC$sampleName, ".noGCBias") - coldata_GC$individual = paste0(coldata_GC$individual, ".noGCBias") - allColdata.l [["GCCorrected"]] = coldata_GC - - allMatrices.l[["all"]] = as.matrix(countsPCA[,-c(1:3)]) - coldata_all = rbind(coldata, coldata_GC) - allColdata.l [["all"]] = coldata_all - - } else { - - allMatrices.l[["all"]] = countsPCA - } - - - pdf(file_output) - - for (matrixCur in names(allMatrices.l)) { - - flog.info(paste0("Generate plots for group ", matrixCur)) - - matrixCur.m = allMatrices.l[[matrixCur]] - colDataCur = allColdata.l[[matrixCur]] - - assertSubset(colnames(matrixCur.m), colDataCur$sampleName) - # Sort matrix to ensure same order as coldata table - matrixCur.m = matrixCur.m[, colDataCur$sampleName] - - # Use DESeq2 for it - dds <- DESeqDataSetFromMatrix(countData = matrixCur.m, colData = colDataCur, design = ~1) - - if (minNoCountsRows > 0) { - dds <- dds[ rowSums(counts(dds)) > minNoCountsRows, ] - } - - vsd <- varianceStabilizingTransformation(dds, blind = TRUE) - - for (nrowsCur in c(500,5000,50000, nrow(vsd))) { - - for (varCur in metadataToInclude) { - pcadata = plotPCA(vsd, intgroup = varCur, ntop = nrowsCur, returnData = TRUE) - - percentVar <- round(100 * attr(pcadata, "percentVar")) - g = ggplot(pcadata, aes_string("PC1", "PC2", color = varCur)) + - geom_point(size = 2) + - xlab(paste0("PC1: ",percentVar[1],"% variance")) + - ylab(paste0("PC2: ",percentVar[2],"% variance")) + - coord_fixed() + theme_bw() + - ggtitle(paste0("Group ", matrixCur, "\nTop ", .prettyNum(nrowsCur), " ", binSize, " bp bins")) + - theme(plot.title = element_text(hjust = 0.5)) - plot(g) - - } - - } - } - - dev.off() - - - -} diff --git a/src/Snakemake/dev/src/functions_beta.R b/src/Snakemake/dev/src/functions_beta.R deleted file mode 100644 index e3b18a3..0000000 --- a/src/Snakemake/dev/src/functions_beta.R +++ /dev/null @@ -1,533 +0,0 @@ - -#TODO: Set 0 counts to 0.1 or 1 to allow enrichment values -#Soll man paired end reads doppelt zählen? Strange effect with even and odd numbers - - -# modified from: https://github.com/imbforge/encodeChIPqc/blob/master/R/PBC.R -#' PCR bottleneck coefficient -#' -#' Calculate the PCR bottleneck coefficient as described in the ENCODE -#' guidelines. -#' -#' The PCR bottleneck coefficient (PBC) is a measure of library complexity, i.e. how skewed the -#' distribution of read counts per location is towards 1 read per location. -#' -#' Defined in the ENCODE guidelines (https://genome.ucsc.edu/ENCODE/qualityMetrics.html) as: -#' -#' PBC = N1/Nd -#' -#' with -#' \itemize{ -#' \item{\code{N1}: Number of genomic locations to which EXACTLY one unique mapping read maps.} -#' \item{\code{Nd}: Number of genomic locations to which AT LEAST one unique mapping read maps, i.e. -#' the number of non-redundant, unique mapping reads.} -#' } -#' -#' PBC is further described on the ENCODE Software Tools page. Provisionally, 0-0.5 is severe -#' bottlenecking, 0.5-0.8 is moderate bottlenecking, 0.8-0.9 is mild bottlenecking, while 0.9-1.0 -#' is no bottlenecking. Very low values can indicate a technical problem, such as PCR bias, or a -#' biological finding, such as a very rare genomic feature. Nuclease-based assays (DNase, MNase) -#' detecting features with base-pair resolution (transcription factor footprints, positioned -#' nucleosomes) are expected to recover the same read multiple times, resulting in a lower PBC -#' score for these assays. Note that the most complex library, random DNA, would approach 1.0, -#' thus the very highest values can indicate technical problems with libraries. It is the practice -#' for some labs outside of ENCODE to remove redundant reads; after this has been done, the value -#' for this metric is 1.0, and this metric is not meaningful. 82\% of TF ChIP, 89\% of His ChIP, 77\% -#' of DNase, 98\% of FAIRE, and 97\% of control ENCODE datasets have no or mild bottlenecking. -#' -#' @param The path to the \code{.bam} file of a ChIP sample or a \code{GAlignments} object of the ChIP sample. -#' @return The PBC coefficient. -#' @examples -#' pbc <- PBC("IP.bam") -calcPBCValues <- function(BAMFile, verbose = TRUE) { - - require(GenomicAlignments) - require(data.table) - - assertFlag(verbose) - assert(checkFile(BAMFile, access = "r"), checkClass(BAMFile, "GAlignments")) - - aln <- BAMFile - - if (class(BAMFile) != "GAlignments") { - aln <- readGAlignments(BAMFile) - } - - - # convert GAlignments object to data.table for fast aggregation - aln <- data.table( - strand=as.factor(BiocGenerics::as.vector(strand(aln))), - seqnames=as.factor(BiocGenerics::as.vector(seqnames(aln))), - pos=ifelse(strand(aln) == "+", start(aln), end(aln)) - ) - - # aggregate reads by position and count them - readsPerPosition <- aln[,list(count=.N), by=list(strand, seqnames, pos)]$count - - # PBC1 = positions with exactly 1 read / positions with at least 1 read - m1 = sum(readsPerPosition == 1) - mDistinct =length(readsPerPosition) - PBC1 <- m1 / mDistinct - - # PBC2 = positions with exactly 1 read / positions with exactly 2 reads - m2 = sum(readsPerPosition == 2) - PBC2 <- m1 / m2 - - - res.l = list("PBC1" = list( - "PBC1" = PBC1, - "M1" = m1, - "M_DISTINCT" = mDistinct - ), - "PBC2" = list( - "PBC2" = PBC2, - "M1" = m1, - "M2" = m2 - )) - - return(res.l) -} - - -annotationFile = "/g/scb2/zaugg/zaugg_shared/annotations/hg19/Gencode_v19/gencode.v19.annotation.gtf" -BAMFile = "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/output/8.FinalOutput/test1_rep1.final.s.bam" -# checkEnrichmentOverTSS(annotationFile, BAMFile, plotDistrFile = "test.pdf") - - -calcLibraryComplexity <- function(BAMFile, experimentType = "ATAC-Seq", verbose = TRUE) { - - assertFile(BAMFile, access = "r") - checkSubset(experimentType, c("ATAC-Seq", "ChIP-Seq")) - assertFlag(verbose) - - flog.info(paste0("Calculate the following measures for file ", BAMFile, ":")) - flog.info(paste0("1) Non-Redundant Fraction (NRF) - Number of distinct uniquely mapping reads (i.e. after removing duplicates) / Total number of reads. ")) - flog.info(paste0("2) PCR Bottlenecking Coefficient 1 (PBC1): PBC1= M1 / M_DISTINCT")) - flog.info(paste0("3) PCR Bottlenecking Coefficient 2 (PBC2): PBC2= M1 / M2")) - flog.info(paste0(" M1: number of genomic locations where exactly one read maps uniquely")) - flog.info(paste0(" M_DISTINCT: number of distinct genomic locations to which some read maps uniquely")) - flog.info(paste0(" M2: number of genomic locations where two reads map uniquely")) - flog.info(paste0("The following evaluations are based on the numbers from https://www.encodeproject.org/data-standards/terms/#library, January 2017")) - - flog.info(paste0("Results for ", experimentType, " data:")) - - PBC.l = calcPBCValues(BAMFile, verbose = verbose) - PBC1 = PBC.l$PBC1$PBC1 - PBC2 = PBC.l$PBC2$PBC2 - - - # The following numbers are taken from https://www.encodeproject.org/data-standards/terms/#library, January 2017 - - # Structure of list: two elements correspond to PBC1 (lower and upper limit, respectively) - status_PBC1.l = list( - "ATAC-Seq" = list( - "severe" = c(0, 0.7), - "moderate" = c(0.7, 0.9), - "none" = c(0.9, 1) - ), - "ChIP-Seq" = list( - "severe" = c(0, 0.5), - "moderate" = c(0.5, 0.8), - "mild" = c(0.8, 0.9), - "none" = c(0.9, 1) - ) - - ) - - statusPBC1 = NA - for (statusCur in names(status_PBC1.l[[experimentType]])) { - if (PBC1 >= status_PBC1.l[[experimentType]][[statusCur]][1] && PBC1 <= status_PBC1.l[[experimentType]][[statusCur]][2]) { - statusPBC1 = statusCur - } - } - - - flog.info(paste0(" PBC1=", round(PBC1,2), ", bottlenecking level \"", statusPBC1, - "\" (bottlenecking levels: ", paste0(names(status_PBC1.l[[experimentType]]), collapse = ", "), ")")) - - - # The following numbers are taken from https://www.encodeproject.org/data-standards/terms/#library, January 2017 - - # Structure of list: two elements correspond to PBC2 (lower and upper limit, respectively) - - status_PBC2.l = list( - "ATAC-Seq" = list( - "severe" = c(0, 1), - "moderate" = c(1, 3), - "none" = c(3, 1000000) - ), - "ChIP-Seq" = list( - "severe" = c(0, 1), - "moderate" = c(1, 3), - "mild" = c(3, 10), - "none" = c(10, 1000000) - ) - ) - - - statusPBC2 = NA - for (statusCur in names(status_PBC2.l[[experimentType]])) { - if (PBC2 >= status_PBC2.l[[experimentType]][[statusCur]][1] && PBC2 <= status_PBC2.l[[experimentType]][[statusCur]][2]) { - statusPBC2 = statusCur - } - } - - - flog.info(paste0(" PBC2=", round(PBC2,2), ", bottlenecking level \"", statusPBC2, - "\" (bottlenecking levels: ", paste0(names(status_PBC2.l[[experimentType]]), collapse = ", "), ")")) - - - - ############### - ## NRF status # - ############### - - NRF = 1 - #NRF = nReadsDistinctUnique / nReadsTotal - - # The following numbers are taken from https://www.encodeproject.org/data-standards/terms/#library, January 2017 - - # Structure of list: two elements correspond to NRF (lower and upper limit, respectively) - - status_NRF.l = list( - "ATAC-Seq" = list( - "concerning" = c(0, 0.7), - "acceptable" = c(0.7, 0.9), - "ideal" = c(0.9, 1) - ), - "ChIP-Seq" = list( - "concerning" = c(0, 0.5), - "acceptable" = c(0.5, 0.8), - "compliant" = c(0.8, 0.9), - "ideal" = c(0.9, 1) - ) - - ) - - statusNRF = "NA" - for (statusCur in names(status_NRF.l[[experimentType]])) { - if (NRF >= status_NRF.l[[experimentType]][[statusCur]][1] && NRF <= status_NRF.l[[experimentType]][[statusCur]][2]) { - statusNRF = statusCur - } - } - - - flog.info(paste0(" NRF=", NRF, ", library complexity \"", statusNRF, - "\" (complexity levels: ", paste0(names(status_NRF.l[[experimentType]]), collapse = ", "), ")")) - - - -} - - -checkEnrichmentOverTSS <- function(annotationFile, BAMFile, thresholdWithinTSS = 4000, thresholdOutsideTSS = 1000, - readFlags = NULL, geneTypesToKeep = c("protein_coding"), doAlsoEncodeVariant = TRUE, assemblyVersion = NULL, plotDistrFile = NULL, verbose = TRUE) { - - checkAndLoadPackages(c("tools", "GenomicRanges"), verbose = verbose) - - assertFile(annotationFile, access = "r") - assertFile(BAMFile, access = "r") - assertSubset(tolower(file_ext(BAMFile)), c("bam")) - assertInt(thresholdWithinTSS, lower = 1) - assertInt(thresholdOutsideTSS, lower = 1) - assert(checkNull(readFlags), checkIntegerish(readFlags, len = 2, lower = 0)) - assert(checkNull(assemblyVersion), checkSubset(assemblyVersion, c("hg19", "hg38", "mm9", "mm10"))) - assertCharacter(geneTypesToKeep, min.chars = 1, min.len = 1) - assertFlag(verbose) - assert(checkNull(plotDistrFile), checkCharacter(plotDistrFile, len = 1)) - if (!testNull(plotDistrFile)) { - assertDirectoryExists(dirname(plotDistrFile), access = "w") - } - - - if (testNull(readFlags)) { - readFlags = scanBamFlag() - } - - - flog.info(paste0("Check enrichment over TSS for file: ", BAMFile)) - flog.info(paste("thresholdWithinTSS=", thresholdWithinTSS, ", thresholdOutsideTSS=", thresholdOutsideTSS)) - - flog.info(paste("Retrieving TSS annotation from file", annotationFile, sep = " ")) - filename_TSS = paste0(annotationFile, ".TSS_nonOverlapping", (thresholdWithinTSS + thresholdOutsideTSS), ".bed") - - if (!file.exists(filename_TSS)) { - - flog.info(paste0("Could not find file", filename_TSS, ", calculating TSS anew...", sep = " ")) - - # Read only particular columns(positions and type) - colsToRead = c("character", "NULL", "character", "integer", "integer", "NULL", "character", "NULL", "character") - - genomeAnnotation.df = read.table(annotationFile, colClasses = colsToRead, skip = 5, sep = "\t") - colnames( genomeAnnotation.df) = c("chr", "type", "start", "end", "strand", "annot") - - # How to define a TSS? Current solution: - # The first codon of the first exon is approximately the transcription start site. The translation start site(i.e. the start of the CDS) is located somewhere within the transcript, but doesn't necessarily have to be in the first exon at all. - # The first nucleotide of the first exon is a good approximation. The main issue with it is that the exact TSS can shift around a bit depending on tissue and condition, so the genome annotations are only so accurate. But they're a good start. - - genomeAnnotation.df = genomeAnnotation.df[-which(!genomeAnnotation.df$type %in% c("exon")), ] - - annot = strsplit(x = genomeAnnotation.df$annot, ";", fixed = TRUE) - - genomeAnnotation.df$ID = gsub(pattern = "gene_id ", replacement = "", unlist(lapply(annot, "[[", 1))) - - - # TSS for minus strand is NOT identical as if it was the plus strand - # See http://www.researchgate.net/post/Can_someone_orient_me_to_the_coordinates_numbering_between_positive_and_negative_strand2 - - gr.list = list() - - for (strandCur in c("+", "-")) { - - genomeAnnotationCur.df = genomeAnnotation.df[which(genomeAnnotation.df$strand == strandCur), ] - - if (strandCur == "+") { - genomeAnnotationCur.df = genomeAnnotationCur.df[order(genomeAnnotationCur.df$ID, genomeAnnotationCur.df$start), ] - } else { - genomeAnnotationCur.df = genomeAnnotationCur.df[order(genomeAnnotationCur.df$ID, -genomeAnnotationCur.df$end), ] - } - - # Find duplicate rows given the gene ID, quicker solution - genomeAnnotationRed = genomeAnnotationCur.df$ID - isDuplicate = duplicated(genomeAnnotationRed) - genomeAnnotationRed.df = genomeAnnotationCur.df[!isDuplicate, ] - - if (strandCur == "+") { - gr.list[[strandCur]] <- GRanges(seqnames = Rle(genomeAnnotationRed.df$chr), - strand = genomeAnnotationRed.df$strand, - ranges = IRanges(start = genomeAnnotationRed.df$start, - end = genomeAnnotationRed.df$start, - names = genomeAnnotationRed.df$ID), - annot = genomeAnnotationRed.df$annot) - - } else { - gr.list[[strandCur]] <- GRanges(seqnames = Rle(genomeAnnotationRed.df$chr), - strand = genomeAnnotationRed.df$strand, - ranges = IRanges(start = genomeAnnotationRed.df$end, - end = genomeAnnotationRed.df$end, - names = genomeAnnotationRed.df$ID), - annot = genomeAnnotationRed.df$annot) - - } - - - - } - - gr = c(gr.list[[1]], gr.list[[2]]) - gr <- sort(sortSeqlevels(gr)) - - flog.info(paste0("Found", length(gr), " TSS")) - - # If TSS are too close together, the calculation might be biased. Count only reads that uniquely map to one TSS - # Exclude TSS that are to close together, delete TSs so that all TSS have at least a distance of 2*thresholdWithinTSS - - # Compute the distance to the neighboring element - res2 = distance(gr[-length(gr)], gr[-1]) - indexes = which(res2 < (thresholdWithinTSS + thresholdOutsideTSS)) - - # Delete the i-th and(i+1)th element of indexes for each element - deleteRows = sort(unique(c(indexes, (indexes + 1)))) - flog.info(paste0("Deleting ", length(deleteRows), " rows because they are located within ", (thresholdWithinTSS + thresholdOutsideTSS), " bp")) - - gr_red = gr[-deleteRows] - - write.table(paste(as.character(seqnames(gr_red)), - as.character(ranges(gr_red)@start), - as.character(ranges(gr_red)@start), - names( gr_red), - mcols(gr_red)$annot, - as.character(strand(gr_red)), - collapse = "\n", sep = "\t"), - file = filename_TSS, row.names = FALSE, quote = FALSE, col.names = FALSE, sep = "\t") - - } - - TSS.df = read.table(filename_TSS, sep = "\t") - colnames(TSS.df) = c("chr", "start", "end", "ID", "annotation", "strand") - - - if (length(geneTypesToKeep) > 0 & length(which(geneTypesToKeep == "all")) == 0) { - flog.info(paste0("Use only the following gene types and discard all others:", paste(geneTypesToKeep, collapse = ", "))) - TSS.df = TSS.df[which(grepl(paste(geneTypesToKeep, collapse = "|"), TSS.df$annotation)), ] - } - - TSS.GRanges = GRanges(seqnames = Rle(TSS.df$chr), ranges = IRanges(start = TSS.df$start, end = TSS.df$end), strand = TSS.df$strand, annotation = TSS.df$ID) - - # Compile a list of positions that are located within the vicinity of TSS and count the reads that are located within - TSS_within = promoters(TSS.GRanges, upstream = thresholdWithinTSS/2, downstream = thresholdWithinTSS/2) - - # Compile a list of positions that are located outside the vicinity of TSS and count the reads that are located within - TSS_outside1 = flank(TSS_within, width = thresholdOutsideTSS, start = TRUE) - #TSS_outside2 = flank(TSS_within, width=thresholdWithinTSS/2, start= FALSE) - #TSS_outside = c(TSS_outside1, TSS_outside2) - - - - - param <- ScanBamParam(which = TSS_within, flag = readFlags) - counts_within = countBam(BAMFile, param = param) - - param <- ScanBamParam(which = TSS_within, what = c("rname", "strand", "pos", "qwidth"), flag = readFlags) - reads_within = scanBam(BAMFile, param = param) - - bamCoverage <- CoverageBamFile(BAMFile, run_type = "paired") - - binWidth = 50 - normalizationProcedure = "rpm" - normalizationProcedure = NULL - - # Create a bed file - filename_TSS = "TSS.bed" - export.bed(TSS.GRanges[1:500,], con = filename_TSS) - - cov.m = cov.matrix(bamCoverage, coordfile = filename_TSS, normalization = normalizationProcedure, - extend = 2000, num_cores = 4, bin_width = binWidth) - - # Does not seem to work properly... - draw.heatmap(cov.m,outfile="TSS_profile.png") - - library(GenomicAlignments) - galign = readGAlignmentPairs(BAMFile, param = param) - reads_coverage = GenomicAlignments::coverage(galign) - - reads_coverage1 = GenomicAlignments::coverage(BamFile(BAMFile, asMates=TRUE), param = param) - reads_coverage2 = GenomicAlignments::coverage(BamFile(BAMFile, asMates=TRUE)) - - # Delete chromosomes we dont care about, also prevents an error message... - reads_coverage1 = reads_coverage1[1:24] - - aggregate(reads_coverage1, TSS.GRanges[1:5,], FUN = mean) - - param <- ScanBamParam(which = TSS_outside1, flag = readFlags) - counts_outside1 = countBam(BAMFile, param = param) - - - oo <- order(as.factor(seqnames(TSS.GRanges[1:500,]))) - myRegions <- TSS.GRanges[1:500,][oo] - - GRangesList(myRegions) - - res = Views(reads_coverage1, as(myRegions,"RangesList")) - - - pparam <- PileupParam(distinguish_nucleotides=FALSE, distinguish_strands=TRUE) - res = pileup(BamFile(BAMFile, asMates=TRUE), scanBamParam=param, pileupParam=pparam) - - - #param <- ScanBamParam(which=TSS_outside2, flag = readFlags) - #counts_outside2 = countBam(BAMFile, param=param) - - if (doAlsoEncodeVariant) { - - - thresholdWithinTSSTotalEncode = 2000 - TSS_withinEncode = promoters(TSS.GRanges, upstream = thresholdWithinTSSTotalEncode/2, downstream = thresholdWithinTSSTotalEncode/2) - - thresholdOutsideTSSTotalEncode = 100 - TSS_outsideEncode1 = flank(TSS_withinEncode, width = thresholdOutsideTSSTotalEncode, start= FALSE, both = FALSE) - TSS_outsideEncode2 = flank(TSS_withinEncode, width = thresholdOutsideTSSTotalEncode, start= TRUE , both = FALSE) - - param <- ScanBamParam(which = TSS_withinEncode, flag = readFlags) - counts_withinEncode = countBam(BAMFile, param = param) - - param <- ScanBamParam(which = TSS_outsideEncode1, flag = readFlags) - counts_outsideEncode1 = countBam(BAMFile, param = param) - - param <- ScanBamParam(which = TSS_outsideEncode2, flag = readFlags) - counts_outsideEncode2 = countBam(BAMFile, param = param) - - # Counts reads outside and calculate fold change for inside - - # Calculate average coverage in the 100bp flanking regions - - # Fold change based on the average read depth from the flanking regions - - # value at the center, the TSS, as signal value - - # - - - # TSS enrichment result evaluation, based on https://www.encodeproject.org/data-standards/atac-seq/ - - status_TSS_Enrichment.l = list( - "hg19" = list( - "concerning" = c(0, 6), - "acceptable" = c(6, 10), - "ideal" = c(10,1000000) - ), - "hg38" = list( - "concerning" = c(0, 5), - "acceptable" = c(5, 7), - "ideal" = c(7,1000000) - ), - "mm9" = list( - "concerning" = c(0, 5), - "acceptable" = c(5, 7), - "ideal" = c(7,1000000) - ), - "mm10" = list( - "concerning" = c(0, 10), - "acceptable" = c(10, 15), - "ideal" = c(15,1000000) - ) - - ) - - assertSubset(assemblyVersion, names(status_TSS_Enrichment.l)) - - - status = "NA" - for (statusCur in names(status_TSS_Enrichment.l[[assemblyVersion]])) { - if (enrichment >= status_TSS_Enrichment.l[[assemblyVersion]][[statusCur]][1] && enrichment <= status_TSS_Enrichment.l[[assemblyVersion]][[statusCur]][2]) { - status = statusCur - } - } - - - - } - - - # How many reads are inside a thresholdWithinTSS/2 bp(to either side) window of the TSS? - nReads_within = sum(counts_within$records) - nReads_outside = sum(counts_outside1$records) #+ sum(counts_outside2$records) - - enrichment = nReads_within / nReads_outside - - fractionWindowSizes = thresholdWithinTSS / thresholdOutsideTSS - enrichmentCorrected = enrichment / fractionWindowSizes - - # TODO: WHy here? - #counts_within = scanBam(BAMFile, param = ScanBamParam(what = c("isize"))) - - - - flog.info(paste("Found ", nReads_within, " reads within [-", - thresholdWithinTSS/2, ":+", thresholdWithinTSS/2, "] bp of annotated TSS and ", - nReads_outside, " reads in(-", - ((thresholdWithinTSS/2) + thresholdOutsideTSS), - ":-", thresholdWithinTSS/2, #" and +", - #thresholdWithinTSS/2, ":+", ((thresholdWithinTSS/2) + thresholdOutsideTSS/2), - "). ", "Overall TSS enrichment: ", - round(enrichment, 2), "(corrected:", - round(enrichmentCorrected, 2), - ")", sep = " ")) - - return(list("countsAroundTSS" = counts_within, - "countsUpstreamTSS" = counts_outside1, - "overallEnrichmentRaw" = round(enrichment, 2), - "overallEnrichmentNorm" = round(enrichmentCorrected, 2), - #"countsDownstreamTSS" = counts_outside2, - "par" = list( - "gencodeVersion" = annotationFile, - "BAMFile" = BAMFile, - "thresholdWithinTSS" = thresholdWithinTSS, - "thresholdOutsideTSS" = thresholdOutsideTSS, - "readFlags" = readFlags - - ) - - )) - -} \ No newline at end of file diff --git a/src/Snakemake/old/Snakefile b/src/Snakemake/old/Snakefile deleted file mode 100755 index f7e7727..0000000 --- a/src/Snakemake/old/Snakefile +++ /dev/null @@ -1,1860 +0,0 @@ -# ATAC-seq analysis pipeline - -# TODO: -# 1. Decide for environments and their combinations -# 2. bamCoverage bam index & put --centerReads back when error is fixed -# 3. peak calling encode -# - - -####################################### -# General stuff to make things easier # -####################################### - -# Make the output nicer and easier to follow -ruleDisplayMessage = "\n\n########################\n# START EXECUTING RULE #\n########################\n" - -############################################ -# Libraries, versions, authors and license # -############################################ - -from snakemake.utils import min_version -import subprocess -from os import makedirs -import pandas -import numpy - -# Enforce a minimum Snakemake version because of various features -#min_version("4.6") - -__author__ = "Christian Arnold" -__license__ = "MIT" - - -############################################ -# Working directory and configuration file # -############################################ - -# Not needed, will be provided via the command line in Snakemake -#DEFAULT_CONFIG_FILE = "/g/scb/zaugg/carnold/Projects/AtacSeq/example/config.json" -#configfile: DEFAULT_CONFIG_FILE - -#ROOT_dir = config["par_general"]["workdir"] -#workdir: ROOT_dir - -########################################### -# Onstart, onsuccess and onerror handlers # -########################################### - -# Sometimes, it is necessary to specify code that shall be executed when the workflow execution is finished (e.g. cleanup, or notification of the user). - -# The onsuccess handler is executed if the workflow finished without error. -onsuccess: - print("\n\n###############################\n# Workflow finished, no error #\n###############################\n\n") - -# Else, the onerror handler is executed. -onerror: - print("\n\n#####################\n# An error occurred #\n#####################\n\n") - #shell("mail -s "an error occurred" carnold@embl.de < {log}") - -# onstart handler will be executed before the workflow starts. Note that dry-runs do not trigger any of the handlers -onstart: - print("Reading samples and metadata....\n") - print ("Running workflow for the following samples:\n " + ' \n '.join(map(str, allSamplesUnique))) - - - -def read_samplesTable(samplesSummaryFile): - """text""" - - data = pandas.read_table(samplesSummaryFile) - - # Expect a particular number of columns, do a sanity check here - - if not {'individual', 'sampleName', 'Flowcell_ID', 'lane_ID', 'Technology', 'Library_ID'}.issubset(data.columns.values): - raise KeyError("The samples file must contain the following named columns (TAB separated!): individual, sampleName, Flowcell_ID, lane_ID, Technology, Library_ID") - - # Make sure the individual column is a string - data['individual'] = data['individual'].astype(str) - - - return data - - -def constructRGFields(samplesData): - """text""" - readGroupFields = {} - - for rowCur in range(0, len(samplesData.index)): - - individual = samplesData.ix[rowCur,"individual"] - field_PL = samplesData.ix[rowCur,"Technology"] - flowcell = samplesData.ix[rowCur,"Flowcell_ID"] - lane = samplesData.ix[rowCur,"lane_ID"] - sample = samplesData.ix[rowCur,"sampleName"] - library = samplesData.ix[rowCur,"Library_ID"] - - field_ID = str(flowcell) + "." + str(lane) - field_SM = str(individual) - field_PU = str(flowcell) + "." + str(lane) + "." + str(sample) - field_LB = str(library) - - readGroupFields[sample] = {"ID": field_ID, - "LB": field_LB, - "PL": field_PL, - "PU": field_PU, - "SM": field_SM} - - - return readGroupFields - - - -############################# -# DIRECTORIES AND VARIABLES # -############################# - -# Maximum number of cores per rule. This value will never be achieved because the minimum of this value and the --cores parameter will define the -# number of CPUs per rule in the end. -threadsMax = 16 - -# Input files -samplesSummaryFile = config["samples"]["summaryFile"] -pairedEnd = config["samples"]["pairedEnd"] - -if not pairedEnd: - print ("Error: SE reads not supported yet by pipeline.\n") - sys.exit(1) - -INPUT_ORIG_DIR= os.path.dirname(samplesSummaryFile) - -ROOT_dir = config["par_general"]["outdir"] -INPUT_DIR = ROOT_dir + "/0.Input" -FASTQC_BT_dir = ROOT_dir + "/1.FastQC_beforeTrimming" -TRIM_dir = ROOT_dir + "/2.Trimming" -FASTQC_AT_dir = ROOT_dir + "/3.FastQC_afterTrimming" -ALIGN_dir = ROOT_dir + "/4.Alignment" -POSTALIGN_dir = ROOT_dir + "/5.Postalignment" -CLEAN_dir = POSTALIGN_dir + "/1.Clean" -BASERECAL_dir = POSTALIGN_dir + "/2.BaseRecalibration" -CHRM_dir = POSTALIGN_dir + "/3.Filter_chrM" -RMDUP_DIR = POSTALIGN_dir + "/4.MarkAndRemove_Duplicates" -MAPQsort_dir = POSTALIGN_dir + "/5.Filter_MAPQ" -ADJRSS_dir = POSTALIGN_dir + "/6.Adjust_RSS" -#RMINDEL_dir = POSTALIGN_dir + "/7.Filter_INDELs" -PEAK_dir = ROOT_dir + "/6.PeakCalling" -DOWNSTREAM_dir = ROOT_dir + "/7.DownstreamAnalyses" -FINAL_OUTPUT_dir = ROOT_dir + "/8.FinalOutput" -REPORTS_dir = ROOT_dir + "/Reports_and_Stats" -LOG_BENCHMARK_dir = ROOT_dir + "/LOGS_AND_BENCHMARKS" - -PEAK_STRINGENT_dir = PEAK_dir + "/stringent" -PEAK_NONSTRINGENT_dir = PEAK_dir + "/nonStringent" -PEAK_ENCODE_dir = PEAK_dir + "/Encode" - -global samplesData -samplesData = read_samplesTable(config["samples"]["summaryFile"]) - -# Make it accessible also within functions -global RGFields -RGFields = constructRGFields(samplesData) - -#print (RGFields) -#print (RGFields["test1_rep1"]["LB"]) -#sys.exit(1) - -# Get all unique sample names -allIndividualsUnique = numpy.unique(samplesData.loc[:,"individual"]) -allIndividualsUniqueStrSpaces = ' '.join(allIndividualsUnique) -allSamplesUnique = numpy.unique(samplesData.loc[:,"sampleName"]) - -# Get only two sampels for nicer graphs -#allIndividualsUnique = numpy.unique(samplesData.loc[:,"individual"])[0:2] -#allSamplesUnique = numpy.unique(samplesData.loc[:,"sampleName"])[0:2] -allSamplesUniqueStr = ','.join(allSamplesUnique) -allSamplesUniqueStrSpaces = ' '.join(allSamplesUnique) -# -# print(samplesData.loc[:,"sampleName"]) -# print(samplesData.loc[:,"individual"]) -# print (allSamplesUnique) -# print (allIndividualsUnique) -# print(numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique])) -# sys.exit(0) - -# Execuables - -script_FL_distr = config["executables"]["FL_distr_script"] -script_STATS = config["executables"]["STATS_script"] -java_exec = config["executables"]["java"] -minMemoryJavaGB = 5 -maxMemoryJavaGB = 50 - -picard_command = java_exec + " -Xms" + str(minMemoryJavaGB) + "g -Xmx" + str(maxMemoryJavaGB) + "g -jar " + config["executables"]["PICARD_jar"] - -par_minMAPQscore = config["par_postalign"]["minMAPQscore"] - - -refGen = config["additionalInputFiles"]["refGenome_fasta"] -if not os.path.isfile(refGen): - raise IOError("File " + refGen + " not found.") - -file_knownSNPs = "" -file_knownINDELS = "" - -if config["par_align"]["assemblyVersion"] in ('hg19', 'hg38'): - gatk_command = java_exec + " -Xms" + str(minMemoryJavaGB) + "g -Xmx" + str(maxMemoryJavaGB) + "g -jar " + config["executables"]["GATK_jar"] - - file_knownSNPs = config["additionalInputFiles"]["knownSNPs"] - file_knownINDELS = config["additionalInputFiles"]["knownIndels"] - if not os.path.isfile(config["executables"]["GATK_jar"]): - raise IOError("File " + config["executables"]["GATK_jar"] + " not found.") - if not os.path.isfile(file_knownSNPs): - raise IOError("File " + file_knownSNPs + " not found.") - if not os.path.isfile(file_knownINDELS): - raise IOError("File " + file_knownINDELS + " not found.") - - dictFile = os.path.splitext(refGen)[0] + ".dict" - - faiFile = refGen + ".fai" - if not (os.path.isfile(faiFile)) or not (os.path.isfile(dictFile)): - raise IOError("Either index file *.fai or *.dict for " + refGen + " not found. See https://software.broadinstitute.org/gatk/guide/article?id=1601") - -if not os.path.isfile(config["additionalInputFiles"]["trimmomatic_adapters"]): - raise IOError("File " + config["additionalInputFiles"]["trimmomatic_adapters"] + " not found.") - -if not os.path.isfile(config["executables"]["FL_distr_script"]): - raise IOError("File " + config["executables"]["FL_distr_script"] + " not found.") - -if not os.path.isfile(config["executables"]["STATS_script"]): - raise IOError("File " + config["executables"]["STATS_script"] + " not found.") - -if not os.path.isfile(config["executables"]["PICARD_jar"]): - raise IOError("File " + config["executables"]["PICARD_jar"] + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["blacklistRegions"]): - raise IOError("File " + config["additionalInputFiles"]["blacklistRegions"] + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["blacklistRegions"]): - raise IOError("File " + config["additionalInputFiles"]["blacklistRegions"] + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["annotationGTF"]): - raise IOError("File " + config["additionalInputFiles"]["annotationGTF"] + " not found.") - - -if not os.path.isfile(config["additionalInputFiles"]["refGenome_2bit"]): - raise IOError("File " + ref2bit + " not found.") - - - -#### Parameter for bamCoverage - -bamCoverage_normOption = config["par_deepTools"]["bamCoverage_normalizationCoverage"] -if not (bamCoverage_normOption == "normalizeTo1x") and not (bamCoverage_normOption == "normalizeUsingRPKM") and not (bamCoverage_normOption == "ignoreForNormalization"): - raise AssertionError("The config parameter config[\"par_deepTools\"][\"bamCoverage_normalizationCoverage\"] has to be one of: normalizeTo1x, normalizeUsingRPKM, ignoreForNormalization") - - -if bamCoverage_normOption == "normalizeTo1x": - bamCoverage_normOption = bamCoverage_normOption + " " + str(config["par_deepTools"]["effectiveGenomeSize"]) - - -########################################################################### -# Get the versions of the used tools and script to record them rigorously # -########################################################################### - -# Almost obselete due to the conda environments. Only record versions for scripts etc - -# For custom scripts, retrieve the modification date instead -VERSION_FL_distr_script = str(os.path.getmtime(config["executables"]["FL_distr_script"])).replace('\n', ' ') -VERSION_STATS_script = str(os.path.getmtime(config["executables"]["STATS_script"])).replace('\n', ' ') - -###################### -# CREATE DIRECTORIES # -###################### - -# Not needed, are created less automatically - -######### -# RULES # -######### - -# For cluster usage: The keyword localrules allows to mark a rule as local, so that it is not submitted to the cluster and instead executed on the host node -localrules: all,link_inputFiles # , prepareEnvironments_JavaMemoryGATK, prepareEnvironments_JavaMemoryPicard - -################### -################### -## FINAL TARGETS ## -################### -################### - - -allSamplesAndIndividuals = numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique + ".final"]) - -rule all: - input: - #idr = expand('{dir}/allSamples.final.{stringency}.peaks.IDR.bed.png', dir = PEAK_dir, stringency = ("stringent", "nonStringent")), - # Filtering by blacklist is the last step, all other files have to be produced before - # First, the individual sample peak files - # individualPeaksEncode = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed', dir = PEAK_ENCODE_dir, sample = allSamplesUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["Encode"], - # peakType = ["broad", "gapped", "narrow"] - # ), - # individualPeaksStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed', dir = PEAK_STRINGENT_dir, sample = allSamplesUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["stringent"], - # peakType = ["narrow"] - # ), - # individualPeaksNonStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed', dir = PEAK_NONSTRINGENT_dir, sample = allSamplesUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["nonStringent"], - # peakType = ["narrow"] - # ), - # pooledPeaksEncode = expand('{dir}/{indiv}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered2.bed', dir = PEAK_ENCODE_dir, indiv = allIndividualsUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["Encode"], - # peaktype2 = [".pooled", ".replicate"], - # peakType = ["broad", "gapped", "narrow"] - # ), - # pooledPeaksStringent = expand('{dir}/{indiv}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered2.bed', dir = PEAK_STRINGENT_dir, indiv = allIndividualsUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["stringent"], - # peaktype2 = [".pooled", ".replicate"], - # peakType = ["narrow"] - # ), - # pooledPeaksNonStringent = expand('{dir}/{indiv}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered2.bed', dir = PEAK_NONSTRINGENT_dir, indiv = allIndividualsUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["nonStringent"], - # peaktype2 = [".pooled", ".replicate"], - # peakType = ["narrow"] - # ), - stats = expand('{dir}/multiqc_report.html', dir = REPORTS_dir) - message: "{ruleDisplayMessage}One Pipeline to rule them \"all\"..." - -################## -################## -## PREPARE DATA ## -################## -################## - - -# The following two rules were deactivated because they cause too much trouble all the time. Java is now just called explicitly -# - -rule link_inputFiles: - input: - forward = expand('{dir}/{{sample}}_1.fastq.gz', dir = INPUT_ORIG_DIR), - reverse = expand('{dir}/{{sample}}_2.fastq.gz', dir = INPUT_ORIG_DIR) - output: - forward = expand('{dir}/{{sample}}_1.fastq.gz', dir = INPUT_DIR), - reverse = expand('{dir}/{{sample}}_2.fastq.gz', dir = INPUT_DIR) - log: - message: "{ruleDisplayMessage}Create symbolic links for the input files {input:q} in directory {INPUT_DIR:q}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/link_inputFiles.{sample}.benchmark" - resources: - version:"NA" - params: sampleCSV = config["samples"]["summaryFile"] - shell: - """sh -c ' - ln -fs {input.forward:q} {output.forward:q} && - ln -fs {input.reverse:q} {output.reverse:q} && - touch -h {output.forward:q} && - touch -h {output.reverse:q} && - cp {params.sampleCSV} {INPUT_DIR} - '""" - - # Workaround for the warning "Unable to set utime on symlink {}. Your Python build does not support it" warning related to symbolic links - # https://bitbucket.org/snakemake/snakemake/issues/397/unable-to-set-utime-on-symlink-your-python - - -rule Picard_CreateSequenceDictionary: - input: - #environment = rules.prepareEnvironments_JavaMemoryPicard.output, - fasta = config["additionalInputFiles"]["refGenome_fasta"] - output: - dict = CLEAN_dir + '/ref.dict.fasta' - log: expand('{dir}/Picard_CreateSequenceDictionary.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Creates a sequence dictionary for a reference sequence for input {input.fasta}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_CreateSequenceDictionary.benchmark" - params: - shell: - """sh -c ' - {picard_command} CreateSequenceDictionary \ - R={input.fasta:q} \ - O={output.dict:q} \ - 2> {log:q} - '""" - - -################################# -################################# -## FASTQC, TRIMMING, ALIGNMENT ## -################################# -################################# - -rule fastqc_BT: - input: - forward = rules.link_inputFiles.output.forward, - reverse = rules.link_inputFiles.output.reverse - output: - forward = expand('{dir}/{{sample}}_1_fastqc.zip', dir = FASTQC_BT_dir), - reverse = expand('{dir}/{{sample}}_2_fastqc.zip', dir = FASTQC_BT_dir) - log: expand('{dir}/fastqc_BT.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Perform FASTQC on the samples {input:q} before trimming..." - threads: threadsMax - priority: 1 - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/fastqc.yaml" - benchmark: LOG_BENCHMARK_dir + "/fastqc_BT.{sample}.benchmark" - shell: - """sh -c ' - fastqc \ - -o {FASTQC_BT_dir:q} \ - -t {threads} \ - --extract \ - {input:q} \ - 2> {log:q} - '""" - -rule trimming_PE: - input: - forward = rules.link_inputFiles.output.forward, - reverse = rules.link_inputFiles.output.reverse, - report = rules.fastqc_BT.output # Not really needed, but force execution here - output: - forward_paired = expand('{dir}/{{sample}}_1.trimmed.fq.gz', dir = TRIM_dir), - forward_unpaired = expand('{dir}/{{sample}}_1.unpaired.fq.gz', dir = TRIM_dir), - reverse_paired = expand('{dir}/{{sample}}_2.trimmed.fq.gz', dir = TRIM_dir), - reverse_unpaired = expand('{dir}/{{sample}}_2.unpaired.fq.gz', dir = TRIM_dir) - log: output = expand('{dir}/trimming_PE_TrimmomaticOutput.{{sample}}.log', dir = LOG_BENCHMARK_dir), - trimlog = expand('{dir}/trimming_PE_trimlog.{{sample}}.log' , dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Trimming of adapters with TRIMMOMATIC in the PE mode for files {input:q} using adapters file {params.adapters:q} ..." - threads: threadsMax - priority: 1 - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/trimmomatic.yaml" - benchmark: LOG_BENCHMARK_dir + "/trimming_PE.{sample}.benchmark" - params: - ILLUMINACLIP = config["par_trimming"]["trimmomatic_ILLUMINACLIP"], - trailing = config["par_trimming"]["trimmomatic_trailing"], - minlen = config["par_trimming"]["trimmomatic_minlen"], - adapters = config["additionalInputFiles"]["trimmomatic_adapters"] - shell: - """sh -c ' - trimmomatic PE \ - -threads {threads} \ - {input.forward:q} {input.reverse:q} \ - {output.forward_paired:q} {output.forward_unpaired:q} {output.reverse_paired:q} {output.reverse_unpaired:q} \ - ILLUMINACLIP:{params.adapters}:{params.ILLUMINACLIP} \ - TRAILING:{params.trailing} \ - MINLEN:{params.minlen} \ - 2>{log.output:q} - '""" - - # removed: -trimlog {log.trimlog:q} \ - -rule fastqc_AT: - input: - forward = rules.trimming_PE.output.forward_paired, - reverse = rules.trimming_PE.output.reverse_paired - output: - forward = expand('{dir}/{{sample}}_1.trimmed_fastqc.zip', dir = FASTQC_AT_dir), - reverse = expand('{dir}/{{sample}}_2.trimmed_fastqc.zip', dir = FASTQC_AT_dir) - log: expand('{dir}/fastqc_AT.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Perform FASTQC on the samples {input:q} after trimming..." - threads: threadsMax - priority: 1 - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/fastqc.yaml" - benchmark: LOG_BENCHMARK_dir + "/fastqc_AT.{sample}.benchmark" - shell: - """sh -c 'fastqc \ - -o {FASTQC_AT_dir:q} \ - -t {threads} \ - --extract \ - {input:q} \ - 2> {log:q} - '""" - -rule Bowtie2_alignment: - input: - file1 = rules.trimming_PE.output.forward_paired, - file2 = rules.trimming_PE.output.reverse_paired, - report = rules.fastqc_AT.output - output: - temp(expand('{dir}/{{sample}}.bt2.sam', dir = ALIGN_dir)) - threads: threadsMax - priority: 1 - log: expand('{dir}/Bowtie2_alignment.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Do Bowtie2 alignment for files {input:q}. This may take a while..." - benchmark: LOG_BENCHMARK_dir + "/alignment.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bowtie2.yaml" - params: - sensitivity = config["par_align"]["bowtie2_sensitivity"], - refGenome = config["par_align"]["bowtie2_refGenome"], - maxFragmentLength = config["par_align"]["bowtie2_maxFragmentLength"] - shell: - """sh -c 'bowtie2 \ - -p {threads} \ - -X {params.maxFragmentLength} \ - {params.sensitivity} \ - -t \ - -x {params.refGenome} \ - -1 {input.file1:q} -2 {input.file2:q} \ - -S {output:q} \ - 2> {log:q} - '""" - - - -##################################### -##################################### -## CLEANING AND BASE RECALIBRATION ## -##################################### -##################################### - -rule samtools_SAM_TO_BAM: - input: - #environment = rules.prepareEnvironments_JavaMemoryPicard.output, - sam = rules.Bowtie2_alignment.output - output: - unsortedBam = temp(expand('{dir}/{{sample}}.bam' , dir = ALIGN_dir)), - sortedBam = expand('{dir}/{{sample}}.s.bam' , dir = ALIGN_dir), - index = expand('{dir}/{{sample}}.s.bam.bai', dir = ALIGN_dir) - threads: threadsMax - priority: 1 - log: - message: "{ruleDisplayMessage}Conversion to BAM, sort, index for file {input.sam:q} ..." - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/samtools_SAM_TO_BAM.{sample}.benchmark" - shell: - """sh -c ' - samtools view -b -S -o {output.unsortedBam:q} {input.sam:q} && - samtools sort -o {output.sortedBam:q} --threads {threads} {output.unsortedBam:q} && - samtools index {output.sortedBam:q} - '""" - - - -basenameSuffix = ".cleaned" - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_cleanSAM_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + ".s" - -rule Picard_cleanSAM: - input: - bam = rules.samtools_SAM_TO_BAM.output.sortedBam - output: - bam = temp(Picard_cleanSAM_outputName + ".bam"), - index = temp(Picard_cleanSAM_outputName + ".bai") - log: expand('{dir}/Picard_cleanSAM.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}CleanSam: Clean the provided SAM/BAM, soft-clipping beyond-end-of-reference alignments and setting MAPQ to 0 for unmapped reads for {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_cleanSAM.{sample}.benchmark" - shell: - """sh -c ' - {picard_command} CleanSam \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} - '""" - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_FixMateInformation_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "2" + ".s" - -rule Picard_FixMateInformation: - input: - bam = rules.Picard_cleanSAM.output.bam, - bai = rules.Picard_cleanSAM.output.index - output: - bam = temp(Picard_FixMateInformation_outputName + '.bam'), - index = temp(Picard_FixMateInformation_outputName + '.bai'), - stats = Picard_FixMateInformation_outputName + '.bam.stats' - log: expand('{dir}/Picard_FixMateInformation.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Verify mate-pair information between mates and fix if needed for input {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_FixMateInformation.{sample}.benchmark" - shell: - """sh -c ' - {picard_command} FixMateInformation \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} && - samtools flagstat {output.bam:q} > {output.stats:q} - '""" - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_AddOrReplaceReadGroups_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "3" + ".s" - -rule Picard_AddOrReplaceReadGroups: - input: - bam = rules.Picard_FixMateInformation.output.bam, - bai = rules.Picard_FixMateInformation.output.index - output: - bam = temp(Picard_AddOrReplaceReadGroups_outputName + '.bam'), - index = temp(Picard_AddOrReplaceReadGroups_outputName + '.bai') - log: expand('{dir}/Picard_AddOrReplaceReadGroups.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Replace read groups in a BAM file for input {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_AddOrReplaceReadGroups.{sample}.benchmark" - params: - RGID = lambda wildcards: RGFields[wildcards.sample]["ID"], # Read Group ID - RGLB = lambda wildcards: RGFields[wildcards.sample]["LB"], # Read Group library - RGPL = lambda wildcards: RGFields[wildcards.sample]["PL"], # Read Group platform - RGPU = lambda wildcards: RGFields[wildcards.sample]["PU"], # Read Group platform unit (eg. run barcode) - RGSM = lambda wildcards: RGFields[wildcards.sample]["SM"] # Read Group sample name - shell: - """sh -c ' - {picard_command} AddOrReplaceReadGroups \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - RGID={params.RGID} \ - RGLB={params.RGLB} \ - RGPL={params.RGPL} \ - RGPU={params.RGPU} \ - RGSM={params.RGSM} \ - 2> {log:q} - '""" - - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_ReorderSam_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "4" + ".s" - -rule Picard_ReorderSam: - input: - bam = rules.Picard_AddOrReplaceReadGroups.output.bam, - index = rules.Picard_AddOrReplaceReadGroups.output.index, - ref = config["additionalInputFiles"]["refGenome_fasta"] - output: - bam = temp(Picard_ReorderSam_outputName + '.bam'), - index = temp(Picard_ReorderSam_outputName + '.bai') - log: expand('{dir}/Picard_ReorderSam.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Reorders reads in a SAM/BAM file to match the contig ordering in a provided reference file for input {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_ReorderSam.{sample}.benchmark" - params: - shell: - """sh -c ' - {picard_command} ReorderSam \ - I={input.bam:q} \ - REFERENCE={input.ref:q} \ - O={output.bam:q} \ - ALLOW_INCOMPLETE_DICT_CONCORDANCE=TRUE \ - CREATE_INDEX=true \ - 2> {log:q} - '""" - - -rule GATK_baseRecalibration1: - input: - #environment = rules.prepareEnvironments_JavaMemoryGATK.output, - bam = rules.Picard_ReorderSam.output.bam, - index = rules.Picard_ReorderSam.output.index - output: - recalibrationTable1 = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.recalTable1', dir = BASERECAL_dir) - log: expand('{dir}/GATK_baseRecalibration1.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}BaseRecalibrator(1): Detect systematic errors in base quality scores for file {input.bam:q}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - params: - knownSNPs = file_knownSNPs, - knownIndels = file_knownINDELS, - fasta = config["additionalInputFiles"]["refGenome_fasta"] - benchmark: LOG_BENCHMARK_dir + "/GATK_baseRecalibration1.{sample}.benchmark" - shell: - """sh -c ' - {gatk_command} -T BaseRecalibrator\ - -R {params.fasta:q} \ - -knownSites {params.knownSNPs:q} \ - -knownSites {params.knownIndels:q} \ - --lowMemoryMode \ - -I {input.bam:q} \ - --out {output.recalibrationTable1:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - '""" - -rule GATK_printReadsBQSR: - input: - #environment = rules.prepareEnvironments_JavaMemoryGATK.output, - bam = rules.Picard_ReorderSam.output.bam, - index = rules.Picard_ReorderSam.output.index, - recalibrationTable = rules.GATK_baseRecalibration1.output.recalibrationTable1 - output: - bam = temp(expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam', dir = BASERECAL_dir)) - log: expand('{dir}/GATK_printReadsBQSR.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}PrintReads: Recalibrate {input.bam:q} using recalibration table {input.recalibrationTable:q} ..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: LOG_BENCHMARK_dir + "/GATK_printReadsBQSR.{sample}.benchmark" - params: - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """sh -c ' - {gatk_command} -T PrintReads \ - -R {params.fasta:q} \ - -I {input.bam:q} \ - -BQSR {input.recalibrationTable:q} \ - --out {output.bam:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - '""" - -rule GATK_baseRecalibration2: - input: - #environment = rules.prepareEnvironments_JavaMemoryGATK.output, - bam = rules.Picard_ReorderSam.output.bam, - index = rules.Picard_ReorderSam.output.index, - recalibrationTable1 = rules.GATK_baseRecalibration1.output.recalibrationTable1 - output: - recalibrationTable2 = expand('{dir}/{{sample}}.BQrecal.s.bam.recalTable2', dir = BASERECAL_dir) - log: expand('{dir}/GATK_baseRecalibration2.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}BaseRecalibrator(2): Detect systematic errors in base quality scores for file {input.bam:q} and first recalibration table..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: LOG_BENCHMARK_dir + "/GATK_baseRecalibration2.{sample}.benchmark" - params: - knownSNPs = file_knownSNPs, - knownIndels = file_knownINDELS, - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """sh -c ' - {gatk_command} -T BaseRecalibrator \ - -R {params.fasta:q} \ - -knownSites {params.knownSNPs:q} \ - -knownSites {params.knownIndels:q} \ - -I {input.bam:q} \ - -BQSR {input.recalibrationTable1:q} \ - --out {output.recalibrationTable2:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - '""" - -# Use -L argument with BaseRecalibrator to restrict recalibration to capture targets on WEx -# - BQSR depends on key assumption: every mismatch is an error, except sites in known variants -# - Off-Âtarget sequence likely to have higher error rates with different error modes -# - If off-target sequence is included in recalibration, may skew the model and mess up results - - -rule GATK_analyzeCovariates: - input: - recalibrationTable1 = rules.GATK_baseRecalibration1.output.recalibrationTable1, - recalibrationTable2 = rules.GATK_baseRecalibration2.output.recalibrationTable2 - output: - plots = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.plots.pdf', dir = REPORTS_dir), - csv = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.plots.csv', dir = REPORTS_dir) - log: - expand('{dir}/GATK_analyzeCovariates.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: - "{ruleDisplayMessage}AnalyzeCovariates: Create plots to visualize base recalibration results for {input}..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: LOG_BENCHMARK_dir + "/GATK_analyzeCovariates.{sample}.benchmark" - params: - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """sh -c ' - {gatk_command} -T AnalyzeCovariates \ - -R {params.fasta:q} \ - -before {input.recalibrationTable1:q} \ - -after {input.recalibrationTable2:q} \ - -plots {output.plots:q} \ - -csv {output.csv:q} \ - --log_to_file {log:q} - '""" - -############### -############### -## FILTERING ## -############### -############### - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4.BQrecal.rmChrM.s.bam' - -# Define if GATK should be run or not -def postalign_inputs(assemblyVersion): - if assemblyVersion in ('hg19', 'hg38'): - return rules.GATK_printReadsBQSR.output.bam - else: - return rules.Picard_ReorderSam.output.bam - - -rule postalign_remove_chrMAndUnassembledChr: - input: - bam = postalign_inputs(config["par_align"]["assemblyVersion"]) - output: - bam = temp(postalign_remove_chrMAndUnassembledChr_outputName), - index2 = temp(postalign_remove_chrMAndUnassembledChr_outputName + ".bai"), - stats = postalign_remove_chrMAndUnassembledChr_outputName + ".stats", - csv = postalign_remove_chrMAndUnassembledChr_outputName + ".csv" - log: - message: "{ruleDisplayMessage}Remove mitochondrial reads for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_remove_chrMAndUnassembledChr.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """sh -c ' - samtools index {input.bam:q} && - samtools idxstats {input.bam:q} | cut -f 1 | grep chr | grep -Pv "chrM|chrUn|random|hap" | xargs samtools view -b {input.bam:q} >{output.bam:q} && - samtools index {output.bam:q} && - samtools flagstat {output.bam:q} > {output.stats:q} && - samtools view {output.bam:q} | cut -f3,5,9 > {output.csv:q} - '""" - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -markDuplicates_Picardtools_outputName = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s' - - -rule markDuplicates_Picardtools: - input: - #environment = rules.prepareEnvironments_JavaMemoryPicard.output, - bam = rules.postalign_remove_chrMAndUnassembledChr.output.bam, - index = rules.postalign_remove_chrMAndUnassembledChr.output.index2 - output: - bam = temp(markDuplicates_Picardtools_outputName + ".bam"), - index = temp(markDuplicates_Picardtools_outputName + ".bai") - log: - log = expand('{dir}/markDuplicates_Picardtools.{{sample}}.log' , dir = LOG_BENCHMARK_dir), - metricsFile = expand('{dir}/markDuplicates_Picardtools.{{sample}}_metrics.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Mark (not remove!) duplicate reads for file {input:q} with Picard tools..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/markDuplicates_Picardtools.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - ValidationStringency = config["par_postalign"]["ValidationStringencyMarkDuplicates"], - removeDuplicates = "false", - assumeSorted = "true" - shell: - """sh -c '{picard_command} MarkDuplicates \ - INPUT={input.bam:q} \ - OUTPUT={output.bam:q} \ - ASSUME_SORTED={params.assumeSorted} \ - METRICS_FILE={log.metricsFile:q} \ - VALIDATION_STRINGENCY={params.ValidationStringency} \ - REMOVE_DUPLICATES={params.removeDuplicates} \ - CREATE_INDEX=TRUE \ - 2> {log.log:q}'""" - - - -rule computeLibraryComplexity: - input: - bam = rules.markDuplicates_Picardtools.output.bam - output: - stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl' - log: - message:"{ruleDisplayMessage}Compute library complexity for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/computeLibraryComplexity.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bedtools.yaml" - params: - shell: - # Taken from https://github.com/kundajelab/atac_dnase_pipelines/blob/72ed6ba2502cca074c51740b612cbc6ebea07b08/modules/postalign_bam.bds - # Implementing the ENCODE ATAC-Seq library complexity guidelines - # PBC File output - # TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair - """ - bedtools bamtobed -i {input.bam:q} | \ - awk 'BEGIN{{OFS="\\t"}} {{print $1,$2,$3,$6}}' | \ - grep -v 'chrM' | sort | uniq -c | \ - awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; if(m2>0) m1_m2=m1/m2; \ - printf "readsTotal\\treadsDistinct\\treadsOccOne\\treadsOccTwo\\tNRF\\tPBC1\\tPBC2\\n%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",mt,m0,m1,m2,m0/mt,m1/m0,m1_m2}}' > {output.stats}""" - - - -rule removeDuplicates: - input: - bam = rules.markDuplicates_Picardtools.output.bam, - index = rules.markDuplicates_Picardtools.output.index, - stats = rules.computeLibraryComplexity.output.stats - output: - bam = temp(RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam') - log: - message: "{ruleDisplayMessage}Remove duplicate reads and QC-failing reads for file {input:q} with samtools..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/removeDuplicates.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - removeReadsWithFlags = 1804, # read or mate unmapped, not primary alignment, read fails platform/vendor quality checks, read is PCR or optical duplicate - keepReadsWithFlags = 2 # read mapped in proper pair - shell: - """sh -c 'samtools view -F {params.removeReadsWithFlags} -f {params.keepReadsWithFlags} -b {input.bam:q} > {output.bam:q}'""" - # https://github.com/kundajelab/atac_dnase_pipelines/blob/72ed6ba2502cca074c51740b612cbc6ebea07b08/modules/postalign_bam.bds - - -rule postalign_samtools_flagstat: - input: - rules.removeDuplicates.output.bam - output: - stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats', - csv = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.csv' - log: - message: "{ruleDisplayMessage}Run samtools flagstat on file {input:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_samtools_flagstat.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """sh -c ' - samtools flagstat {input:q} > {output.stats:q} && - samtools view {input:q} | cut -f3,5,9 > {output.csv} - '""" - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.s.bam' - -rule postalign_MAPQ: - input: - bam = rules.removeDuplicates.output.bam - output: - bam = temp(postalign_MAPQ_outputName), - index = temp(postalign_MAPQ_outputName + ".bai"), - stats = postalign_MAPQ_outputName + ".stats", - csv = postalign_MAPQ_outputName + ".csv" - log: - message: "{ruleDisplayMessage}Remove reads with a MAPQ quality lower than {par_minMAPQscore} for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_MAPQ.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - minMAPQ = par_minMAPQscore - shell: - """sh -c ' - samtools view -b -q {params.minMAPQ} -F 4 {input.bam:q} > {output.bam:q} && - samtools index {output.bam:q} && - samtools flagstat {output.bam:q} > {output.stats:q} && - samtools view {output.bam:q} | cut -f3,5,9 > {output.csv:q} - '""" - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.adjRSS.s.bam' - - -rule postaling_RSS: - input: - bam = rules.postalign_MAPQ.output.bam, - index = rules.postalign_MAPQ.output.index - output: - #header = temp(expand('{dir}/header{{sample}}', dir = ADJRSS_dir)), - #forward = temp(expand('{dir}/temp1forw{{sample}}.sam', dir = ADJRSS_dir)), - #reverse = temp(expand('{dir}/temp1rev{{sample}}.sam', dir = ADJRSS_dir)), - bam = postalign_RSS_outputName, - stats = postalign_RSS_outputName + ".stats", - csv = postalign_RSS_outputName + ".csv" - log: - message: "{ruleDisplayMessage}Adjust read start sites for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postaling_RSS.{sample}.benchmark" - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - adjustRSS_forward = config["par_postalign"]["adjustRSS_forward"], - adjustRSS_reverse = config["par_postalign"]["adjustRSS_reverse"] - shell: - """ cat <(samtools view -H {input.bam:q}) <(samtools view -F 16 {input.bam:q} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4=$4+{params.adjustRSS_forward}; print $0}}') <(samtools view -f 16 {input.bam:q} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4=$4+{params.adjustRSS_reverse}; print $0}}') | samtools view -S -b -o {output.bam:q} - && - samtools flagstat {output.bam:q} > {output.stats:q} && - samtools view {output.bam:q} | cut -f3,5,9 > {output.csv:q} - """ - - -Picard_sortFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.adjRSS.cleaned1.s' - -# Necessary to resport, as Picard seems to require a different sorting order -# VALIDATION_STRINGENCY=SILENT is important because any strict check might identify the out-of-reference alignments and abort otherwise. -rule Picard_sortFinal: - input: - bam = rules.postaling_RSS.output.bam - output: - bam = temp(Picard_sortFinal_outputName + '.bam') - log: expand('{dir}/Picard_sortFinal.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}SortSam for {input.bam:q} ..." - threads: 1 - priority: 1 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """sh -c ' - {picard_command} SortSam \ - I={input.bam:q} \ - O={output.bam:q} \ - SORT_ORDER=coordinate \ - VALIDATION_STRINGENCY=SILENT \ - 2> {log:q} - '""" - -Picard_cleanSamFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.adjRSS.cleaned2.s' - -# Necessary to handle potential out-of-reference alignments due to the read start adjustment. -# Out-of-references bases will be soft-clipped, which is conform with the BAM specifications -rule Picard_cleanSamFinal: - input: - bam = rules.Picard_sortFinal.output.bam - output: - bam = temp(Picard_cleanSamFinal_outputName + '.bam'), - index = temp(Picard_cleanSamFinal_outputName + '.bai') - log: expand('{dir}/Picard_cleanSamFinal.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}CleanSam: Clean the provided SAM/BAM, soft-clipping beyond-end-of-reference alignments and setting MAPQ to 0 for unmapped reads for {input.bam:q} ..." - threads: 1 - priority: 1 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """sh -c ' - {picard_command} CleanSam \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} - '""" - - -# Necessary because adjusting the RSS makes the start position column of the next mate invalid -rule Picard_FixMateInformationFinal: - input: - bam = rules.Picard_cleanSamFinal.output.bam, - bai = rules.Picard_cleanSamFinal.output.index - output: - bam = expand('{dir}/{{sample}}.final.bam', dir = FINAL_OUTPUT_dir), - index = expand('{dir}/{{sample}}.final.bai', dir = FINAL_OUTPUT_dir) - log: expand('{dir}/Picard_FixMateInformationFinal.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Verify mate-pair information between mates and fix if needed for input {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - shell: - """sh -c ' - {picard_command} FixMateInformation \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} - '""" - - -######################## -######################## -## MERGING REPLICATES ## -######################## -######################## - - - -def getSampleBasenamesForIndividual(individual): - """text""" - sampleBasenames = numpy.asarray(samplesData.loc[samplesData["individual"] == individual, "sampleName"]) - return sampleBasenames - - -rule Picardtools_MergeSamFiles: - input: - lambda wildcards: expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = getSampleBasenamesForIndividual(wildcards.individual)) - output: - bam = expand('{dir}/{{individual}}.merged.final.bam', dir = FINAL_OUTPUT_dir), - index = expand('{dir}/{{individual}}.merged.final.bam.bai', dir = FINAL_OUTPUT_dir) - log: - expand('{dir}/Picardtools_MergeSamFiles.{{individual}}.log', dir = LOG_BENCHMARK_dir) - message: - "{ruleDisplayMessage}Merging all replicates for individual {wildcards.individual} (files {input:q} with Picardtools..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picardtools_MergeSamFiles.{individual}.benchmark" - params: inputString = lambda wildcards, input: " I=".join(input) - shell: - """sh -c ' - {picard_command} MergeSamFiles \ - I={params.inputString} \ - O={output.bam:q} \ - 2> {log:q} && - samtools index {output.bam:q} - '""" - - - - -############# -############# -## GC BIAS ## -############# -############# - -rule deepTools_computeGCBias: - input: - bam = FINAL_OUTPUT_dir + '/{basename}.final.bam' - output: - frequencies = REPORTS_dir + '/{basename}.GCBias.frequencies', - biasPlot = REPORTS_dir + '/{basename}.GCBias.plot.pdf' - log: LOG_BENCHMARK_dir + "/deepTools_computeGCBias.{basename}.log" - message: "{ruleDisplayMessage}Run deepTools: computeGCBias for file {input.bam}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_computeGCBias_{basename}.benchmark" - params: - genome2Bit = config["additionalInputFiles"]["refGenome_2bit"], - blacklistRegions = config["additionalInputFiles"]["blacklistRegions"], - effectiveGenomeSize = config["par_deepTools"]["effectiveGenomeSize"], - fragmentLength = "200", # will be ignored for paired-end data, as the fragment length is computed based from the bam file - other = "" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - computeGCBias \ - --bamfile {input.bam} \ - --effectiveGenomeSize {params.effectiveGenomeSize} \ - --genome {params.genome2Bit} \ - --blackListFileName {params.blacklistRegions} \ - --fragmentLength {params.fragmentLength}\ - {params.other} \ - --plotFileFormat pdf \ - --biasPlot {output.biasPlot} \ - --GCbiasFrequenciesFile {output.frequencies} \ - --numberOfProcessors {threads} \ - 2> {log:q} - '""" - - -rule deepTools_correctGCBias: - input: - bam = FINAL_OUTPUT_dir + '/{basename}.final.bam', - frequencies = rules.deepTools_computeGCBias.output.frequencies - output: - bam = FINAL_OUTPUT_dir + '/{basename}.noGCBias.final.bam', - index = FINAL_OUTPUT_dir + '/{basename}.noGCBias.final.bam.bai', - log: LOG_BENCHMARK_dir + "/deepTools_correctGCBias.{basename}.log" - message: "{ruleDisplayMessage}Run deepTools: correctGCBias for file {input.bam}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_correctGCBias_{basename}.benchmark" - params: - genome2Bit = config["additionalInputFiles"]["refGenome_2bit"], - effectiveGenomeSize = config["par_deepTools"]["effectiveGenomeSize"], - other = "" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - correctGCBias \ - --bamfile {input.bam} \ - --GCbiasFrequenciesFile {input.frequencies} \ - --effectiveGenomeSize {params.effectiveGenomeSize} \ - --genome {params.genome2Bit} \ - --correctedFile {output.bam} \ - --numberOfProcessors {threads} \ - 2> {log:q} && - samtools index {output.bam} - '""" - - -################## -################## -## PEAK CALLING ## -################## -################## - - -def getGenomeTypeMacs2(assemblyVersion): - - if assemblyVersion == "mm9" or assemblyVersion == "mm10": - genomeType = "mm" - elif assemblyVersion == "hg19" or assemblyVersion == "hg38": - genomeType = "hs" - else: - raise NotImplementedError("Genome assembly version " + assemblyVersion + " not yet implemented for the -g parameter in MACS2.") - - return genomeType - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_stringent_outputName = PEAK_STRINGENT_dir + '/{basename}' + '.stringent' - -# Runs for all bam fiels in the final output folder, also the replicated one -rule macs2_stringent: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - peaks_bedT = temp(macs2_stringent_outputName + '_peaks.narrowPeak'), - peaks_bed = temp(macs2_stringent_outputName + '.narrowPeak'), - summit_bed = macs2_stringent_outputName + '_summits.bed', - xls = temp(macs2_stringent_outputName + '_peaks.xls') - log: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (stringent) for {input.bam:q} ..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - qValue = config["par_peakCalling"]["modelStringent_minQValue"], - modelPar = config["par_peakCalling"]["modelStringent"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.stringent', - outputDir = PEAK_STRINGENT_dir, - keepDuplicates = "--keep-dup all" - shell: - """macs2 callpeak \ - --treatment {input.bam} \ - -q {params.qValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelPar} \ - 2> {log:q} && - sort -k 8gr,8gr {output.peaks_bedT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.peaks_bed} - """ - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_nonStringent_outputName = PEAK_NONSTRINGENT_dir + '/{basename}' + '.nonStringent' - - -rule macs2_nonStringent: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - peaks_bedT = temp(macs2_nonStringent_outputName + '_peaks.narrowPeak'), - peaks_bed = temp(macs2_nonStringent_outputName + '.narrowPeak'), - summit_bed = macs2_nonStringent_outputName + '_summits.bed', - xls = temp(macs2_nonStringent_outputName + '_peaks.xls') - log: LOG_BENCHMARK_dir + "/macs2_nonStringent.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (non-stringent) for {input.bam:q} ..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/macs2_nonStringent.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - qValue = config["par_peakCalling"]["modelNonStringent_minQValue"], - slocalVal = config["par_peakCalling"]["modelNonStringent_slocal"], - modelPar = config["par_peakCalling"]["modelNonStringent"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.nonStringent', - outputDir = PEAK_NONSTRINGENT_dir, - keepDuplicates = "--keep-dup all" - shell: - """macs2 callpeak \ - --treatment {input.bam} \ - -q {params.qValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.modelPar} \ - {params.keepDuplicates} \ - --slocal {params.slocalVal} \ - 2> {log:q} && - sort -k 8gr,8gr {output.peaks_bedT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.peaks_bed} - """ - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_Encode_outputName = PEAK_ENCODE_dir + '/{basename}' + '.Encode' - -rule macs2_Encode: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - broadPeakfileT = temp(macs2_Encode_outputName + '_peaks.broadPeak'), - gappedPeakfileT = temp(macs2_Encode_outputName + '_peaks.gappedPeak'), - narrowPeakfileT = temp(macs2_Encode_outputName + '_peaks.narrowPeak'), - xls = temp(macs2_Encode_outputName + '_peaks.xls'), - broadPeakfile = temp(macs2_Encode_outputName + '.broadPeak'), - gappedPeakfile = temp(macs2_Encode_outputName + '.gappedPeak'), - narrowPeakfile = temp(macs2_Encode_outputName + '.narrowPeak'), - bdg1 = macs2_Encode_outputName + '_control_lambda.bdg', - bdg2 = macs2_Encode_outputName + '_treat_pileup.bdg' - log: - broadAndGapped = LOG_BENCHMARK_dir + "/macs2_Encode_broadAndGapped.{basename}.log", - narrow = LOG_BENCHMARK_dir + "/macs2_Encode_narrow.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (Encode version) for {input.bam:q} ..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/macs2_Encode.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - pValue = config["par_peakCalling"]["Encode_pValThreshold"], - modelParBroadAndGapped = config["par_peakCalling"]["Encode_modelBroadAndGapped"], - modelParNarrow = config["par_peakCalling"]["Encode_modelNarrow"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.Encode', - outputDir = PEAK_ENCODE_dir, - keepDuplicates = "--keep-dup all" - shell: - # 1. First produce broad and gapped peaks, then narrow ones - # See https://www.encodeproject.org/atac-seq/ and https://github.com/kundajelab/atac_dnase_pipelines/blob/62e1c544a394d3215d0b2d24743fc1e8bb08123c/modules/callpeak_macs2.bds - # After peak calling, sort by col 8and 14 in descending order and replace long peak names in Column 4 with Peak_<peakRank> - """ - macs2 callpeak \ - --treatment {input.bam} \ - -p {params.pValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelParBroadAndGapped} \ - 2> {log.broadAndGapped:q} && - sort -k 8gr,8gr {output.broadPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.broadPeakfile} && - sort -k 14gr,14gr {output.gappedPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.gappedPeakfile} && - macs2 callpeak \ - --treatment {input.bam} \ - -p {params.pValue} \ - --outdir {params.outputDir} \ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelParNarrow} \ - 2> {log.narrow:q} && - sort -k 8gr,8gr {output.narrowPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.narrowPeakfile} - """ - - -def awkStringPeakType (peakType, overlap): - """text""" - - awkStr = "{{s1=$3-$2; " - - if peakType == "narrow": - awkStr = awkStr + "s2=$13-$12; if (($21/s1 >= " + str(overlap) + ") || ($21" - elif peakType == "broad": - awkStr = awkStr + "s2=$12-$11; if (($19/s1 >= " + str(overlap) + ") || ($19" - elif peakType == "gapped": - awkStr = awkStr + "s2=$18-$17; if (($31/s1 >= " + str(overlap) + ") || ($31" - - awkStr = awkStr + "/s2 >= " + str(overlap) + ")) {{print $0}}}}" - return awkStr - - -def cutColPeakType (peakType): - """text""" - - if peakType == "narrow": - return "1-10" - elif peakType == "broad": - return "1-9" - elif peakType == "gapped": - return "1-15" - - -def generateInputFiles (wildcards): - if len(getSampleBasenamesForIndividual(wildcards.individual)) == 0: - raise AssertionError("Cannot determine sample basenames for wildcard " + wildcards.individual + ": " + str(len(getSampleBasenamesForIndividual(wildcards.individual)))) - return expand('{dir}/{samples}{GCBiasStr}final.{analysisType}.{peakType}Peak.filtered.bed', dir = wildcards.dir, samples = getSampleBasenamesForIndividual(wildcards.individual), GCBiasStr = wildcards.GCBiasStr, analysisType = wildcards.analysisType, peakType = wildcards.peakType) - -rule poolPeaksReplicateSamples: - input: - peakfiles = generateInputFiles - output: - pooledPeaks = expand('{{dir}}/{{individual}}.merged{{GCBiasStr}}final.{{analysisType}}.pooled.{{peakType}}Peak.filtered2.bed'), - replicatePeaks = expand('{{dir}}/{{individual}}.merged{{GCBiasStr}}final.{{analysisType}}.replicate.{{peakType}}Peak.filtered2.bed') - log: - message: "{ruleDisplayMessage}Pool peaks for individual {wildcards.individual}, GCBias:{wildcards.GCBiasStr}, analysisType: {wildcards.analysisType}, peaktype: {wildcards.peakType} and produce pooled and replicated peaks for input {input.peakfiles:q} ..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - params: minOverlap = 0.5 - run: - - # 1. Pool replicate samples and produce a replicate peak file - - # With compression : zcat {input.peakfiles} | gzip -nc > {output.pooledPeaks} - - # Without compression - shell("""cat {input.peakfiles} > {output.pooledPeaks}""") - - # 2. From this set of peaks on pooled data, we only retain those that have at least 50% overlap with a peak both replicates. - - # https://github.com/kundajelab/atac_dnase_pipelines/blob/master/modules/callpeak_naive_overlap.bds - # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs >= 0.5 - - cutStr = cutColPeakType (wildcards.peakType) - awkStr = awkStringPeakType (wildcards.peakType, params.minOverlap) - - command = "" - - # Strategy: call intersectBed multiple times: -a is always the result of the previous one, starting with all pooled peaks and only those peaks overlapping at least 50% with the previous run. There, this is a logical "and" connection - for i in range(len(input)): - - command = command + "intersectBed -wo -a " - - if ( i == 0 ): - command += """{output.pooledPeaks}""" - else: - command += "stdin " - - command = command + " -b " + input[i] + """ | awk 'BEGIN{{FS="\\t";OFS="\\t"}} {awkStr}' | cut -f {cutStr} | sort | uniq """ - - if not (i == (len(input) - 1) ): - command = command + "|" - - - - command = command + """ >{output.replicatePeaks}""" - - shell(command) - -rule filterPeaks: - input: - bed = '{dir}/{basename}Peak' - - output: - bed = '{dir}/{basename}Peak' + '.filtered.bed' - log: - message: "{ruleDisplayMessage}Exclude blacklist regions for file {input.bed}..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - #benchmark: LOG_BENCHMARK_dir + "/filterPeaks.{basename}.benchmark" - params: - blacklistRegions = config["additionalInputFiles"]["blacklistRegions"] - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bedtools.yaml" - shell: - """sh -c ' - bedtools subtract \ - -a {input.bed} \ - -b {params.blacklistRegions} \ - > {output.bed} - '""" - -# rule idr: -# input: -# sample1Peaks = expand('{dir}/{{sample1}}.final.{{analysisType}}.{{peakType}}Peak.Peak' , dir = PEAK_dir) -# sample2Peaks = expand('{dir}/{{sample2}}.final.{{analysisType}}.{{peakType}}Peak.Peak' , dir = PEAK_dir) -# pooledPeaks = expand() -# output: -# peaks = expand('{dir}/idr_{{sample1}}.{{sample2}}.{{analysisType}}.{{peakType}}Peak' , dir = PEAK_dir), -# plot = expand('{dir}/idr_{{sample1}}.{{sample2}}.{{analysisType}}.{{peakType}}Peak.png', dir = PEAK_dir) -# log: -# LOG_BENCHMARK_dir + "/idr.{sample1}.{sample2}.{analysisType}.{peakType}.log" -# message: -# "{ruleDisplayMessage}Run IDR analysis for files {sample1} and {sample2} using analysisType={analysisType} and peakType={peakType}..." -# threads: 1 -# priority: 1 -# resources: maxMemGB=20 -# benchmark: -# LOG_BENCHMARK_dir + "/idr.{sample1}.{sample2}.{analysisType}.{peakType}.benchmark" -# params: -# rank = config["par_peakCalling"]["idr_rank"], -# softIDRThreshold = config["par_peakCalling"]["idr_softIDRThreshold"] -# other = "--plot --use-best-multisummit-IDR", -# inputFileType = "narrowPeak" # File type of --samples and --peak-list -# conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/idr.yaml" -# shell: -# """sh -c '{idr_exec:q} \ -# --samples {input.sample1Peaks} {input.sample2Peaks} \ -# --input-file-type {params.inputFileType} \ -# --peak-list {input.pooledPeaks} \ -# --output-file {output.bed:q} \ -# --rank {params.rank} \ -# --soft-idr-threshold {params.softIDRThreshold} \ -# {params.other} \ -# --log-output-file {log:q}'""" - - - # after IDR: - # sys idr_thresh_transformed=$(awk -v p=$idr_thresh 'BEGIN{print -log(p)/log(10)}') - # - # //# Get peaks passing global IDR threshold and convert file to narrowPeak format (Step 9) - # sys awk 'BEGIN{OFS="\\t"} $12>='"${idr_thresh_transformed}"' {if ($2<0) $2=0; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,"0"}' $idr_out \ - # | sort | uniq | sort -k7n,7n | gzip -nc > $peak_idr_trk_tmp - # - # sys zcat $peak_idr_trk_tmp | awk 'BEGIN{OFS="\\t"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' | gzip -nc > $peak_idr - # sys zcat $peak_idr_trk_tmp | awk 'BEGIN{OFS="\\t"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' | gzip -nc > $peak_idr_trk - # - # - - -############################## -############################## -## STATISTICS AND SUMAMRIES ## -############################## -############################## - -rule fragment_length_distr: - input: - expand('{dir}/{allSamples}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.csv', dir = ADJRSS_dir, MAPQ = par_minMAPQscore, allSamples = allSamplesUnique) - output: - pdf = expand('{dir}/allSamples_fragmentLengthDistr.pdf', dir = REPORTS_dir), - rdata = expand('{dir}/allSamples_fragmentLengthDistr.RData', dir = REPORTS_dir) - log: expand('{dir}/fragment_length_distr.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Create fragment length distribution..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/fragment_length_distr.benchmark" - resources: - version: VERSION_FL_distr_script - params: - FL_distr_cutoff = config["par_scripts"]["FL_distr_script_cutoff"], - inputString = lambda wildcards, input: ','.join(input) - shell: - """sh -c ' - Rscript {script_FL_distr:q} \ - {params.inputString} \ - {params.FL_distr_cutoff} \ - {output.pdf:q} \ - {output.rdata:q} \ - {log:q} - '""" - - -# Enforce the creation of the report here, use a "dummy" file in case of GATK was skipped -def GATKReport(assemblyVersion): - if assemblyVersion in ('hg19', 'hg38'): - return expand('{dir}/{sample}.cleaned4.BQrecal.s.bam.plots.pdf', dir = REPORTS_dir, sample = allSamplesUnique) - else: - return expand('{dir}/{sample}{basename}4.s.bam', dir = CLEAN_dir, basename = basenameSuffix, sample = allSamplesUnique) - -rule stats: - input: - # We have to specify the results from ALL samples here because they are collectively needed as input - GATKReport(config["par_align"]["assemblyVersion"]), - expand('{dir}/allSamples_fragmentLengthDistr.pdf', - dir = REPORTS_dir), - expand('{dir}/{sample}.s.bam', - dir = ALIGN_dir, sample = allSamplesUnique), - expand('{dir}/{sample}.s.bam.bai', - dir = ALIGN_dir, sample = allSamplesUnique), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.s.bam.stats', - dir = CHRM_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats', - dir = RMDUP_DIR, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats', - dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.stats', - dir = ADJRSS_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl', - dir = RMDUP_DIR, sample = allSamplesUnique), - expand('{dir}/{sample}.final.bam', dir = FINAL_OUTPUT_dir, sample = allSamplesUnique) - output: - pdf = expand('{dir}/allSamples_statSummary.pdf', dir = REPORTS_dir), - rdata = expand('{dir}/allSamples_statSummary.RData', dir = REPORTS_dir), - log: expand('{dir}/stats.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Generate statistics about pipeline and produce file {output:q}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/stats.benchmark" - resources: - version: VERSION_STATS_script - params: - pairedEnd = config["samples"]["pairedEnd"], - withinThr = config["par_scripts"]["STATS_script_withinThr"], - outsideThr = config["par_scripts"]["STATS_script_outsideThr"], - geneTypesToKeep = config["par_scripts"]["STATS_script_geneTypesToKeep"], - annotationFile = config["additionalInputFiles"]["annotationGTF"], - statsPattern = "*.s.bam.stats$", - libraryStatsPattern = "*.s.bam.statsLibraryCompl$", - rootDir = ROOT_dir - shell: - """sh -c ' - Rscript {script_STATS:q} \ - {allSamplesUniqueStr} \ - {params.rootDir:q} \ - {output.pdf:q} \ - {output.rdata:q} \ - {params.annotationFile} \ - {params.pairedEnd} \ - {params.withinThr} \ - {params.outsideThr} \ - {params.geneTypesToKeep} \ - {log:q} \ - {params.statsPattern} \ - {params.libraryStatsPattern} - '""" - - -rule bamCoverage: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - bigwig = expand('{dir}/{{basename}}.bigwig' , dir = FINAL_OUTPUT_dir), - bedgraph = temp(expand('{dir}/{{basename}}.bedgraph', dir = FINAL_OUTPUT_dir)), - bedgraphgz = expand('{dir}/{{basename}}.bedgraph.gz', dir = FINAL_OUTPUT_dir), - bedgraphIndex = expand('{dir}/{{basename}}.bedgraph.gz.tbi', dir = FINAL_OUTPUT_dir) - log: - bigwig = LOG_BENCHMARK_dir + "/bamCoverage.{basename}.bigwig.log", - bedgraph = LOG_BENCHMARK_dir + "/bamCoverage.{basename}.bedgraph.log" - message: "{ruleDisplayMessage}Run bamCoverage for {input.bam:q} ..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/bamCoverage.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - params: - normalization = bamCoverage_normOption, - binSize = config["par_deepTools"]["bamCoverage_binSize"], - ignoreChr = "chrX chrM", - otherOptions = config["par_deepTools"]["bamCoverage_otherOptions"], - duplicates = "--ignoreDuplicates" # Ignored now, should NOT be set for GC-corrected data - shell: - # First bigwig, then bedgraph - """sh -c ' - bamCoverage \ - --bam {input.bam} \ - --binSize {params.binSize} \ - --{params.normalization} \ - {params.otherOptions} \ - --numberOfProcessors {threadsMax} \ - --ignoreForNormalization {params.ignoreChr} \ - --outFileName {output.bigwig} \ - --outFileFormat bigwig \ - 2> {log.bigwig:q} && - bamCoverage \ - --bam {input.bam} \ - --binSize {params.binSize} \ - --{params.normalization} \ - {params.otherOptions} \ - --numberOfProcessors {threadsMax} \ - --ignoreForNormalization {params.ignoreChr} \ - --outFileName {output.bedgraph} \ - --outFileFormat bedgraph \ - 2> {log.bedgraph:q} && - sort -k1,1 -k2,2n {output.bedgraph} | bgzip > {output.bedgraphgz} && - tabix -s 1 -b 2 -e 3 {output.bedgraphgz} - '""" - -# Note the necessary [] for concatenate two arrays -basenameSamplesAndIndArray = numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique]) - -rule deepTools_plotCoverage: - input: - bamAll = expand('{dir}/{basename}{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, basename = basenameSamplesAndIndArray, GCBiasStr = ["",".noGCBias"]) - output: - plot = REPORTS_dir + '/allSamples_CoveragePlot.pdf', - rawCounts = REPORTS_dir + '/allSamples_CoveragePlot.counts', - log: LOG_BENCHMARK_dir + "/deepTools_plotCoverage.log" - message: "{ruleDisplayMessage}Run deepTools: plotCoverage for file {input.bamAll}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_plotCoverage.benchmark" - params: - fragmentLength = "--minFragmentLength", # Currently ignored, might be useful for ATAC-seq data - other = "--centerReads --plotFileFormat pdf --numberOfSamples 3000000", - titlePlot = "Coverage plot for all samples", - duplicates = "--ignoreDuplicates" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - plotCoverage \ - --bamfiles {input.bamAll} \ - --plotTitle "{params.titlePlot}" \ - {params.other} \ - --numberOfProcessors {threads} \ - {params.duplicates} \ - --outRawCounts {output.rawCounts} \ - --plotFile {output.plot:q} \ - 2> {log:q} '""" - -rule deepTools_correlationPlots: - input: - bamAll = expand('{dir}/{allSamples}{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, allSamples = allSamplesUnique, GCBiasStr = ["",".noGCBias"]) - output: - npz = REPORTS_dir + '/allSamples' + '.bins.npz', - pdf = REPORTS_dir + '/allSamples' + '.correlations.pdf', - rawCounts = REPORTS_dir + '/allSamples' + '.correlations.rawCounts' - #corMatrix = REPORTS_dir + '/allSamples' + '.correlations.matrix' - log: - multiBamSummary = LOG_BENCHMARK_dir + "/deepTools_correlationPlots.multiBamSummary.log", - plotCorrelation = LOG_BENCHMARK_dir + "/deepTools_correlationPlots.plotCorrelation.log" - message: "{ruleDisplayMessage}Run deepTools: multiBamSummary and correlationPlots for files {input.bamAll}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_correlationPlots.benchmark" - params: - corMethod = "pearson", - whatToPlot = "heatmap", - colorMap = "hsv", - other = "--skipZeros --plotFileFormat pdf --removeOutliers --plotNumbers", - titlePlot = "Correlation plot (" + "Pearson" + ")" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - multiBamSummary \ - bins \ - --bamfiles {input.bamAll} \ - --numberOfProcessors {threads} \ - --outRawCounts {output.rawCounts} \ - -out {output.npz:q} \ - 2> {log.multiBamSummary:q} && - plotCorrelation \ - --corData {output.npz:q} \ - --corMethod "{params.corMethod}" \ - --whatToPlot "{params.whatToPlot}" \ - --plotTitle "{params.titlePlot}" \ - --colorMap "{params.colorMap}" \ - {params.other} \ - --plotFile {output.pdf:q} \ - 2> {log.plotCorrelation:q} - '""" - -# bug in version 2.4.1: --outFileCorMatrix {output.corMatrix} \ -# --labels {allSamplesUniqueStrSpaces} # No " " here these cause an error! - - -rule deepTools_correlationPlotsPooledSamples: - input: - bamAll = expand('{dir}/{allIndividuals}.merged{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, allIndividuals = allIndividualsUnique, GCBiasStr = ["",".noGCBias"]) - output: - npz = REPORTS_dir + '/allSamplesPooled' + '.bins.npz', - pdf = REPORTS_dir + '/allSamplesPooled' + '.correlations.pdf', - rawCounts = REPORTS_dir + '/allSamplesPooled' + '.correlations.rawCounts' - #corMatrix = DOWNSTREAM_dir + '/allSamples' + '.minMapQ' + str(par_minMAPQscore) + '.rmChrM.adRSS.rmDup.rmINDEL.correlations.corMatrix' - log: - multiBamSummary = LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.multiBamSummary.log", - plotCorrelation = LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.plotCorrelation.log" - message: "{ruleDisplayMessage}Run deepTools: multiBamSummary and correlationPlots for files {input.bamAll}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.benchmark" - params: - corMethod = "pearson", - whatToPlot = "heatmap", - colorMap = "hsv", - other = "--skipZeros --plotFileFormat pdf --removeOutliers --plotNumbers", - titlePlot = "Correlation plot (" + "Pearson" + ")" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - multiBamSummary \ - bins \ - --bamfiles {input.bamAll} \ - --numberOfProcessors {threads} \ - --outRawCounts {output.rawCounts} \ - -out {output.npz:q} \ - 2> {log.multiBamSummary:q} && - plotCorrelation \ - --corData {output.npz:q} \ - --corMethod "{params.corMethod}" \ - --whatToPlot "{params.whatToPlot}" \ - --plotTitle "{params.titlePlot}" \ - --colorMap "{params.colorMap}" \ - {params.other} \ - --plotFile {output.pdf:q} \ - 2> {log.plotCorrelation:q} - '""" -# bug in version 2.4.1: --outFileCorMatrix {output.corMatrix} \ --labels {allIndividualsUniqueStrSpaces} \ - - - -rule deepTools_plotPCA: - input: - coverage = REPORTS_dir + '/{basename}.bins.npz' - output: - plot = REPORTS_dir + '/{basename}_PCAPlot.pdf', - data = REPORTS_dir + '/{basename}_PCAPlot.data' - log: LOG_BENCHMARK_dir + "/deepTools_plotPCA.{basename}.log" - message: "{ruleDisplayMessage}Run deepTools: plotPCA for coverage file {input.coverage}..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_plotPCA_{basename}.benchmark" - params: - other = "", - titlePlot = "PCA plot for all samples" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - plotPCA \ - --corData {input.coverage} \ - --plotTitle "{params.titlePlot}" \ - --plotFileFormat pdf \ - {params.other} \ - --outFileNameData {output.data} \ - --plotFile {output.plot:q}\ - 2> {log:q} - '""" - - -# Enforce that it runs at the very end - -rule multiqc: - input: - summaryStats = expand('{dir}/allSamples_statSummary.{fileType}', dir = REPORTS_dir, fileType = ["pdf", "RData"]), - fragmentLength = expand('{dir}/allSamples_fragmentLengthDistr.{fileType}', dir = REPORTS_dir, fileType = ["pdf", "RData"]), - corrPlots = REPORTS_dir + '/allSamples' + '.correlations.pdf', - corrPlotsPooled = REPORTS_dir + '/allSamplesPooled' + '.correlations.pdf', - PCAPlot = expand('{dir}/{basename}_PCAPlot.pdf', dir = REPORTS_dir, basename = ["allSamples", "allSamplesPooled"]), - GCBiasPlot = expand('{dir}/{sample}{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir, sample = allSamplesUnique, GCBiasStr = ["", ".noGCBias"]), - GCBiasPlotPooled= expand('{dir}/{individual}.merged{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir, individual = allIndividualsUnique, GCBiasStr = ["", ".noGCBias"]), - coveragePlot = REPORTS_dir + '/allSamples_CoveragePlot.pdf', - coverage = expand('{dir}/{sample}{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, sample = allSamplesUnique , GCBiasStr = ["", ".noGCBias"], type = ["bigwig", "bedgraph.gz"]), - coveragePooled = expand('{dir}/{individual}.merged{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, individual = allIndividualsUnique, GCBiasStr = ["", ".noGCBias"], type = ["bigwig", "bedgraph.gz"]) - output: - report = REPORTS_dir + '/multiqc_report.html' - log: - message: "{ruleDisplayMessage}Finally, run multiqc for the folder {ROOT_dir}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/multiqc.benchmark" - resources: - params: - outputDir = lambda wildcards, output: os.path.dirname(output.report), - basename = lambda wildcards, output: os.path.basename(output.report), - rootDir = ROOT_dir - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/multiqc.yaml" - shell: - """sh -c ' - multiqc \ - --force \ - --ignore *.out --ignore *.err \ - -o {params.outputDir} \ - --filename {params.basename} \ - {params.rootDir:q} - '""" diff --git a/src/Snakemake/old/Snakefile_hg38 b/src/Snakemake/old/Snakefile_hg38 deleted file mode 100755 index 7484e51..0000000 --- a/src/Snakemake/old/Snakefile_hg38 +++ /dev/null @@ -1,1904 +0,0 @@ -# ATAC-seq analysis pipeline - -# TODO: -# 1. Decide for environments and their combinations -# 2. bamCoverage bam index & put --centerReads back when error is fixed -# 3. peak calling encode -# - -shell.prefix("module purge; module load GCCcore/5.4.0 ncurses BEDTools/2.26.0-foss-2016b SAMtools/1.3.1-foss-2016b R-bundle-Bioconductor/3.5-foss-2016b-R-3.4.0 Autoconf/2.69 Java/1.8.0_112 FastQC/0.11.5-Java-1.8.0_112 Trimmomatic/0.32-Java-1.7.0_80 Bowtie2/2.3.2-foss-2016b Java/1.8.0_112; ") - -####################################### -# General stuff to make things easier # -####################################### - -# Make the output nicer and easier to follow -ruleDisplayMessage = "\n\n########################\n# START EXECUTING RULE #\n########################\n" - -############################################ -# Libraries, versions, authors and license # -############################################ - -from snakemake.utils import min_version -import subprocess -from os import makedirs -import pandas -import numpy - -# Enforce a minimum Snakemake version because of various features -min_version("4.0") - -__author__ = "Christian Arnold" -__license__ = "MIT" - - - -############################################ -# Working directory and configuration file # -############################################ - -# Not needed, will be provided via the command line in Snakemake -#DEFAULT_CONFIG_FILE = "/g/scb/zaugg/carnold/Projects/AtacSeq/example/config.json" -#configfile: DEFAULT_CONFIG_FILE - -#ROOT_dir = config["par_general"]["workdir"] -#workdir: ROOT_dir - -########################################### -# Onstart, onsuccess and onerror handlers # -########################################### - -# Sometimes, it is necessary to specify code that shall be executed when the workflow execution is finished (e.g. cleanup, or notification of the user). - -# The onsuccess handler is executed if the workflow finished without error. -onsuccess: - print("\n\n###############################\n# Workflow finished, no error #\n###############################\n\n") - -# Else, the onerror handler is executed. -onerror: - print("\n\n#####################\n# An error occurred #\n#####################\n\n") - #shell("mail -s "an error occurred" carnold@embl.de < {log}") - -# onstart handler will be executed before the workflow starts. Note that dry-runs do not trigger any of the handlers -onstart: - print("Reading samples and metadata....\n") - print ("Running workflow for the following samples:\n " + ' \n '.join(map(str, allSamplesUnique))) - - - -def read_samplesTable(samplesSummaryFile): - """text""" - - data = pandas.read_table(samplesSummaryFile) - - # Expect a particular number of columns, do a sanity check here - - if not {'individual', 'sampleName', 'Flowcell_ID', 'lane_ID', 'Technology', 'Library_ID'}.issubset(data.columns.values): - raise KeyError("The samples file must contain the following named columns (TAB separated!): individual, sampleName, Flowcell_ID, lane_ID, Technology, Library_ID") - - # Make sure the individual column is a string - data['individual'] = data['individual'].astype(str) - - - return data - - -def constructRGFields(samplesData): - """text""" - readGroupFields = {} - - for rowCur in range(0, len(samplesData.index)): - - individual = samplesData.ix[rowCur,"individual"] - field_PL = samplesData.ix[rowCur,"Technology"] - flowcell = samplesData.ix[rowCur,"Flowcell_ID"] - lane = samplesData.ix[rowCur,"lane_ID"] - sample = samplesData.ix[rowCur,"sampleName"] - library = samplesData.ix[rowCur,"Library_ID"] - - field_ID = str(flowcell) + "." + str(lane) - field_SM = str(individual) - field_PU = str(flowcell) + "." + str(lane) + "." + str(sample) - field_LB = str(library) - - readGroupFields[sample] = {"ID": field_ID, - "LB": field_LB, - "PL": field_PL, - "PU": field_PU, - "SM": field_SM} - - - return readGroupFields - - - -############################# -# DIRECTORIES AND VARIABLES # -############################# - -# Maximum number of cores per rule. This value will never be achieved because the minimum of this value and the --cores parameter will define the -# number of CPUs per rule in the end. -threadsMax = 16 - -# Input files -samplesSummaryFile = config["samples"]["summaryFile"] -pairedEnd = config["samples"]["pairedEnd"] - -if not pairedEnd: - print ("Error: SE reads not supported yet by pipeline.\n") - sys.exit(1) - -INPUT_ORIG_DIR= os.path.dirname(samplesSummaryFile) - -ROOT_dir = config["par_general"]["outdir"] -INPUT_DIR = ROOT_dir + "/0.Input" -FASTQC_BT_dir = ROOT_dir + "/1.FastQC_beforeTrimming" -TRIM_dir = ROOT_dir + "/2.Trimming" -FASTQC_AT_dir = ROOT_dir + "/3.FastQC_afterTrimming" -ALIGN_dir = ROOT_dir + "/4.Alignment" -POSTALIGN_dir = ROOT_dir + "/5.Postalignment" -CLEAN_dir = POSTALIGN_dir + "/1.Clean" -BASERECAL_dir = POSTALIGN_dir + "/2.BaseRecalibration" -CHRM_dir = POSTALIGN_dir + "/3.Filter_chrM" -rmDup_dir = POSTALIGN_dir + "/4.MarkAndRemove_Duplicates" -MAPQsort_dir = POSTALIGN_dir + "/5.Filter_MAPQ" -ADJRSS_dir = POSTALIGN_dir + "/6.Adjust_RSS" -RMINDEL_dir = POSTALIGN_dir + "/7.Filter_INDELs" -PEAKCALLING_dir = ROOT_dir + "/6.PeakCalling" -DOWNSTREAM_dir = ROOT_dir + "/7.DownstreamAnalyses" -FINAL_OUTPUT_dir = ROOT_dir + "/8.FinalOutput" -REPORTS_dir = ROOT_dir + "/Reports_and_Stats" -LOG_BENCHMARK_dir = ROOT_dir + "/Logs_and_Benchmarks" - -PEAKCALLING_STRINGENT_dir = PEAKCALLING_dir + "/stringent" -PEAKCALLING_NONSTRINGENT_dir = PEAKCALLING_dir + "/nonStringent" -PEAKCALLING_ENCODE_dir = PEAKCALLING_dir + "/Encode" - -global samplesData -samplesData = read_samplesTable(config["samples"]["summaryFile"]) - -# Make it accessible also within functions -global RGFields -RGFields = constructRGFields(samplesData) - -#print (RGFields) -#print (RGFields["test1_rep1"]["LB"]) -#sys.exit(1) - -# Get all unique sample names -allIndividualsUnique = numpy.unique(samplesData.loc[:,"individual"]) -allIndividualsUniqueStrSpaces = ' '.join(allIndividualsUnique) -allSamplesUnique = numpy.unique(samplesData.loc[:,"sampleName"]) -# Get only two sampels for nicer graphs -#allIndividualsUnique = numpy.unique(samplesData.loc[:,"individual"])[0:2] -#allSamplesUnique = numpy.unique(samplesData.loc[:,"sampleName"])[0:2] -allSamplesUniqueStr = ','.join(allSamplesUnique) -allSamplesUniqueStrSpaces = ' '.join(allSamplesUnique) -# -# print(samplesData.loc[:,"sampleName"]) -# print(samplesData.loc[:,"individual"]) -# print (allSamplesUnique) -# print (allIndividualsUnique) -# print(numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique])) -# sys.exit(0) - -# Execuables - -script_FL_distr = config["executables"]["FL_distr_script"] -script_STATS = config["executables"]["STATS_script"] -java_exec = config["executables"]["java"] -minMemoryJavaGB = 5 -maxMemoryJavaGB = 50 - -picard_command = java_exec + " -Xms" + str(minMemoryJavaGB) + "g -Xmx" + str(maxMemoryJavaGB) + "g -jar " + config["executables"]["PICARD_jar"] -gatk_command = java_exec + " -Xms" + str(minMemoryJavaGB) + "g -Xmx" + str(maxMemoryJavaGB) + "g -jar " + config["executables"]["GATK_jar"] - - -par_minMAPQscore = config["par_postalign"]["minMAPQscore"] - -if not os.path.isfile(config["additionalInputFiles"]["trimmomatic_adapters"]): - raise IOError("File " + config["additionalInputFiles"]["trimmomatic_adapters"] + " not found.") - -if not os.path.isfile(config["executables"]["FL_distr_script"]): - raise IOError("File " + config["executables"]["FL_distr_script"] + " not found.") - -if not os.path.isfile(config["executables"]["STATS_script"]): - raise IOError("File " + config["executables"]["STATS_script"] + " not found.") - -if not os.path.isfile(config["executables"]["GATK_jar"]): - raise IOError("File " + config["executables"]["GATK_jar"] + " not found.") - -if not os.path.isfile(config["executables"]["PICARD_jar"]): - raise IOError("File " + config["executables"]["PICARD_jar"] + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["blacklistRegions"]): - raise IOError("File " + config["additionalInputFiles"]["blacklistRegions"] + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["blacklistRegions"]): - raise IOError("File " + config["additionalInputFiles"]["blacklistRegions"] + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["knownSNPs"]): - raise IOError("File " + config["additionalInputFiles"]["knownSNPs"] + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["knownIndels"]): - raise IOError("File " + config["additionalInputFiles"]["knownSNPs"] + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["annotationGTF"]): - raise IOError("File " + config["additionalInputFiles"]["annotationGTF"] + " not found.") - -refGen = config["additionalInputFiles"]["refGenome_fasta"] - - -if not os.path.isfile(refGen): - raise IOError("File " + refGen + " not found.") - -if not os.path.isfile(config["additionalInputFiles"]["refGenome_2bit"]): - raise IOError("File " + ref2bit + " not found.") - - -dictFile = os.path.splitext(refGen)[0] + ".dict" - -faiFile = refGen + ".fai" -if not (os.path.isfile(faiFile)) or not (os.path.isfile(dictFile)): - raise IOError("Either index file *.fai or *.dict for " + refGen + " not found. See https://software.broadinstitute.org/gatk/guide/article?id=1601") - -#### Parameter for bamCoverage - -bamCoverage_normOption = config["par_deepTools"]["bamCoverage_normalizationCoverage"] -if not (bamCoverage_normOption == "normalizeTo1x") and not (bamCoverage_normOption == "normalizeUsingRPKM") and not (bamCoverage_normOption == "ignoreForNormalization"): - raise AssertionError("The config parameter config[\"par_deepTools\"][\"bamCoverage_normalizationCoverage\"] has to be one of: normalizeTo1x, normalizeUsingRPKM, ignoreForNormalization") - -if bamCoverage_normOption == "normalizeTo1x": - bamCoverage_normOption = bamCoverage_normOption + " " + str(config["par_deepTools"]["effectiveGenomeSize"]) - - -########################################################################### -# Get the versions of the used tools and script to record them rigorously # -########################################################################### - -# Almost obselete due to the conda environments. Only record versions for scripts etc - -# For custom scripts, retrieve the modification date instead -VERSION_FL_distr_script = str(os.path.getmtime(config["executables"]["FL_distr_script"])).replace('\n', ' ') -VERSION_STATS_script = str(os.path.getmtime(config["executables"]["STATS_script"])).replace('\n', ' ') - -###################### -# CREATE DIRECTORIES # -###################### - -# Not needed, are created less automatically - -######### -# RULES # -######### - -# For cluster usage: The keyword localrules allows to mark a rule as local, so that it is not submitted to the cluster and instead executed on the host node -localrules: all,link_inputFiles,link_outputFiles # , prepareEnvironments_JavaMemoryGATK, prepareEnvironments_JavaMemoryPicard - -################### -################### -## FINAL TARGETS ## -################### -################### - - -allSamplesAndIndividuals = numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique + ".final"]) - -rule all: - input: - #idr = expand('{dir}/allSamples.final.{stringency}.peaks.IDR.bed.png', dir = PEAKCALLING_dir, stringency = ("stringent", "nonStringent")), - # Filtering by blacklist is the last step, all other files have to be produced before - # First, the individual sample peak files - individualPeaksEncode = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_ENCODE_dir, sample = allSamplesUnique, - GCBias = ["",".noGCBias"], - analysisType = ["Encode"], - peakType = ["broad", "gapped", "narrow"] - ), - individualPeaksStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_STRINGENT_dir, sample = allSamplesUnique, - GCBias = ["",".noGCBias"], - analysisType = ["stringent"], - peakType = ["narrow"] - ), - individualPeaksNonStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_NONSTRINGENT_dir, sample = allSamplesUnique, - GCBias = ["",".noGCBias"], - analysisType = ["nonStringent"], - peakType = ["narrow"] - ), - - # pooledPeaksEncode = expand('{dir}/{individual}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_ENCODE_dir, individual = allIndividualsUnique, - # #pooledPeaksEncode = expand('{dir}/{individual}.merged.final.{analysisType}{peaktype2}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_ENCODE_dir, individual = allIndividualsUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["Encode"], - # peaktype2 = ["", ".pooled", ".replicate"], - # peakType = ["broad", "gapped", "narrow"] - # ), - - # pooledPeaksStringent = expand('{dir}/{individual}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_STRINGENT_dir, individual = allIndividualsUnique, - # #pooledPeaksStringent = expand('{dir}/{individual}.merged.final.{analysisType}{peaktype2}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_STRINGENT_dir, individual = allIndividualsUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["stringent"], - # peaktype2 = ["", ".pooled", ".replicate"], - # peakType = ["narrow"] - # ), - # - # pooledPeaksNonStringent = expand('{dir}/{individual}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_NONSTRINGENT_dir, individual = allIndividualsUnique, - # #pooledPeaksNonStringent = expand('{dir}/{individual}.merged.final.{analysisType}{peaktype2}.{peakType}Peak.filtered.bed', dir = PEAKCALLING_NONSTRINGENT_dir, individual = allIndividualsUnique, - # GCBias = ["",".noGCBias"], - # analysisType = ["nonStringent"], - # peaktype2 = ["", ".pooled", ".replicate"], - # peakType = ["narrow"] - # ), - stats = expand('{dir}/multiqc_report.html', dir = REPORTS_dir) - message: "{ruleDisplayMessage}One Pipeline to rule them \"all\"..." - -################## -################## -## PREPARE DATA ## -################## -################## - - -# The following two rules were deactivated because they cause too much trouble all the time. Java is now just called explicitly -# -# rule prepareEnvironments_JavaMemoryGATK: -# output: touch(LOG_BENCHMARK_dir + "/preparedEnvironments_JavaMemoryGATK.done") -# threads: 1 -# message: "{ruleDisplayMessage}Prepare environments that use Java (GATK) and edit Java memory settings..." -# conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" -# params: -# memMinInGB = 5, -# memMaxInGB = 50, -# gatkJar = config["executables"]["GATK_archive"] -# shell: -# """ -# gatk-register "{params.gatkJar}" -# path_gatk=$(which GenomeAnalysisTK) && -# sed -i 's/-Xms[0-9]\+[mg]/-Xms{params.memMinInGB}g/' $path_gatk && -# sed -i 's/-Xmx[0-9]\+[mg]/-Xmx{params.memMaxInGB}g/' $path_gatk -# """ -# -# rule prepareEnvironments_JavaMemoryPicard: -# output: touch(LOG_BENCHMARK_dir + "/preparedEnvironments_JavaMemoryPicard.done") -# threads: 1 -# message: "{ruleDisplayMessage}Prepare environments that use Java (Picard) and edit Java memory settings..." -# conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" -# params: -# memMinInGB = 5, -# memMaxInGB = 50, -# gatkJar = config["executables"]["GATK_archive"] -# shell: -# """ -# path_picard=$(which picard) && -# sed -i 's/-Xms[0-9]\+[mg]/-Xms{params.memMinInGB}g/' $path_picard && -# sed -i 's/-Xmx[0-9]\+[mg]/-Xmx{params.memMaxInGB}g/' $path_picard -# """ - -rule link_inputFiles: - input: - forward = expand('{dir}/{{sample}}_1.fastq.gz', dir = INPUT_ORIG_DIR), - reverse = expand('{dir}/{{sample}}_2.fastq.gz', dir = INPUT_ORIG_DIR) - output: - forward = expand('{dir}/{{sample}}_1.fastq.gz', dir = INPUT_DIR), - reverse = expand('{dir}/{{sample}}_2.fastq.gz', dir = INPUT_DIR) - log: - message: "{ruleDisplayMessage}Create symbolic links for the input files {input:q} in directory {INPUT_DIR:q}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/link_inputFiles.{sample}.benchmark" - resources: - version:"NA" - params: sampleCSV = config["samples"]["summaryFile"] - shell: - """sh -c ' - ln -fs {input.forward:q} {output.forward:q} && - ln -fs {input.reverse:q} {output.reverse:q} && - touch -h {output.forward:q} && - touch -h {output.reverse:q} && - cp {params.sampleCSV} {INPUT_DIR} - '""" - - # Workaround for the warning "Unable to set utime on symlink {}. Your Python build does not support it" warning related to symbolic links - # https://bitbucket.org/snakemake/snakemake/issues/397/unable-to-set-utime-on-symlink-your-python - - -rule Picard_CreateSequenceDictionary: - input: - #environment = rules.prepareEnvironments_JavaMemoryPicard.output, - fasta = config["additionalInputFiles"]["refGenome_fasta"] - output: - dict = CLEAN_dir + '/ref.dict.fasta' - log: expand('{dir}/Picard_CreateSequenceDictionary.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Creates a sequence dictionary for a reference sequence for input {input.fasta}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_CreateSequenceDictionary.benchmark" - params: - shell: - """sh -c ' - {picard_command} CreateSequenceDictionary \ - R={input.fasta:q} \ - O={output.dict:q} \ - 2> {log:q} - '""" - - -################################# -################################# -## FASTQC, TRIMMING, ALIGNMENT ## -################################# -################################# - -rule fastqc_BT: - input: - forward = rules.link_inputFiles.output.forward, - reverse = rules.link_inputFiles.output.reverse - output: - forward = expand('{dir}/{{sample}}_1_fastqc.zip', dir = FASTQC_BT_dir), - reverse = expand('{dir}/{{sample}}_2_fastqc.zip', dir = FASTQC_BT_dir) - log: expand('{dir}/fastqc_BT.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Perform FASTQC on the samples {input:q} before trimming..." - threads: threadsMax - priority: 1 - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/fastqc.yaml" - benchmark: LOG_BENCHMARK_dir + "/fastqc_BT.{sample}.benchmark" - shell: - """sh -c ' - fastqc \ - -o {FASTQC_BT_dir:q} \ - -t {threads} \ - --extract \ - {input:q} \ - 2> {log:q} - '""" - -rule trimming_PE: - input: - forward = rules.link_inputFiles.output.forward, - reverse = rules.link_inputFiles.output.reverse, - report = rules.fastqc_BT.output # Not really needed, but force execution here - output: - forward_paired = expand('{dir}/{{sample}}_1.trimmed.fq.gz', dir = TRIM_dir), - forward_unpaired = expand('{dir}/{{sample}}_1.unpaired.fq.gz', dir = TRIM_dir), - reverse_paired = expand('{dir}/{{sample}}_2.trimmed.fq.gz', dir = TRIM_dir), - reverse_unpaired = expand('{dir}/{{sample}}_2.unpaired.fq.gz', dir = TRIM_dir) - log: output = expand('{dir}/trimming_PE_TrimmomaticOutput.{{sample}}.log', dir = LOG_BENCHMARK_dir), - trimlog = expand('{dir}/trimming_PE_trimlog.{{sample}}.log' , dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Trimming of adapters with TRIMMOMATIC in the PE mode for files {input:q} using adapters file {params.adapters:q} ..." - threads: threadsMax - priority: 1 - resources: - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/trimmomatic.yaml" - benchmark: LOG_BENCHMARK_dir + "/trimming_PE.{sample}.benchmark" - params: - ILLUMINACLIP = config["par_trimming"]["trimmomatic_ILLUMINACLIP"], - trailing = config["par_trimming"]["trimmomatic_trailing"], - minlen = config["par_trimming"]["trimmomatic_minlen"], - adapters = config["additionalInputFiles"]["trimmomatic_adapters"] - shell: - """sh -c ' - trimmomatic PE \ - -threads {threads} \ - {input.forward:q} {input.reverse:q} \ - {output.forward_paired:q} {output.forward_unpaired:q} {output.reverse_paired:q} {output.reverse_unpaired:q} \ - ILLUMINACLIP:{params.adapters}:{params.ILLUMINACLIP} \ - TRAILING:{params.trailing} \ - MINLEN:{params.minlen} \ - 2>{log.output:q} - '""" - - # removed: -trimlog {log.trimlog:q} \ - -rule fastqc_AT: - input: - forward = rules.trimming_PE.output.forward_paired, - reverse = rules.trimming_PE.output.reverse_paired - output: - forward = expand('{dir}/{{sample}}_1.trimmed_fastqc.zip', dir = FASTQC_AT_dir), - reverse = expand('{dir}/{{sample}}_2.trimmed_fastqc.zip', dir = FASTQC_AT_dir) - log: expand('{dir}/fastqc_AT.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Perform FASTQC on the samples {input:q} after trimming..." - threads: threadsMax - priority: 1 - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/fastqc.yaml" - benchmark: LOG_BENCHMARK_dir + "/fastqc_AT.{sample}.benchmark" - shell: - """sh -c 'fastqc \ - -o {FASTQC_AT_dir:q} \ - -t {threads} \ - --extract \ - {input:q} \ - 2> {log:q} - '""" - -rule Bowtie2_alignment: - input: - file1 = rules.trimming_PE.output.forward_paired, - file2 = rules.trimming_PE.output.reverse_paired, - report = rules.fastqc_AT.output - output: - temp(expand('{dir}/{{sample}}.bt2.sam', dir = ALIGN_dir)) - threads: threadsMax - priority: 1 - log: expand('{dir}/Bowtie2_alignment.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Do Bowtie2 alignment for files {input:q}. This may take a while..." - benchmark: LOG_BENCHMARK_dir + "/alignment.{sample}.benchmark" - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bowtie2.yaml" - params: - sensitivity = config["par_align"]["bowtie2_sensitivity"], - refGenome = config["par_align"]["bowtie2_refGenome"], - maxFragmentLength = config["par_align"]["bowtie2_maxFragmentLength"] - shell: - """sh -c 'bowtie2 \ - -p {threads} \ - -X {params.maxFragmentLength} \ - {params.sensitivity} \ - -t \ - -x {params.refGenome} \ - -1 {input.file1:q} -2 {input.file2:q} \ - -S {output:q} \ - 2> {log:q} - '""" - - - -##################################### -##################################### -## CLEANING AND BASE RECALIBRATION ## -##################################### -##################################### - -rule samtools_SAM_TO_BAM: - input: - #environment = rules.prepareEnvironments_JavaMemoryPicard.output, - sam = rules.Bowtie2_alignment.output - output: - unsortedBam = temp(expand('{dir}/{{sample}}.bam' , dir = ALIGN_dir)), - sortedBam = expand('{dir}/{{sample}}.s.bam' , dir = ALIGN_dir), - index = expand('{dir}/{{sample}}.s.bam.bai', dir = ALIGN_dir) - threads: threadsMax - priority: 1 - log: - message: "{ruleDisplayMessage}Conversion to BAM, sort, index for file {input.sam:q} ..." - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/samtools_SAM_TO_BAM.{sample}.benchmark" - shell: - """sh -c ' - samtools view -b -S -o {output.unsortedBam:q} {input.sam:q} && - samtools sort -o {output.sortedBam:q} --threads {threads} {output.unsortedBam:q} && - samtools index {output.sortedBam:q} - '""" - - - -basenameSuffix = ".cleaned" - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_cleanSAM_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + ".s" - -rule Picard_cleanSAM: - input: - bam = rules.samtools_SAM_TO_BAM.output.sortedBam - output: - bam = temp(Picard_cleanSAM_outputName + ".bam"), - index = temp(Picard_cleanSAM_outputName + ".bai") - log: expand('{dir}/Picard_cleanSAM.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}CleanSam: Clean the provided SAM/BAM, soft-clipping beyond-end-of-reference alignments and setting MAPQ to 0 for unmapped reads for {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_cleanSAM.{sample}.benchmark" - shell: - """sh -c ' - {picard_command} CleanSam \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} - '""" - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_FixMateInformation_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "2" + ".s" - -rule Picard_FixMateInformation: - input: - bam = rules.Picard_cleanSAM.output.bam, - bai = rules.Picard_cleanSAM.output.index - output: - bam = temp(Picard_FixMateInformation_outputName + '.bam'), - index = temp(Picard_FixMateInformation_outputName + '.bai'), - stats = Picard_FixMateInformation_outputName + '.bam.stats' - log: expand('{dir}/Picard_FixMateInformation.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Verify mate-pair information between mates and fix if needed for input {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_FixMateInformation.{sample}.benchmark" - shell: - """sh -c ' - {picard_command} FixMateInformation \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - 2> {log:q} && - samtools flagstat {output.bam:q} > {output.stats:q} - '""" - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_AddOrReplaceReadGroups_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "3" + ".s" - -rule Picard_AddOrReplaceReadGroups: - input: - bam = rules.Picard_FixMateInformation.output.bam, - bai = rules.Picard_FixMateInformation.output.index - output: - bam = temp(Picard_AddOrReplaceReadGroups_outputName + '.bam'), - index = temp(Picard_AddOrReplaceReadGroups_outputName + '.bai') - log: expand('{dir}/Picard_AddOrReplaceReadGroups.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Replace read groups in a BAM file for input {input.bam}..." - threads: 1 - priority: 1 - resources:maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picard_AddOrReplaceReadGroups.{sample}.benchmark" - params: - RGID = lambda wildcards: RGFields[wildcards.sample]["ID"], # Read Group ID - RGLB = lambda wildcards: RGFields[wildcards.sample]["LB"], # Read Group library - RGPL = lambda wildcards: RGFields[wildcards.sample]["PL"], # Read Group platform - RGPU = lambda wildcards: RGFields[wildcards.sample]["PU"], # Read Group platform unit (eg. run barcode) - RGSM = lambda wildcards: RGFields[wildcards.sample]["SM"] # Read Group sample name - shell: - """sh -c ' - {picard_command} AddOrReplaceReadGroups \ - I={input.bam:q} \ - O={output.bam:q} \ - CREATE_INDEX=true \ - RGID={params.RGID} \ - RGLB={params.RGLB} \ - RGPL={params.RGPL} \ - RGPU={params.RGPU} \ - RGSM={params.RGSM} \ - 2> {log:q} - '""" - - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -Picard_ReorderSam_outputName = CLEAN_dir + '/{sample}' + basenameSuffix + "4" + ".s" -# -# rule Picard_ReorderSam: -# input: -# bam = rules.Picard_AddOrReplaceReadGroups.output.bam, -# index = rules.Picard_AddOrReplaceReadGroups.output.index, -# ref = config["additionalInputFiles"]["refGenome_fasta"] -# output: -# bam = temp(Picard_ReorderSam_outputName + '.bam'), -# index = temp(Picard_ReorderSam_outputName + '.bai') -# log: expand('{dir}/Picard_ReorderSam.{{sample}}.log', dir = LOG_BENCHMARK_dir) -# message: "{ruleDisplayMessage}Reorders reads in a SAM/BAM file to match the contig ordering in a provided reference file for input {input.bam}..." -# threads: 1 -# priority: 1 -# resources:maxMemGB=20 -# conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" -# benchmark: LOG_BENCHMARK_dir + "/Picard_ReorderSam.{sample}.benchmark" -# params: -# shell: -# """sh -c ' -# {picard_command} ReorderSam \ -# I={input.bam:q} \ -# REFERENCE={input.ref:q} \ -# O={output.bam:q} \ -# ALLOW_INCOMPLETE_DICT_CONCORDANCE=TRUE \ -# CREATE_INDEX=true \ -# 2> {log:q} -# '""" - - -rule GATK_baseRecalibration1: - input: - #environment = rules.prepareEnvironments_JavaMemoryGATK.output, - bam = rules.Picard_AddOrReplaceReadGroups.output.bam, - index = rules.Picard_AddOrReplaceReadGroups.output.index - output: - recalibrationTable1 = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.recalTable1', dir = BASERECAL_dir) - log: expand('{dir}/GATK_baseRecalibration1.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}BaseRecalibrator(1): Detect systematic errors in base quality scores for file {input.bam:q}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - params: - knownSNPs = config["additionalInputFiles"]["knownSNPs"], - knownIndels = config["additionalInputFiles"]["knownIndels"], - fasta = config["additionalInputFiles"]["refGenome_fasta"] - benchmark: LOG_BENCHMARK_dir + "/GATK_baseRecalibration1.{sample}.benchmark" - shell: - """sh -c ' - {gatk_command} -T BaseRecalibrator\ - -R {params.fasta:q} \ - -knownSites {params.knownSNPs:q} \ - -knownSites {params.knownIndels:q} \ - --lowMemoryMode \ - -I {input.bam:q} \ - --out {output.recalibrationTable1:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - '""" - -rule GATK_printReadsBQSR: - input: - #environment = rules.prepareEnvironments_JavaMemoryGATK.output, - bam = rules.Picard_AddOrReplaceReadGroups.output.bam, - index = rules.Picard_AddOrReplaceReadGroups.output.index, - recalibrationTable = rules.GATK_baseRecalibration1.output.recalibrationTable1 - output: - bam = temp(expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam', dir = BASERECAL_dir)) - log: expand('{dir}/GATK_printReadsBQSR.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}PrintReads: Recalibrate {input.bam:q} using recalibration table {input.recalibrationTable:q} ..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: LOG_BENCHMARK_dir + "/GATK_printReadsBQSR.{sample}.benchmark" - params: - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """sh -c ' - {gatk_command} -T PrintReads \ - -R {params.fasta:q} \ - -I {input.bam:q} \ - -BQSR {input.recalibrationTable:q} \ - --out {output.bam:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - '""" - -rule GATK_baseRecalibration2: - input: - #environment = rules.prepareEnvironments_JavaMemoryGATK.output, - bam = rules.Picard_AddOrReplaceReadGroups.output.bam, - index = rules.Picard_AddOrReplaceReadGroups.output.index, - recalibrationTable1 = rules.GATK_baseRecalibration1.output.recalibrationTable1 - output: - recalibrationTable2 = expand('{dir}/{{sample}}.BQrecal.s.bam.recalTable2', dir = BASERECAL_dir) - log: expand('{dir}/GATK_baseRecalibration2.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}BaseRecalibrator(2): Detect systematic errors in base quality scores for file {input.bam:q} and first recalibration table..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: LOG_BENCHMARK_dir + "/GATK_baseRecalibration2.{sample}.benchmark" - params: - knownSNPs = config["additionalInputFiles"]["knownSNPs"], - knownIndels = config["additionalInputFiles"]["knownIndels"], - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """sh -c ' - {gatk_command} -T BaseRecalibrator \ - -R {params.fasta:q} \ - -knownSites {params.knownSNPs:q} \ - -knownSites {params.knownIndels:q} \ - -I {input.bam:q} \ - -BQSR {input.recalibrationTable1:q} \ - --out {output.recalibrationTable2:q} \ - --log_to_file {log:q} \ - --num_cpu_threads_per_data_thread {threads} - '""" - -# Use -L argument with BaseRecalibrator to restrict recalibration to capture targets on WEx -# - BQSR depends on key assumption: every mismatch is an error, except sites in known variants -# - Off-Âtarget sequence likely to have higher error rates with different error modes -# - If off-target sequence is included in recalibration, may skew the model and mess up results - - -rule GATK_analyzeCovariates: - input: - recalibrationTable1 = rules.GATK_baseRecalibration1.output.recalibrationTable1, - recalibrationTable2 = rules.GATK_baseRecalibration2.output.recalibrationTable2 - output: - plots = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.plots.pdf', dir = REPORTS_dir), - csv = expand('{dir}/{{sample}}.cleaned4.BQrecal.s.bam.plots.csv', dir = REPORTS_dir) - log: - expand('{dir}/GATK_analyzeCovariates.{{sample}}.log', dir = LOG_BENCHMARK_dir) - message: - "{ruleDisplayMessage}AnalyzeCovariates: Create plots to visualize base recalibration results for {input}..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/gatk.yaml" - benchmark: - LOG_BENCHMARK_dir + "/GATK_analyzeCovariates.{sample}.benchmark" - params: - fasta = config["additionalInputFiles"]["refGenome_fasta"] - shell: - """sh -c ' - {gatk_command} -T AnalyzeCovariates \ - -R {params.fasta:q} \ - -before {input.recalibrationTable1:q} \ - -after {input.recalibrationTable2:q} \ - -plots {output.plots:q} \ - -csv {output.csv:q} \ - --log_to_file {log:q} - '""" - -############### -############### -## FILTERING ## -############### -############### - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4.BQrecal.rmChrM.s.bam' - - -rule postalign_remove_chrMAndUnassembledChr: - input: - bam = rules.GATK_printReadsBQSR.output.bam, - recalReport = rules.GATK_analyzeCovariates.output.plots - output: - index1 = temp(str(rules.GATK_printReadsBQSR.output.bam) + ".bai"), - bam = temp(postalign_remove_chrMAndUnassembledChr_outputName), - index2 = temp(postalign_remove_chrMAndUnassembledChr_outputName + ".bai"), - stats = postalign_remove_chrMAndUnassembledChr_outputName + ".stats", - csv = postalign_remove_chrMAndUnassembledChr_outputName + ".csv" - log: - message: "{ruleDisplayMessage}Remove mitochondrial reads for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_remove_chrMAndUnassembledChr.{sample}.benchmark" - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """sh -c ' - samtools index {input.bam:q} && - samtools idxstats {input.bam:q} | cut -f 1 | grep chr | grep -Pv "chrM|chrUn|random|hap" | xargs samtools view -b {input.bam:q} >{output.bam:q} && - samtools index {output.bam:q} && - samtools flagstat {output.bam:q} > {output.stats:q} && - samtools view {output.bam:q} | cut -f3,5,9 > {output.csv:q} - '""" - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -markDuplicates_Picardtools_outputName = rmDup_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s' - - -rule markDuplicates_Picardtools: - input: - #environment = rules.prepareEnvironments_JavaMemoryPicard.output, - bam = rules.postalign_remove_chrMAndUnassembledChr.output.bam, - index = rules.postalign_remove_chrMAndUnassembledChr.output.index2 - output: - bam = temp(markDuplicates_Picardtools_outputName + ".bam"), - index = temp(markDuplicates_Picardtools_outputName + ".bai") - log: - log = expand('{dir}/markDuplicates_Picardtools.{{sample}}.log' , dir = LOG_BENCHMARK_dir), - metricsFile = expand('{dir}/markDuplicates_Picardtools.{{sample}}_metrics.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Mark (not remove!) duplicate reads for file {input:q} with Picard tools..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/markDuplicates_Picardtools.{sample}.benchmark" - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - ValidationStringency = config["par_postalign"]["ValidationStringencyMarkDuplicates"], - removeDuplicates = "false", - assumeSorted = "true" - shell: - """sh -c '{picard_command} MarkDuplicates \ - INPUT={input.bam:q} \ - OUTPUT={output.bam:q} \ - ASSUME_SORTED={params.assumeSorted} \ - METRICS_FILE={log.metricsFile:q} \ - VALIDATION_STRINGENCY={params.ValidationStringency} \ - REMOVE_DUPLICATES={params.removeDuplicates} \ - CREATE_INDEX=TRUE \ - 2> {log.log:q}'""" - - - -rule computeLibraryComplexity: - input: - bam = rules.markDuplicates_Picardtools.output.bam - output: - stats = rmDup_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl' - log: - message:"{ruleDisplayMessage}Compute library complexity for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/computeLibraryComplexity.{sample}.benchmark" - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bedtools.yaml" - params: - shell: - # Taken from https://github.com/kundajelab/atac_dnase_pipelines/blob/72ed6ba2502cca074c51740b612cbc6ebea07b08/modules/postalign_bam.bds - # Implementing the ENCODE ATAC-Seq library complexity guidelines - # PBC File output - # TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair - """ - bedtools bamtobed -i {input.bam:q} | \ - awk 'BEGIN{{OFS="\\t"}} {{print $1,$2,$3,$6}}' | \ - grep -v 'chrM' | sort | uniq -c | \ - awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; if(m2>0) m1_m2=m1/m2; \ - printf "readsTotal\\treadsDistinct\\treadsOccOne\\treadsOccTwo\\tNRF\\tPBC1\\tPBC2\\n%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",mt,m0,m1,m2,m0/mt,m1/m0,m1_m2}}' > {output.stats}""" - - - -rule removeDuplicates: - input: - bam = rules.markDuplicates_Picardtools.output.bam, - index = rules.markDuplicates_Picardtools.output.index, - stats = rules.computeLibraryComplexity.output.stats - output: - bam = temp(rmDup_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam') - log: - message: "{ruleDisplayMessage}Remove duplicate reads and QC-failing reads for file {input:q} with samtools..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/removeDuplicates.{sample}.benchmark" - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - removeReadsWithFlags = 1804, # read or mate unmapped, not primary alignment, read fails platform/vendor quality checks, read is PCR or optical duplicate - keepReadsWithFlags = 2 # read mapped in proper pair - shell: - """sh -c 'samtools view -F {params.removeReadsWithFlags} -f {params.keepReadsWithFlags} -b {input.bam:q} > {output.bam:q}'""" - # https://github.com/kundajelab/atac_dnase_pipelines/blob/72ed6ba2502cca074c51740b612cbc6ebea07b08/modules/postalign_bam.bds - - -rule postalign_samtools_flagstat: - input: - rules.removeDuplicates.output.bam - output: - stats = rmDup_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats', - csv = rmDup_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.csv' - log: - message: "{ruleDisplayMessage}Run samtools flagstat on file {input:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_samtools_flagstat.{sample}.benchmark" - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - shell: - """sh -c ' - samtools flagstat {input:q} > {output.stats:q} && - samtools view {input:q} | cut -f3,5,9 > {output.csv} - '""" - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.s.bam' - -rule postalign_MAPQ: - input: - bam = rules.removeDuplicates.output.bam - output: - bam = temp(postalign_MAPQ_outputName), - index = temp(postalign_MAPQ_outputName + ".bai"), - stats = postalign_MAPQ_outputName + ".stats", - csv = postalign_MAPQ_outputName + ".csv" - log: - message: "{ruleDisplayMessage}Remove reads with a MAPQ quality lower than {par_minMAPQscore} for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_MAPQ.{sample}.benchmark" - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - minMAPQ = par_minMAPQscore - shell: - """sh -c ' - samtools view -b -q {params.minMAPQ} -F 4 {input.bam:q} > {output.bam:q} && - samtools index {output.bam:q} && - samtools flagstat {output.bam:q} > {output.stats:q} && - samtools view {output.bam:q} | cut -f3,5,9 > {output.csv:q} - '""" - - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.adjRSS.s.bam' - - -rule postaling_RSS: - input: - bam = rules.postalign_MAPQ.output.bam, - index = rules.postalign_MAPQ.output.index - output: - header = temp(expand('{dir}/header{{sample}}', dir = ADJRSS_dir)), - forward = temp(expand('{dir}/temp1forw{{sample}}.sam', dir = ADJRSS_dir)), - reverse = temp(expand('{dir}/temp1rev{{sample}}.sam', dir = ADJRSS_dir)), - bam = postalign_RSS_outputName, - stats = postalign_RSS_outputName + ".stats", - csv = postalign_RSS_outputName + ".csv" - log: - message: "{ruleDisplayMessage}Adjust read start sites for file {input.bam:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postaling_RSS.{sample}.benchmark" - resources: - # conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - adjustRSS_forward = config["par_postalign"]["adjustRSS_forward"], - adjustRSS_reverse = config["par_postalign"]["adjustRSS_reverse"] - shell: - """ samtools view -H {input.bam:q} > {output.header:q} && - samtools view -F 16 {input.bam:q} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4=$4+{params.adjustRSS_forward}; print $0}}' > {output.forward:q} && - samtools view -f 16 {input.bam:q} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4=$4+{params.adjustRSS_reverse}; print $0}}' > {output.reverse:q} && - cat {output.header:q} {output.forward:q} {output.reverse:q} | samtools view -S -b -o {output.bam:q} - && - samtools flagstat {output.bam:q} > {output.stats:q} && - samtools view {output.bam:q} | cut -f3,5,9 > {output.csv:q} - """ - - # TODO: Also adjust the position of the next mate column - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_rmINDEL_outputName = RMINDEL_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(par_minMAPQscore) + '.adjRSS.rmINDEL.s.bam' - - -rule postalign_rmINDEL: - input: - rules.postaling_RSS.output.bam - output: - final = temp(postalign_rmINDEL_outputName), - finalSorted = expand('{dir}/{{sample}}.final.s.bam', dir = RMINDEL_dir), - index = expand('{dir}/{{sample}}.final.s.bam.bai', dir = RMINDEL_dir), - stats = expand('{dir}/{{sample}}.final.s.bam.stats', dir = RMINDEL_dir), - csv = expand('{dir}/{{sample}}.final.s.bam.csv', dir = RMINDEL_dir), - header = temp(expand('{dir}/header{{sample}}.sam', dir = RMINDEL_dir)) - log: - message: "{ruleDisplayMessage}Remove reads with INDELs from file {input:q} ..." - threads: threadsMax - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/postalign_rmINDEL.{sample}.benchmark" - resources: - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - params: - CIGAR = config["par_postalign"]["CIGAR"] - shell: - """ samtools view -H {input:q} > {output.header:q} && - samtools view {input:q} | awk 'BEGIN {{OFS = "\\t"}};{{if ($6 !~ /[{params.CIGAR}]/) print $0}}' | cat {output.header:q} - | samtools view -S -b -o {output.final:q} - && - samtools sort -o {output.finalSorted:q} --threads {threads} {output.final:q} && - samtools index {output.finalSorted:q} && - samtools flagstat {output.finalSorted:q} > {output.stats:q} && - samtools view {output.finalSorted:q} | cut -f3,5,9 > {output.csv:q} - """ - -# -# rule finalCleaning: -# input: -# rules.postalign_rmINDEL.output.finalSorted -# output: -# bam = expand('{dir}/{{sample}}.final.bam', dir = FINAL_OUTPUT_dir), -# index = expand('{dir}/{{sample}}.final.bam.bai', dir = FINAL_OUTPUT_dir) -# log: -# message: "{ruleDisplayMessage}Final cleaning for file {input:q} ..." -# threads: 1 -# priority: 1 -# benchmark: LOG_BENCHMARK_dir + "/finalCleaning.{sample}.benchmark" -# resources: -# conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" -# params: -# shell: -# """ -# echo "test" -# """ -# # cleansam -# # run FixMateInf one more time -# # run VerifySam - - -########################################## -########################################## -## FINAL LINKING AND MERGING REPLICATES ## -########################################## -########################################## - -rule link_outputFiles: - input: - bam = rules.postalign_rmINDEL.output.finalSorted, - index = rules.postalign_rmINDEL.output.index - output: - bam = expand('{dir}/{{sample}}.final.bam', dir = FINAL_OUTPUT_dir), - index = expand('{dir}/{{sample}}.final.bam.bai', dir = FINAL_OUTPUT_dir) - log: - message: "{ruleDisplayMessage}Link final data for final output files {input.bam:q} and {input.index:q} in directory {FINAL_OUTPUT_dir:q} ..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/link_outputFiles.{sample}.benchmark" - resources: - version: "NA" - params: - shell: - """ sh -c 'ln -fs {input.bam:q} {output.bam:q} && - ln -fs {input.index:q} {output.index:q} && - touch -h {output.bam:q} && - touch -h {output.index:q} - '""" - - - -def getSampleBasenamesForIndividual(individual): - """text""" - sampleBasenames = numpy.asarray(samplesData.loc[samplesData["individual"] == individual, "sampleName"]) - return sampleBasenames - - -rule Picardtools_MergeSamFiles: - input: - lambda wildcards: expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = getSampleBasenamesForIndividual(wildcards.individual)) - output: - bam = expand('{dir}/{{individual}}.merged.final.bam', dir = FINAL_OUTPUT_dir), - index = expand('{dir}/{{individual}}.merged.final.bam.bai', dir = FINAL_OUTPUT_dir) - log: - expand('{dir}/Picardtools_MergeSamFiles.{{individual}}.log', dir = LOG_BENCHMARK_dir) - message: - "{ruleDisplayMessage}Merging all replicates for individual {wildcards.individual} (files {input:q} with Picardtools..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/picard_samtools.yaml" - benchmark: LOG_BENCHMARK_dir + "/Picardtools_MergeSamFiles.{individual}.benchmark" - params: inputString = lambda wildcards, input: " I=".join(input) - shell: - """sh -c ' - {picard_command} MergeSamFiles \ - I={params.inputString} \ - O={output.bam:q} \ - 2> {log:q} && - samtools index {output.bam:q} - '""" - - - - -############# -############# -## GC BIAS ## -############# -############# - -rule deepTools_computeGCBias: - input: - bam = FINAL_OUTPUT_dir + '/{basename}.final.bam' - output: - frequencies = REPORTS_dir + '/{basename}.GCBias.frequencies', - biasPlot = REPORTS_dir + '/{basename}.GCBias.plot.pdf' - log: LOG_BENCHMARK_dir + "/deepTools_computeGCBias.{basename}.log" - message: "{ruleDisplayMessage}Run deepTools: computeGCBias for file {input.bam}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_computeGCBias.benchmark" - params: - genome2Bit = config["additionalInputFiles"]["refGenome_2bit"], - blacklistRegions = config["additionalInputFiles"]["blacklistRegions"], - effectiveGenomeSize = config["par_deepTools"]["effectiveGenomeSize"], - fragmentLength = "200", # will be ignored for paired-end data, as the fragment length is computed based from the bam file - other = "" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - computeGCBias \ - --bamfile {input.bam} \ - --effectiveGenomeSize {params.effectiveGenomeSize} \ - --genome {params.genome2Bit} \ - --blackListFileName {params.blacklistRegions} \ - --fragmentLength {params.fragmentLength}\ - {params.other} \ - --plotFileFormat pdf \ - --biasPlot {output.biasPlot} \ - --GCbiasFrequenciesFile {output.frequencies} \ - --numberOfProcessors {threads} \ - 2> {log:q} - '""" - - -rule deepTools_correctGCBias: - input: - bam = FINAL_OUTPUT_dir + '/{basename}.final.bam', - frequencies = rules.deepTools_computeGCBias.output.frequencies - output: - bam = FINAL_OUTPUT_dir + '/{basename}.noGCBias.final.bam', - index = FINAL_OUTPUT_dir + '/{basename}.noGCBias.final.bam.bai', - log: LOG_BENCHMARK_dir + "/deepTools_correctGCBias.{basename}.log" - message: "{ruleDisplayMessage}Run deepTools: correctGCBias for file {input.bam}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_correctGCBias.benchmark" - params: - genome2Bit = config["additionalInputFiles"]["refGenome_2bit"], - effectiveGenomeSize = config["par_deepTools"]["effectiveGenomeSize"], - other = "" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - correctGCBias \ - --bamfile {input.bam} \ - --GCbiasFrequenciesFile {input.frequencies} \ - --effectiveGenomeSize {params.effectiveGenomeSize} \ - --genome {params.genome2Bit} \ - --correctedFile {output.bam} \ - --numberOfProcessors {threads} \ - 2> {log:q} && - samtools index {output.bam} - '""" - - -################## -################## -## PEAK CALLING ## -################## -################## - - -def getGenomeTypeMacs2(assemblyVersion): - - if assemblyVersion == "mm9" or assemblyVersion == "mm10": - genomeType = "mm" - elif assemblyVersion == "hg19" or assemblyVersion == "hg38": - genomeType = "hs" - else: - raise NotImplementedError("Genome assembly version " + assemblyVersion + " not yet implemented for the -g parameter in MACS2.") - - return genomeType - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_stringent_outputName = PEAKCALLING_STRINGENT_dir + '/{basename}' + '.stringent' - -# Runs for all bam fiels in the final output folder, also the replicated one -rule macs2_stringent: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - peaks_bedT = temp(macs2_stringent_outputName + '_peaks.narrowPeak'), - peaks_bed = temp(macs2_stringent_outputName + '.narrowPeak'), - summit_bed = macs2_stringent_outputName + '_summits.bed', - xls = temp(macs2_stringent_outputName + '_peaks.xls') - log: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (stringent) for {input.bam:q} ..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - qValue = config["par_peakCalling"]["modelStringent_minQValue"], - modelPar = config["par_peakCalling"]["modelStringent"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.stringent', - outputDir = PEAKCALLING_STRINGENT_dir, - keepDuplicates = "--keep-dup all" - shell: - """macs2 callpeak \ - --treatment {input.bam} \ - -q {params.qValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelPar} \ - 2> {log:q} && - sort -k 8gr,8gr {output.peaks_bedT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.peaks_bed} - """ - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_nonStringent_outputName = PEAKCALLING_NONSTRINGENT_dir + '/{basename}' + '.nonStringent' - - -rule macs2_nonStringent: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - peaks_bedT = temp(macs2_nonStringent_outputName + '_peaks.narrowPeak'), - peaks_bed = temp(macs2_nonStringent_outputName + '.narrowPeak'), - summit_bed = macs2_nonStringent_outputName + '_summits.bed', - xls = temp(macs2_nonStringent_outputName + '_peaks.xls') - log: LOG_BENCHMARK_dir + "/macs2_nonStringent.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (non-stringent) for {input.bam:q} ..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/macs2_nonStringent.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - qValue = config["par_peakCalling"]["modelNonStringent_minQValue"], - slocalVal = config["par_peakCalling"]["modelNonStringent_slocal"], - modelPar = config["par_peakCalling"]["modelNonStringent"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.nonStringent', - outputDir = PEAKCALLING_NONSTRINGENT_dir, - keepDuplicates = "--keep-dup all" - shell: - """macs2 callpeak \ - --treatment {input.bam} \ - -q {params.qValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.modelPar} \ - {params.keepDuplicates} \ - --slocal {params.slocalVal} \ - 2> {log:q} && - sort -k 8gr,8gr {output.peaks_bedT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.peaks_bed} - """ - - -# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -macs2_Encode_outputName = PEAKCALLING_ENCODE_dir + '/{basename}' + '.Encode' - -rule macs2_Encode: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - broadPeakfileT = temp(macs2_Encode_outputName + '_peaks.broadPeak'), - gappedPeakfileT = temp(macs2_Encode_outputName + '_peaks.gappedPeak'), - narrowPeakfileT = temp(macs2_Encode_outputName + '_peaks.narrowPeak'), - xls = temp(macs2_Encode_outputName + '_peaks.xls'), - broadPeakfile = temp(macs2_Encode_outputName + '.broadPeak'), - gappedPeakfile = temp(macs2_Encode_outputName + '.gappedPeak'), - narrowPeakfile = temp(macs2_Encode_outputName + '.narrowPeak'), - bdg1 = macs2_Encode_outputName + '_control_lambda.bdg', - bdg2 = macs2_Encode_outputName + '_treat_pileup.bdg' - log: - broadAndGapped = LOG_BENCHMARK_dir + "/macs2_Encode_broadAndGapped.{basename}.log", - narrow = LOG_BENCHMARK_dir + "/macs2_Encode_narrow.{basename}.log" - message: "{ruleDisplayMessage}Run MACS2 (Encode version) for {input.bam:q} ..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/macs2_Encode.{basename}.benchmark" - conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/macs2.yaml" - params: - pValue = config["par_peakCalling"]["Encode_pValThreshold"], - modelParBroadAndGapped = config["par_peakCalling"]["Encode_modelBroadAndGapped"], - modelParNarrow = config["par_peakCalling"]["Encode_modelNarrow"], - genomeType = getGenomeTypeMacs2(config["par_align"]["assemblyVersion"]), - name = lambda wildcards: wildcards.basename + '.Encode', - outputDir = PEAKCALLING_ENCODE_dir, - keepDuplicates = "--keep-dup all" - shell: - # 1. First produce broad and gapped peaks, then narrow ones - # See https://www.encodeproject.org/atac-seq/ and https://github.com/kundajelab/atac_dnase_pipelines/blob/62e1c544a394d3215d0b2d24743fc1e8bb08123c/modules/callpeak_macs2.bds - # After peak calling, sort by col 8and 14 in descending order and replace long peak names in Column 4 with Peak_<peakRank> - """ - macs2 callpeak \ - --treatment {input.bam} \ - -p {params.pValue} \ - --outdir {params.outputDir}\ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelParBroadAndGapped} \ - 2> {log.broadAndGapped:q} && - sort -k 8gr,8gr {output.broadPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.broadPeakfile} && - sort -k 14gr,14gr {output.gappedPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.gappedPeakfile} && - macs2 callpeak \ - --treatment {input.bam} \ - -p {params.pValue} \ - --outdir {params.outputDir} \ - --name {params.name}\ - -g {params.genomeType} \ - {params.keepDuplicates} \ - {params.modelParNarrow} \ - 2> {log.narrow:q} && - sort -k 8gr,8gr {output.narrowPeakfileT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' > {output.narrowPeakfile} - """ - - -def awkStringPeakType (peakType, overlap): - """text""" - - awkStr = "{{s1=$3-$2; " - - if peakType == "narrow": - awkStr = awkStr + "s2=$13-$12; if (($21/s1 >= " + str(overlap) + ") || ($21" - elif peakType == "broad": - awkStr = awkStr + "s2=$12-$11; if (($19/s1 >= " + str(overlap) + ") || ($19" - elif peakType == "gapped": - awkStr = awkStr + "s2=$18-$17; if (($31/s1 >= " + str(overlap) + ") || ($31" - - awkStr = awkStr + "/s2 >= " + str(overlap) + ")) {{print $0}}}}" - return awkStr - - -def cutColPeakType (peakType): - """text""" - - if peakType == "narrow": - return "1-10" - elif peakType == "broad": - return "1-9" - elif peakType == "gapped": - return "1-15" - - -# todo: solve for GC bias files also -def generateInputFiles (wildcards): - if len(getSampleBasenamesForIndividual(wildcards.individual)) == 0: - raise AssertionError("Cannot determine sample basenames for wildcard " + wildcards.individual + ": " + str(len(getSampleBasenamesForIndividual(wildcards.individual)))) - return expand('{dir}/{samples}.final.{analysisType}.{peakType}Peak', dir = wildcards.dir, samples = getSampleBasenamesForIndividual(wildcards.individual), analysisType = wildcards.analysisType, peakType = wildcards.peakType) - -rule poolPeaksReplicateSamples: - input: - peakfiles = generateInputFiles - output: - pooledPeaks = temp(expand('{{dir}}/{{individual}}.merged{{GCBiasStr}}.final.{{analysisType}}.pooled.{{peakType}}Peak')), - replicatePeaks = temp(expand('{{dir}}/{{individual}}.merged{{GCBiasStr}}.final.{{analysisType}}.replicate.{{peakType}}Peak')) - # pooledPeaks = temp(expand('{{dir}}/{{individual}}.merged.final.{{analysisType}}.pooled.{{peakType}}Peak')), - # replicatePeaks = temp(expand('{{dir}}/{{individual}}.merged.final.{{analysisType}}.replicate.{{peakType}}Peak')) - log: - message: "{ruleDisplayMessage}Pool peaks for individual {wildcards.individual} and produce replicated peaks for input {input.peakfiles:q} ..." - threads: 1 - priority: 1 - # wildcard_constraints: - # individual= "[\w+]", - # GCBiasStr = "[.]noGCBias|\b" - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/poolPeaksReplicateSamples.{individual}.{analysisType}.{peakType}.benchmark" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bedtools.yaml" - params: minOverlap = 0.5 - run: - - # 1. Pool replicate samples and produce a replicate peak file - - # With compression : zcat {input.peakfiles} | gzip -nc > {output.pooledPeaks} - - # Without compression - shell("""cat {input.peakfiles} > {output.pooledPeaks}""") - - # 2. From this set of peaks on pooled data, we only retain those that have at least 50% overlap with a peak both replicates. - - # https://github.com/kundajelab/atac_dnase_pipelines/blob/master/modules/callpeak_naive_overlap.bds - # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs >= 0.5 - - cutStr = cutColPeakType (wildcards.peakType) - awkStr = awkStringPeakType (wildcards.peakType, params.minOverlap) - - command = "" - - # Strategy: call intersectBed multiple times: -a is always the result of the previous one, starting with all pooled peaks and only those peaks overlapping at least 50% with the previous run. There, this is a logical "and" connection - for i in range(len(input)): - - command = command + "intersectBed -wo -a " - - if ( i == 0 ): - command += """{output.pooledPeaks}""" - else: - command += "stdin " - - command = command + " -b " + input[i] + """ | awk 'BEGIN{{FS="\\t";OFS="\\t"}} {awkStr}' | cut -f {cutStr} | sort | uniq """ - - if not (i == (len(input) - 1) ): - command = command + "|" - - - - command = command + """ >{output.replicatePeaks}""" - - shell(command) - - - - - - -rule filterPeaks: - input: - bed = '{dir}/{basename}Peak' - - output: - bed = '{dir}/{basename}Peak' + '.filtered.bed' - log: - message: "{ruleDisplayMessage}Exclude blacklist regions for file {input.bed}..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/filterPeaks.{basename}.benchmark" - params: - blacklistRegions = config["additionalInputFiles"]["blacklistRegions"] - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/bedtools.yaml" - shell: - """sh -c ' - bedtools subtract \ - -a {input.bed} \ - -b {params.blacklistRegions} \ - > {output.bed} - '""" - -# rule idr: -# input: -# sample1Peaks = expand('{dir}/{{sample1}}.final.{{analysisType}}.{{peakType}}Peak.Peak' , dir = PEAKCALLING_dir) -# sample2Peaks = expand('{dir}/{{sample2}}.final.{{analysisType}}.{{peakType}}Peak.Peak' , dir = PEAKCALLING_dir) -# pooledPeaks = expand() -# output: -# peaks = expand('{dir}/idr_{{sample1}}.{{sample2}}.{{analysisType}}.{{peakType}}Peak' , dir = PEAKCALLING_dir), -# plot = expand('{dir}/idr_{{sample1}}.{{sample2}}.{{analysisType}}.{{peakType}}Peak.png', dir = PEAKCALLING_dir) -# log: -# LOG_BENCHMARK_dir + "/idr.{sample1}.{sample2}.{analysisType}.{peakType}.log" -# message: -# "{ruleDisplayMessage}Run IDR analysis for files {sample1} and {sample2} using analysisType={analysisType} and peakType={peakType}..." -# threads: 1 -# priority: 1 -# resources: maxMemGB=20 -# benchmark: -# LOG_BENCHMARK_dir + "/idr.{sample1}.{sample2}.{analysisType}.{peakType}.benchmark" -# params: -# rank = config["par_peakCalling"]["idr_rank"], -# softIDRThreshold = config["par_peakCalling"]["idr_softIDRThreshold"] -# other = "--plot --use-best-multisummit-IDR", -# inputFileType = "narrowPeak" # File type of --samples and --peak-list -# conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/idr.yaml" -# shell: -# """sh -c '{idr_exec:q} \ -# --samples {input.sample1Peaks} {input.sample2Peaks} \ -# --input-file-type {params.inputFileType} \ -# --peak-list {input.pooledPeaks} \ -# --output-file {output.bed:q} \ -# --rank {params.rank} \ -# --soft-idr-threshold {params.softIDRThreshold} \ -# {params.other} \ -# --log-output-file {log:q}'""" - - - # after IDR: - # sys idr_thresh_transformed=$(awk -v p=$idr_thresh 'BEGIN{print -log(p)/log(10)}') - # - # //# Get peaks passing global IDR threshold and convert file to narrowPeak format (Step 9) - # sys awk 'BEGIN{OFS="\\t"} $12>='"${idr_thresh_transformed}"' {if ($2<0) $2=0; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,"0"}' $idr_out \ - # | sort | uniq | sort -k7n,7n | gzip -nc > $peak_idr_trk_tmp - # - # sys zcat $peak_idr_trk_tmp | awk 'BEGIN{OFS="\\t"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' | gzip -nc > $peak_idr - # sys zcat $peak_idr_trk_tmp | awk 'BEGIN{OFS="\\t"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' | gzip -nc > $peak_idr_trk - # - # - - -############################## -############################## -## STATISTICS AND SUMAMRIES ## -############################## -############################## - -rule fragment_length_distr: - input: - expand('{dir}/{allSamples}.final.s.bam.csv', dir = RMINDEL_dir, MAPQ = par_minMAPQscore, allSamples = allSamplesUnique) - output: - pdf = expand('{dir}/allSamples_fragmentLengthDistr.pdf', dir = REPORTS_dir), - rdata = expand('{dir}/allSamples_fragmentLengthDistr.RData', dir = REPORTS_dir) - log: expand('{dir}/fragment_length_distr.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Create fragment length distribution..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/fragment_length_distr.benchmark" - resources: - version: VERSION_FL_distr_script - params: - FL_distr_cutoff = config["par_scripts"]["FL_distr_script_cutoff"], - inputString = lambda wildcards, input: ','.join(input) - shell: - """sh -c ' - Rscript {script_FL_distr:q} \ - {params.inputString} \ - {params.FL_distr_cutoff} \ - {output.pdf:q} \ - {output.rdata:q} \ - {log:q} - '""" - - - -rule stats: - input: - # We have to specify the results from ALL samples here because they are collectively needed as input - expand('{dir}/allSamples_fragmentLengthDistr.pdf', - dir = REPORTS_dir), - expand('{dir}/{sample}.s.bam', - dir = ALIGN_dir, sample = allSamplesUnique), - expand('{dir}/{sample}.s.bam.bai', - dir = ALIGN_dir, sample = allSamplesUnique), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.s.bam.stats', - dir = CHRM_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats', - dir = rmDup_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats', - dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.stats', - dir = ADJRSS_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.final.s.bam.stats', - dir = RMINDEL_dir, sample = allSamplesUnique, MAPQ = par_minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl', - dir = rmDup_dir, sample = allSamplesUnique) - output: - pdf = expand('{dir}/allSamples_statSummary.pdf', dir = REPORTS_dir), - rdata = expand('{dir}/allSamples_statSummary.RData', dir = REPORTS_dir), - log: expand('{dir}/stats.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage}Generate statistics about pipeline and produce file {output:q}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/stats.benchmark" - resources: - version: VERSION_STATS_script - params: - pairedEnd = config["samples"]["pairedEnd"], - withinThr = config["par_scripts"]["STATS_script_withinThr"], - outsideThr = config["par_scripts"]["STATS_script_outsideThr"], - geneTypesToKeep = config["par_scripts"]["STATS_script_geneTypesToKeep"], - annotationFile = config["additionalInputFiles"]["annotationGTF"], - statsPattern = "*.s.bam.stats$", - libraryStatsPattern = "*.s.bam.statsLibraryCompl$", - rootDir = ROOT_dir - shell: - """sh -c ' - Rscript {script_STATS:q} \ - {allSamplesUniqueStr} \ - {params.rootDir:q} \ - {output.pdf:q} \ - {output.rdata:q} \ - {params.annotationFile} \ - {params.pairedEnd} \ - {params.withinThr} \ - {params.outsideThr} \ - {params.geneTypesToKeep} \ - {log:q} \ - {params.statsPattern} \ - {params.libraryStatsPattern} - '""" - - -rule bamCoverage: - input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) - output: - bigwig = expand('{dir}/{{basename}}.bigwig' , dir = FINAL_OUTPUT_dir), - bedgraph = temp(expand('{dir}/{{basename}}.bedgraph', dir = FINAL_OUTPUT_dir)), - bedgraphgz = expand('{dir}/{{basename}}.bedgraph.gz', dir = FINAL_OUTPUT_dir), - bedgraphIndex = expand('{dir}/{{basename}}.bedgraph.gz.tbi', dir = FINAL_OUTPUT_dir) - log: - bigwig = LOG_BENCHMARK_dir + "/bamCoverage.{basename}.bigwig.log", - bedgraph = LOG_BENCHMARK_dir + "/bamCoverage.{basename}.bedgraph.log" - message: "{ruleDisplayMessage}Run bamCoverage for {input.bam:q} ..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/bamCoverage.{basename}.benchmark" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - params: - normalization = bamCoverage_normOption, - binSize = config["par_deepTools"]["bamCoverage_binSize"], - ignoreChr = "chrX chrM", - otherOptions = config["par_deepTools"]["bamCoverage_otherOptions"], - duplicates = "--ignoreDuplicates" # Ignored now, should NOT be set for GC-corrected data - shell: - # First bigwig, then bedgraph - """sh -c ' - bamCoverage \ - --bam {input.bam} \ - --binSize {params.binSize} \ - --{params.normalization} \ - {params.otherOptions} \ - --numberOfProcessors {threadsMax} \ - --ignoreForNormalization {params.ignoreChr} \ - --outFileName {output.bigwig} \ - --outFileFormat bigwig \ - 2> {log.bigwig:q} && - bamCoverage \ - --bam {input.bam} \ - --binSize {params.binSize} \ - --{params.normalization} \ - {params.otherOptions} \ - --numberOfProcessors {threadsMax} \ - --ignoreForNormalization {params.ignoreChr} \ - --outFileName {output.bedgraph} \ - --outFileFormat bedgraph \ - 2> {log.bedgraph:q} && - sort -k1,1 -k2,2n {output.bedgraph} | bgzip > {output.bedgraphgz} && - tabix -s 1 -b 2 -e 3 {output.bedgraphgz} - '""" - -# Note the necessary [] for concatenate two arrays -basenameSamplesAndIndArray = numpy.concatenate([allIndividualsUnique + ".merged", allSamplesUnique]) - -rule deepTools_plotCoverage: - input: - bamAll = expand('{dir}/{basename}{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, basename = basenameSamplesAndIndArray, GCBiasStr = ["",".noGCBias"]) - output: - plot = REPORTS_dir + '/allSamples_CoveragePlot.pdf', - rawCounts = REPORTS_dir + '/allSamples_CoveragePlot.counts', - log: LOG_BENCHMARK_dir + "/deepTools_plotCoverage.log" - message: "{ruleDisplayMessage}Run deepTools: plotCoverage for file {input.bamAll}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_plotCoverage.benchmark" - params: - fragmentLength = "--minFragmentLength", # Currently ignored, might be useful for ATAC-seq data - other = "--centerReads --plotFileFormat pdf --numberOfSamples 3000000", - titlePlot = "Coverage plot for all samples", - duplicates = "--ignoreDuplicates" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - plotCoverage \ - --bamfiles {input.bamAll} \ - --plotTitle "{params.titlePlot}" \ - {params.other} \ - --numberOfProcessors {threads} \ - {params.duplicates} \ - --outRawCounts {output.rawCounts} \ - --plotFile {output.plot:q} \ - 2> {log:q} '""" - -rule deepTools_correlationPlots: - input: - bamAll = expand('{dir}/{allSamples}{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, allSamples = allSamplesUnique, GCBiasStr = ["",".noGCBias"]) - output: - npz = REPORTS_dir + '/allSamples' + '.bins.npz', - pdf = REPORTS_dir + '/allSamples' + '.correlations.pdf', - rawCounts = REPORTS_dir + '/allSamples' + '.correlations.rawCounts' - #corMatrix = REPORTS_dir + '/allSamples' + '.correlations.matrix' - log: - multiBamSummary = LOG_BENCHMARK_dir + "/deepTools_correlationPlots.multiBamSummary.log", - plotCorrelation = LOG_BENCHMARK_dir + "/deepTools_correlationPlots.plotCorrelation.log" - message: "{ruleDisplayMessage}Run deepTools: multiBamSummary and correlationPlots for files {input.bamAll}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_correlationPlots.benchmark" - params: - corMethod = "pearson", - whatToPlot = "heatmap", - colorMap = "hsv", - other = "--skipZeros --plotFileFormat pdf --removeOutliers --plotNumbers", - titlePlot = "Correlation plot (" + "Pearson" + ")" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - multiBamSummary \ - bins \ - --bamfiles {input.bamAll} \ - --numberOfProcessors {threads} \ - --outRawCounts {output.rawCounts} \ - -out {output.npz:q} \ - 2> {log.multiBamSummary:q} && - plotCorrelation \ - --corData {output.npz:q} \ - --corMethod "{params.corMethod}" \ - --whatToPlot "{params.whatToPlot}" \ - --plotTitle "{params.titlePlot}" \ - --colorMap "{params.colorMap}" \ - {params.other} \ - --plotFile {output.pdf:q} \ - 2> {log.plotCorrelation:q} - '""" - -# bug in version 2.4.1: --outFileCorMatrix {output.corMatrix} \ -# --labels {allSamplesUniqueStrSpaces} # No " " here these cause an error! - - -rule deepTools_correlationPlotsPooledSamples: - input: - bamAll = expand('{dir}/{allIndividuals}.merged{GCBiasStr}.final.bam', dir = FINAL_OUTPUT_dir, allIndividuals = allIndividualsUnique, GCBiasStr = ["",".noGCBias"]) - output: - npz = REPORTS_dir + '/allSamplesPooled' + '.bins.npz', - pdf = REPORTS_dir + '/allSamplesPooled' + '.correlations.pdf', - rawCounts = REPORTS_dir + '/allSamplesPooled' + '.correlations.rawCounts' - #corMatrix = DOWNSTREAM_dir + '/allSamples' + '.minMapQ' + str(par_minMAPQscore) + '.rmChrM.adRSS.rmDup.rmINDEL.correlations.corMatrix' - log: - multiBamSummary = LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.multiBamSummary.log", - plotCorrelation = LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.plotCorrelation.log" - message: "{ruleDisplayMessage}Run deepTools: multiBamSummary and correlationPlots for files {input.bamAll}..." - threads: threadsMax - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_correlationPlotsPooledSamples.benchmark" - params: - corMethod = "pearson", - whatToPlot = "heatmap", - colorMap = "hsv", - other = "--skipZeros --plotFileFormat pdf --removeOutliers --plotNumbers", - titlePlot = "Correlation plot (" + "Pearson" + ")" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - multiBamSummary \ - bins \ - --bamfiles {input.bamAll} \ - --numberOfProcessors {threads} \ - --outRawCounts {output.rawCounts} \ - -out {output.npz:q} \ - 2> {log.multiBamSummary:q} && - plotCorrelation \ - --corData {output.npz:q} \ - --corMethod "{params.corMethod}" \ - --whatToPlot "{params.whatToPlot}" \ - --plotTitle "{params.titlePlot}" \ - --colorMap "{params.colorMap}" \ - {params.other} \ - --plotFile {output.pdf:q} \ - 2> {log.plotCorrelation:q} - '""" -# bug in version 2.4.1: --outFileCorMatrix {output.corMatrix} \ --labels {allIndividualsUniqueStrSpaces} \ - - - -rule deepTools_plotPCA: - input: - coverage = REPORTS_dir + '/{basename}.bins.npz' - output: - plot = REPORTS_dir + '/{basename}_PCAPlot.pdf', - data = REPORTS_dir + '/{basename}_PCAPlot.data' - log: LOG_BENCHMARK_dir + "/deepTools_plotPCA.{basename}.log" - message: "{ruleDisplayMessage}Run deepTools: plotPCA for coverage file {input.coverage}..." - threads: 1 - priority: 1 - resources: maxMemGB=20 - benchmark: LOG_BENCHMARK_dir + "/deepTools_plotPCA.benchmark" - params: - other = "", - titlePlot = "PCA plot for all samples" - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/deepTools.yaml" - shell: - """sh -c ' - plotPCA \ - --corData {input.coverage} \ - --plotTitle "{params.titlePlot}" \ - --plotFileFormat pdf \ - {params.other} \ - --outFileNameData {output.data} \ - --plotFile {output.plot:q}\ - 2> {log:q} - '""" - - -# Enforce that it runs at the very end - -rule multiqc: - input: - summaryStats = expand('{dir}/allSamples_statSummary.{fileType}', dir = REPORTS_dir, fileType = ["pdf", "RData"]), - fragmentLength = expand('{dir}/allSamples_fragmentLengthDistr.{fileType}', dir = REPORTS_dir, fileType = ["pdf", "RData"]), - corrPlots = REPORTS_dir + '/allSamples' + '.correlations.pdf', - corrPlotsPooled = REPORTS_dir + '/allSamplesPooled' + '.correlations.pdf', - PCAPlot = expand('{dir}/{basename}_PCAPlot.pdf', dir = REPORTS_dir, basename = ["allSamples", "allSamplesPooled"]), - GCBiasPlot = expand('{dir}/{sample}{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir, sample = allSamplesUnique, GCBiasStr = ["", ".noGCBias"]), - GCBiasPlotPooled= expand('{dir}/{individual}.merged{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir, individual = allIndividualsUnique, GCBiasStr = ["", ".noGCBias"]), - coveragePlot = REPORTS_dir + '/allSamples_CoveragePlot.pdf', - coverage = expand('{dir}/{sample}{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, sample = allSamplesUnique , GCBiasStr = ["", ".noGCBias"], type = ["bigwig", "bedgraph"]), - coveragePooled = expand('{dir}/{individual}.merged{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, individual = allIndividualsUnique, GCBiasStr = ["", ".noGCBias"], type = ["bigwig", "bedgraph"]) - output: - report = REPORTS_dir + '/multiqc_report.html' - log: - message: "{ruleDisplayMessage}Finally, run multiqc for the folder {ROOT_dir}..." - threads: 1 - priority: 1 - benchmark: LOG_BENCHMARK_dir + "/multiqc.benchmark" - resources: - params: - outputDir = lambda wildcards, output: os.path.dirname(output.report), - basename = lambda wildcards, output: os.path.basename(output.report), - rootDir = ROOT_dir - #conda: "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/environmentsYAML/multiqc.yaml" - shell: - """sh -c ' - multiqc \ - --force \ - --ignore *.out --ignore *.err \ - -o {params.outputDir} \ - --filename {params.basename} \ - {params.rootDir:q} - '""" diff --git a/src/misc/generateExample.sh b/src/misc/generateExample.sh deleted file mode 100644 index 75fee33..0000000 --- a/src/misc/generateExample.sh +++ /dev/null @@ -1,9 +0,0 @@ - -nReads=1000000 - -nLines=$(expr $nReads \* 4) - -zcat /g/scb2/zaugg/carnold/Projects/AtacSeq/data/Ximing/Ximing_4thBatch/391_rep1_1.gz | head -$nLines | bgzip -c >test1_rep1_1.gz -zcat /g/scb2/zaugg/carnold/Projects/AtacSeq/data/Ximing/Ximing_4thBatch/391_rep1_2.gz | head -$nLines | bgzip -c >test1_rep1_2.gz -zcat /g/scb2/zaugg/carnold/Projects/AtacSeq/data/Ximing/Ximing_4thBatch/419_rep1_1.gz | head -$nLines | bgzip -c >test2_rep1_1.gz -zcat /g/scb2/zaugg/carnold/Projects/AtacSeq/data/Ximing/Ximing_4thBatch/419_rep1_2.gz | head -$nLines | bgzip -c >test2_rep1_2.gz -- GitLab