diff --git a/src/Snakemake/dev/R/PCA.R b/src/Snakemake/dev/R/PCA.R index b481077b681b35896f8ea11a25cc394be78f68a9..95ce1f1bf1f808fc91f187d11aa5d67d7fa119b8 100755 --- a/src/Snakemake/dev/R/PCA.R +++ b/src/Snakemake/dev/R/PCA.R @@ -41,33 +41,32 @@ par.l$log_minlevel = "INFO" checkAndLogWarningsAndErrors(snakemake, checkClass(snakemake, "Snakemake")) ## INPUT ## -checkAndLogWarningsAndErrors(snakemake@input, checkList(snakemake@input, min.len = 3)) -checkAndLogWarningsAndErrors(snakemake@input, checkSubset(names(snakemake@input), c("", "metadata", "rawCounts", "rawCountsMerged"))) +checkAndLogWarningsAndErrors(snakemake@input, checkList(snakemake@input, min.len = 2)) +checkAndLogWarningsAndErrors(snakemake@input, checkSubset(names(snakemake@input), c("", "metadata", "rawCounts"))) par.l$file_metadata = snakemake@input$metadata par.l$file_rawCounts = snakemake@input$rawCounts -par.l$file_rawCountsMerged = snakemake@input$rawCountsMerged + checkAndLogWarningsAndErrors(par.l$file_metadata, checkFileExists(par.l$file_metadata, access = "r")) checkAndLogWarningsAndErrors(par.l$file_rawCounts, checkFileExists(par.l$file_rawCounts, access = "r")) -checkAndLogWarningsAndErrors(par.l$file_rawCountsMerged, checkFileExists(par.l$file_rawCountsMerged, access = "r")) ## OUTPUT ## checkAndLogWarningsAndErrors(snakemake@output, checkList(snakemake@output, min.len = 2)) -checkAndLogWarningsAndErrors(names(snakemake@output), checkSubset(names(snakemake@output), c("", "pdf", "pdfMerged"))) +checkAndLogWarningsAndErrors(names(snakemake@output), checkSubset(names(snakemake@output), c("", "pdf", "data"))) par.l$file_output_pdf = snakemake@output$pdf -par.l$file_output_pdfMerged = snakemake@output$pdfMerged - +par.l$file_output_data = snakemake@output$data ## PARAMS ## -checkAndLogWarningsAndErrors(snakemake@params, checkList(snakemake@params, min.len = 4)) -checkAndLogWarningsAndErrors(names(snakemake@params), checkSubset(names(snakemake@params), c("", "binSize", "filterChr", "splitByGCCorrection", "minNoCountsRows"))) +checkAndLogWarningsAndErrors(snakemake@params, checkList(snakemake@params, min.len = 3)) +#checkAndLogWarningsAndErrors(names(snakemake@params), checkSubset(names(snakemake@params), c("", "binSize", "filterChr", "splitByGCCorrection", "minNoCountsRows"))) +checkAndLogWarningsAndErrors(names(snakemake@params), checkSubset(names(snakemake@params), c("", "filterChr", "splitByGCCorrection", "minNoCountsRows"))) -par.l$binSize = as.integer(snakemake@params$binSize) -checkAndLogWarningsAndErrors(par.l$binSize, checkIntegerish(par.l$binSize)) +#par.l$binSize = as.integer(snakemake@params$binSize) +#checkAndLogWarningsAndErrors(par.l$binSize, checkIntegerish(par.l$binSize)) par.l$filterChr = as.logical(snakemake@params$filterChr) checkAndLogWarningsAndErrors(par.l$filterChr, checkFlag(par.l$filterChr)) @@ -83,8 +82,7 @@ checkAndLogWarningsAndErrors(snakemake@log, checkList(snakemake@log, min.len = 1 par.l$file_log = snakemake@log[[1]] -allDirs = c(dirname(par.l$file_output_pdfMerged), - dirname(par.l$file_output_pdf), +allDirs = c(dirname(par.l$file_output_pdf), dirname(par.l$file_log) ) @@ -97,25 +95,62 @@ testExistanceAndCreateDirectoriesRecursively(allDirs) startLogger(par.l$file_log, par.l$log_minlevel, removeOldLog = TRUE) printParametersLog(par.l) -readCounts.l = list(all = par.l$file_rawCounts, pooled = par.l$file_rawCountsMerged) -output.l = list(all = par.l$file_output_pdf, pooled = par.l$file_output_pdfMerged) - -for (typeCur in c("all", "pooled")) { - - flog.info(paste0("Generate PCA plots for ", typeCur, " samples")) - - readCountsCur = readCounts.l[[typeCur]] - outputCur = output.l[[typeCur]] - - doPCAPlot(file_readCounts = readCountsCur, - file_metadata = par.l$file_metadata, - file_output = outputCur, - file_log = par.l$file_log, - metadataToInclude = c("sampleName", "individual"), - type = typeCur, - descriptionString = "10,000 bp bins", - filterChr = par.l$filterChr, - splitByGCCorrection = par.l$splitByGCCorrection, - minNoCountsRows = par.l$minNoCountsRows) + +flog.info(paste0("Generate PCA plots")) + + +# TODO: c("sampleName", "individual") populate automatically + +# prepare tables here already + +metadata = read_tsv(par.l$file_metadata) + +if (nrow(problems(metadata)) > 0) { + flog.fatal(paste0("Parsing errors: "), problems(metadata), capture = TRUE) + stop("Error when parsing the file ", par.l$file_metadata, ", see errors above") +} + +metadataToInclude = colnames(metadata) +removeCols = c("path_inputForward", "path_inputReverse") +# Check for only NA columns and remove +for (colCur in metadataToInclude) { + if (all(is.na(metadata[,colCur]))) { + removeCols = c(removeCols, colCur) + } } +metadataToInclude = setdiff(metadataToInclude, removeCols) +# metadata = dplyr::mutate_if(metadata, is.character,as.factor) +metadata = dplyr::mutate(metadata, replicate_No = as.factor(replicate_No)) + +countsPCA = read_tsv(par.l$file_rawCounts, skip = 1) +if (nrow(problems(countsPCA)) > 0) { + flog.fatal(paste0("Parsing errors: "), problems(countsPCA), capture = TRUE) + stop("Error when parsing the file ", par.l$file_rawCounts, ", see errors above") +} + +# Test purposes +colnames(countsPCA) = gsub(".final.bam", "", colnames(countsPCA)) +colnames(countsPCA) = gsub("'", "", colnames(countsPCA)) +colnames(countsPCA) = basename(colnames(countsPCA)) + + + +if (par.l$filterChr) { + countsPCA = dplyr::filter(countsPCA, !grepl("random|chrUn|chrM|hap|_gl", Chr, perl = TRUE)) +} + +# Keep only the Chr column, as needed for potential filtering by chromosome in the doPCAPlot function +countsPCA = dplyr::select(countsPCA, -Geneid, -Chr, -Start, -End, -Strand, -Length) + +# TODO: Loop over all input files or run rule separately for each? + +doPCAPlot(countsPCA = countsPCA, + metadata = metadata, + file_output_PDF = par.l$file_output_pdf, + file_output_data = par.l$file_output_data, + file_log = par.l$file_log, + metadataToInclude = metadataToInclude, + descriptionString = "variable peaks", + splitByGCCorrection = par.l$splitByGCCorrection, + minNoCountsRows = par.l$minNoCountsRows) diff --git a/src/Snakemake/dev/R/functions.R b/src/Snakemake/dev/R/functions.R index 42601e3f639532f086e434cf63da0f7a1f1c25f0..a5d500a1b9795fedb03f9ec07a3ea9c951f26152 100755 --- a/src/Snakemake/dev/R/functions.R +++ b/src/Snakemake/dev/R/functions.R @@ -624,16 +624,12 @@ splitStringInMultipleLines <- function(input, width, sepChar = "\n", verbose = T } -doPCAPlot <- function(file_readCounts, file_metadata, file_output, file_log, metadataToInclude, type, - descriptionString = "10,000 bp bins", filterChr = TRUE, splitByGCCorrection = TRUE, minNoCountsRows = 0) { +doPCAPlot <- function(countsPCA, metadata, file_output_PDF, file_output_data, file_log, metadataToInclude, + descriptionString = "variable peaks", splitByGCCorrection = TRUE, minNoCountsRows = 0) { checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "tools", "methods", "DESeq2", "readr"), verbose = TRUE) - - assertFileExists(file_readCounts) - assertFileExists(file_metadata) - assertLogical(filterChr) assertLogical(splitByGCCorrection) assertIntegerish(minNoCountsRows, lower = 0) start.time <- Sys.time() @@ -642,49 +638,9 @@ doPCAPlot <- function(file_readCounts, file_metadata, file_output, file_log, met startLogger(file_log, par.l$log_minlevel, removeOldLog = TRUE) - - coldata = read_tsv(file_metadata) - - if (nrow(problems(coldata)) > 0) { - flog.fatal(paste0("Parsing errors: "), problems(coldata), capture = TRUE) - stop("Error when parsing the file ", file_metadata, ", see errors above") - } - - - countsPCA = read_tsv(file_readCounts) - if (nrow(problems(countsPCA)) > 0) { - flog.fatal(paste0("Parsing errors: "), problems(countsPCA), capture = TRUE) - stop("Error when parsing the file ", file_readCounts, ", see errors above") - } - # Test purposes - #countsPCA = countsPCA[1:1000,] - - colnames(countsPCA)[1] = "chr" - - if (filterChr) { - countsPCA = dplyr::filter(countsPCA, !grepl("random|chrUn|chrM|hap|_gl", chr, perl = TRUE)) - } - - - if (type == "all") { - colnames(countsPCA) = gsub(".final.bam", "", colnames(countsPCA)) - - } else if (type == "pooled") { - colnames(countsPCA) = gsub(".merged", "", colnames(countsPCA)) - colnames(countsPCA) = gsub(".final.bam", "", colnames(countsPCA)) - - # Rewrite coldata - coldata = coldata[duplicated(coldata[,"individual"]),] - coldata$sampleName = coldata$individual - - } else { - flog.fatal(paste0("Unsupported type ", type)) - } + - colnames(countsPCA) = gsub("'", "", colnames(countsPCA)) - - allMatrices.l = list() allColdata.l = list() @@ -692,28 +648,29 @@ doPCAPlot <- function(file_readCounts, file_metadata, file_output, file_log, met if (splitByGCCorrection) { countsPCA_original = dplyr::select(countsPCA, -contains("noGCBias")) - allMatrices.l[["original"]] = as.matrix(countsPCA_original[,-c(1:3)]) - allColdata.l [["original"]] = coldata + allMatrices.l[["original"]] = as.matrix(countsPCA_original) + allColdata.l [["original"]] = metadata countsPCA_GC = dplyr::select(countsPCA, contains("noGCBias")) allMatrices.l[["GCCorrected"]] = as.matrix(countsPCA_GC) - coldata_GC = coldata + coldata_GC = metadata coldata_GC$sampleName = paste0(coldata_GC$sampleName, ".noGCBias") coldata_GC$individual = paste0(coldata_GC$individual, ".noGCBias") allColdata.l [["GCCorrected"]] = coldata_GC - allMatrices.l[["all"]] = as.matrix(countsPCA[,-c(1:3)]) - coldata_all = rbind(coldata, coldata_GC) + allMatrices.l[["all"]] = as.matrix(countsPCA) + coldata_all = rbind(metadata, coldata_GC) allColdata.l [["all"]] = coldata_all } else { - allMatrices.l[["all"]] = as.matrix(countsPCA[,-c(1:3)]) - allColdata.l [["all"]] = coldata + allMatrices.l[["all"]] = as.matrix(countsPCA) + allColdata.l [["all"]] = metadata } + nSamples = ncol(countsPCA) - pdf(file_output) + pdf(file_output_PDF, width = max(10, nSamples * 0.1), height = max(10, nSamples * 0.1)) for (matrixCur in names(allMatrices.l)) { @@ -721,6 +678,8 @@ doPCAPlot <- function(file_readCounts, file_metadata, file_output, file_log, met matrixCur.m = allMatrices.l[[matrixCur]] colDataCur = allColdata.l[[matrixCur]] + + if (!testSubset(colnames(matrixCur.m), colDataCur$sampleName)) { @@ -743,11 +702,27 @@ doPCAPlot <- function(file_readCounts, file_metadata, file_output, file_log, met vsd <- varianceStabilizingTransformation(dds, blind = TRUE) - for (nrowsCur in c(500,5000,50000, nrow(vsd))) { + output_data = tribble(~topn, ~PC1, ~PC2, ~group, ~nameOtherColumn, ~valueOtherColumn) + + nrowsMax = 10^seq(2, log10(nrow(vsd)) + 1, 1) + nrowsMax[length(nrowsMax)] = nrow(vsd) + if (nrow(vsd) > 500) {nrowsMax = sort(c(nrowsMax, 500))} + + + for (nrowsCur in nrowsMax) { + + flog.info(paste0(" Top ", nrowsCur)) for (varCur in metadataToInclude) { + flog.info(paste0(" Variable ", varCur)) pcadata = plotPCA(vsd, intgroup = varCur, ntop = nrowsCur, returnData = TRUE) + output_data = add_row(output_data, + topn = nrowsCur, + PC1 = pcadata$PC1, PC2 = pcadata$PC2, + group = pcadata$group, + nameOtherColumn = varCur, valueOtherColumn = pcadata[, varCur]) + percentVar <- round(100 * attr(pcadata, "percentVar")) g = ggplot(pcadata, aes_string("PC1", "PC2", color = varCur)) + geom_point(size = 2) + @@ -755,16 +730,22 @@ doPCAPlot <- function(file_readCounts, file_metadata, file_output, file_log, met ylab(paste0("PC2: ",percentVar[2],"% variance")) + coord_fixed() + theme_bw() + ggtitle(paste0("Group ", matrixCur, "\nTop ", .prettyNum(nrowsCur), " ", descriptionString)) + - theme(plot.title = element_text(hjust = 0.5)) + theme(plot.title = element_text(hjust = 0.5), + legend.title = element_text(size = rel(0.5)), + legend.text = element_text(size = rel(0.5))) + plot(g) } } + } dev.off() - flog.info(paste0("Successfulyl wrote PCA plot to file ", file_output)) + write_tsv(output_data, file_output_data) + + flog.info(paste0("Successfully wrote PCA plot to file ", file_output_PDF)) } diff --git a/src/Snakemake/dev/Snakefile b/src/Snakemake/dev/Snakefile index 575d3fe7221d6864d7fd279965735fec0159f2c8..5dbdee2824552a9f1cd679276f186577f2ab9c64 100755 --- a/src/Snakemake/dev/Snakefile +++ b/src/Snakemake/dev/Snakefile @@ -5,7 +5,7 @@ ############################## # TODO: -# peak PCA only: check why they are identical +# peak PCA only: check why they are identical, script not ready # merged files, some rules are not optiized for this # idr @@ -173,8 +173,8 @@ genomeSizeDict = { 'mm10': { "":2652783500, "50":2308125349, "75":2407883318, "100":2467481108, "150":2494787188,"200":2520869189 } } - -# TODO: Check whether config["par_deepTools"]["readLength"] is among the supported options +if not str(config["par_deepTools"]["readLength"]) in ["", "50", "100", "150", "200"]: + raise KeyError("Parameter \"readLength\" in section \"par_deepTools\" must be one of the following: \"\", 50, 100, 150 or 200. If your read length is not one of them, approximate by rounding to the closest of the available options") effectiveGenomeSize = genomeSizeDict[config["par_align"]["assemblyVersion"]][config["par_deepTools"]["readLength"]] @@ -205,10 +205,13 @@ if pairedEnd: macs2_pairedEndMode = "--format BAMPE" duplicates_keepReadsFlag = 2 # read mapped in proper pair + featureCounts_pairedEndOptions = "-p -B -P -d 0 -D 2000 -C" + else: macs2_pairedEndMode = "" # TODO: When single-end reads, this is not used at all duplicates_keepReadsFlag = 16 # reverse strand + featureCounts_pairedEndOptions = "" INPUT_ORIG_DIR = os.path.dirname(samplesSummaryFile) @@ -520,20 +523,32 @@ if doPeakCalling: allResultFiles.append(consensusPeaks_stringent) allResultFiles.append(consensusPeaks_nonStringent) - - consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], minOverlap = rangeOverlap) + consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{overlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], overlap = rangeOverlap) allResultFiles.append(consensusPeaks_stringentNonStringent_frip) + # featureCount output files + consensusPeaks_stringentNonStringent_featureCounts = expand('{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.allBams.overlaps.bed.gz', dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlap) + allResultFiles.append(consensusPeaks_stringentNonStringent_featureCounts) + + # PCA R script output files + PCA_R_consensusPeaks = expand("{dir}/{peakType}/{consensusPeaks{GCBias}.minOverlap{overlap}.PCA_summary.pdf", + dir = REPORTS_dir_PCA, peakType = ["stringent", "nonStringent"], GCBias = GCBiasStr, overlap = rangeOverlap) + allResultFiles.append(PCA_R_consensusPeaks) if encodePeaks: consensusPeaks_ENCODE = expand("{dir}/consensusPeaks{GCBias}.{peakType}Peak.minOverlap{overlap}.bed.gz", dir = PEAK_ENCODE_dir, GCBias = GCBiasStr, peakType = peakTypeEncode, overlap = rangeOverlap) - consensusPeaks_encode_frip = expand('{dir}/consensusPeaks/{peakType}Peak.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = peakTypeEncode, minOverlap = rangeOverlap) + consensusPeaks_encode_frip = expand('{dir}/consensusPeaks/{peakType}Peak.minOverlap{overlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = peakTypeEncode, overlap = rangeOverlap) + + # TODO: + PCA_R_consensusPeaks = expand("{dir}/Encode/{consensusPeaks{GCBias}{peakType}Peak.minOverlap{overlap}.PCA_summary.pdf", + dir = REPORTS_dir_PCA, peakType = peakTypeEncode, GCBias = GCBiasStr, overlap = rangeOverlap) peakTypesCur = peakTypesAll allResultFiles.append(consensusPeaks_ENCODE) allResultFiles.append(consensusPeaks_encode_frip) + allResultFiles.append(PCA_R_consensusPeaks) else: consensusPeaks_ENCODE = [] @@ -541,14 +556,6 @@ if doPeakCalling: peakTypesCur = ["stringent", "nonStringent"] - - # TODO move - PCA_peaks = expand("{dir}/{basename}_PCAPlot_peaks{GCBias}_{peakType}_minOverlap{overlap}.pdf", - dir = REPORTS_dir_PCA,basename = ["allSamples"], GCBias = GCBiasStr, overlap = rangeOverlap, - peakType = peakTypesCur) - - allResultFiles.append(PCA_peaks) - if annotatePeaks: annotation_consensusPeaks_stringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.{ext}", dir = REPORTS_dir_anno_STRINGENT, GCBias = GCBiasStr, overlap = rangeOverlap, ext = ["pdf", "csv"]) annotation_consensusPeaks_nonStringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.{ext}", dir = REPORTS_dir_anno_NONSTRINGENT, GCBias = GCBiasStr, overlap = rangeOverlap, ext = ["pdf", "csv"]) @@ -560,8 +567,8 @@ if doPeakCalling: allResultFiles.append(annotation_consensusPeaks_ENCODE) # PCA files for peaks - # TODO - # allResultFiles.append( REPORTS_dir_PCA + '/peaks_PCA_summary.pdf') + # Not finished, finish script + # allResultFiles.append(REPORTS_dir_PCA + '/peaks_PCA_summary.pdf') # # TODO @@ -1430,7 +1437,7 @@ if dataType == "ATACseq": bam = rules.postalign_MAPQ.output.bam, index = rules.postalign_MAPQ.output.index output: - bam = postalign_RSS_outputName, + bam = temp(postalign_RSS_outputName), stats = postalign_RSS_outputName + ".stats", csv = postalign_RSS_outputName + ".csv.gz" log: @@ -1964,21 +1971,66 @@ if nSamplesUnique > 1 and doPeakCalling: minOverlapValues = ",".join(map(str, rangeOverlap)) script: script_consPeaks + + output_prefix = PEAKCALLING_dir + "/{peakType}/consensusPeaks" + GCBiasStr + ".{optionalEncodeStr}minOverlap{overlap}" + + rule countPeakReads: + input: + consensusPeaks = output_prefix + ".bed.gz", + allBAMs = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique), + output: + peaksBamOverlapRaw = temp(output_prefix + ".allBams.overlaps.bed"), + peaksBamOverlap = output_prefix + ".allBams.overlaps.bed.gz", + consensusPeaksSAF = temp(output_prefix + ".saf") + log: + wildcard_constraints: + optionalEncodeStr="|Encode.broadPeak|Encode.gappedPeak|Encode.narrowPeak", + peakType="stringent|nonStringent|Encode" + message: "{ruleDisplayMessage} Intersect for file {input.consensusPeaks} with all BAM files..." + threads: threadsMax + envmodules: "Subread" + # singularity: "shub://chrarnold/Singularity_images:atac_seq_conda" # not yet in the container, TODO + params: + pairedEnd = featureCounts_pairedEndOptions, + readFiltering = "-Q 10", + multiOverlap = "" # set to -O for incorporating multi overlaps + shell: + """ zcat {input.consensusPeaks} | awk 'BEGIN {{ OFS = "\\t" }} {{print $4,$1,$2,$3,"+"}}' >{output.consensusPeaksSAF} && + featureCounts \ + -F SAF \ + -T {threads} \ + {params.readFiltering} \ + {params.pairedEnd} \ + -s 0 \ + {params.multiOverlap} \ + -a {output.consensusPeaksSAF} \ + -o {output.peaksBamOverlapRaw} \ + {input.allBAMs} && + gzip -f < {output.peaksBamOverlapRaw} > {output.peaksBamOverlap} + """ + + # TODO: Adjust and finish: Create all necessary files at beginning + prefix_rule = "{{peakType}}/consensusPeaks{GCBias}{{optionalEncodeStr}}.minOverlap{{overlap}}" + rule peaksPCA: input: - encode = consensusPeaks_ENCODE, - stringent = consensusPeaks_stringent, - nonStringent = consensusPeaks_nonStringent + bed = expand("{dir}/" + prefix_rule + ".bed.gz", dir = PEAKCALLING_dir, GCBias = GCBiasStr) output: - PCA_plot = REPORTS_dir_PCA + '/peaks_PCA_summary.pdf' + pdf = expand("{dir}/" + prefix_rule + ".PCA_summary.pdf", dir = REPORTS_dir_PCA, GCBias = GCBiasStr), + data = expand("{dir}/" + prefix_rule + ".PCA_summary.data", dir = REPORTS_dir_PCA, GCBias = GCBiasStr) log: expand('{dir}/peaksPCA.R.log', dir = LOG_BENCHMARK_dir) + wildcard_constraints: + optionalEncodeStr="|Encode.broadPeak|Encode.gappedPeak|Encode.narrowPeak", + peakType="stringent|nonStringent|Encode" # benchmark: LOG_BENCHMARK_dir + "/peaksPCA.benchmark" singularity: "shub://chrarnold/Singularity_images:atac_seq_r" message: "{ruleDisplayMessage}Calculate PCA for consensus peaks for all peak files with the script {script_PCA}..." threads: 1 params: GCString = GCBiasStr, - minOverlapValues = ",".join(map(str, rangeOverlap)) + minNoCountsRows = 0, + filterChr = "True", + description = "variable peaks" script: script_PCA @@ -2094,14 +2146,14 @@ rule fragment_length_distr: # FRIP score # For each sample, plot against the sample-specific peaks file and all consensus peak files # May fail for empty consensus peak files, therefore we add a "|| touch" here in the end, even though this hides ANY error -rule deeptools_plotEnrichment: +rule deepTools_plotEnrichment: input: bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique), beds = expand('{dir}/{{peakType}}/{{basename}}Peak' + '.filtered.bed.gz', dir = PEAKCALLING_dir) output: fig = expand('{dir}/{{peakType}}/{{basename}}_frip.pdf', dir = REPORTS_dir_frip), rawCounts = expand('{dir}/{{peakType}}/{{basename}}_frip.rawCounts', dir = REPORTS_dir_frip), - log: expand('{dir}/deeptools_plotEnrichment_{{peakType}}.{{basename}}.log', dir = LOG_BENCHMARK_dir) + log: expand('{dir}/deepTools_plotEnrichment_{{peakType}}.{{basename}}.log', dir = LOG_BENCHMARK_dir) message: "{ruleDisplayMessage} Computing FRiP scores for sample {wildcards.basename} and peak type {wildcards.peakType}" threads: threadsMax conda: "condaEnvironments/deepTools.yaml" @@ -2124,10 +2176,10 @@ rule deeptools_plotEnrichment: """ -use rule deeptools_plotEnrichment as deeptools_plotEnrichment_consensusPeaks with: +use rule deepTools_plotEnrichment as deepTools_plotEnrichment_consensusPeaks with: input: bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique), - beds = expand("{dir}/{{peakType}}/consensusPeaks{GCBias}.{{optionalEncodeStr}}minOverlap{{overlap}}.bed.gz", dir = PEAKCALLING_dir, GCBias = GCBiasStr) + beds = expand("{dir}/{{peakType}}/consensusPeaks{GCBias}{{optionalEncodeStr}}.minOverlap{{overlap}}.bed.gz", dir = PEAKCALLING_dir, GCBias = GCBiasStr) output: fig = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.pdf', dir = REPORTS_dir_frip), rawCounts = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.rawCounts', dir = REPORTS_dir_frip) @@ -2137,7 +2189,7 @@ use rule deeptools_plotEnrichment as deeptools_plotEnrichment_consensusPeaks wit message: "{ruleDisplayMessage} Computing FRiP scores for consensus peaks with overlap {wildcards.overlap} and peak type {wildcards.peakType} + {wildcards.optionalEncodeStr}" log: - expand('{dir}/deeptools_plotEnrichment_consensusPeaks.{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}.log', dir = LOG_BENCHMARK_dir) + expand('{dir}/deepTools_plotEnrichment_consensusPeaks.{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}.log', dir = LOG_BENCHMARK_dir) singularity: "shub://chrarnold/Singularity_images:atac_seq_conda" @@ -2145,7 +2197,7 @@ use rule deeptools_plotEnrichment as deeptools_plotEnrichment_consensusPeaks wit # AdjRSS files are not needed here and are ignored in the stats file rule stats: input: - # rules.deeptools_plotEnrichment.output, # test? + # rules.deepTools_plotEnrichment.output, # test? expand('{dir}/allSamples_fragmentLengthDistr.pdf', dir = REPORTS_dir_summary), expand('{dir}/{sample}.s.bam', @@ -2406,12 +2458,8 @@ def correlationPlotsPeaks_inputs(stringency): return PEAK_STRINGENT_dir + "/consensusPeaks.minOverlap{overlap}.bed.gz" elif (stringency == "nonStringent"): return PEAK_NONSTRINGENT_dir + "/consensusPeaks.minOverlap{overlap}.bed.gz" - elif (stringency == "Encode.gapped"): - return PEAK_ENCODE_dir + "/consensusPeaks.Encode.gappedPeak.minOverlap{overlap}.bed.gz" - elif (stringency == "Encode.broad"): - return PEAK_ENCODE_dir + "/consensusPeaks.Encode.broadPeak.minOverlap{overlap}.bed.gz" - elif (stringency == "Encode.narrow"): - return PEAK_ENCODE_dir + "/consensusPeaks.Encode.narrowPeak.minOverlap{overlap}.bed.gz" + elif (stringency == "Encode.gapped" or stringency == "Encode.broad" or stringency == "Encode.narrow"): + return PEAK_ENCODE_dir + "/consensusPeaks." + stringency + "Peak.minOverlap{overlap}.bed.gz" else: raise KeyError("Stringency must be one of: \"stringent\", \"nonStringent\", \"Encode.gapped\", \"Encode.broad\", \"Encode.narrow\", but not " + stringency) @@ -2465,6 +2513,7 @@ rule deepTools_correlationPlotsPeaks: ######### ######### +# TODO delete, potentially create a fake bed file with all tiled peaks to eliminate calling plotPCA in the first place rule deepTools_plotPCA_genomeWide: input: coverage = REPORTS_dir_PCA + '/{basename}_genomeWide.bins.npz' @@ -2508,7 +2557,7 @@ rule deepTools_plotPCA_genomeWide: """ - +# TODO delete use rule deepTools_plotPCA_genomeWide as deepTools_plotPCA_peaksOnly with: input: coverage = rules.deepTools_correlationPlotsPeaks.output.npz