diff --git a/example/input/params.sh b/example/input/params.sh index 31c5e23a0f46008a54e98783108e7c90c1baabbc..dadc0d88b93316512053f350796433cae5c4d1e7 100644 --- a/example/input/params.sh +++ b/example/input/params.sh @@ -24,7 +24,7 @@ snakefile="/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakefile" ####################### # Use a dry run for testing purposes? -dryRun=false +dryRun=true # Number of cores to use. For local execution, the minimum of this the real number of cores and this number is taken. # When in cluster mode, the maximum number of CPUs per rule. See the separate cluster specification diff --git a/src/Snakefile b/src/Snakefile index dc82b5d2aa12634c676f711978021b21e3635c9a..8565c10794f3be0d0c5d81c9851d097159c47537 100755 --- a/src/Snakefile +++ b/src/Snakefile @@ -179,11 +179,11 @@ genomeSizeDict = { 'mm10': { "":2652783500, "50":2308125349, "75":2407883318, "100":2467481108, "150":2494787188,"200":2520869189 } } -if config["deepTools"]["readLength"] not in ["", "50", "75", "100", "150", "200"]: - raise AssertionError("Unsupported value for parameter readLength. Must be one of \"\", 50, 75, 100, 150, or 200\n") +if str(config["deepTools"]["readLength"]) not in ["", "50", "75", "100", "150", "200"]: + raise AssertionError("Unsupported value for parameter readLength. Must be one of \"\", 50, 75, 100, 150, or 200, but not \"" + str(config["deepTools"]["readLength"]) + "\" \n") sys.exit(1) -effectiveGenomeSize = genomeSizeDict[config["align"]["assemblyVersion"]][config["deepTools"]["readLength"]] +effectiveGenomeSize = genomeSizeDict[config["align"]["assemblyVersion"]][str(config["deepTools"]["readLength"])] samplesSummaryFile = config["samples"]["summaryFile"] @@ -436,6 +436,9 @@ if encodePeaks: allResultFiles = [] +if dataType == "ATACseq": + allResultFiles.append(REPORTS_dir_summary + "/allSamples_fragmentLengthDistr.pdf") + if correctGCBias: GCBiasStr = ["",".noGCBias"] @@ -462,11 +465,13 @@ if coverageFiles: allResultFiles.append(coveragePlot) coverage = expand('{dir}/{sample}{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, sample = allSamplesAndIndividualsUnique, GCBiasStr = GCBiasStr, type = ["bigwig", "bedgraph.gz"]) - allResultFiles.append(coverage) + allResultFiles.extend(coverage) if correctGCBias: GCBiasPlot = expand('{dir}/{sample}{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir_gcbias, sample = allSamplesAndIndividualsUnique, GCBiasStr = GCBiasStr) - allResultFiles.append(GCBiasPlot) + allResultFiles.extend(GCBiasPlot) + + if doPeakCalling: @@ -479,12 +484,11 @@ if doPeakCalling: GCBias = GCBiasStr, peakType = peakTypeEncode) - allResultFiles.append(individualPeaksEncode) else: individualPeaksEncode = [] - allResultFiles.append(individualPeaksEncode) + allResultFiles.extend(individualPeaksEncode) individualPeaksStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed.gz', @@ -499,8 +503,8 @@ if doPeakCalling: analysisType = ["nonStringent"], peakType = ["narrow"]) - allResultFiles.append(individualPeaksStringent) - allResultFiles.append(individualPeaksNonStringent) + allResultFiles.extend(individualPeaksStringent) + allResultFiles.extend(individualPeaksNonStringent) ############## # FRIP score # @@ -508,17 +512,17 @@ if doPeakCalling: # narrow as peakType here serves to unify all peak flavours. frip_nonStringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["nonStringent"] , sample = allFilesUnique, GCBias = GCBiasStr, analysisType = ["nonStringent"], peakType = ["narrow"]) - allResultFiles.append(frip_nonStringent) + allResultFiles.extend(frip_nonStringent) frip_stringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["stringent"] , sample = allFilesUnique, GCBias = GCBiasStr, analysisType = ["stringent"], peakType = ["narrow"]) - allResultFiles.append(frip_stringent) + allResultFiles.extend(frip_stringent) if encodePeaks: frip_encode = expand('{dir}/{dir2}/{sample}{GCBias}.final.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["Encode"] , sample = allFilesUnique, GCBias = GCBiasStr, peakType = peakTypeEncode) else: frip_encode = [] - allResultFiles.append(frip_encode) + allResultFiles.extend(frip_encode) ################### # Peak annotation # @@ -530,14 +534,14 @@ if doPeakCalling: peakType = ["narrow"], ext = ["pdf", "csv"]) - allResultFiles.append(annotation_individualPeaksStringent) + allResultFiles.extend(annotation_individualPeaksStringent) annotation_individualPeaksNonStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered_annotation.{ext}', dir = REPORTS_dir_anno_NONSTRINGENT, sample = allFilesUnique, GCBias = GCBiasStr, analysisType = ["nonStringent"], peakType = ["narrow"], ext = ["pdf", "csv"]) - allResultFiles.append(annotation_individualPeaksNonStringent) + allResultFiles.extend(annotation_individualPeaksNonStringent) if encodePeaks: annotation_individualPeaksEncode = expand('{dir}/{sample}{GCBias}.final.{peakType}Peak.filtered_annotation.{ext}', dir = REPORTS_dir_anno_ENCODE, sample = allFilesUnique, @@ -549,7 +553,7 @@ if doPeakCalling: else: annotation_individualPeaksEncode = [] - allResultFiles.append(annotation_individualPeaksEncode) + allResultFiles.extend(annotation_individualPeaksEncode) ################### # Consensus peaks # @@ -559,35 +563,37 @@ if doPeakCalling: # TODO: Encode missing PCA_plot = expand(REPORTS_dir_PCA + '/consensusPeaks_PCA_{type}_minOverlap{minOverlap}.pdf', type = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur) cor_plot = expand(REPORTS_dir_corr + '/consensusPeaks_sampleCorrelation_{type}_minOverlap{minOverlap}.pdf', type = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur) - allResultFiles.append(PCA_plot) - allResultFiles.append(cor_plot) + allResultFiles.extend(PCA_plot) + allResultFiles.extend(cor_plot) # They already contain both samples and individuals in one file if config["output"]["countReadsPeak"]: readCountFiles = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.countsBAM.bed.gz", - dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlapCur), - allResultFiles.append(readCountFiles) + dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlapCur) + allResultFiles.extend(readCountFiles) # Cannot be merged due to rule specifications consensusPeaks_stringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.bed.gz",dir = PEAK_STRINGENT_dir, GCBias = GCBiasStr, overlap = rangeOverlapCur) consensusPeaks_nonStringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.bed.gz", dir = PEAK_NONSTRINGENT_dir, GCBias = GCBiasStr, overlap = rangeOverlapCur) - allResultFiles.append(consensusPeaks_stringent) - allResultFiles.append(consensusPeaks_nonStringent) + allResultFiles.extend(consensusPeaks_stringent) + allResultFiles.extend(consensusPeaks_nonStringent) consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur) - allResultFiles.append(consensusPeaks_stringentNonStringent_frip) + allResultFiles.extend(consensusPeaks_stringentNonStringent_frip) # featureCount output files - consensusPeaks_stringentNonStringent_featureCounts = expand('{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.allBams.overlaps.bed.gz', dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlap) - allResultFiles.append(consensusPeaks_stringentNonStringent_featureCounts) + # TODO check causes error but redundant, see above? + # consensusPeaks_stringentNonStringent_featureCounts = expand('{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.allBams.overlaps.bed.gz', dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlap) + # allResultFiles.extend(consensusPeaks_stringentNonStringent_featureCounts) # PCA R script output files - PCA_R_consensusPeaks = expand("{dir}/{peakType}/{consensusPeaks{GCBias}.minOverlap{overlap}.PCA_summary.pdf", - dir = REPORTS_dir_PCA, peakType = ["stringent", "nonStringent"], GCBias = GCBiasStr, overlap = rangeOverlap) - allResultFiles.append(PCA_R_consensusPeaks) + # TODO check not needed + # PCA_R_consensusPeaks = expand("{dir}/{peakType}/consensusPeaks{GCBias}.minOverlap{overlap}.PCA_summary.pdf", + # dir = REPORTS_dir_PCA, peakType = ["stringent", "nonStringent"], GCBias = GCBiasStr, overlap = rangeOverlap) + # allResultFiles.extend(PCA_R_consensusPeaks) if encodePeaks: consensusPeaks_ENCODE = expand("{dir}/consensusPeaks{GCBias}.{peakType}Peak.minOverlap{overlap}.bed.gz", dir = PEAK_ENCODE_dir, GCBias = GCBiasStr, peakType = peakTypeEncode, overlap = rangeOverlapCur) @@ -596,17 +602,17 @@ if doPeakCalling: peakTypesCur = peakTypesAll - allResultFiles.append(consensusPeaks_ENCODE) - allResultFiles.append(consensusPeaks_encode_frip) - allResultFiles.append(PCA_R_consensusPeaks) + allResultFiles.extend(consensusPeaks_ENCODE) + allResultFiles.extend(consensusPeaks_encode_frip) + allResultFiles.extend(PCA_R_consensusPeaks) else: consensusPeaks_ENCODE = [] consensusPeaks_encode_frip = [] peakTypesCur = ["stringent", "nonStringent"] - allResultFiles.append(consensusPeaks_ENCODE) - allResultFiles.append(consensusPeaks_encode_frip) + allResultFiles.extend(consensusPeaks_ENCODE) + allResultFiles.extend(consensusPeaks_encode_frip) if annotatePeaks: @@ -618,9 +624,9 @@ if doPeakCalling: else: annotation_consensusPeaks_ENCODE = [] - allResultFiles.append(annotation_consensusPeaks_stringent) - allResultFiles.append(annotation_consensusPeaks_nonStringent) - allResultFiles.append(annotation_consensusPeaks_ENCODE) + allResultFiles.extend(annotation_consensusPeaks_stringent) + allResultFiles.extend(annotation_consensusPeaks_nonStringent) + allResultFiles.extend(annotation_consensusPeaks_ENCODE) else: consensusPeaks_stringent_nonStringent = [] @@ -630,8 +636,6 @@ if doPeakCalling: # end if consensus peaks - - # if mergeReplicates and nSamplesUnique > 1: # # if doPeakCalling: @@ -662,8 +666,9 @@ if doPeakCalling: # peakType = ["narrow"]) # allResultFiles.append(pooledPeaksNonStringent) - +#print(allResultFiles) #print('\n'.join(map(str, allResultFiles))) + #Sys.exit(0) ########################## @@ -1317,8 +1322,14 @@ if config["output"]["doBaseRecalibration"]: ############### ############### +if config["output"]["doBaseRecalibration"]: + baseRecalibrationStr = ".BQrecal" +else: + baseRecalibrationStr = "" + +# TODO: Remove the BQrecal string here as not done usually # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4.BQrecal.rmChrM.s.bam' +postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.s.bam' # Define if GATK should be run or not def postalign_inputs(assemblyVersion): @@ -1352,7 +1363,7 @@ rule postalign_remove_chrMAndUnassembledChr: """ # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -markDuplicates_Picardtools_outputName = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s' +markDuplicates_Picardtools_outputName = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.s' rule markDuplicates_Picardtools: @@ -1392,7 +1403,7 @@ rule computeLibraryComplexity: input: bam = rules.markDuplicates_Picardtools.output.bam output: - stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl' + stats = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.s.bam.statsLibraryCompl' log: message:"{ruleDisplayMessage}Compute library complexity for file {input.bam:q} ..." threads: 1 @@ -1421,8 +1432,8 @@ rule removeDuplicates: index = rules.markDuplicates_Picardtools.output.index, stats = rules.computeLibraryComplexity.output.stats output: - bam = temp(RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam'), - stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats' + bam = temp(RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.s.bam'), + stats = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.s.bam.stats' log: message: "{ruleDisplayMessage}Remove duplicate reads and QC-failing reads for file {input:q} with samtools..." threads: 1 @@ -1442,7 +1453,7 @@ rule removeDuplicates: # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.s.bam' +postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.s.bam' postalign_MAPQ_outputNameFinal = FINAL_OUTPUT_dir + '/{sample}.final.bam' rule postalign_MAPQ: @@ -1473,7 +1484,7 @@ rule postalign_MAPQ: # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule -postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.s.bam' +postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.s.bam' # TODO: Read https://github.com/TheJacksonLaboratory/ATAC-seq/blob/master/make_atac_seq_shifted_bam_4_shift_sam.sh and check if we actually do it correctly # https://informatics.fas.harvard.edu/atac-seq-guidelines.html#alignments @@ -1540,7 +1551,7 @@ if doRSSAdjustment: # --shift {params.forwardShift} -{params.reverseShift} {params.reverseShift} -{params.forwardShift} \ - Picard_sortFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned1.s' + Picard_sortFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned1.s' # Necessary to resport, as Picard seems to require a different sorting order # VALIDATION_STRINGENCY=SILENT is important because any strict check might identify the out-of-reference alignments and abort otherwise. @@ -1567,7 +1578,7 @@ if doRSSAdjustment: 2> {log:q} """ - Picard_cleanSamFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned2.s' + Picard_cleanSamFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned2.s' # Necessary to handle potential out-of-reference alignments due to the read start adjustment. # Out-of-references bases will be soft-clipped, which is conform with the BAM specifications @@ -2223,9 +2234,9 @@ if annotatePeaks and doPeakCalling: if dataType == "ATACseq": if doRSSAdjustment: - inputCur = expand('{dir}/{allSamples}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.csv.gz', dir = ADJRSS_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique) + inputCur = expand('{dir}/{allSamples}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.csv.gz', dir = ADJRSS_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique, bqRecalStr = baseRecalibrationStr) else: - inputCur = expand('{dir}/{allSamples}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.csv.gz', dir = MAPQsort_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique) + inputCur = expand('{dir}/{allSamples}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.csv.gz', dir = MAPQsort_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique, bqRecalStr = baseRecalibrationStr) rule fragment_length_distr: input: @@ -2298,22 +2309,20 @@ use rule deeptools_plotEnrichment_frip_individual as deeptools_plotEnrichment_fr # AdjRSS files are not needed here and are ignored in the stats file rule stats: input: - expand('{dir}/allSamples_fragmentLengthDistr.pdf', - dir = REPORTS_dir_summary), expand('{dir}/{sample}.s.bam', dir = ALIGN_dir, sample = allSamplesUnique), expand('{dir}/{sample}.s.bam.bai', dir = ALIGN_dir, sample = allSamplesUnique), expand('{dir}/{sample}.s.bam.stats', dir = ALIGN_dir, sample = allSamplesUnique), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.s.bam.stats', - dir = CHRM_dir, sample = allSamplesUnique, MAPQ = minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats', - dir = RMDUP_DIR, sample = allSamplesUnique, MAPQ = minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats', - dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = minMAPQscore), - expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl', - dir = RMDUP_DIR, sample = allSamplesUnique), + expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.s.bam.stats', + dir = CHRM_dir, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr), + expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.s.bam.stats', + dir = RMDUP_DIR, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr), + expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats', + dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr), + expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.s.bam.statsLibraryCompl', + dir = RMDUP_DIR, sample = allSamplesUnique, bqRecalStr = baseRecalibrationStr), expand('{dir}/{sample}.final.bam', dir = FINAL_OUTPUT_dir, sample = allSamplesUnique) output: pdf = expand('{dir}/allSamples_statSummary.pdf', dir = REPORTS_dir_summary), @@ -2450,7 +2459,6 @@ use rule deepTools_plotCoverage as deepTools_plotCoverageMerged with: rule multiqc: input: summaryStats = expand('{dir}/allSamples_statSummary.{fileType}', dir = REPORTS_dir_summary, fileType = ["pdf", "rds"]), - fragmentLength = expand('{dir}/allSamples_fragmentLengthDistr.{fileType}', dir = REPORTS_dir_summary, fileType = ["pdf"]), other = allResultFiles # Enforce that it runs at the very end output: report = REPORTS_dir_multiqc + '/multiqc_report.html'