Bugfixes

9a0221cb · Christian Arnold · ec6fcff5 · 9a0221cb · 9a0221cb
Commit 9a0221cb authored 2 years ago by Christian Arnold
--- a/example/input/params.sh
+++ b/example/input/params.sh
@@ -24,7 +24,7 @@ snakefile="/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakefile"
 #######################

 # Use a dry run for testing purposes?
-dryRun=false
+dryRun=true

 # Number of cores to use. For local execution, the minimum of this the real number of cores and this number is taken.
 # When in cluster mode, the maximum number of CPUs per rule. See the separate cluster specification

--- a/src/Snakefile
+++ b/src/Snakefile
@@ -179,11 +179,11 @@ genomeSizeDict = {
    'mm10': { "":2652783500, "50":2308125349, "75":2407883318, "100":2467481108, "150":2494787188,"200":2520869189 }
 }

-if config["deepTools"]["readLength"] not in ["", "50", "75", "100", "150", "200"]:
-    raise AssertionError("Unsupported value for parameter readLength. Must be one of \"\", 50, 75, 100, 150, or 200\n")
+if str(config["deepTools"]["readLength"]) not in ["", "50", "75", "100", "150", "200"]:
+    raise AssertionError("Unsupported value for parameter readLength. Must be one of \"\", 50, 75, 100, 150, or 200, but not \"" + str(config["deepTools"]["readLength"]) + "\" \n")
    sys.exit(1)

-effectiveGenomeSize = genomeSizeDict[config["align"]["assemblyVersion"]][config["deepTools"]["readLength"]]
+effectiveGenomeSize = genomeSizeDict[config["align"]["assemblyVersion"]][str(config["deepTools"]["readLength"])]


 samplesSummaryFile = config["samples"]["summaryFile"]
@@ -436,6 +436,9 @@ if encodePeaks:

 allResultFiles = []

+if dataType == "ATACseq":
+    allResultFiles.append(REPORTS_dir_summary + "/allSamples_fragmentLengthDistr.pdf")
+

 if correctGCBias:
    GCBiasStr = ["",".noGCBias"]
@@ -462,11 +465,13 @@ if coverageFiles:
            allResultFiles.append(coveragePlot)

    coverage = expand('{dir}/{sample}{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, sample = allSamplesAndIndividualsUnique, GCBiasStr = GCBiasStr, type = ["bigwig", "bedgraph.gz"])
-    allResultFiles.append(coverage)
+    allResultFiles.extend(coverage)

 if correctGCBias:
    GCBiasPlot = expand('{dir}/{sample}{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir_gcbias, sample = allSamplesAndIndividualsUnique, GCBiasStr = GCBiasStr)
-    allResultFiles.append(GCBiasPlot)
+    allResultFiles.extend(GCBiasPlot)
+
+


 if doPeakCalling:
@@ -479,12 +484,11 @@ if doPeakCalling:
                                GCBias = GCBiasStr,
                                peakType = peakTypeEncode)

-        allResultFiles.append(individualPeaksEncode)

    else:
        individualPeaksEncode = []

-    allResultFiles.append(individualPeaksEncode)
+    allResultFiles.extend(individualPeaksEncode)


    individualPeaksStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed.gz',
@@ -499,8 +503,8 @@ if doPeakCalling:
                            analysisType = ["nonStringent"],
                            peakType = ["narrow"])

-    allResultFiles.append(individualPeaksStringent)
-    allResultFiles.append(individualPeaksNonStringent)
+    allResultFiles.extend(individualPeaksStringent)
+    allResultFiles.extend(individualPeaksNonStringent)

    ##############
    # FRIP score #
@@ -508,17 +512,17 @@ if doPeakCalling:

    # narrow as peakType here serves to unify all peak flavours.
    frip_nonStringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["nonStringent"] , sample = allFilesUnique, GCBias = GCBiasStr, analysisType = ["nonStringent"], peakType = ["narrow"])
-    allResultFiles.append(frip_nonStringent)
+    allResultFiles.extend(frip_nonStringent)

    frip_stringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["stringent"] , sample = allFilesUnique, GCBias = GCBiasStr, analysisType = ["stringent"], peakType = ["narrow"])
-    allResultFiles.append(frip_stringent)
+    allResultFiles.extend(frip_stringent)

    if encodePeaks:
        frip_encode = expand('{dir}/{dir2}/{sample}{GCBias}.final.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["Encode"] , sample = allFilesUnique, GCBias = GCBiasStr, peakType = peakTypeEncode)
    else:
        frip_encode = []

-    allResultFiles.append(frip_encode)
+    allResultFiles.extend(frip_encode)

    ###################
    # Peak annotation #
@@ -530,14 +534,14 @@ if doPeakCalling:
                                peakType = ["narrow"],
                                ext = ["pdf", "csv"])

-        allResultFiles.append(annotation_individualPeaksStringent)
+        allResultFiles.extend(annotation_individualPeaksStringent)

        annotation_individualPeaksNonStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered_annotation.{ext}', dir = REPORTS_dir_anno_NONSTRINGENT, sample = allFilesUnique,
                                GCBias = GCBiasStr,
                                analysisType = ["nonStringent"],
                                peakType = ["narrow"],
                                ext = ["pdf", "csv"])
-        allResultFiles.append(annotation_individualPeaksNonStringent)
+        allResultFiles.extend(annotation_individualPeaksNonStringent)

        if encodePeaks:
            annotation_individualPeaksEncode = expand('{dir}/{sample}{GCBias}.final.{peakType}Peak.filtered_annotation.{ext}', dir = REPORTS_dir_anno_ENCODE, sample = allFilesUnique,
@@ -549,7 +553,7 @@ if doPeakCalling:
        else:
            annotation_individualPeaksEncode = []

-        allResultFiles.append(annotation_individualPeaksEncode)
+        allResultFiles.extend(annotation_individualPeaksEncode)

    ###################
    # Consensus peaks #
@@ -559,35 +563,37 @@ if doPeakCalling:
        # TODO: Encode missing
        PCA_plot = expand(REPORTS_dir_PCA + '/consensusPeaks_PCA_{type}_minOverlap{minOverlap}.pdf', type = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur)
        cor_plot = expand(REPORTS_dir_corr + '/consensusPeaks_sampleCorrelation_{type}_minOverlap{minOverlap}.pdf', type = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur)
-        allResultFiles.append(PCA_plot)
-        allResultFiles.append(cor_plot)
+        allResultFiles.extend(PCA_plot)
+        allResultFiles.extend(cor_plot)

        # They already contain both samples and individuals in one file
        if config["output"]["countReadsPeak"]:

            readCountFiles = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.countsBAM.bed.gz",
-                             dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlapCur),
-            allResultFiles.append(readCountFiles)
+                             dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlapCur)
+            allResultFiles.extend(readCountFiles)


        # Cannot be merged due to rule specifications
        consensusPeaks_stringent  = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.bed.gz",dir = PEAK_STRINGENT_dir, GCBias = GCBiasStr, overlap = rangeOverlapCur)
        consensusPeaks_nonStringent  = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.bed.gz", dir = PEAK_NONSTRINGENT_dir, GCBias = GCBiasStr, overlap = rangeOverlapCur)

-        allResultFiles.append(consensusPeaks_stringent)
-        allResultFiles.append(consensusPeaks_nonStringent)
+        allResultFiles.extend(consensusPeaks_stringent)
+        allResultFiles.extend(consensusPeaks_nonStringent)

        consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur)
-        allResultFiles.append(consensusPeaks_stringentNonStringent_frip)
+        allResultFiles.extend(consensusPeaks_stringentNonStringent_frip)

        # featureCount output files
-        consensusPeaks_stringentNonStringent_featureCounts = expand('{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.allBams.overlaps.bed.gz', dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlap)
-        allResultFiles.append(consensusPeaks_stringentNonStringent_featureCounts)
+        # TODO check causes error but redundant, see above?
+        # consensusPeaks_stringentNonStringent_featureCounts = expand('{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.allBams.overlaps.bed.gz', dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlap)
+        # allResultFiles.extend(consensusPeaks_stringentNonStringent_featureCounts)

        # PCA R script output files
-        PCA_R_consensusPeaks = expand("{dir}/{peakType}/{consensusPeaks{GCBias}.minOverlap{overlap}.PCA_summary.pdf",
-            dir = REPORTS_dir_PCA, peakType = ["stringent", "nonStringent"], GCBias = GCBiasStr, overlap = rangeOverlap)
-        allResultFiles.append(PCA_R_consensusPeaks)
+        # TODO check not needed
+        # PCA_R_consensusPeaks = expand("{dir}/{peakType}/consensusPeaks{GCBias}.minOverlap{overlap}.PCA_summary.pdf",
+        #     dir = REPORTS_dir_PCA, peakType = ["stringent", "nonStringent"], GCBias = GCBiasStr, overlap = rangeOverlap)
+        # allResultFiles.extend(PCA_R_consensusPeaks)

        if encodePeaks:
            consensusPeaks_ENCODE = expand("{dir}/consensusPeaks{GCBias}.{peakType}Peak.minOverlap{overlap}.bed.gz", dir = PEAK_ENCODE_dir, GCBias = GCBiasStr, peakType = peakTypeEncode, overlap = rangeOverlapCur)
@@ -596,17 +602,17 @@ if doPeakCalling:

            peakTypesCur = peakTypesAll

-            allResultFiles.append(consensusPeaks_ENCODE)
-            allResultFiles.append(consensusPeaks_encode_frip)
-            allResultFiles.append(PCA_R_consensusPeaks)
+            allResultFiles.extend(consensusPeaks_ENCODE)
+            allResultFiles.extend(consensusPeaks_encode_frip)
+            allResultFiles.extend(PCA_R_consensusPeaks)

        else:
            consensusPeaks_ENCODE = []
            consensusPeaks_encode_frip = []
            peakTypesCur = ["stringent", "nonStringent"]

-        allResultFiles.append(consensusPeaks_ENCODE)
-        allResultFiles.append(consensusPeaks_encode_frip)
+        allResultFiles.extend(consensusPeaks_ENCODE)
+        allResultFiles.extend(consensusPeaks_encode_frip)

        if annotatePeaks:

@@ -618,9 +624,9 @@ if doPeakCalling:
            else:
                annotation_consensusPeaks_ENCODE = []

-            allResultFiles.append(annotation_consensusPeaks_stringent)
-            allResultFiles.append(annotation_consensusPeaks_nonStringent)
-            allResultFiles.append(annotation_consensusPeaks_ENCODE)
+            allResultFiles.extend(annotation_consensusPeaks_stringent)
+            allResultFiles.extend(annotation_consensusPeaks_nonStringent)
+            allResultFiles.extend(annotation_consensusPeaks_ENCODE)

    else:
        consensusPeaks_stringent_nonStringent = []
@@ -630,8 +636,6 @@ if doPeakCalling:
    # end if consensus peaks


-
-
 # if mergeReplicates and nSamplesUnique > 1:
 #
 #     if doPeakCalling:
@@ -662,8 +666,9 @@ if doPeakCalling:
        #                         peakType = ["narrow"])
        # allResultFiles.append(pooledPeaksNonStringent)

-
+#print(allResultFiles)
 #print('\n'.join(map(str, allResultFiles)))
+
 #Sys.exit(0)

 ##########################
@@ -1317,8 +1322,14 @@ if config["output"]["doBaseRecalibration"]:
 ###############
 ###############

+if config["output"]["doBaseRecalibration"]:
+    baseRecalibrationStr = ".BQrecal"
+else:
+    baseRecalibrationStr = ""
+
+# TODO: Remove the BQrecal string here as not done usually
 # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
-postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4.BQrecal.rmChrM.s.bam'
+postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.s.bam'

 # Define if GATK should be run or not
 def postalign_inputs(assemblyVersion):
@@ -1352,7 +1363,7 @@ rule postalign_remove_chrMAndUnassembledChr:
        """

 # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
-markDuplicates_Picardtools_outputName = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s'
+markDuplicates_Picardtools_outputName = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.s'


 rule markDuplicates_Picardtools:
@@ -1392,7 +1403,7 @@ rule computeLibraryComplexity:
    input:
        bam    = rules.markDuplicates_Picardtools.output.bam
    output:
-        stats  = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl'
+        stats  = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.s.bam.statsLibraryCompl'
    log:
    message:"{ruleDisplayMessage}Compute library complexity for file {input.bam:q} ..."
    threads: 1
@@ -1421,8 +1432,8 @@ rule removeDuplicates:
        index = rules.markDuplicates_Picardtools.output.index,
        stats = rules.computeLibraryComplexity.output.stats
    output:
-        bam = temp(RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam'),
-        stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats'
+        bam = temp(RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.s.bam'),
+        stats = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.s.bam.stats'
    log:
    message: "{ruleDisplayMessage}Remove duplicate reads and QC-failing reads for file {input:q} with samtools..."
    threads: 1
@@ -1442,7 +1453,7 @@ rule removeDuplicates:


 # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
-postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.s.bam'
+postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.s.bam'
 postalign_MAPQ_outputNameFinal = FINAL_OUTPUT_dir + '/{sample}.final.bam'

 rule postalign_MAPQ:
@@ -1473,7 +1484,7 @@ rule postalign_MAPQ:


 # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
-postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.s.bam'
+postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.s.bam'

 # TODO: Read https://github.com/TheJacksonLaboratory/ATAC-seq/blob/master/make_atac_seq_shifted_bam_4_shift_sam.sh and check if we actually do it correctly
 # https://informatics.fas.harvard.edu/atac-seq-guidelines.html#alignments
@@ -1540,7 +1551,7 @@ if doRSSAdjustment:

 # --shift {params.forwardShift} -{params.reverseShift} {params.reverseShift} -{params.forwardShift} \

-    Picard_sortFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned1.s'
+    Picard_sortFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned1.s'

    # Necessary to resport, as Picard seems to require a different sorting order
    # VALIDATION_STRINGENCY=SILENT is important because any strict check might identify the out-of-reference alignments and abort otherwise.
@@ -1567,7 +1578,7 @@ if doRSSAdjustment:
                    2> {log:q}
            """

-    Picard_cleanSamFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned2.s'
+    Picard_cleanSamFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned2.s'

    # Necessary to handle potential out-of-reference alignments due to the read start adjustment.
    # Out-of-references bases will be soft-clipped, which is conform with the BAM specifications
@@ -2223,9 +2234,9 @@ if annotatePeaks and doPeakCalling:
 if dataType == "ATACseq":

    if doRSSAdjustment:
-        inputCur = expand('{dir}/{allSamples}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.csv.gz', dir = ADJRSS_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique)
+        inputCur = expand('{dir}/{allSamples}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.csv.gz', dir = ADJRSS_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique, bqRecalStr = baseRecalibrationStr)
    else:
-        inputCur = expand('{dir}/{allSamples}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.csv.gz', dir = MAPQsort_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique)
+        inputCur = expand('{dir}/{allSamples}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.csv.gz', dir = MAPQsort_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique, bqRecalStr = baseRecalibrationStr)

    rule fragment_length_distr:
        input:
@@ -2298,22 +2309,20 @@ use rule deeptools_plotEnrichment_frip_individual as deeptools_plotEnrichment_fr
 # AdjRSS files are not needed here and are ignored in the stats file
 rule stats:
    input:
-        expand('{dir}/allSamples_fragmentLengthDistr.pdf',
-            dir = REPORTS_dir_summary),
        expand('{dir}/{sample}.s.bam',
            dir = ALIGN_dir, sample = allSamplesUnique),
        expand('{dir}/{sample}.s.bam.bai',
            dir = ALIGN_dir, sample = allSamplesUnique),
        expand('{dir}/{sample}.s.bam.stats',
            dir = ALIGN_dir, sample = allSamplesUnique),
-        expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.s.bam.stats',
-            dir = CHRM_dir, sample = allSamplesUnique, MAPQ = minMAPQscore),
-        expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats',
-            dir = RMDUP_DIR, sample = allSamplesUnique, MAPQ = minMAPQscore),
-        expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats',
-            dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = minMAPQscore),
-        expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl',
-            dir = RMDUP_DIR, sample = allSamplesUnique),
+        expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.s.bam.stats',
+            dir = CHRM_dir, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr),
+        expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.s.bam.stats',
+            dir = RMDUP_DIR, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr),
+        expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats',
+            dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr),
+        expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.s.bam.statsLibraryCompl',
+            dir = RMDUP_DIR, sample = allSamplesUnique, bqRecalStr = baseRecalibrationStr),
        expand('{dir}/{sample}.final.bam', dir = FINAL_OUTPUT_dir, sample = allSamplesUnique)
    output:
        pdf   = expand('{dir}/allSamples_statSummary.pdf', dir = REPORTS_dir_summary),
@@ -2450,7 +2459,6 @@ use rule deepTools_plotCoverage as deepTools_plotCoverageMerged with:
 rule multiqc:
    input:
        summaryStats    = expand('{dir}/allSamples_statSummary.{fileType}', dir = REPORTS_dir_summary, fileType = ["pdf", "rds"]),
-        fragmentLength  = expand('{dir}/allSamples_fragmentLengthDistr.{fileType}', dir = REPORTS_dir_summary, fileType = ["pdf"]),
        other = allResultFiles  # Enforce that it runs at the very end
    output:
        report = REPORTS_dir_multiqc + '/multiqc_report.html'