Skip to content
Snippets Groups Projects
Commit 9a0221cb authored by Christian Arnold's avatar Christian Arnold
Browse files

Bugfixes

parent ec6fcff5
No related branches found
No related tags found
No related merge requests found
......@@ -24,7 +24,7 @@ snakefile="/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakefile"
#######################
# Use a dry run for testing purposes?
dryRun=false
dryRun=true
# Number of cores to use. For local execution, the minimum of this the real number of cores and this number is taken.
# When in cluster mode, the maximum number of CPUs per rule. See the separate cluster specification
......
......@@ -179,11 +179,11 @@ genomeSizeDict = {
'mm10': { "":2652783500, "50":2308125349, "75":2407883318, "100":2467481108, "150":2494787188,"200":2520869189 }
}
if config["deepTools"]["readLength"] not in ["", "50", "75", "100", "150", "200"]:
raise AssertionError("Unsupported value for parameter readLength. Must be one of \"\", 50, 75, 100, 150, or 200\n")
if str(config["deepTools"]["readLength"]) not in ["", "50", "75", "100", "150", "200"]:
raise AssertionError("Unsupported value for parameter readLength. Must be one of \"\", 50, 75, 100, 150, or 200, but not \"" + str(config["deepTools"]["readLength"]) + "\" \n")
sys.exit(1)
effectiveGenomeSize = genomeSizeDict[config["align"]["assemblyVersion"]][config["deepTools"]["readLength"]]
effectiveGenomeSize = genomeSizeDict[config["align"]["assemblyVersion"]][str(config["deepTools"]["readLength"])]
samplesSummaryFile = config["samples"]["summaryFile"]
......@@ -436,6 +436,9 @@ if encodePeaks:
allResultFiles = []
if dataType == "ATACseq":
allResultFiles.append(REPORTS_dir_summary + "/allSamples_fragmentLengthDistr.pdf")
if correctGCBias:
GCBiasStr = ["",".noGCBias"]
......@@ -462,11 +465,13 @@ if coverageFiles:
allResultFiles.append(coveragePlot)
coverage = expand('{dir}/{sample}{GCBiasStr}.final.{type}', dir = FINAL_OUTPUT_dir, sample = allSamplesAndIndividualsUnique, GCBiasStr = GCBiasStr, type = ["bigwig", "bedgraph.gz"])
allResultFiles.append(coverage)
allResultFiles.extend(coverage)
if correctGCBias:
GCBiasPlot = expand('{dir}/{sample}{GCBiasStr}.GCBias.plot.pdf', dir = REPORTS_dir_gcbias, sample = allSamplesAndIndividualsUnique, GCBiasStr = GCBiasStr)
allResultFiles.append(GCBiasPlot)
allResultFiles.extend(GCBiasPlot)
if doPeakCalling:
......@@ -479,12 +484,11 @@ if doPeakCalling:
GCBias = GCBiasStr,
peakType = peakTypeEncode)
allResultFiles.append(individualPeaksEncode)
else:
individualPeaksEncode = []
allResultFiles.append(individualPeaksEncode)
allResultFiles.extend(individualPeaksEncode)
individualPeaksStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed.gz',
......@@ -499,8 +503,8 @@ if doPeakCalling:
analysisType = ["nonStringent"],
peakType = ["narrow"])
allResultFiles.append(individualPeaksStringent)
allResultFiles.append(individualPeaksNonStringent)
allResultFiles.extend(individualPeaksStringent)
allResultFiles.extend(individualPeaksNonStringent)
##############
# FRIP score #
......@@ -508,17 +512,17 @@ if doPeakCalling:
# narrow as peakType here serves to unify all peak flavours.
frip_nonStringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["nonStringent"] , sample = allFilesUnique, GCBias = GCBiasStr, analysisType = ["nonStringent"], peakType = ["narrow"])
allResultFiles.append(frip_nonStringent)
allResultFiles.extend(frip_nonStringent)
frip_stringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["stringent"] , sample = allFilesUnique, GCBias = GCBiasStr, analysisType = ["stringent"], peakType = ["narrow"])
allResultFiles.append(frip_stringent)
allResultFiles.extend(frip_stringent)
if encodePeaks:
frip_encode = expand('{dir}/{dir2}/{sample}{GCBias}.final.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["Encode"] , sample = allFilesUnique, GCBias = GCBiasStr, peakType = peakTypeEncode)
else:
frip_encode = []
allResultFiles.append(frip_encode)
allResultFiles.extend(frip_encode)
###################
# Peak annotation #
......@@ -530,14 +534,14 @@ if doPeakCalling:
peakType = ["narrow"],
ext = ["pdf", "csv"])
allResultFiles.append(annotation_individualPeaksStringent)
allResultFiles.extend(annotation_individualPeaksStringent)
annotation_individualPeaksNonStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered_annotation.{ext}', dir = REPORTS_dir_anno_NONSTRINGENT, sample = allFilesUnique,
GCBias = GCBiasStr,
analysisType = ["nonStringent"],
peakType = ["narrow"],
ext = ["pdf", "csv"])
allResultFiles.append(annotation_individualPeaksNonStringent)
allResultFiles.extend(annotation_individualPeaksNonStringent)
if encodePeaks:
annotation_individualPeaksEncode = expand('{dir}/{sample}{GCBias}.final.{peakType}Peak.filtered_annotation.{ext}', dir = REPORTS_dir_anno_ENCODE, sample = allFilesUnique,
......@@ -549,7 +553,7 @@ if doPeakCalling:
else:
annotation_individualPeaksEncode = []
allResultFiles.append(annotation_individualPeaksEncode)
allResultFiles.extend(annotation_individualPeaksEncode)
###################
# Consensus peaks #
......@@ -559,35 +563,37 @@ if doPeakCalling:
# TODO: Encode missing
PCA_plot = expand(REPORTS_dir_PCA + '/consensusPeaks_PCA_{type}_minOverlap{minOverlap}.pdf', type = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur)
cor_plot = expand(REPORTS_dir_corr + '/consensusPeaks_sampleCorrelation_{type}_minOverlap{minOverlap}.pdf', type = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur)
allResultFiles.append(PCA_plot)
allResultFiles.append(cor_plot)
allResultFiles.extend(PCA_plot)
allResultFiles.extend(cor_plot)
# They already contain both samples and individuals in one file
if config["output"]["countReadsPeak"]:
readCountFiles = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.countsBAM.bed.gz",
dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlapCur),
allResultFiles.append(readCountFiles)
dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlapCur)
allResultFiles.extend(readCountFiles)
# Cannot be merged due to rule specifications
consensusPeaks_stringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.bed.gz",dir = PEAK_STRINGENT_dir, GCBias = GCBiasStr, overlap = rangeOverlapCur)
consensusPeaks_nonStringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.bed.gz", dir = PEAK_NONSTRINGENT_dir, GCBias = GCBiasStr, overlap = rangeOverlapCur)
allResultFiles.append(consensusPeaks_stringent)
allResultFiles.append(consensusPeaks_nonStringent)
allResultFiles.extend(consensusPeaks_stringent)
allResultFiles.extend(consensusPeaks_nonStringent)
consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], minOverlap = rangeOverlapCur)
allResultFiles.append(consensusPeaks_stringentNonStringent_frip)
allResultFiles.extend(consensusPeaks_stringentNonStringent_frip)
# featureCount output files
consensusPeaks_stringentNonStringent_featureCounts = expand('{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.allBams.overlaps.bed.gz', dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlap)
allResultFiles.append(consensusPeaks_stringentNonStringent_featureCounts)
# TODO check causes error but redundant, see above?
# consensusPeaks_stringentNonStringent_featureCounts = expand('{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.allBams.overlaps.bed.gz', dir = [PEAK_STRINGENT_dir, PEAK_NONSTRINGENT_dir], GCBias = GCBiasStr, overlap = rangeOverlap)
# allResultFiles.extend(consensusPeaks_stringentNonStringent_featureCounts)
# PCA R script output files
PCA_R_consensusPeaks = expand("{dir}/{peakType}/{consensusPeaks{GCBias}.minOverlap{overlap}.PCA_summary.pdf",
dir = REPORTS_dir_PCA, peakType = ["stringent", "nonStringent"], GCBias = GCBiasStr, overlap = rangeOverlap)
allResultFiles.append(PCA_R_consensusPeaks)
# TODO check not needed
# PCA_R_consensusPeaks = expand("{dir}/{peakType}/consensusPeaks{GCBias}.minOverlap{overlap}.PCA_summary.pdf",
# dir = REPORTS_dir_PCA, peakType = ["stringent", "nonStringent"], GCBias = GCBiasStr, overlap = rangeOverlap)
# allResultFiles.extend(PCA_R_consensusPeaks)
if encodePeaks:
consensusPeaks_ENCODE = expand("{dir}/consensusPeaks{GCBias}.{peakType}Peak.minOverlap{overlap}.bed.gz", dir = PEAK_ENCODE_dir, GCBias = GCBiasStr, peakType = peakTypeEncode, overlap = rangeOverlapCur)
......@@ -596,17 +602,17 @@ if doPeakCalling:
peakTypesCur = peakTypesAll
allResultFiles.append(consensusPeaks_ENCODE)
allResultFiles.append(consensusPeaks_encode_frip)
allResultFiles.append(PCA_R_consensusPeaks)
allResultFiles.extend(consensusPeaks_ENCODE)
allResultFiles.extend(consensusPeaks_encode_frip)
allResultFiles.extend(PCA_R_consensusPeaks)
else:
consensusPeaks_ENCODE = []
consensusPeaks_encode_frip = []
peakTypesCur = ["stringent", "nonStringent"]
allResultFiles.append(consensusPeaks_ENCODE)
allResultFiles.append(consensusPeaks_encode_frip)
allResultFiles.extend(consensusPeaks_ENCODE)
allResultFiles.extend(consensusPeaks_encode_frip)
if annotatePeaks:
......@@ -618,9 +624,9 @@ if doPeakCalling:
else:
annotation_consensusPeaks_ENCODE = []
allResultFiles.append(annotation_consensusPeaks_stringent)
allResultFiles.append(annotation_consensusPeaks_nonStringent)
allResultFiles.append(annotation_consensusPeaks_ENCODE)
allResultFiles.extend(annotation_consensusPeaks_stringent)
allResultFiles.extend(annotation_consensusPeaks_nonStringent)
allResultFiles.extend(annotation_consensusPeaks_ENCODE)
else:
consensusPeaks_stringent_nonStringent = []
......@@ -630,8 +636,6 @@ if doPeakCalling:
# end if consensus peaks
# if mergeReplicates and nSamplesUnique > 1:
#
# if doPeakCalling:
......@@ -662,8 +666,9 @@ if doPeakCalling:
# peakType = ["narrow"])
# allResultFiles.append(pooledPeaksNonStringent)
#print(allResultFiles)
#print('\n'.join(map(str, allResultFiles)))
#Sys.exit(0)
##########################
......@@ -1317,8 +1322,14 @@ if config["output"]["doBaseRecalibration"]:
###############
###############
if config["output"]["doBaseRecalibration"]:
baseRecalibrationStr = ".BQrecal"
else:
baseRecalibrationStr = ""
# TODO: Remove the BQrecal string here as not done usually
# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4.BQrecal.rmChrM.s.bam'
postalign_remove_chrMAndUnassembledChr_outputName = CHRM_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.s.bam'
# Define if GATK should be run or not
def postalign_inputs(assemblyVersion):
......@@ -1352,7 +1363,7 @@ rule postalign_remove_chrMAndUnassembledChr:
"""
# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
markDuplicates_Picardtools_outputName = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s'
markDuplicates_Picardtools_outputName = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.s'
rule markDuplicates_Picardtools:
......@@ -1392,7 +1403,7 @@ rule computeLibraryComplexity:
input:
bam = rules.markDuplicates_Picardtools.output.bam
output:
stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl'
stats = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.s.bam.statsLibraryCompl'
log:
message:"{ruleDisplayMessage}Compute library complexity for file {input.bam:q} ..."
threads: 1
......@@ -1421,8 +1432,8 @@ rule removeDuplicates:
index = rules.markDuplicates_Picardtools.output.index,
stats = rules.computeLibraryComplexity.output.stats
output:
bam = temp(RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam'),
stats = RMDUP_DIR + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats'
bam = temp(RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.s.bam'),
stats = RMDUP_DIR + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.s.bam.stats'
log:
message: "{ruleDisplayMessage}Remove duplicate reads and QC-failing reads for file {input:q} with samtools..."
threads: 1
......@@ -1442,7 +1453,7 @@ rule removeDuplicates:
# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.s.bam'
postalign_MAPQ_outputName = MAPQsort_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.s.bam'
postalign_MAPQ_outputNameFinal = FINAL_OUTPUT_dir + '/{sample}.final.bam'
rule postalign_MAPQ:
......@@ -1473,7 +1484,7 @@ rule postalign_MAPQ:
# Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.s.bam'
postalign_RSS_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.s.bam'
# TODO: Read https://github.com/TheJacksonLaboratory/ATAC-seq/blob/master/make_atac_seq_shifted_bam_4_shift_sam.sh and check if we actually do it correctly
# https://informatics.fas.harvard.edu/atac-seq-guidelines.html#alignments
......@@ -1540,7 +1551,7 @@ if doRSSAdjustment:
# --shift {params.forwardShift} -{params.reverseShift} {params.reverseShift} -{params.forwardShift} \
Picard_sortFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned1.s'
Picard_sortFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned1.s'
# Necessary to resport, as Picard seems to require a different sorting order
# VALIDATION_STRINGENCY=SILENT is important because any strict check might identify the out-of-reference alignments and abort otherwise.
......@@ -1567,7 +1578,7 @@ if doRSSAdjustment:
2> {log:q}
"""
Picard_cleanSamFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned2.s'
Picard_cleanSamFinal_outputName = ADJRSS_dir + '/{sample}.cleaned4' + baseRecalibrationStr + '.rmChrM.markedDup.rmDup.minMapQ' + str(minMAPQscore) + '.adjRSS.cleaned2.s'
# Necessary to handle potential out-of-reference alignments due to the read start adjustment.
# Out-of-references bases will be soft-clipped, which is conform with the BAM specifications
......@@ -2223,9 +2234,9 @@ if annotatePeaks and doPeakCalling:
if dataType == "ATACseq":
if doRSSAdjustment:
inputCur = expand('{dir}/{allSamples}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.csv.gz', dir = ADJRSS_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique)
inputCur = expand('{dir}/{allSamples}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.adjRSS.s.bam.csv.gz', dir = ADJRSS_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique, bqRecalStr = baseRecalibrationStr)
else:
inputCur = expand('{dir}/{allSamples}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.csv.gz', dir = MAPQsort_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique)
inputCur = expand('{dir}/{allSamples}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.csv.gz', dir = MAPQsort_dir, MAPQ = minMAPQscore, allSamples = allSamplesUnique, bqRecalStr = baseRecalibrationStr)
rule fragment_length_distr:
input:
......@@ -2298,22 +2309,20 @@ use rule deeptools_plotEnrichment_frip_individual as deeptools_plotEnrichment_fr
# AdjRSS files are not needed here and are ignored in the stats file
rule stats:
input:
expand('{dir}/allSamples_fragmentLengthDistr.pdf',
dir = REPORTS_dir_summary),
expand('{dir}/{sample}.s.bam',
dir = ALIGN_dir, sample = allSamplesUnique),
expand('{dir}/{sample}.s.bam.bai',
dir = ALIGN_dir, sample = allSamplesUnique),
expand('{dir}/{sample}.s.bam.stats',
dir = ALIGN_dir, sample = allSamplesUnique),
expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.s.bam.stats',
dir = CHRM_dir, sample = allSamplesUnique, MAPQ = minMAPQscore),
expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.s.bam.stats',
dir = RMDUP_DIR, sample = allSamplesUnique, MAPQ = minMAPQscore),
expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats',
dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = minMAPQscore),
expand('{dir}/{sample}.cleaned4.BQrecal.rmChrM.markedDup.s.bam.statsLibraryCompl',
dir = RMDUP_DIR, sample = allSamplesUnique),
expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.s.bam.stats',
dir = CHRM_dir, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr),
expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.s.bam.stats',
dir = RMDUP_DIR, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr),
expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.rmDup.minMapQ{MAPQ}.s.bam.stats',
dir = MAPQsort_dir, sample = allSamplesUnique, MAPQ = minMAPQscore, bqRecalStr = baseRecalibrationStr),
expand('{dir}/{sample}.cleaned4{bqRecalStr}.rmChrM.markedDup.s.bam.statsLibraryCompl',
dir = RMDUP_DIR, sample = allSamplesUnique, bqRecalStr = baseRecalibrationStr),
expand('{dir}/{sample}.final.bam', dir = FINAL_OUTPUT_dir, sample = allSamplesUnique)
output:
pdf = expand('{dir}/allSamples_statSummary.pdf', dir = REPORTS_dir_summary),
......@@ -2450,7 +2459,6 @@ use rule deepTools_plotCoverage as deepTools_plotCoverageMerged with:
rule multiqc:
input:
summaryStats = expand('{dir}/allSamples_statSummary.{fileType}', dir = REPORTS_dir_summary, fileType = ["pdf", "rds"]),
fragmentLength = expand('{dir}/allSamples_fragmentLengthDistr.{fileType}', dir = REPORTS_dir_summary, fileType = ["pdf"]),
other = allResultFiles # Enforce that it runs at the very end
output:
report = REPORTS_dir_multiqc + '/multiqc_report.html'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment