From 1e28aa2dfd69fa079a15edadd4246dd2da83819f Mon Sep 17 00:00:00 2001 From: Rim Moussa <rmoussa@spinoza.embl.de> Date: Mon, 10 Jan 2022 04:51:47 +0100 Subject: [PATCH] chipseq changes --- example/dev/input/._config.yaml | Bin 0 -> 4096 bytes example/dev/input/._samples.csv | Bin 0 -> 4096 bytes example/dev/input/config.yaml | 1 - example/dev/input/samples.csv | 8 +- src/Snakemake/dev/._Snakefile | Bin 4096 -> 4096 bytes src/Snakemake/dev/Snakefile | 228 ++++++++++++++++++++------------ 6 files changed, 147 insertions(+), 90 deletions(-) create mode 100755 example/dev/input/._config.yaml create mode 100755 example/dev/input/._samples.csv mode change 100644 => 100755 example/dev/input/samples.csv diff --git a/example/dev/input/._config.yaml b/example/dev/input/._config.yaml new file mode 100755 index 0000000000000000000000000000000000000000..ae14cb815afa051c5c66bbdb67dbc9b350335505 GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vVs!ar z2+_f?0H|C5O$#HC4;7b6&d=3LEGWoH)yqjNE-5WeO-V^CNmULA2I+af=5`{8PI#Kc z3!+ECXb6mkz-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeD;0ggyXA^|MKrSRBvsj@h zwK%`DC^=OjEx#yRAv3QeHLoNyKQA#Sr&1v&HLXM;DJL;68`u|y>Kf7%s{i3$kztVg G{~rLwUn}we literal 0 HcmV?d00001 diff --git a/example/dev/input/._samples.csv b/example/dev/input/._samples.csv new file mode 100755 index 0000000000000000000000000000000000000000..8b8885a06799930cbe485aee8503a3033fc8aab4 GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vVs!ar z0M)?(R9=Cmg&AlPNSvR6K|DD>S1+-kASYEXB(<W%H7_|oB{MG_tbtJ+NC_}7NFmhZ zBo>#H7N@49B$lKq2LwaRWmE^!kqivx2z9xsC5b>aiB{RZE<TA#sX6f(6<LYqS@}g} zxkcH<#>J(krRC|x1=$(Jg+R@vC26`A)`rF=Muz5=7ACva+)e};{VsK;9*F*AuYa7a z{zIj^{<N)v<@;xS)!B5q!L)8)Zh!w4yCco**Cb;;xHyR2nSa&vN%QlxfBcVDo!lj( zxURuNd*jx978A;Seir*Ln%urxH2+@ioYn&~cTB9w-1(z>KgjS=Fd71*Aut*OqaiRF z0;3@?8UmvsFd71*Aut*Oqai?@5MTrv1Hxb+7m|@#tWcC%oL^d$oT`wPUzDwonOBmU pSCW~Zmza}NsgRSJR-%xUlbDwc><dG64QUG1|8TF!Fv$J?4**eqeQy8& literal 0 HcmV?d00001 diff --git a/example/dev/input/config.yaml b/example/dev/input/config.yaml index 7f35028..b036792 100644 --- a/example/dev/input/config.yaml +++ b/example/dev/input/config.yaml @@ -63,7 +63,6 @@ samples: # STRING. "ATACseq" or "ChIPseq". Only data type specific steps are executed. Currently, if set to ChIP-Seq, ATAC-seq specific steps like RSS adjustment are not executed. dataType: "ATACseq" - dataType: "ChIPseq" ########################### # SECTION additionalInput # diff --git a/example/dev/input/samples.csv b/example/dev/input/samples.csv old mode 100644 new mode 100755 index 525e4dd..dd6a42b --- a/example/dev/input/samples.csv +++ b/example/dev/input/samples.csv @@ -1,4 +1,4 @@ -individual sampleName path_inputForward path_inputReverse Flowcell_ID lane_ID Technology Library_ID replicate_No -"test1" "test1_rep1" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" -"test1" "test1_rep2" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" -"test2" "test2_rep1" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" +individual sampleName path_inputForward path_inputReverse Flowcell_ID lane_ID Technology Library_ID replicate_No control_sampleName +"test1" "test1_rep1" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" "NA" +"test1" "test1_rep2" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" "NA" +"test2" "test2_rep1" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" "NA" diff --git a/src/Snakemake/dev/._Snakefile b/src/Snakemake/dev/._Snakefile index cdc6518cd0c4459c384e7c7c85903b632d35b975..7e99ec425dc91ea9de2aec6e47f433fcae8b205e 100755 GIT binary patch delta 75 zcmZorXi%6CsqFGem4Sip5d#Cm0w5LuVr0Mw<SdxDMv)&#|D1C@5hT}Cnx?k7kujNf O@&v}Ko1gH>FaiKDq!9)H delta 271 zcmZorXi%6C8Enbm%D}+)h=GCO0uZkNVr0Ms<bcFyFfb@3=jZAr78K;9>iHy=<|StY zrxulECZ`tb`4^<-g=dyzKr}LDOsr9?&rC^8D#*z!E-^5;%*e#d!pat2mReMtnV%O@ zkXVutFCdUqnwOH33RIU@l9`s7S|q>_FQBO(S`1XBUzC}fn_pU7oT%@TTAW>yU!Wgc znv|27o2n3!T2TUWMZAC@$UVgn;S!Lj2?xKpjJ{zk$Qc?842+Br_b{+SX&6-)a#$!4 T#5w<zfpO!;0^ZF}_+%IX@Q6n8 diff --git a/src/Snakemake/dev/Snakefile b/src/Snakemake/dev/Snakefile index 575d3fe..a6d968c 100755 --- a/src/Snakemake/dev/Snakefile +++ b/src/Snakemake/dev/Snakefile @@ -20,6 +20,7 @@ from os import makedirs import pandas import numpy import warnings +import re # Enforce a minimum Snakemake version because of various features #min_version("5.9.1") @@ -68,8 +69,8 @@ def read_samplesTable(samplesSummaryFile): # Expect a particular number of columns, do a sanity check here - if not {'individual', 'sampleName', 'path_inputForward', 'path_inputReverse', 'Flowcell_ID', 'lane_ID', 'Technology', 'Library_ID'}.issubset(data.columns.values): - raise KeyError("The samples file must contain the following named columns (TAB separated!): individual, sampleName, path_inputForward, path_inputReverse, Flowcell_ID, lane_ID, Technology, Library_ID") + if not {'individual', 'sampleName', 'path_inputForward', 'path_inputReverse', 'Flowcell_ID', 'lane_ID', 'Technology', 'Library_ID','replicate_No', 'control_sampleName'}.issubset(data.columns.values): + raise KeyError("The samples file must contain the following named columns (TAB separated!): individual, sampleName, path_inputForward, path_inputReverse, Flowcell_ID, lane_ID, Technology, Library_ID, control_sampleName") # Make sure the individual column is a string data['individual'] = data['individual'].astype(str) @@ -147,7 +148,7 @@ configDict = { "par_scripts": ["STATS_script_withinThr", "STATS_script_outsideThr", "STATS_script_geneTypesToKeep", "FL_distr_script_cutoff"], "par_peakCalling": - ["modelNonStringent", "modelStringent", "modelStringent_minQValue", "modelNonStringent_minQValue", "modelNonStringent_slocal", "Encode_pValThreshold", "Encode_modelBroadAndGapped", "Encode_modelNarrow"], + ["modelNonStringent", "modelStringent", "modelStringent_minQValue", "modelNonStringent_minQValue", "modelNonStringent_slocal", "Encode_pValThreshold", "Encode_modelBroadAndGapped", "Encode_modelNarrow", "controlSample"], "par_deepTools": ["readLength", "bamCoverage_normalizationCoverage", "bamCoverage_binSize", "bamCoverage_otherOptions"] } @@ -273,6 +274,7 @@ allIndividualsUnique = numpy.unique(samplesData.loc[:,"individual"]) nIndividualsUnique = len(allIndividualsUnique) allIndividualsUniqueStrSpaces = ' '.join(allIndividualsUnique) allSamplesUnique = numpy.unique(samplesData.loc[:,"sampleName"]) +nControl = len(samplesData['control_sampleName'].dropna().unique()) nSamplesUnique = len(allSamplesUnique) # Get only two sampels for nicer graphs @@ -281,14 +283,15 @@ nSamplesUnique = len(allSamplesUnique) allSamplesUniqueStr = ','.join(allSamplesUnique) allSamplesUniqueStrSpaces = ' '.join(allSamplesUnique) + if nSamplesUnique <=10: - rangeOverlap = list(range(1, nSamplesUnique + 1, 1)) + rangeOverlap = list(range(1, nSamplesUnique - nControl + 1, 1)) elif nSamplesUnique <=50: - rangeOverlap = list(range(1, 5 + 1, 1)) + list(range(10, nSamplesUnique + 1, 5)) + rangeOverlap = list(range(1, 5 + 1, 1)) + list(range(10, nSamplesUnique - nControl + 1, 5)) elif nSamplesUnique <=100: - rangeOverlap = list(range(1, 5 + 1, 1)) + list(range(10, nSamplesUnique + 1, 10)) + rangeOverlap = list(range(1, 5 + 1, 1)) + list(range(10, nSamplesUnique - nControl + 1, 10)) else: - rangeOverlap = list(range(1, 10 + 1, 1)) + list(range(20, nSamplesUnique + 1, 20)) + rangeOverlap = list(range(1, 10 + 1, 1)) + list(range(20, nSamplesUnique - nControl + 1, 20)) if nSamplesUnique % 5 != 0 and nSamplesUnique not in rangeOverlap: rangeOverlap = rangeOverlap + list([nSamplesUnique]) @@ -346,7 +349,6 @@ for fileCur in filesToCheck: # Check input files inputFiles = numpy.asarray(samplesData.loc[:,"path_inputForward"]) - if pairedEnd: inputFiles = numpy.append(inputFiles, numpy.asarray(samplesData.loc[:, "path_inputReverse"])) @@ -447,11 +449,12 @@ if doPeakCalling: GCBias = GCBiasStr, peakType = peakTypeEncode) - allResultFiles.append(individualPeaksEncode) + #allResultFiles.append(individualPeaksEncode) else: individualPeaksEncode = [] + allResultFiles.append(individualPeaksEncode) individualPeaksStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed.gz', dir = PEAK_STRINGENT_dir, sample = allSamplesUnique, GCBias = GCBiasStr, @@ -470,15 +473,18 @@ if doPeakCalling: ############## # FRIP score # ############## - frip_nonStringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["nonStringent"] , sample = allSamplesUnique, GCBias = GCBiasStr, analysisType = ["nonStringent"], peakType = ["narrow"]) - allResultFiles.append(frip_nonStringent) + # frip_nonStringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["nonStringent"] , sample = allSamplesUnique, GCBias = GCBiasStr, analysisType = ["nonStringent"], peakType = ["narrow"]) + # allResultFiles.append(frip_nonStringent) - frip_stringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["stringent"] , sample = allSamplesUnique, GCBias = GCBiasStr, analysisType = ["stringent"], peakType = ["narrow"]) - allResultFiles.append(frip_stringent) + # frip_stringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["stringent"] , sample = allSamplesUnique, GCBias = GCBiasStr, analysisType = ["stringent"], peakType = ["narrow"]) + # allResultFiles.append(frip_stringent) - if encodePeaks: - frip_encode = expand('{dir}/{dir2}/{sample}{GCBias}.final.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["Encode"] , sample = allSamplesUnique, GCBias = GCBiasStr, peakType = peakTypeEncode) - allResultFiles.append(frip_encode) + # if encodePeaks: + # frip_encode = expand('{dir}/{dir2}/{sample}{GCBias}.final.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["Encode"] , sample = allSamplesUnique, GCBias = GCBiasStr, peakType = peakTypeEncode) + # else: + # frip_encode = [] + + # allResultFiles.append(frip_encode) ################### # Peak annotation # @@ -504,10 +510,12 @@ if doPeakCalling: GCBias = GCBiasStr, peakType = peakTypeEncode, ext = ["pdf", "csv"]) - allResultFiles.append(annotation_individualPeaksEncode) + # allResultFiles.append(annotation_individualPeaksEncode) else: annotation_individualPeaksEncode = [] + allResultFiles.append(annotation_individualPeaksEncode) + ################### # Consensus peaks # @@ -521,26 +529,27 @@ if doPeakCalling: allResultFiles.append(consensusPeaks_nonStringent) - consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], minOverlap = rangeOverlap) - allResultFiles.append(consensusPeaks_stringentNonStringent_frip) + # consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], minOverlap = rangeOverlap) + # allResultFiles.append(consensusPeaks_stringentNonStringent_frip) if encodePeaks: consensusPeaks_ENCODE = expand("{dir}/consensusPeaks{GCBias}.{peakType}Peak.minOverlap{overlap}.bed.gz", dir = PEAK_ENCODE_dir, GCBias = GCBiasStr, peakType = peakTypeEncode, overlap = rangeOverlap) - consensusPeaks_encode_frip = expand('{dir}/consensusPeaks/{peakType}Peak.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = peakTypeEncode, minOverlap = rangeOverlap) + # consensusPeaks_encode_frip = expand('{dir}/consensusPeaks/{peakType}Peak.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = peakTypeEncode, minOverlap = rangeOverlap) peakTypesCur = peakTypesAll allResultFiles.append(consensusPeaks_ENCODE) - allResultFiles.append(consensusPeaks_encode_frip) + # allResultFiles.append(consensusPeaks_encode_frip) else: consensusPeaks_ENCODE = [] - consensusPeaks_encode_frip = [] + # consensusPeaks_encode_frip = [] peakTypesCur = ["stringent", "nonStringent"] - + allResultFiles.append(consensusPeaks_ENCODE) + #allResultFiles.append(consensusPeaks_encode_frip) # TODO move PCA_peaks = expand("{dir}/{basename}_PCAPlot_peaks{GCBias}_{peakType}_minOverlap{overlap}.pdf", @@ -553,11 +562,14 @@ if doPeakCalling: annotation_consensusPeaks_stringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.{ext}", dir = REPORTS_dir_anno_STRINGENT, GCBias = GCBiasStr, overlap = rangeOverlap, ext = ["pdf", "csv"]) annotation_consensusPeaks_nonStringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.{ext}", dir = REPORTS_dir_anno_NONSTRINGENT, GCBias = GCBiasStr, overlap = rangeOverlap, ext = ["pdf", "csv"]) - allResultFiles.append(annotation_consensusPeaks_stringent) - allResultFiles.append(annotation_consensusPeaks_nonStringent) if encodePeaks: annotation_consensusPeaks_ENCODE = expand("{dir}/consensusPeaks{GCBias}.{peakType}Peak.minOverlap{overlap}_annotation.{ext}", dir = REPORTS_dir_anno_ENCODE, GCBias = GCBiasStr, peakType = peakTypeEncode, overlap = rangeOverlap, ext = ["pdf", "csv"]) - allResultFiles.append(annotation_consensusPeaks_ENCODE) + else: + annotation_consensusPeaks_ENCODE = [] + + allResultFiles.append(annotation_consensusPeaks_stringent) + allResultFiles.append(annotation_consensusPeaks_nonStringent) + allResultFiles.append(annotation_consensusPeaks_ENCODE) # PCA files for peaks # TODO @@ -593,6 +605,7 @@ if mergeReplicates and nSamplesUnique > 1: #allResultFiles.append(pooledPeaksEncode) else: pooledPeaksEncode = [] + allResultFiles.append(pooledPeaksEncode) pooledPeaksStringent = expand('{dir}/{indiv}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered2.bed.gz', dir = PEAK_STRINGENT_dir, indiv = allIndividualsUnique, GCBias = GCBiasStr, @@ -1685,13 +1698,36 @@ def getGenomeTypeMacs2(assemblyVersion): return genomeType + +# if dataType == "ChIPseq" and config["peak"]["controlSample"] != "": +# controlSamplePath = + + # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule macs2_stringent_outputName = PEAK_STRINGENT_dir + '/{basename}' + '.stringent' +#macs2_stringent_outputName = expand("{dir}/{sample}.{{specifics}}.stringent", sample = [allSamplesUnique, allIndividualsUnique] if mergeReplicates else allSamplesUnique, dir = PEAK_STRINGENT_dir) + +def getControlSample(bam): + + samples_regex = "|".join(allSamplesUnique) + file_base = os.path.basename(bam) + sample_cur = re.search(samples_regex, file_base).group(0) + controlSample = samplesData["control_sampleName"][samplesData["sampleName"]==sample_cur].iloc[0] + + if pandas.isnull(controlSample): + ctrl_param = "" + else: + ctrl_param = "-c " + FINAL_OUTPUT_dir + "/" + controlSample + ".final.bam" + return ctrl_param + + +#def getControlSample2(bam): + # Runs for all bam fiels in the final output folder, also the replicated one rule macs2_stringent: input: - bam = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir) + bam = FINAL_OUTPUT_dir + "/{basename}.bam" output: peaks_bedT = temp(macs2_stringent_outputName + '_peaks.narrowPeak'), peaks_bed = temp(macs2_stringent_outputName + '.narrowPeak.gz'), @@ -1699,6 +1735,7 @@ rule macs2_stringent: summit_bedgz = macs2_stringent_outputName + '_summits.bed.gz', xls = temp(macs2_stringent_outputName + '_peaks.xls') log: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.log" + #log: LOG_BENCHMARK_dir + expand("/macs2_stringent.{sample}{{specifics}}.log", sample = [allSamplesUnique, allIndividualsUnique] if mergeReplicates else allSamplesUnique) message: "{ruleDisplayMessage}Run MACS2 (stringent) for {input.bam:q} ..." threads: 1 # benchmark: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.benchmark" @@ -1712,11 +1749,13 @@ rule macs2_stringent: name = lambda wildcards: wildcards.basename + '.stringent', outputDir = PEAK_STRINGENT_dir, pairedEndString = macs2_pairedEndMode, - keepDuplicates = "--keep-dup all" + keepDuplicates = "--keep-dup all", + controlFile = lambda wildcards: getControlSample(wildcards.basename) shell: """PYTHONPATH="" && macs2 callpeak \ --treatment {input.bam} \ + {params.controlFile} \ -q {params.qValue} \ --outdir {params.outputDir}\ --name {params.name}\ @@ -1728,6 +1767,7 @@ rule macs2_stringent: 2> {log:q} && sort -k 8gr,8gr {output.peaks_bedT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' | gzip -f > {output.peaks_bed} && gzip -f < {output.summit_bed} > {output.summit_bedgz} + """ @@ -1756,7 +1796,8 @@ use rule macs2_stringent as macs2_nonStringent with: name = lambda wildcards: wildcards.basename + '.nonStringent', outputDir = PEAK_NONSTRINGENT_dir, keepDuplicates = "--keep-dup all", - pairedEndString = macs2_pairedEndMode + pairedEndString = macs2_pairedEndMode, + controlFile = lambda wildcards: getControlSample(wildcards.basename) # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule @@ -1797,7 +1838,8 @@ rule macs2_Encode: outputDir = PEAK_ENCODE_dir, keepDuplicates = "--keep-dup all", bdg1 = macs2_Encode_outputName + '_control_lambda.bdg', - bdg2 = macs2_Encode_outputName + '_treat_pileup.bdg' + bdg2 = macs2_Encode_outputName + '_treat_pileup.bdg', + controlFile = lambda wildcards: getControlSample(wildcards.basename) shell: # 1. First produce broad and gapped peaks, then narrow ones @@ -1961,7 +2003,8 @@ if nSamplesUnique > 1 and doPeakCalling: threads: 1 params: GCString = GCBiasStr, - minOverlapValues = ",".join(map(str, rangeOverlap)) + minOverlapValues = ",".join(map(str, rangeOverlap)), + controlFiles = samplesData['control_sampleName'].dropna().unique() script: script_consPeaks rule peaksPCA: @@ -1983,33 +2026,48 @@ if nSamplesUnique > 1 and doPeakCalling: if annotatePeaks and doPeakCalling: - rule annotatePeaks: + rule annotatePeaks_individual: input: individual_encode = individualPeaksEncode, individual_stringent = individualPeaksStringent, individual_nonStringent = individualPeaksNonStringent, - consensus_encode = consensusPeaks_ENCODE, - consensus_stringent = consensusPeaks_stringent, - consensus_nonStringent = consensusPeaks_nonStringent #pooled = rule.poolPeaksReplicateSamples.output.pooledPeaks, #replicates = rule.poolPeaksReplicateSamples.output.replicatePeaks output: annotation_individual_encode = annotation_individualPeaksEncode, annotation_individual_stringent = annotation_individualPeaksStringent, annotation_individual_nonStringent = annotation_individualPeaksNonStringent, - annotation_consensus_encode = annotation_consensusPeaks_ENCODE, - annotation_consensus_stringent = annotation_consensusPeaks_stringent, - annotation_consensus_nonStringent = annotation_consensusPeaks_nonStringent - log: expand('{dir}/annotatePeaks.R.log', dir = LOG_BENCHMARK_dir) + log: expand('{dir}/annotatePeaks_individual.R.log', dir = LOG_BENCHMARK_dir) singularity: "shub://chrarnold/Singularity_images:atac_seq_r" - message: "{ruleDisplayMessage} Annotating peaks for all peak files with the script {script_annoPeaks}..." + message: "{ruleDisplayMessage} Annotating individual peaks for all peak files with the script {script_annoPeaks}..." threads: 1 params: tabulateOutput = "true", assemblyVersion = config["par_align"]["assemblyVersion"], - tssRegion = "-5000,5000" + tssRegion = "-5000,5000", + mode = "individual" script: script_annoPeaks + use rule annotatePeaks_individual as annotatePeaks_consensus with: + input: + consensus_encode = consensusPeaks_ENCODE, + consensus_stringent = consensusPeaks_stringent, + consensus_nonStringent = consensusPeaks_nonStringent + output: + annotation_consensus_encode = annotation_consensusPeaks_ENCODE, + annotation_consensus_stringent = annotation_consensusPeaks_stringent, + annotation_consensus_nonStringent = annotation_consensusPeaks_nonStringent + log: + expand('{dir}/annotatePeaks_consensus.R.log', dir = LOG_BENCHMARK_dir) + singularity: + "shub://chrarnold/Singularity_images:atac_seq_r" + message: + "{ruleDisplayMessage} Annotating consensus peaks for all peak files with the script {script_annoPeaks}..." + params: + tabulateOutput = "true", + assemblyVersion = config["par_align"]["assemblyVersion"], + tssRegion = "-5000,5000", + mode = "consensus" @@ -2094,52 +2152,52 @@ rule fragment_length_distr: # FRIP score # For each sample, plot against the sample-specific peaks file and all consensus peak files # May fail for empty consensus peak files, therefore we add a "|| touch" here in the end, even though this hides ANY error -rule deeptools_plotEnrichment: - input: - bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique), - beds = expand('{dir}/{{peakType}}/{{basename}}Peak' + '.filtered.bed.gz', dir = PEAKCALLING_dir) - output: - fig = expand('{dir}/{{peakType}}/{{basename}}_frip.pdf', dir = REPORTS_dir_frip), - rawCounts = expand('{dir}/{{peakType}}/{{basename}}_frip.rawCounts', dir = REPORTS_dir_frip), - log: expand('{dir}/deeptools_plotEnrichment_{{peakType}}.{{basename}}.log', dir = LOG_BENCHMARK_dir) - message: "{ruleDisplayMessage} Computing FRiP scores for sample {wildcards.basename} and peak type {wildcards.peakType}" - threads: threadsMax - conda: "condaEnvironments/deepTools.yaml" - singularity: "shub://chrarnold/Singularity_images:atac_seq_conda" - params: - plotTitle = "FRiP score", - heightPDF = 15, # can be fixed, as long as sample names are not too long. Default is 10 - widthPDF = max(10, nSamplesUnique * 1 * 0.4) # has to be flexible due to no. of samples on X - shell: - """ - plotEnrichment \ - -b {input.bams} \ - --BED {input.beds} \ - --plotFile {output.fig} \ - -p {threads} \ - --plotHeight {params.heightPDF} --plotWidth {params.widthPDF} \ - --outRawCounts {output.rawCounts} \ - --smartLabels \ - --plotTitle "{params.plotTitle}" 2> {log:q} || touch {output.fig:q} {output.rawCounts:q} - """ +# rule deeptools_plotEnrichment: +# input: +# bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique), +# beds = expand('{dir}/{{peakType}}/{{basename}}Peak' + '.filtered.bed.gz', dir = PEAKCALLING_dir) +# output: +# fig = expand('{dir}/{{peakType}}/{{basename}}_frip.pdf', dir = REPORTS_dir_frip), +# rawCounts = expand('{dir}/{{peakType}}/{{basename}}_frip.rawCounts', dir = REPORTS_dir_frip), +# log: expand('{dir}/deeptools_plotEnrichment_{{peakType}}.{{basename}}.log', dir = LOG_BENCHMARK_dir) +# message: "{ruleDisplayMessage} Computing FRiP scores for sample {wildcards.basename} and peak type {wildcards.peakType}" +# threads: threadsMax +# conda: "condaEnvironments/deepTools.yaml" +# singularity: "shub://chrarnold/Singularity_images:atac_seq_conda" +# params: +# plotTitle = "FRiP score", +# heightPDF = 15, # can be fixed, as long as sample names are not too long. Default is 10 +# widthPDF = max(10, nSamplesUnique * 1 * 0.4) # has to be flexible due to no. of samples on X +# shell: +# """ +# plotEnrichment \ +# -b {input.bams} \ +# --BED {input.beds} \ +# --plotFile {output.fig} \ +# -p {threads} \ +# --plotHeight {params.heightPDF} --plotWidth {params.widthPDF} \ +# --outRawCounts {output.rawCounts} \ +# --smartLabels \ +# --plotTitle "{params.plotTitle}" 2> {log:q} || touch {output.fig:q} {output.rawCounts:q} +# """ -use rule deeptools_plotEnrichment as deeptools_plotEnrichment_consensusPeaks with: - input: - bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique), - beds = expand("{dir}/{{peakType}}/consensusPeaks{GCBias}.{{optionalEncodeStr}}minOverlap{{overlap}}.bed.gz", dir = PEAKCALLING_dir, GCBias = GCBiasStr) - output: - fig = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.pdf', dir = REPORTS_dir_frip), - rawCounts = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.rawCounts', dir = REPORTS_dir_frip) - wildcard_constraints: - optionalEncodeStr="|Encode.broadPeak|Encode.gappedPeak|Encode.narrowPeak", - peakType="stringent|nonStringent|Encode" - message: - "{ruleDisplayMessage} Computing FRiP scores for consensus peaks with overlap {wildcards.overlap} and peak type {wildcards.peakType} + {wildcards.optionalEncodeStr}" - log: - expand('{dir}/deeptools_plotEnrichment_consensusPeaks.{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}.log', dir = LOG_BENCHMARK_dir) - singularity: - "shub://chrarnold/Singularity_images:atac_seq_conda" +# use rule deeptools_plotEnrichment as deeptools_plotEnrichment_consensusPeaks with: +# input: +# bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique), +# beds = expand("{dir}/{{peakType}}/consensusPeaks{GCBias}.{{optionalEncodeStr}}minOverlap{{overlap}}.bed.gz", dir = PEAKCALLING_dir, GCBias = GCBiasStr) +# output: +# fig = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.pdf', dir = REPORTS_dir_frip), +# rawCounts = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.rawCounts', dir = REPORTS_dir_frip) +# wildcard_constraints: +# optionalEncodeStr="|Encode.broadPeak|Encode.gappedPeak|Encode.narrowPeak", +# peakType="stringent|nonStringent|Encode" +# message: +# "{ruleDisplayMessage} Computing FRiP scores for consensus peaks with overlap {wildcards.overlap} and peak type {wildcards.peakType} + {wildcards.optionalEncodeStr}" +# log: +# expand('{dir}/deeptools_plotEnrichment_consensusPeaks.{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}.log', dir = LOG_BENCHMARK_dir) +# singularity: +# "shub://chrarnold/Singularity_images:atac_seq_conda" # AdjRSS files are not needed here and are ignored in the stats file -- GitLab