From 1e28aa2dfd69fa079a15edadd4246dd2da83819f Mon Sep 17 00:00:00 2001
From: Rim Moussa <rmoussa@spinoza.embl.de>
Date: Mon, 10 Jan 2022 04:51:47 +0100
Subject: [PATCH] chipseq changes

---
 example/dev/input/._config.yaml | Bin 0 -> 4096 bytes
 example/dev/input/._samples.csv | Bin 0 -> 4096 bytes
 example/dev/input/config.yaml   |   1 -
 example/dev/input/samples.csv   |   8 +-
 src/Snakemake/dev/._Snakefile   | Bin 4096 -> 4096 bytes
 src/Snakemake/dev/Snakefile     | 228 ++++++++++++++++++++------------
 6 files changed, 147 insertions(+), 90 deletions(-)
 create mode 100755 example/dev/input/._config.yaml
 create mode 100755 example/dev/input/._samples.csv
 mode change 100644 => 100755 example/dev/input/samples.csv

diff --git a/example/dev/input/._config.yaml b/example/dev/input/._config.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..ae14cb815afa051c5c66bbdb67dbc9b350335505
GIT binary patch
literal 4096
zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vVs!ar
z2+_f?0H|C5O$#HC4;7b6&d=3LEGWoH)yqjNE-5WeO-V^CNmULA2I+af=5`{8PI#Kc
z3!+ECXb6mkz-S1JhQMeDjE2By2#kinXb6mkz-S1JhQMeD;0ggyXA^|MKrSRBvsj@h
zwK%`DC^=OjEx#yRAv3QeHLoNyKQA#Sr&1v&HLXM;DJL;68`u|y>Kf7%s{i3$kztVg
G{~rLwUn}we

literal 0
HcmV?d00001

diff --git a/example/dev/input/._samples.csv b/example/dev/input/._samples.csv
new file mode 100755
index 0000000000000000000000000000000000000000..8b8885a06799930cbe485aee8503a3033fc8aab4
GIT binary patch
literal 4096
zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vVs!ar
z0M)?(R9=Cmg&AlPNSvR6K|DD>S1+-kASYEXB(<W%H7_|oB{MG_tbtJ+NC_}7NFmhZ
zBo>#H7N@49B$lKq2LwaRWmE^!kqivx2z9xsC5b>aiB{RZE<TA#sX6f(6<LYqS@}g}
zxkcH<#>J(krRC|x1=$(Jg+R@vC26`A)`rF=Muz5=7ACva+)e};{VsK;9*F*AuYa7a
z{zIj^{<N)v<@;xS)!B5q!L)8)Zh!w4yCco**Cb;;xHyR2nSa&vN%QlxfBcVDo!lj(
zxURuNd*jx978A;Seir*Ln%urxH2+@ioYn&~cTB9w-1(z>KgjS=Fd71*Aut*OqaiRF
z0;3@?8UmvsFd71*Aut*Oqai?@5MTrv1Hxb+7m|@#tWcC%oL^d$oT`wPUzDwonOBmU
pSCW~Zmza}NsgRSJR-%xUlbDwc><dG64QUG1|8TF!Fv$J?4**eqeQy8&

literal 0
HcmV?d00001

diff --git a/example/dev/input/config.yaml b/example/dev/input/config.yaml
index 7f35028..b036792 100644
--- a/example/dev/input/config.yaml
+++ b/example/dev/input/config.yaml
@@ -63,7 +63,6 @@ samples:
 
   # STRING. "ATACseq" or "ChIPseq". Only data type specific steps are executed. Currently, if set to ChIP-Seq, ATAC-seq specific steps like RSS adjustment are not executed.
   dataType: "ATACseq"
-  dataType: "ChIPseq"
 
 ###########################
 # SECTION additionalInput #
diff --git a/example/dev/input/samples.csv b/example/dev/input/samples.csv
old mode 100644
new mode 100755
index 525e4dd..dd6a42b
--- a/example/dev/input/samples.csv
+++ b/example/dev/input/samples.csv
@@ -1,4 +1,4 @@
-individual	sampleName	path_inputForward	path_inputReverse	Flowcell_ID	lane_ID	Technology	Library_ID	replicate_No
-"test1"	"test1_rep1"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_1.fastq.gz"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_2.fastq.gz"	"NA"	"NA"	"ILLUMINA"	"default"	"1"
-"test1"	"test1_rep2"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_1.fastq.gz"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_2.fastq.gz"	"NA"	"NA"	"ILLUMINA"	"default"	"1"
-"test2"	"test2_rep1"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_1.fastq.gz"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_2.fastq.gz"	"NA"	"NA"	"ILLUMINA"	"default"	"1"
+individual	sampleName	path_inputForward	path_inputReverse	Flowcell_ID	lane_ID	Technology	Library_ID	replicate_No control_sampleName
+"test1"	"test1_rep1"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_1.fastq.gz"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_2.fastq.gz"	"NA"	"NA"	"ILLUMINA"	"default"	"1"	"NA"
+"test1"	"test1_rep2"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_1.fastq.gz"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_2.fastq.gz"	"NA"	"NA"	"ILLUMINA"	"default"	"1"	"NA"
+"test2"	"test2_rep1"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_1.fastq.gz"	"/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_2.fastq.gz"	"NA"	"NA"	"ILLUMINA"	"default"	"1"	"NA"
diff --git a/src/Snakemake/dev/._Snakefile b/src/Snakemake/dev/._Snakefile
index cdc6518cd0c4459c384e7c7c85903b632d35b975..7e99ec425dc91ea9de2aec6e47f433fcae8b205e 100755
GIT binary patch
delta 75
zcmZorXi%6CsqFGem4Sip5d#Cm0w5LuVr0Mw<SdxDMv)&#|D1C@5hT}Cnx?k7kujNf
O@&v}Ko1gH>FaiKDq!9)H

delta 271
zcmZorXi%6C8Enbm%D}+)h=GCO0uZkNVr0Ms<bcFyFfb@3=jZAr78K;9>iHy=<|StY
zrxulECZ`tb`4^<-g=dyzKr}LDOsr9?&rC^8D#*z!E-^5;%*e#d!pat2mReMtnV%O@
zkXVutFCdUqnwOH33RIU@l9`s7S|q>_FQBO(S`1XBUzC}fn_pU7oT%@TTAW>yU!Wgc
znv|27o2n3!T2TUWMZAC@$UVgn;S!Lj2?xKpjJ{zk$Qc?842+Br_b{+SX&6-)a#$!4
T#5w<zfpO!;0^ZF}_+%IX@Q6n8

diff --git a/src/Snakemake/dev/Snakefile b/src/Snakemake/dev/Snakefile
index 575d3fe..a6d968c 100755
--- a/src/Snakemake/dev/Snakefile
+++ b/src/Snakemake/dev/Snakefile
@@ -20,6 +20,7 @@ from os import makedirs
 import pandas
 import numpy
 import warnings
+import re
 
 # Enforce a minimum Snakemake version because of various features
 #min_version("5.9.1")
@@ -68,8 +69,8 @@ def read_samplesTable(samplesSummaryFile):
 
     # Expect a particular number of columns, do a sanity check here
 
-    if not {'individual', 'sampleName', 'path_inputForward', 'path_inputReverse', 'Flowcell_ID', 'lane_ID', 'Technology', 'Library_ID'}.issubset(data.columns.values):
-        raise KeyError("The samples file must contain the following named columns (TAB separated!): individual, sampleName, path_inputForward, path_inputReverse, Flowcell_ID, lane_ID, Technology, Library_ID")
+    if not {'individual', 'sampleName', 'path_inputForward', 'path_inputReverse', 'Flowcell_ID', 'lane_ID', 'Technology', 'Library_ID','replicate_No', 'control_sampleName'}.issubset(data.columns.values):
+        raise KeyError("The samples file must contain the following named columns (TAB separated!): individual, sampleName, path_inputForward, path_inputReverse, Flowcell_ID, lane_ID, Technology, Library_ID, control_sampleName")
 
     # Make sure the individual column is a string
     data['individual'] = data['individual'].astype(str)
@@ -147,7 +148,7 @@ configDict = {
             "par_scripts":
                 ["STATS_script_withinThr", "STATS_script_outsideThr", "STATS_script_geneTypesToKeep", "FL_distr_script_cutoff"],
             "par_peakCalling":
-                ["modelNonStringent", "modelStringent", "modelStringent_minQValue", "modelNonStringent_minQValue", "modelNonStringent_slocal", "Encode_pValThreshold", "Encode_modelBroadAndGapped", "Encode_modelNarrow"],
+                ["modelNonStringent", "modelStringent", "modelStringent_minQValue", "modelNonStringent_minQValue", "modelNonStringent_slocal", "Encode_pValThreshold", "Encode_modelBroadAndGapped", "Encode_modelNarrow", "controlSample"],
             "par_deepTools":
                 ["readLength", "bamCoverage_normalizationCoverage", "bamCoverage_binSize", "bamCoverage_otherOptions"]
             }
@@ -273,6 +274,7 @@ allIndividualsUnique = numpy.unique(samplesData.loc[:,"individual"])
 nIndividualsUnique = len(allIndividualsUnique)
 allIndividualsUniqueStrSpaces  = ' '.join(allIndividualsUnique)
 allSamplesUnique     = numpy.unique(samplesData.loc[:,"sampleName"])
+nControl = len(samplesData['control_sampleName'].dropna().unique())
 nSamplesUnique = len(allSamplesUnique)
 
 # Get only two sampels for nicer graphs
@@ -281,14 +283,15 @@ nSamplesUnique = len(allSamplesUnique)
 allSamplesUniqueStr  = ','.join(allSamplesUnique)
 allSamplesUniqueStrSpaces  = ' '.join(allSamplesUnique)
 
+
 if nSamplesUnique <=10:
-    rangeOverlap = list(range(1, nSamplesUnique + 1, 1))
+    rangeOverlap = list(range(1, nSamplesUnique - nControl + 1, 1))
 elif nSamplesUnique <=50:
-    rangeOverlap = list(range(1, 5 + 1, 1)) + list(range(10, nSamplesUnique + 1, 5))
+    rangeOverlap = list(range(1, 5 + 1, 1)) + list(range(10, nSamplesUnique - nControl + 1, 5))
 elif nSamplesUnique <=100:
-    rangeOverlap = list(range(1, 5 + 1, 1)) + list(range(10, nSamplesUnique + 1, 10))
+    rangeOverlap = list(range(1, 5 + 1, 1)) + list(range(10, nSamplesUnique - nControl + 1, 10))
 else:
-    rangeOverlap = list(range(1, 10 + 1, 1)) + list(range(20, nSamplesUnique + 1, 20))
+    rangeOverlap = list(range(1, 10 + 1, 1)) + list(range(20, nSamplesUnique - nControl + 1, 20))
 
 if nSamplesUnique % 5 != 0 and nSamplesUnique not in rangeOverlap:
     rangeOverlap = rangeOverlap + list([nSamplesUnique])
@@ -346,7 +349,6 @@ for fileCur in filesToCheck:
 
 # Check input files
 inputFiles = numpy.asarray(samplesData.loc[:,"path_inputForward"])
-
 if pairedEnd:
     inputFiles = numpy.append(inputFiles, numpy.asarray(samplesData.loc[:, "path_inputReverse"]))
 
@@ -447,11 +449,12 @@ if doPeakCalling:
                                 GCBias = GCBiasStr,
                                 peakType = peakTypeEncode)
 
-        allResultFiles.append(individualPeaksEncode)
+        #allResultFiles.append(individualPeaksEncode)
 
     else:
         individualPeaksEncode = []
 
+    allResultFiles.append(individualPeaksEncode)
 
     individualPeaksStringent = expand('{dir}/{sample}{GCBias}.final.{analysisType}.{peakType}Peak.filtered.bed.gz', dir = PEAK_STRINGENT_dir, sample = allSamplesUnique,
                             GCBias = GCBiasStr,
@@ -470,15 +473,18 @@ if doPeakCalling:
     ##############
     # FRIP score #
     ##############
-    frip_nonStringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["nonStringent"] , sample = allSamplesUnique, GCBias = GCBiasStr, analysisType = ["nonStringent"], peakType = ["narrow"])
-    allResultFiles.append(frip_nonStringent)
+    # frip_nonStringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["nonStringent"] , sample = allSamplesUnique, GCBias = GCBiasStr, analysisType = ["nonStringent"], peakType = ["narrow"])
+    # allResultFiles.append(frip_nonStringent)
 
-    frip_stringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["stringent"] , sample = allSamplesUnique, GCBias = GCBiasStr, analysisType = ["stringent"], peakType = ["narrow"])
-    allResultFiles.append(frip_stringent)
+    # frip_stringent = expand('{dir}/{dir2}/{sample}{GCBias}.final.{analysisType}.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["stringent"] , sample = allSamplesUnique, GCBias = GCBiasStr, analysisType = ["stringent"], peakType = ["narrow"])
+    # allResultFiles.append(frip_stringent)
 
-    if encodePeaks:
-        frip_encode = expand('{dir}/{dir2}/{sample}{GCBias}.final.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["Encode"] , sample = allSamplesUnique, GCBias = GCBiasStr, peakType = peakTypeEncode)
-        allResultFiles.append(frip_encode)
+    # if encodePeaks:
+    #     frip_encode = expand('{dir}/{dir2}/{sample}{GCBias}.final.{peakType}_frip.pdf', dir = REPORTS_dir_frip, dir2 = ["Encode"] , sample = allSamplesUnique, GCBias = GCBiasStr, peakType = peakTypeEncode)
+    # else:
+    #     frip_encode = []
+    
+    # allResultFiles.append(frip_encode)
 
     ###################
     # Peak annotation #
@@ -504,10 +510,12 @@ if doPeakCalling:
                     GCBias = GCBiasStr,
                     peakType = peakTypeEncode,
                     ext = ["pdf", "csv"])
-            allResultFiles.append(annotation_individualPeaksEncode)
+          #  allResultFiles.append(annotation_individualPeaksEncode)
         else:
             annotation_individualPeaksEncode = []
 
+        allResultFiles.append(annotation_individualPeaksEncode)
+
 
     ###################
     # Consensus peaks #
@@ -521,26 +529,27 @@ if doPeakCalling:
         allResultFiles.append(consensusPeaks_nonStringent)
 
 
-        consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], minOverlap = rangeOverlap)
-        allResultFiles.append(consensusPeaks_stringentNonStringent_frip)
+        # consensusPeaks_stringentNonStringent_frip = expand('{dir}/consensusPeaks/{peakType}.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = ["stringent", "nonStringent"], minOverlap = rangeOverlap)
+        # allResultFiles.append(consensusPeaks_stringentNonStringent_frip)
 
 
         if encodePeaks:
             consensusPeaks_ENCODE = expand("{dir}/consensusPeaks{GCBias}.{peakType}Peak.minOverlap{overlap}.bed.gz", dir = PEAK_ENCODE_dir, GCBias = GCBiasStr, peakType = peakTypeEncode, overlap = rangeOverlap)
 
-            consensusPeaks_encode_frip = expand('{dir}/consensusPeaks/{peakType}Peak.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = peakTypeEncode, minOverlap = rangeOverlap)
+            # consensusPeaks_encode_frip = expand('{dir}/consensusPeaks/{peakType}Peak.minOverlap{minOverlap}_frip.pdf', dir = REPORTS_dir_frip, peakType = peakTypeEncode, minOverlap = rangeOverlap)
 
             peakTypesCur = peakTypesAll
 
             allResultFiles.append(consensusPeaks_ENCODE)
-            allResultFiles.append(consensusPeaks_encode_frip)
+            # allResultFiles.append(consensusPeaks_encode_frip)
 
         else:
             consensusPeaks_ENCODE = []
-            consensusPeaks_encode_frip = []
+            # consensusPeaks_encode_frip = []
             peakTypesCur = ["stringent", "nonStringent"]
 
-
+        allResultFiles.append(consensusPeaks_ENCODE)
+        #allResultFiles.append(consensusPeaks_encode_frip)
 
         # TODO move
         PCA_peaks = expand("{dir}/{basename}_PCAPlot_peaks{GCBias}_{peakType}_minOverlap{overlap}.pdf",
@@ -553,11 +562,14 @@ if doPeakCalling:
             annotation_consensusPeaks_stringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.{ext}", dir = REPORTS_dir_anno_STRINGENT, GCBias = GCBiasStr, overlap = rangeOverlap, ext = ["pdf", "csv"])
             annotation_consensusPeaks_nonStringent = expand("{dir}/consensusPeaks{GCBias}.minOverlap{overlap}.{ext}", dir = REPORTS_dir_anno_NONSTRINGENT, GCBias = GCBiasStr, overlap = rangeOverlap, ext = ["pdf", "csv"])
 
-            allResultFiles.append(annotation_consensusPeaks_stringent)
-            allResultFiles.append(annotation_consensusPeaks_nonStringent)
             if encodePeaks:
                 annotation_consensusPeaks_ENCODE = expand("{dir}/consensusPeaks{GCBias}.{peakType}Peak.minOverlap{overlap}_annotation.{ext}", dir = REPORTS_dir_anno_ENCODE, GCBias = GCBiasStr, peakType = peakTypeEncode, overlap = rangeOverlap, ext = ["pdf", "csv"])
-                allResultFiles.append(annotation_consensusPeaks_ENCODE)
+            else:
+                annotation_consensusPeaks_ENCODE = []
+            
+            allResultFiles.append(annotation_consensusPeaks_stringent)
+            allResultFiles.append(annotation_consensusPeaks_nonStringent)
+            allResultFiles.append(annotation_consensusPeaks_ENCODE)
 
         # PCA files for peaks
         # TODO
@@ -593,6 +605,7 @@ if mergeReplicates and nSamplesUnique > 1:
             #allResultFiles.append(pooledPeaksEncode)
         else:
             pooledPeaksEncode = []
+        allResultFiles.append(pooledPeaksEncode)
 
         pooledPeaksStringent = expand('{dir}/{indiv}.merged{GCBias}.final.{analysisType}{peaktype2}.{peakType}Peak.filtered2.bed.gz', dir = PEAK_STRINGENT_dir, indiv = allIndividualsUnique,
                                 GCBias = GCBiasStr,
@@ -1685,13 +1698,36 @@ def getGenomeTypeMacs2(assemblyVersion):
 
     return genomeType
 
+
+# if dataType == "ChIPseq" and config["peak"]["controlSample"] != "":
+#     controlSamplePath = 
+
+
 # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
 macs2_stringent_outputName     = PEAK_STRINGENT_dir  + '/{basename}' + '.stringent'
+#macs2_stringent_outputName     = expand("{dir}/{sample}.{{specifics}}.stringent", sample = [allSamplesUnique, allIndividualsUnique] if mergeReplicates else allSamplesUnique, dir = PEAK_STRINGENT_dir)
+
+def getControlSample(bam):
+
+    samples_regex = "|".join(allSamplesUnique)
+    file_base = os.path.basename(bam)
+    sample_cur = re.search(samples_regex, file_base).group(0)
+    controlSample = samplesData["control_sampleName"][samplesData["sampleName"]==sample_cur].iloc[0]
+
+    if pandas.isnull(controlSample):
+        ctrl_param = ""
+    else:
+        ctrl_param = "-c " + FINAL_OUTPUT_dir + "/" + controlSample + ".final.bam"
+    return ctrl_param
+
+   
+#def getControlSample2(bam):
+
 
 # Runs for all bam fiels in the final output folder, also the replicated one
 rule macs2_stringent:
     input:
-        bam          = expand('{dir}/{{basename}}.bam', dir = FINAL_OUTPUT_dir)
+        bam = FINAL_OUTPUT_dir + "/{basename}.bam"
     output:
         peaks_bedT   = temp(macs2_stringent_outputName + '_peaks.narrowPeak'),
         peaks_bed    = temp(macs2_stringent_outputName + '.narrowPeak.gz'),
@@ -1699,6 +1735,7 @@ rule macs2_stringent:
         summit_bedgz = macs2_stringent_outputName      + '_summits.bed.gz',
         xls          = temp(macs2_stringent_outputName + '_peaks.xls')
     log: LOG_BENCHMARK_dir + "/macs2_stringent.{basename}.log"
+    #log: LOG_BENCHMARK_dir + expand("/macs2_stringent.{sample}{{specifics}}.log", sample = [allSamplesUnique, allIndividualsUnique] if mergeReplicates else allSamplesUnique)
     message: "{ruleDisplayMessage}Run MACS2 (stringent) for {input.bam:q} ..."
     threads: 1
     # benchmark: LOG_BENCHMARK_dir +  "/macs2_stringent.{basename}.benchmark"
@@ -1712,11 +1749,13 @@ rule macs2_stringent:
         name            = lambda wildcards: wildcards.basename + '.stringent',
         outputDir       = PEAK_STRINGENT_dir,
         pairedEndString = macs2_pairedEndMode,
-        keepDuplicates  = "--keep-dup all"
+        keepDuplicates  = "--keep-dup all",
+        controlFile = lambda wildcards: getControlSample(wildcards.basename)
     shell:
         """PYTHONPATH="" &&
         macs2 callpeak \
                 --treatment {input.bam} \
+                {params.controlFile} \
                 -q {params.qValue} \
                 --outdir {params.outputDir}\
                 --name {params.name}\
@@ -1728,6 +1767,7 @@ rule macs2_stringent:
                 2> {log:q} &&
             sort -k 8gr,8gr {output.peaks_bedT} | awk 'BEGIN {{OFS = "\\t"}} ; {{$4="Peak_"NR ; print $0}}' | gzip -f > {output.peaks_bed} &&
             gzip -f < {output.summit_bed} > {output.summit_bedgz}
+
          """
 
 
@@ -1756,7 +1796,8 @@ use rule macs2_stringent as macs2_nonStringent with:
         name            = lambda wildcards: wildcards.basename + '.nonStringent',
         outputDir       = PEAK_NONSTRINGENT_dir,
         keepDuplicates  = "--keep-dup all",
-        pairedEndString = macs2_pairedEndMode
+        pairedEndString = macs2_pairedEndMode,
+        controlFile = lambda wildcards: getControlSample(wildcards.basename)
 
 
 # Define the general output file name above the rule. Wildcards are resolved in the resulting string when evaluating the rule
@@ -1797,7 +1838,8 @@ rule macs2_Encode:
         outputDir              = PEAK_ENCODE_dir,
         keepDuplicates         = "--keep-dup all",
         bdg1                   = macs2_Encode_outputName + '_control_lambda.bdg',
-        bdg2                   = macs2_Encode_outputName + '_treat_pileup.bdg'
+        bdg2                   = macs2_Encode_outputName + '_treat_pileup.bdg',
+        controlFile = lambda wildcards: getControlSample(wildcards.basename)
 
     shell:
         # 1. First produce broad and gapped peaks, then narrow ones
@@ -1961,7 +2003,8 @@ if nSamplesUnique > 1 and doPeakCalling:
         threads: 1
         params:
             GCString = GCBiasStr,
-            minOverlapValues = ",".join(map(str, rangeOverlap))
+            minOverlapValues = ",".join(map(str, rangeOverlap)),
+            controlFiles = samplesData['control_sampleName'].dropna().unique()
         script: script_consPeaks
 
     rule peaksPCA:
@@ -1983,33 +2026,48 @@ if nSamplesUnique > 1 and doPeakCalling:
 
 
 if annotatePeaks and doPeakCalling:
-    rule annotatePeaks:
+    rule annotatePeaks_individual:
         input:
             individual_encode       = individualPeaksEncode,
             individual_stringent    = individualPeaksStringent,
             individual_nonStringent = individualPeaksNonStringent,
-            consensus_encode        = consensusPeaks_ENCODE,
-            consensus_stringent     = consensusPeaks_stringent,
-            consensus_nonStringent  = consensusPeaks_nonStringent
             #pooled = rule.poolPeaksReplicateSamples.output.pooledPeaks,
             #replicates = rule.poolPeaksReplicateSamples.output.replicatePeaks
         output:
             annotation_individual_encode = annotation_individualPeaksEncode,
             annotation_individual_stringent = annotation_individualPeaksStringent,
             annotation_individual_nonStringent = annotation_individualPeaksNonStringent,
-            annotation_consensus_encode = annotation_consensusPeaks_ENCODE,
-            annotation_consensus_stringent = annotation_consensusPeaks_stringent,
-            annotation_consensus_nonStringent = annotation_consensusPeaks_nonStringent
-        log: expand('{dir}/annotatePeaks.R.log', dir = LOG_BENCHMARK_dir)
+        log: expand('{dir}/annotatePeaks_individual.R.log', dir = LOG_BENCHMARK_dir)
         singularity: "shub://chrarnold/Singularity_images:atac_seq_r"
-        message: "{ruleDisplayMessage} Annotating peaks for all peak files with the script {script_annoPeaks}..."
+        message: "{ruleDisplayMessage} Annotating individual peaks for all peak files with the script {script_annoPeaks}..."
         threads: 1
         params:
             tabulateOutput = "true",
             assemblyVersion = config["par_align"]["assemblyVersion"],
-            tssRegion = "-5000,5000"
+            tssRegion = "-5000,5000",
+            mode = "individual"
         script: script_annoPeaks
 
+    use rule annotatePeaks_individual as annotatePeaks_consensus with:
+        input:
+            consensus_encode        = consensusPeaks_ENCODE,
+            consensus_stringent     = consensusPeaks_stringent,
+            consensus_nonStringent  = consensusPeaks_nonStringent
+        output:
+            annotation_consensus_encode = annotation_consensusPeaks_ENCODE,
+            annotation_consensus_stringent = annotation_consensusPeaks_stringent,
+            annotation_consensus_nonStringent = annotation_consensusPeaks_nonStringent
+        log:
+            expand('{dir}/annotatePeaks_consensus.R.log', dir = LOG_BENCHMARK_dir)
+        singularity:
+            "shub://chrarnold/Singularity_images:atac_seq_r"
+        message: 
+            "{ruleDisplayMessage} Annotating consensus peaks for all peak files with the script {script_annoPeaks}..."
+        params:
+            tabulateOutput = "true",
+            assemblyVersion = config["par_align"]["assemblyVersion"],
+            tssRegion = "-5000,5000",
+            mode = "consensus"
 
 
 
@@ -2094,52 +2152,52 @@ rule fragment_length_distr:
 # FRIP score
 # For each sample, plot against the sample-specific peaks file and all consensus peak files
 # May fail for empty consensus peak files, therefore we add a "|| touch" here in the end, even though this hides ANY error
-rule deeptools_plotEnrichment:
-    input:
-        bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique),
-        beds = expand('{dir}/{{peakType}}/{{basename}}Peak' + '.filtered.bed.gz', dir = PEAKCALLING_dir)
-    output:
-        fig       = expand('{dir}/{{peakType}}/{{basename}}_frip.pdf', dir = REPORTS_dir_frip),
-        rawCounts = expand('{dir}/{{peakType}}/{{basename}}_frip.rawCounts', dir = REPORTS_dir_frip),
-    log: expand('{dir}/deeptools_plotEnrichment_{{peakType}}.{{basename}}.log', dir = LOG_BENCHMARK_dir)
-    message: "{ruleDisplayMessage} Computing FRiP scores for sample {wildcards.basename} and peak type {wildcards.peakType}"
-    threads: threadsMax
-    conda: "condaEnvironments/deepTools.yaml"
-    singularity: "shub://chrarnold/Singularity_images:atac_seq_conda"
-    params:
-        plotTitle = "FRiP score",
-        heightPDF = 15, # can be fixed, as long as sample names are not too long. Default is 10
-        widthPDF  = max(10, nSamplesUnique * 1 * 0.4) # has to be flexible due to no. of samples on X
-    shell:
-        """
-            plotEnrichment \
-                -b {input.bams} \
-                --BED {input.beds} \
-                --plotFile {output.fig} \
-                -p {threads} \
-                --plotHeight {params.heightPDF} --plotWidth {params.widthPDF} \
-                --outRawCounts {output.rawCounts} \
-                --smartLabels \
-                --plotTitle "{params.plotTitle}" 2> {log:q}  || touch {output.fig:q} {output.rawCounts:q}
-        """
+# rule deeptools_plotEnrichment:
+#     input:
+#         bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique),
+#         beds = expand('{dir}/{{peakType}}/{{basename}}Peak' + '.filtered.bed.gz', dir = PEAKCALLING_dir)
+#     output:
+#         fig       = expand('{dir}/{{peakType}}/{{basename}}_frip.pdf', dir = REPORTS_dir_frip),
+#         rawCounts = expand('{dir}/{{peakType}}/{{basename}}_frip.rawCounts', dir = REPORTS_dir_frip),
+#     log: expand('{dir}/deeptools_plotEnrichment_{{peakType}}.{{basename}}.log', dir = LOG_BENCHMARK_dir)
+#     message: "{ruleDisplayMessage} Computing FRiP scores for sample {wildcards.basename} and peak type {wildcards.peakType}"
+#     threads: threadsMax
+#     conda: "condaEnvironments/deepTools.yaml"
+#     singularity: "shub://chrarnold/Singularity_images:atac_seq_conda"
+#     params:
+#         plotTitle = "FRiP score",
+#         heightPDF = 15, # can be fixed, as long as sample names are not too long. Default is 10
+#         widthPDF  = max(10, nSamplesUnique * 1 * 0.4) # has to be flexible due to no. of samples on X
+#     shell:
+#         """
+#             plotEnrichment \
+#                 -b {input.bams} \
+#                 --BED {input.beds} \
+#                 --plotFile {output.fig} \
+#                 -p {threads} \
+#                 --plotHeight {params.heightPDF} --plotWidth {params.widthPDF} \
+#                 --outRawCounts {output.rawCounts} \
+#                 --smartLabels \
+#                 --plotTitle "{params.plotTitle}" 2> {log:q}  || touch {output.fig:q} {output.rawCounts:q}
+#         """
 
 
-use rule deeptools_plotEnrichment as deeptools_plotEnrichment_consensusPeaks with:
-    input:
-        bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique),
-        beds = expand("{dir}/{{peakType}}/consensusPeaks{GCBias}.{{optionalEncodeStr}}minOverlap{{overlap}}.bed.gz", dir = PEAKCALLING_dir, GCBias = GCBiasStr)
-    output:
-        fig       = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.pdf', dir = REPORTS_dir_frip),
-        rawCounts = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.rawCounts', dir = REPORTS_dir_frip)
-    wildcard_constraints:
-        optionalEncodeStr="|Encode.broadPeak|Encode.gappedPeak|Encode.narrowPeak",
-        peakType="stringent|nonStringent|Encode"
-    message:
-        "{ruleDisplayMessage} Computing FRiP scores for consensus peaks with overlap {wildcards.overlap} and peak type {wildcards.peakType} + {wildcards.optionalEncodeStr}"
-    log:
-        expand('{dir}/deeptools_plotEnrichment_consensusPeaks.{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}.log', dir = LOG_BENCHMARK_dir)
-    singularity:
-        "shub://chrarnold/Singularity_images:atac_seq_conda"
+# use rule deeptools_plotEnrichment as deeptools_plotEnrichment_consensusPeaks with:
+#     input:
+#         bams = expand('{dir}/{samples}.final.bam', dir = FINAL_OUTPUT_dir, samples = allSamplesUnique),
+#         beds = expand("{dir}/{{peakType}}/consensusPeaks{GCBias}.{{optionalEncodeStr}}minOverlap{{overlap}}.bed.gz", dir = PEAKCALLING_dir, GCBias = GCBiasStr)
+#     output:
+#         fig       = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.pdf', dir = REPORTS_dir_frip),
+#         rawCounts = expand('{dir}/consensusPeaks/{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}_frip.rawCounts', dir = REPORTS_dir_frip)
+#     wildcard_constraints:
+#         optionalEncodeStr="|Encode.broadPeak|Encode.gappedPeak|Encode.narrowPeak",
+#         peakType="stringent|nonStringent|Encode"
+#     message:
+#         "{ruleDisplayMessage} Computing FRiP scores for consensus peaks with overlap {wildcards.overlap} and peak type {wildcards.peakType} + {wildcards.optionalEncodeStr}"
+#     log:
+#         expand('{dir}/deeptools_plotEnrichment_consensusPeaks.{{peakType}}{{optionalEncodeStr}}.minOverlap{{overlap}}.log', dir = LOG_BENCHMARK_dir)
+#     singularity:
+#         "shub://chrarnold/Singularity_images:atac_seq_conda"
 
 
 # AdjRSS files are not needed here and are ignored in the stats file
-- 
GitLab