Snakefile 29 KB
Newer Older
Christian Arnold's avatar
Christian Arnold committed
1
2
3
4
5
6
7
8
9
############################################
# Libraries, versions, authors and license #
############################################

from snakemake.utils import min_version
import subprocess
import os
import pandas
import numpy
Christian Arnold's avatar
Christian Arnold committed
10
import socket
11
12
13
import time

start = time.time()
Christian Arnold's avatar
Christian Arnold committed
14

Christian Arnold's avatar
Christian Arnold committed
15
16

# Enforce a minimum Snakemake version because of various features
17
min_version("5.3.1")
Christian Arnold's avatar
Christian Arnold committed
18

Christian Arnold's avatar
Christian Arnold committed
19
__author__  = "Christian Arnold & Ivan Berest"
Christian Arnold's avatar
Christian Arnold committed
20
21
22
__license__ = "MIT"


23
24
25
#############
# FUNCTIONS #
#############
Christian Arnold's avatar
Christian Arnold committed
26
27
28
29


# The onsuccess handler is executed if the workflow finished without error.
onsuccess:
30
    print("\n\n#################################\n#  Workflow finished, no error  #")
Christian Arnold's avatar
Christian Arnold committed
31
    print(                                       "# Check the FINAL_OUTPUT folder #\n#################################\n\n")
32
    print("\nRunning time in minutes: %s\n" % round((time.time() - start)/60,1))
Christian Arnold's avatar
Christian Arnold committed
33
34
35
# Else, the onerror handler is executed.
onerror:
    print("\n\n#####################\n# An error occurred #\n#####################\n\n")
36
    print("\nRunning time in minutes: %s\n" % round((time.time() - start)/60,1))
Christian Arnold's avatar
Christian Arnold committed
37
38
39
40
41
    #shell("mail -s "an error occurred" carnold@embl.de < {log}")

# onstart handler will be executed before the workflow starts. Note that dry-runs do not trigger any of the handlers
onstart:
    print("Reading samples and metadata....\n")
Christian Arnold's avatar
Christian Arnold committed
42
    print ("Running workflow for the following " + str(len(allTF)) + " TF:\n " + ' \n '.join(map(str, allTF)))
Christian Arnold's avatar
Christian Arnold committed
43
44
45
46
    print ("Running workflow for the following BAM files:\n " + ' \n '.join(map(str, allBamFiles)))



Christian Arnold's avatar
Christian Arnold committed
47
def read_samplesTable(samplesSummaryFile, consensusPeaks):
Christian Arnold's avatar
Christian Arnold committed
48
49
50
51
52
53
    """text"""

    data = pandas.read_table(samplesSummaryFile)

    # Expect a particular number of columns, do a sanity check here

Christian Arnold's avatar
Christian Arnold committed
54
    if not consensusPeaks:
Christian Arnold's avatar
Christian Arnold committed
55

Christian Arnold's avatar
Christian Arnold committed
56
57
        if not {'SampleID', 'bamReads', 'conditionSummary', 'Peaks'}.issubset(data.columns.values):
            raise KeyError("The samples file must contain at least the following named columns (TAB separated!): 'SampleID', 'bamReads', 'conditionSummary', 'Peaks'")
Christian Arnold's avatar
Christian Arnold committed
58

Christian Arnold's avatar
Christian Arnold committed
59
60
61
62
63
64
    else:

        if not {'SampleID', 'bamReads', 'conditionSummary'}.issubset(data.columns.values):
            raise KeyError("The samples file must contain at least the following named columns (TAB separated!): 'SampleID', 'bamReads', 'conditionSummary'")

    return data
Christian Arnold's avatar
Christian Arnold committed
65

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# https://stackoverflow.com/questions/26560726/python-binomial-coefficient
def fcomb0(n, k):
    '''
    Compute the number of ways to choose k elements out of a pile of n.

    Use an iterative approach with the multiplicative formula:
    \frac{n!}{k!(n - k)!} =
    \frac{n(n - 1)\dots(n - k + 1)}{k(k-1)\dots(1)} =
    \prod{i = 1}{k}\frac{n + 1 - i}{i}

    Also rely on the symmetry: C_n^k = C_n^{n - k}, so the product can
    be calculated up to min(k, n - k)

    :param n: the size of the pile of elements
    :param k: the number of elements to take from the pile
    :return: the number of ways to choose k elements out of a pile of n
    '''

    # When k out of sensible range, should probably throw an exception.
    # For compatibility with scipy.special.{comb, binom} returns 0 instead.
    if k < 0 or k > n:
        return 0

    if k == 0 or k == n:
        return 1

    total_ways = 1
    for i in range(min(k, n - k)):
        total_ways = total_ways * (n - i) // (i + 1)

    return total_ways

98
99
100
101
###############################################
# Check if all parameters have been specified #
###############################################

102
103
104
105
configDict = {
            "par_general":
                ["outdir", "regionExtension", "comparisonType", "designContrast", "designVariableTypes", "conditionComparison", "nPermutations", "nCGBins", "TFs", "dir_scripts", "RNASeqIntegration", "nBootstraps"],
            "samples":
106
                ["summaryFile", "pairedEnd"],
107
108
109
110
111
112
113
114
115
            "peaks":
                ["consensusPeaks", "peakType", "minOverlap"],
            "additionalInputFiles":
                ["refGenome_fasta","dir_TFBS", "RNASeqCounts", "HOCOMOCO_mapping"]
            }

for sectionCur in configDict:

    # 1. Section is defined at all?
116
117
118
    if not sectionCur in config:
        raise KeyError("Could not find section \"" + sectionCur + "\" in the config file or no config file has been specified.")

119
120
121
122
123
124
125
126
127
128
129
    # 2. All section parameters are defined?
    requiredPar = configDict[sectionCur]
    missingParameters = []

    for parCur in requiredPar:
        if not parCur in config[sectionCur]:
            missingParameters.append(parCur)

    if len(missingParameters) > 0:
        missingParStr = ",".join(missingParameters)
        raise KeyError("Could not find parameter(s) \"" + missingParStr + "\" in section \"" + sectionCur + "\" in the config file.")
130

Christian Arnold's avatar
Christian Arnold committed
131
132
133
134
135

#############################
# DIRECTORIES AND VARIABLES #
#############################

136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Maximum number of cores per rule.

# For local computation, the minimum of this value and the --cores parameter will define the number of CPUs per rule,
# while in a cluster setting, the minimum of this value and the number of cores onn the node the jobs runs is usedself.

try:
  threadsMax = config["par_general"]["maxCoresPerRule"]
except KeyError:
  print("The parameter \"coresPerRule\" in section \"par_general\" has not been defined. Jobs/rules with multithreading support will use the default of 16 cores.")
  threadsMax = 16

try:
  TFBSSorted = config["par_general"]["dir_TFBS_sorted"]
except KeyError:
  print("The parameter \"dir_TFBS_sorted\" in section \"par_general\" has not been defined. Presorted BED files will be assumed.")
  TFBSSorted = True
Christian Arnold's avatar
Christian Arnold committed
152

153
154
155
# Increase ulimit -n for analysis with high number of TF and/or input files. The standard value of 1024 may not be enough.
ulimitMax = 4096

156
157
158
159
160
161
# Suffix name for the TFBS lists
suffixTFBS = '_TFBS.bed'

# Make the output nicer and easier to follow
ruleDisplayMessage = "\n# START EXECUTING RULE #\n"

Christian Arnold's avatar
Christian Arnold committed
162
163
# Input files
samplesSummaryFile = config["samples"]["summaryFile"]
164
165
regionExt          = config["par_general"]["regionExtension"]
nCGBins            = config["par_general"]["nCGBins"]
Christian Arnold's avatar
Christian Arnold committed
166
extDir = "extension" + str(config["par_general"]["regionExtension"])
Christian Arnold's avatar
Christian Arnold committed
167

168
169
170
pairedEndOptions = "-p -B -P -d 0 -D 2000 -C"
if not config["samples"]["pairedEnd"]:
    pairedEndOptions = ""
171

Christian Arnold's avatar
Christian Arnold committed
172
ROOT_DIR            = config["par_general"]["outdir"]
Christian Arnold's avatar
Christian Arnold committed
173
FINAL_DIR           = ROOT_DIR + "/FINAL_OUTPUT/" + extDir
Christian Arnold's avatar
Christian Arnold committed
174
175
TF_DIR              = ROOT_DIR + "/TF-SPECIFIC"
PEAKS_DIR           = ROOT_DIR + "/PEAKS"
Christian Arnold's avatar
Christian Arnold committed
176
LOG_BENCHMARK_DIR   = ROOT_DIR + "/LOGS_AND_BENCHMARKS"
Christian Arnold's avatar
Christian Arnold committed
177
TEMP_DIR            = ROOT_DIR + "/TEMP"
Christian Arnold's avatar
Christian Arnold committed
178
TEMP_EXTENSION_DIR  = ROOT_DIR + "/TEMP/"  + extDir
179
TEMP_BAM_DIR        = ROOT_DIR + "/TEMP/"  + "sortedBAM"
Christian Arnold's avatar
Christian Arnold committed
180

181

Christian Arnold's avatar
Christian Arnold committed
182
global samplesData
Christian Arnold's avatar
Christian Arnold committed
183
samplesData = read_samplesTable(config["samples"]["summaryFile"], config["peaks"]["consensusPeaks"])
Christian Arnold's avatar
Christian Arnold committed
184
185

allBamFiles  = samplesData.loc[:,"bamReads"]
Christian Arnold's avatar
Christian Arnold committed
186

187
188
189
190
if config["par_general"]["comparisonType"] != "":
    compType = config["par_general"]["comparisonType"] + "."
else:
    compType = ""
Christian Arnold's avatar
Christian Arnold committed
191

192
193
194
############################
# CHECK EXISTANCE OF FILES #
############################
Christian Arnold's avatar
Christian Arnold committed
195

Christian Arnold's avatar
Christian Arnold committed
196
197
198
allBamFilesBasename = []
for fileCur in allBamFiles:
    if not os.path.isfile(fileCur):
Christian Arnold's avatar
Christian Arnold committed
199
        raise IOError("File \"" + fileCur + "\" (defined in " + config["samples"]["summaryFile"] + ") not found.")
200
    # Index is not needed for featureCounts
Christian Arnold's avatar
Christian Arnold committed
201
202
203
204
205
206
207
    basename = os.path.splitext(os.path.basename(fileCur))[0]
    allBamFilesBasename.append(basename)

# Add new column
samplesData = samplesData.assign(basename = allBamFilesBasename)

# Check existance of all specified peak files
Christian Arnold's avatar
Christian Arnold committed
208
209
210
211
212
213
214
if not config["peaks"]["consensusPeaks"]:
    allPeakFiles = samplesData.loc[:,"Peaks"]
    for fileCur in allPeakFiles:
        if not os.path.isfile(fileCur):
            raise IOError("File \"" + fileCur + "\" (defined in " + config["samples"]["summaryFile"] +  ") not found.")
else:
    allPeakFiles = []
Christian Arnold's avatar
Christian Arnold committed
215
216
217
218


allTF = []

Christian Arnold's avatar
Christian Arnold committed
219
if config["par_general"]["TFs"] == "all":
220
221
    TFBS_FILES = os.popen("ls " + config["additionalInputFiles"]["dir_TFBS"]).readlines()
    for TFCur in TFBS_FILES:
222
223
224
        if not os.path.basename(TFCur.replace('\n', '')).endswith(suffixTFBS):
            continue
        TFCurBasename = os.path.basename(TFCur.replace('\n', '').replace(suffixTFBS, ''))
Christian Arnold's avatar
Christian Arnold committed
225
226
227
228
        allTF.append(TFCurBasename)
else:
    TFArray = config["par_general"]["TFs"].replace(" ", "").split(',')
    for TFCur in TFArray:
229
        fileCur = config["additionalInputFiles"]["dir_TFBS"] + "/" + TFCur + suffixTFBS
Christian Arnold's avatar
Christian Arnold committed
230
        if not os.path.isfile(fileCur):
231
            raise IOError("The TF " + TFCur + " is in the list of TFs to process, but the file \"" + fileCur + "\" is missing. Check the folder " + config["additionalInputFiles"]["dir_TFBS"] + " and \"par_general\": \"TFs\")")
Christian Arnold's avatar
Christian Arnold committed
232
        allTF.append(TFCur)
Christian Arnold's avatar
Christian Arnold committed
233

Christian Arnold's avatar
Christian Arnold committed
234
if len(allTF) == 0:
235
    raise WorkflowError("The list of TFs is empty. Adjust the parameter \"par_general\": \"TFs\" and verify that in the specified folder \"" + config["additionalInputFiles"]["dir_TFBS"] + "\", files with the pattern \"{TF}_TFBS.bed\" are present")
Christian Arnold's avatar
Christian Arnold committed
236
237


Christian Arnold's avatar
Christian Arnold committed
238
if not os.path.isfile(config["additionalInputFiles"]["refGenome_fasta"]):
239
    raise IOError("File \"" + config["additionalInputFiles"]["refGenome_fasta"] + "\" not found (parameter additionalInputFiles:refGenome_fasta).")
Christian Arnold's avatar
Christian Arnold committed
240

Christian Arnold's avatar
Christian Arnold committed
241
if config["par_general"]["RNASeqIntegration"]:
Christian Arnold's avatar
Christian Arnold committed
242

Christian Arnold's avatar
Christian Arnold committed
243
244
    filenameCur = config["additionalInputFiles"]["HOCOMOCO_mapping"]
    if not os.path.isfile(filenameCur):
245
        raise IOError("File \"" + filenameCur + "\" not found (parameter additionalInputFiles:HOCOMOCO_mapping).")
Christian Arnold's avatar
Christian Arnold committed
246

Christian Arnold's avatar
Christian Arnold committed
247
248
    filenameCur = config["additionalInputFiles"]["RNASeqCounts"]
    if not os.path.isfile(filenameCur):
249
        raise IOError("File \"" + filenameCur + "\" not found (parameter additionalInputFiles:RNASeqCounts).")
Christian Arnold's avatar
Christian Arnold committed
250
251


Christian Arnold's avatar
Christian Arnold committed
252
253
254
255
256
if config["peaks"]["consensusPeaks"]:
    filenameCur = config["peaks"]["consensusPeaks"]
    if not os.path.isfile(filenameCur):
        raise IOError("File \"" + filenameCur + "\" not found.")
    # Check if it contains scientific notification
257
    # the || true in the end ensures the exit status of grep is 0, because this would raise an error otherwise
Christian Arnold's avatar
Christian Arnold committed
258
259
260
    nHits = int(subprocess.check_output('grep -c "e+" ' + config["peaks"]["consensusPeaks"] + " || true", shell=True))
    if nHits > 0:
        raise AssertionError("File " + config["peaks"]["consensusPeaks"] + " contains at least one line with the scientific notation (e+). This will cause errors in subsequent steps. Check the file and transform all \"e+\" coordinates.")
Christian Arnold's avatar
Christian Arnold committed
261

262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
################
# PERMUTATIONS #
################

# Adjust the number of permutations if set to high
nSamples = len(samplesData.index)
nSamplesCondition1 = len(samplesData[(samplesData['conditionSummary'] == samplesData['conditionSummary'][0])])
nPermutationsTotal = fcomb0(nSamples, nSamplesCondition1)
if nPermutationsTotal < config["par_general"]["nPermutations"]:
    nPermutationsAdjusted = nPermutationsTotal
else:
    nPermutationsAdjusted = config["par_general"]["nPermutations"]

if config["par_general"]["nPermutations"] == 0 and config["par_general"]["nBootstraps"] == 0:
  raise AssertionError("Both nPermutations and nBootstraps are set to 0 in the config file. Either of the two has to have a value > 0. We recommend running diffTF with as many permutations as possible. See the documentation for details.")


###########
# SCRIPTS #
###########
Christian Arnold's avatar
Christian Arnold committed
282

283
284
285
# Default directory for R scripts, specified relative to the Snakefile
# Not to be confused with the dir_scripts directory, which is only used to load the correct functions.R file in all R scripts
dir_scripts = "R/"
Christian Arnold's avatar
Christian Arnold committed
286

287
288
289
290
291
292
293
script_checkParValidity     = "checkParameterValidity.R"
script_createConsensusPeaks = "createConsensusPeaks.R"
script_DiffPeaks            = "diffPeaks.R"
script_analyzeTF            = "analyzeTF.R"
script_summary1             = "summary1.R"
script_binningTF            = "binningTF.R"
script_summaryFinal         = "summaryFinal.R"
294

Christian Arnold's avatar
Christian Arnold committed
295
296


297
298
299
300
301
#########################################
#########################################
#              RULES                    #
#########################################
#########################################
Christian Arnold's avatar
Christian Arnold committed
302
303

# For cluster usage:  The keyword localrules allows to mark a rule as local, so that it is not submitted to the cluster and instead executed on the host node
304
localrules: all,cleanUpLogFiles,filterSexChromosomesAndSortPeaks
Christian Arnold's avatar
Christian Arnold committed
305
306
307

rule all:
    input:
308
        plots       = expand("{dir}/{compType}summary.{plotType}.pdf", dir = FINAL_DIR, compType = compType, plotType = ["circular", "volcano"]),
Christian Arnold's avatar
Christian Arnold committed
309
        summaryLogs = LOG_BENCHMARK_DIR + "/" + compType + "all.warnings.log"
Christian Arnold's avatar
Christian Arnold committed
310
311


312
313
314
315
316
rule checkParameterValidity:
    input:
    output:
        flag      = touch(TEMP_DIR  + "/" + compType + "checkParameterValidity.done"),
        consPeaks = TEMP_DIR  + "/" + compType + "consensusPeaks.clean.bed",
317
    log: expand('{dir}/checkParameterValidity.R.log', dir = LOG_BENCHMARK_DIR)
318
319
320
    message: "{ruleDisplayMessage}Check parameter validity {script_checkParValidity}..."
    threads: 1
    priority: 1
321
    singularity: "shub://chrarnold/Singularity_images:difftf_r"
322
323
324
    params:
    script: dir_scripts + script_checkParValidity

Christian Arnold's avatar
Christian Arnold committed
325
326
rule produceConsensusPeaks:
    input:
327
        checkFlag = ancient(rules.checkParameterValidity.output.flag),
328
        peaks     = allPeakFiles
Christian Arnold's avatar
Christian Arnold committed
329
    output:
330
331
        consensusPeaks_bed = TEMP_DIR + "/" + compType + "consensusPeaks.bed",
        summaryPlot        = TEMP_DIR + "/" + compType + "consensusPeaks_lengthDistribution.pdf"
332
    log: expand('{dir}/produceConsensusPeaks.R.log', dir = LOG_BENCHMARK_DIR)
Christian Arnold's avatar
Christian Arnold committed
333
    message: "{ruleDisplayMessage}Calculate consensus peaks for all peak files with the script {script_createConsensusPeaks}..."
Christian Arnold's avatar
Christian Arnold committed
334
    threads: 1
335
    singularity: "shub://chrarnold/Singularity_images:difftf_r"
Christian Arnold's avatar
Christian Arnold committed
336
    params:
337
    script: dir_scripts + script_createConsensusPeaks
Christian Arnold's avatar
Christian Arnold committed
338

Christian Arnold's avatar
Christian Arnold committed
339
340
# Forces the execution of the rule above just when the user did not provide a consensus peak file
def retrieveConsensusPeakFile (par_consensusPeaks):
Christian Arnold's avatar
Christian Arnold committed
341

Christian Arnold's avatar
Christian Arnold committed
342
343
344
    if not par_consensusPeaks:
        return rules.produceConsensusPeaks.output.consensusPeaks_bed
    else:
345
        return rules.checkParameterValidity.output.consPeaks
Christian Arnold's avatar
Christian Arnold committed
346

Christian Arnold's avatar
Christian Arnold committed
347
rule filterSexChromosomesAndSortPeaks:
Christian Arnold's avatar
Christian Arnold committed
348
    input:
Christian Arnold's avatar
Christian Arnold committed
349
        consensusPeaks = retrieveConsensusPeakFile(config["peaks"]["consensusPeaks"])
Christian Arnold's avatar
Christian Arnold committed
350
    output:
Christian Arnold's avatar
Christian Arnold committed
351
352
        consensusPeaks_sorted   = PEAKS_DIR + "/" + compType + "consensusPeaks.filtered.sorted.bed"
    message: "{ruleDisplayMessage}Filter sex and unassembled chromosomes..."
Christian Arnold's avatar
Christian Arnold committed
353
    threads: 1
354
    singularity: "shub://chrarnold/Singularity_images:difftf_conda"
Christian Arnold's avatar
Christian Arnold committed
355
    shell: """
356
            grep -v "^chrX\|^chrY\|^chrM\|^chrUn\|random\|hap\|_gl" {input.consensusPeaks} | sort -k1,1 -k2,2n > {output.consensusPeaks_sorted}
Christian Arnold's avatar
Christian Arnold committed
357
           """
Christian Arnold's avatar
Christian Arnold committed
358
359


360
overlapPattern = "overlaps.bed.gz"
Christian Arnold's avatar
Christian Arnold committed
361

362
# This rule is only run when parameter TFBSSorted = False
363
rule sortTFBSParallel:
Christian Arnold's avatar
Christian Arnold committed
364
    input:
365
        flag = ancient(rules.checkParameterValidity.output.flag),
366
        allBed = expand('{dir}/{TF}{suffix}', dir = config["additionalInputFiles"]["dir_TFBS"], TF = allTF, suffix = suffixTFBS)
Christian Arnold's avatar
Christian Arnold committed
367
    output:
368
369
        allBedSorted = expand('{dir}/{compType}{TF}_TFBS.sorted.bed', dir = TEMP_DIR, compType = compType, TF = allTF)
    message: "{ruleDisplayMessage} Sort TFBS files for all TF..."
Christian Arnold's avatar
Christian Arnold committed
370
    threads: 1
371
372
373
    run:
        for fi,fo in zip(input.allBed, output.allBedSorted):
          shell("sort -k1,1 -k2,2n {fi}  > {fo}")
Christian Arnold's avatar
Christian Arnold committed
374
375
376
377
378
379
380
381
382


def getBamFileFromBasename(basename):
    """text"""
    hit = numpy.asarray(samplesData.loc[samplesData["basename"] == basename, "bamReads"])
    if len(hit) != 1:
        raise KeyError("Could not uniquely retrieve the BAM file for the basename \"" + basename + "\" from the file " + config["samples"]["summaryFile"])
    return hit

383
# Not run for single-end data
384
rule resortBAM:
Christian Arnold's avatar
Christian Arnold committed
385
    input:
386
387
        flag = ancient(rules.checkParameterValidity.output.flag),
        BAM = lambda wildcards: getBamFileFromBasename(wildcards.BAM)
Christian Arnold's avatar
Christian Arnold committed
388
    output:
389
390
        BAMSorted = TEMP_BAM_DIR + "/" + "{BAM}.bam"
    message: "{ruleDisplayMessage} Sort BAM file {input.BAM}..."
Christian Arnold's avatar
Christian Arnold committed
391
    threads: 1
392
    singularity: "shub://chrarnold/Singularity_images:difftf_conda"
Christian Arnold's avatar
Christian Arnold committed
393
    params:
394
395
        compression = "-c",
        noSeqInf = "-t"
Christian Arnold's avatar
Christian Arnold committed
396
    shell:
397
        """repair {params.compression} {params.noSeqInf} -i {input.BAM} -o {output.BAMSorted}"""
398

399
400
401
402
403
404
405
406
407
408


def getBamFilesBasedOnPairedEnd(wildcards):
    """text"""
    if config["samples"]["pairedEnd"]:
        return expand('{dir}/{allBasenamesBAM}.bam', dir = TEMP_BAM_DIR, allBasenamesBAM = allBamFilesBasename)
    else:
        return allBamFiles


409
# Use anonymous named pipe to speed-up execution times
Christian Arnold's avatar
Christian Arnold committed
410
411
rule intersectPeaksAndBAM:
    input:
Christian Arnold's avatar
Christian Arnold committed
412
        consensusPeaks = rules.filterSexChromosomesAndSortPeaks.output.consensusPeaks_sorted,
413
        allBAMs        = getBamFilesBasedOnPairedEnd
Christian Arnold's avatar
Christian Arnold committed
414
    output:
415
        peaksBamOverlapRaw = temp(PEAKS_DIR + '/' + compType + 'allBams.peaks.overlaps.bed'),
416
417
        peaksBamOverlap    = PEAKS_DIR + '/' + compType + 'allBams.peaks.overlaps.bed.gz',
        consensusPeaksSAF  = temp(TEMP_DIR + "/" + compType + "consensusPeaks.filtered.sorted.saf"),
Christian Arnold's avatar
Christian Arnold committed
418
    log:
419
420
    message: "{ruleDisplayMessage} Intersect for file {input.consensusPeaks} with all BAM files..."
    threads: threadsMax
421
    singularity: "shub://chrarnold/Singularity_images:difftf_conda"
Christian Arnold's avatar
Christian Arnold committed
422
    params:
423
        pairedEnd     = pairedEndOptions,
424
        readFiltering = "-Q 10",
425
        multiOverlap = "-O",
426
        ulimitMax = ulimitMax
Christian Arnold's avatar
Christian Arnold committed
427
    shell:
428
429
430
431
432
433
434
435
        """ ulimit -n {params.ulimitMax}  &&
            awk 'BEGIN {{ OFS = "\\t" }} {{print $4,$1,$2,$3,"+"}}' {input.consensusPeaks} >{output.consensusPeaksSAF} &&
            featureCounts \
            -F SAF \
            -T {threads} \
            {params.readFiltering} \
            {params.pairedEnd} \
            -s 0 \
436
            {params.multiOverlap}  \
437
438
439
440
            -a {output.consensusPeaksSAF} \
            -o {output.peaksBamOverlapRaw}  \
            {input.allBAMs} &&
            gzip -f < {output.peaksBamOverlapRaw} > {output.peaksBamOverlap}
Christian Arnold's avatar
Christian Arnold committed
441
442
443
444
        """

# TF-specific part:

445
446
447
448
449
450
451
def retrieveInputFilesTFBS (TFBSSorted):

    if not TFBSSorted:
        return rules.sortTFBSParallel.output.allBedSorted
    else:
        return expand('{dir}/{TF}{suffix}', dir = config["additionalInputFiles"]["dir_TFBS"], TF = allTF, suffix = suffixTFBS)

452
rule intersectPeaksAndTFBS:
Christian Arnold's avatar
Christian Arnold committed
453
    input:
Christian Arnold's avatar
Christian Arnold committed
454
        consensusPeaks = rules.filterSexChromosomesAndSortPeaks.output.consensusPeaks_sorted,
455
        allTFBS        = retrieveInputFilesTFBS(TFBSSorted)
Christian Arnold's avatar
Christian Arnold committed
456
    output:
457
458
        TFBSinPeaks_bed    = expand('{dir}/{compType}allTFBS.peaks.bed.gz', dir = TEMP_DIR, compType = compType),
        TFBSinPeaksMod_bed = expand('{dir}/{compType}allTFBS.peaks.extension.bed.gz', dir = TEMP_EXTENSION_DIR, compType = compType)
459
    message: "{ruleDisplayMessage} Obtain binding sites from peaks: Intersect all TFBS files and {input.consensusPeaks}..."
Christian Arnold's avatar
Christian Arnold committed
460
    threads: 1
461
    singularity: "shub://chrarnold/Singularity_images:difftf_conda"
Christian Arnold's avatar
Christian Arnold committed
462
    params:
463
464
        extension = config["par_general"]["regionExtension"],
        ulimitMax = ulimitMax
Christian Arnold's avatar
Christian Arnold committed
465
    shell:
466
467
        """ ulimit -n {params.ulimitMax} &&
            bedtools intersect \
Christian Arnold's avatar
Christian Arnold committed
468
                -a {input.consensusPeaks} \
469
                -b {input.allTFBS} \
Christian Arnold's avatar
Christian Arnold committed
470
471
                -wa -wb \
                -sorted \
Christian Arnold's avatar
Christian Arnold committed
472
                -filenames \
473
                | gzip -f > {output.TFBSinPeaks_bed} &&
474
            gunzip -c {output.TFBSinPeaks_bed} | cut -f4,5,6,7,8,9,12 | uniq  | awk '{{OFS="\\t"}};{{ print $4, $5-{params.extension}, $6+{params.extension},$1,$2,$7,$3}}' | gzip -f > {output.TFBSinPeaksMod_bed}
Christian Arnold's avatar
Christian Arnold committed
475
476
        """

477

Christian Arnold's avatar
Christian Arnold committed
478
rule intersectTFBSAndBAM:
Christian Arnold's avatar
Christian Arnold committed
479
    input:
480
        bed           = rules.intersectPeaksAndTFBS.output.TFBSinPeaksMod_bed,
481
        allBAMs       = getBamFilesBasedOnPairedEnd
Christian Arnold's avatar
Christian Arnold committed
482
    output:
483
        BAMOverlapRaw = temp(TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.allBAMs.overlaps.bed"),
484
485
        BAMOverlap    =      TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.allBAMs.overlaps.bed.gz",
        saf           = temp(expand('{dir}/{compType}{{TF}}.allTFBS.peaks.extension.saf', dir = TEMP_EXTENSION_DIR, compType = compType))
Christian Arnold's avatar
Christian Arnold committed
486
    log:
487
    message: "{ruleDisplayMessage} Intersect file {input.bed} against all BAM files for TF {wildcards.TF}..."
488
    threads: 4
489
    singularity: "shub://chrarnold/Singularity_images:difftf_conda"
Christian Arnold's avatar
Christian Arnold committed
490
    params:
491
        pairedEnd = pairedEndOptions,
492
        readFiltering = "-Q 10",
493
        multiOverlap = "-O",
494
        ulimitMax = ulimitMax
Christian Arnold's avatar
Christian Arnold committed
495
    shell:
496
        """ ulimit -n {params.ulimitMax} &&
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
            zgrep "{wildcards.TF}_TFBS\." {input.bed} | awk 'BEGIN {{ OFS = "\\t" }} {{print $4"_"$2"-"$3,$1,$2,$3,$6}}' | sort -u -k1,1  >{output.saf} || true &&
            if [[ $(wc -l <{output.saf}) -eq "0" ]]; then
                touch {output.BAMOverlapRaw}
                echo "No TFBS found, skip featureCounts..."
            else
                featureCounts \
                -F SAF \
                -T {threads} \
                {params.readFiltering} \
                {params.pairedEnd} \
                -a {output.saf} \
                -s 0 \
                {params.multiOverlap}  \
                -o {output.BAMOverlapRaw}  \
                {input.allBAMs}
            fi &&
513
            gzip -f < {output.BAMOverlapRaw} > {output.BAMOverlap}
514

Christian Arnold's avatar
Christian Arnold committed
515
516
        """

517

Christian Arnold's avatar
Christian Arnold committed
518
519
name_plots = PEAKS_DIR + "/" + compType + "diagnosticPlots.peaks.pdf"

520
rule DiffPeaks:
Christian Arnold's avatar
Christian Arnold committed
521
    input:
522
        sampleData      = config["samples"]["summaryFile"],
523
        BAMPeakoverlaps = rules.intersectPeaksAndBAM.output.peaksBamOverlap
Christian Arnold's avatar
Christian Arnold committed
524
    output:
Christian Arnold's avatar
Christian Arnold committed
525
526
        sampleDataR    = PEAKS_DIR + "/" + compType + "sampleMetadata.rds",
        peakFile       = PEAKS_DIR + "/" + compType + "peaks.rds",
527
        peaks_tsv      = PEAKS_DIR + "/" + compType + "peaks.tsv.gz",
Christian Arnold's avatar
Christian Arnold committed
528
529
        condComp       = TEMP_EXTENSION_DIR  + "/" + compType + "conditionComparison.rds",
        normFacs       = PEAKS_DIR + "/" + compType + "normFacs.rds",
530
        normCounts     = PEAKS_DIR + "/" + compType + "countsNormalized.tsv.gz",
Christian Arnold's avatar
Christian Arnold committed
531
532
        plots          = name_plots,
        DESeqObj       = PEAKS_DIR + "/" + compType + "DESeq.object.rds"
533
    log: expand('{dir}/DiffPeaks.R.log', dir = LOG_BENCHMARK_DIR)
534
535
    message: "{ruleDisplayMessage}Run R script {script_DiffPeaks}"
    threads: 1
536
    singularity: "shub://chrarnold/Singularity_images:difftf_r"
Christian Arnold's avatar
Christian Arnold committed
537
    params:
Christian Arnold's avatar
Christian Arnold committed
538
        doCyclicLoess = "true"
539
    script: dir_scripts + script_DiffPeaks
Christian Arnold's avatar
Christian Arnold committed
540
541


Christian Arnold's avatar
Christian Arnold committed
542
name_plotsDiag = TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}" + ".diagnosticPlots.pdf"
Christian Arnold's avatar
Christian Arnold committed
543
544
545

rule analyzeTF:
    input:
546
        overlapFile        = rules.intersectTFBSAndBAM.output.BAMOverlap,
547
548
549
550
551
        sampleDataR        = rules.DiffPeaks.output.sampleDataR,
        peakFile           = rules.DiffPeaks.output.peakFile,
        peakFile2          = rules.DiffPeaks.output.peaks_tsv,
        normFacs           = rules.DiffPeaks.output.normFacs,
        condComp           = rules.DiffPeaks.output.condComp
Christian Arnold's avatar
Christian Arnold committed
552
    output:
553
554
555
556
        outputTSV            = TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.output.tsv.gz",
        outputPermTSV        = TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.outputPerm.tsv.gz",
        outputRDS            = TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.summary.rds",
        plot_diagnostic      = name_plotsDiag
557
    log: expand('{dir}/analyzeTF.{{TF}}.R.log', dir = LOG_BENCHMARK_DIR)
Christian Arnold's avatar
Christian Arnold committed
558
    message: "{ruleDisplayMessage}Run R script {script_analyzeTF} for TF {wildcards.TF}..."
Christian Arnold's avatar
Christian Arnold committed
559
    threads: 1
560
    singularity: "shub://chrarnold/Singularity_images:difftf_r"
Christian Arnold's avatar
Christian Arnold committed
561
    params:
Christian Arnold's avatar
Christian Arnold committed
562
563
        doCyclicLoess = "true",
        allBAMS       = list(allBamFiles)
564
    script: dir_scripts + script_analyzeTF
Christian Arnold's avatar
Christian Arnold committed
565
566
567


rule summary1:
Christian Arnold's avatar
Christian Arnold committed
568
    input:
569
        peaks = rules.DiffPeaks.output.peaks_tsv,
Christian Arnold's avatar
Christian Arnold committed
570
        TF    = expand('{dir}/{TF}/{ext}/{compType}{TF}.summary.rds', dir = TF_DIR, TF = allTF, ext = extDir, compType = compType)
Christian Arnold's avatar
Christian Arnold committed
571
    output:
572
        outputTable = FINAL_DIR + "/" + compType + "TF_vs_peak_distribution.tsv.gz"
573
    log: expand('{dir}/summary1.R.log', dir = LOG_BENCHMARK_DIR)
Christian Arnold's avatar
Christian Arnold committed
574
    message: "{ruleDisplayMessage}Run R script {script_summary1} ..."
Christian Arnold's avatar
Christian Arnold committed
575
    threads: 1
576
    singularity: "shub://chrarnold/Singularity_images:difftf_r"
577
    script: dir_scripts + script_summary1
Christian Arnold's avatar
Christian Arnold committed
578
579


580
# Python: Read in 640 files one by one, and for each loop through all 1000 permutations and output to the correpsonding output file.
Christian Arnold's avatar
Christian Arnold committed
581

Christian Arnold's avatar
Christian Arnold committed
582

583
584
585
586
587
588
589
590
591
rule concatenateMotifsPerm:
    input:
        diagnosticPlots = rules.summary1.output,
        TFMotifesPerm = expand('{dir}/{TF}/{extension}/{compType}{TF}.outputPerm.tsv.gz', dir = TF_DIR, TF = allTF, extension = extDir, compType = compType)
    output:
        allMotifsLog2FC  = temp(TEMP_EXTENSION_DIR + "/" + compType + "allMotifs_log2FC_perm{perm}.tsv.gz")
    log:
    message: "{ruleDisplayMessage}Concatenate all motifs for permutation {wildcards.perm}..."
    threads: 1
592
    singularity: "shub://chrarnold/Singularity_images:difftf_conda"
593
    params:
594
595
        motifsShortPerm = TF_DIR + "/*/" + extDir + "/" + compType + "*.outputPerm.tsv.gz",
        colToExtract= lambda wc: str(int(wc.perm) + 3)
596
597
    shell:
      """
598
        zcat {params.motifsShortPerm} | cut -f1,2,{params.colToExtract} | gzip >{output.allMotifsLog2FC}
599
600
      """

601
602

# set +o pipefail has to be used here because head closes the pipe after reading the specified number of lines, causing a 141 (127+14) error that Snakemake captures)
603
# Use anonymous named pipe to speed-up execution times
Christian Arnold's avatar
Christian Arnold committed
604
rule calcNucleotideContent:
Christian Arnold's avatar
Christian Arnold committed
605
    input:
606
607
        diagnosticPlots = rules.summary1.output,
        TFMotifes     = expand('{dir}/{TF}/{extension}/{compType}{TF}.output.tsv.gz', dir = TF_DIR, TF = allTF, extension = extDir, compType = compType)
Christian Arnold's avatar
Christian Arnold committed
608
    output:
609
        allMotifsDetails = FINAL_DIR + "/" + compType + "allMotifs.tsv.gz",
610
        bed     = TEMP_EXTENSION_DIR + "/" + compType + "motifs.coord.nucContent.bed.gz"
Christian Arnold's avatar
Christian Arnold committed
611
    log:
612
    message: "{ruleDisplayMessage}Calculate nucleotide content via bedtools nuc for all TFBS..."
Christian Arnold's avatar
Christian Arnold committed
613
    threads: 1
614
    singularity: "shub://chrarnold/Singularity_images:difftf_conda"
615
616
617
618
    params:
        motifsShort = TF_DIR + "/*/" + extDir + "/" + compType + "*.output.tsv.gz",
        # TFMotifes = construct a string that resembles the call to tail,
        refGenome = config["additionalInputFiles"]["refGenome_fasta"]
Christian Arnold's avatar
Christian Arnold committed
619
620
    shell:
      """
621
622
            set +o pipefail; cat <(gunzip -c {input.TFMotifes[1]} | head -1) <(zgrep "$(printf '^0\\t')" {params.motifsShort}) | gzip -f > {output.allMotifsDetails} &&
            bedtools nuc -fi {params.refGenome} -bed <(gunzip -c {output.allMotifsDetails} | awk '{{OFS="\\t"}}; NR > 1 {{print $3,$4,$5,$6,$2}}') | gzip -f > {output.bed}
Christian Arnold's avatar
Christian Arnold committed
623
624
        """

Christian Arnold's avatar
Christian Arnold committed
625
rule binningTF:
Christian Arnold's avatar
Christian Arnold committed
626
    input:
627
        nucContent  = rules.calcNucleotideContent.output.bed,
628
        sampleDataR        = rules.DiffPeaks.output.sampleDataR,
629
        motifes     = expand("{dir}/{compType}allMotifs_log2FC_perm{perm}.tsv.gz", dir = TEMP_EXTENSION_DIR, compType = compType, perm = list(range(0, nPermutationsAdjusted + 1)))
Christian Arnold's avatar
Christian Arnold committed
630
    output:
Christian Arnold's avatar
Christian Arnold committed
631
        permResults  = expand('{dir}/{{TF}}/{extension}/{compType}{{TF}}.permutationResults.rds', dir = TF_DIR, extension = extDir, compType = compType),
632
        summary      = expand('{dir}/{{TF}}/{extension}/{compType}{{TF}}.permutationSummary.tsv.gz', dir = TF_DIR, extension = extDir, compType = compType)
633
    log: expand('{dir}/binningTF.{{TF}}.R.log', dir = LOG_BENCHMARK_DIR)
634
    message: "{ruleDisplayMessage}Run R script {script_binningTF} for TF {wildcards.TF}..."
Christian Arnold's avatar
Christian Arnold committed
635
    threads: 1
636
    singularity: "shub://chrarnold/Singularity_images:difftf_r"
637
    script: dir_scripts + script_binningTF
Christian Arnold's avatar
Christian Arnold committed
638

639
640
641
642
# Determine which files are produced by the rule depending on whether the classifiation should be run
allDiagnosticFiles = []
allDiagnosticFiles.append(FINAL_DIR + "/" + compType + "diagnosticPlots.pdf")
if config["par_general"]["RNASeqIntegration"]:
643
    classificationFiles = expand("{dir}/{compType}diagnosticPlotsClassification{no}.pdf", dir = FINAL_DIR, compType = compType, no = [1,2])
644
645
    allDiagnosticFiles.append(classificationFiles)

646

Christian Arnold's avatar
Christian Arnold committed
647
rule summaryFinal:
Christian Arnold's avatar
Christian Arnold committed
648
    input:
649
650
651
652
        allPermutationResults = expand('{dir}/{TF}/{extension}/{compType}{TF}.permutationSummary.tsv.gz', dir = TF_DIR, TF = allTF, extension = extDir, compType = compType),
        condComp              = rules.DiffPeaks.output.condComp,
        normCounts            = rules.DiffPeaks.output.normCounts,
        sampleDataR           = rules.DiffPeaks.output.sampleDataR,
Christian Arnold's avatar
Christian Arnold committed
653
    output:
654
        summary         = FINAL_DIR + "/" + compType + "summary.tsv.gz",
Christian Arnold's avatar
Christian Arnold committed
655
        circularPlot    = FINAL_DIR + "/" + compType + "summary.circular.pdf",
656
        volcanoPlot     = FINAL_DIR + "/" + compType + "summary.volcano.pdf",
657
        diagnosticPlots = allDiagnosticFiles,
658
        plotsRDS        = FINAL_DIR + "/" + compType + "summary.circular.rds",
659
    log: expand('{dir}/summaryFinal.R.log', dir = LOG_BENCHMARK_DIR)
Christian Arnold's avatar
Christian Arnold committed
660
    message: "{ruleDisplayMessage}Run R script {script_summaryFinal} ..."
Christian Arnold's avatar
Christian Arnold committed
661
    threads: 1
662
    singularity: "shub://chrarnold/Singularity_images:difftf_r"
Christian Arnold's avatar
Christian Arnold committed
663
    params: TFs = ",".join(allTF)
664
    script: dir_scripts + script_summaryFinal
Christian Arnold's avatar
Christian Arnold committed
665
666
667
668
669
670
671
672
673
674
675



rule cleanUpLogFiles:
    input: rules.summaryFinal.output
    output:
        warnLog  = LOG_BENCHMARK_DIR + "/" + compType + "all.warnings.log",
        errorLog = LOG_BENCHMARK_DIR + "/" + compType + "all.errors.log"
    message: "{ruleDisplayMessage}Clean and summarize Logs_and_Benchmark directory..."
    threads: 1
    params: dir = LOG_BENCHMARK_DIR
676
    singularity: "shub://chrarnold/Singularity_images:difftf_conda"
Christian Arnold's avatar
Christian Arnold committed
677
678
    shell:
      """
679
680
        grep -i "^WARN"  {params.dir}/*.log > {output.warnLog} || true &&
        grep -i "^FATAL" {params.dir}/*.log > {output.errorLog} || true &&
Christian Arnold's avatar
Christian Arnold committed
681
682
        rm {params.dir}/*.out {params.dir}/*.err || true
      """