Commit dba60d42 authored by Christian Arnold's avatar Christian Arnold

Major Update: featureCounts integration, TFBS naming updates, various other changes

parent 1c3e7071
No preview for this file type
......@@ -30,13 +30,10 @@
"additionalInputFiles":
{
"refGenome_fasta" : "referenceGenome/mm10.fa",
"dir_PWMScan" : "PWMscan",
"dir_TFBS" : "mm10/PWMScan_HOCOMOCOv10",
"RNASeqCounts" : "data/RNA-Seq/RNA.counts.tsv",
"HOCOMOCO_mapping" : "../../src/HOCOMOCO/translationTable_mouse.csv"
"HOCOMOCO_mapping" : "../../src/TF_Gene_TranslationTables/HOCOMOCO_v10/translationTable_mm10.csv"
}
}
......@@ -7,5 +7,5 @@ wget -O $filename https://www.embl.de/download/zaugg/diffTF/$filename && tar xvz
filename="mm10.fa.tar.gz"
wget -O $filename https://www.embl.de/download/zaugg/diffTF/referenceGenome/$filename && mkdir referenceGenome && tar xvzf $filename -C referenceGenome --overwrite && rm $filename
filename="PWMscan.mouse.tar.gz"
wget -O $filename https://www.embl.de/download/zaugg/diffTF/PWMScan/$filename && tar xvzf $filename --overwrite && rm $filename
filename="TFBS_mm10_PWMScan_HOCOMOCOv10.tar.gz"
wget -O $filename https://www.embl.de/download/zaugg/diffTF/TFBS/$filename && tar xvzf $filename --overwrite && rm $filename
SampleID Condition bamReads Peaks conditionSummary
GMP.WT.1 GMP data/bam/GMP.WT.1.final.sort.CGcorr.chr1.bam data/peaks/GMP.WT.1.final.sort.CGcorr_peaks.narrowPeak GMP
GMP.WT.2 GMP data/bam/GMP.WT.2.final.sort.CGcorr.chr1.bam data/peaks/GMP.WT.2.final.sort.CGcorr_peaks.narrowPeak GMP
GMP.WT.3 GMP data/bam/GMP.WT.3.final.sort.CGcorr.chr1.bam data/peaks/GMP.WT.3.final.sort.CGcorr_peaks.narrowPeak GMP
GMP.WT.4 GMP data/bam/GMP.WT.4.final.sort.CGcorr.chr1.bam data/peaks/GMP.WT.4.final.sort.CGcorr_peaks.narrowPeak GMP
MPP.WT.1 MPP data/bam/MPP.WT.1.final.sort.CGcorr.chr1.bam data/peaks/MPP.WT.1.final.sort.CGcorr_peaks.narrowPeak MPP
MPP.WT.2 MPP data/bam/MPP.WT.2.final.sort.CGcorr.chr1.bam data/peaks/MPP.WT.2.final.sort.CGcorr_peaks.narrowPeak MPP
MPP.WT.3 MPP data/bam/MPP.WT.3.final.sort.CGcorr.chr1.bam data/peaks/MPP.WT.3.final.sort.CGcorr_peaks.narrowPeak MPP
MPP.WT.4 MPP data/bam/MPP.WT.4.final.sort.CGcorr.chr1.bam data/peaks/MPP.WT.4.final.sort.CGcorr_peaks.narrowPeak MPP
GMP.WT.1 GMP /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/bam/GMP.WT.1.chr1.bam /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/peaks/GMP.WT.1.narrowPeak GMP
GMP.WT.2 GMP /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/bam/GMP.WT.2.chr1.bam /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/peaks/GMP.WT.2.narrowPeak GMP
GMP.WT.3 GMP /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/bam/GMP.WT.3.chr1.bam /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/peaks/GMP.WT.3.narrowPeak GMP
GMP.WT.4 GMP /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/bam/GMP.WT.4.chr1.bam /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/peaks/GMP.WT.4.narrowPeak GMP
MPP.WT.1 MPP /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/bam/MPP.WT.1.chr1.bam /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/peaks/MPP.WT.1.narrowPeak MPP
MPP.WT.2 MPP /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/bam/MPP.WT.2.chr1.bam /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/peaks/MPP.WT.2.narrowPeak MPP
MPP.WT.3 MPP /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/bam/MPP.WT.3.chr1.bam /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/peaks/MPP.WT.3.narrowPeak MPP
MPP.WT.4 MPP /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/bam/MPP.WT.4.chr1.bam /g/scb2/zaugg/carnold/Projects/diffTF/example/input/data/peaks/MPP.WT.4.narrowPeak MPP
......@@ -17,4 +17,4 @@ echo "# INCREASING THE NUMBER OF CORES SPEEDS UP THE ANALYSIS #"
echo "#########################################################"
# Real run, using 2 cores
snakemake --snakefile ../../src/Snakefile --cores 2 --configfile config.json --timestamp --directory .
snakemake --snakefile ../../src/Snakefile --cores 2 --configfile config.json --timestamp --directory . --forceall
start.time <- Sys.time()
#########################
# LIBRARY AND FUNCTIONS #
#########################
library("checkmate")
assertClass(snakemake, "Snakemake")
assertDirectoryExists(snakemake@config$par_general$dir_scripts)
source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R"))
initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "Rsamtools"), verbose = FALSE)
########################################################################
# SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES #
########################################################################
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} and {TF} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/6.binningTF.{TF}.R.rds")
createDebugFile(snakemake)
par.l = list()
par.l$verbose = TRUE
par.l$log_minlevel = "INFO"
#####################
# VERIFY PARAMETERS #
#####################
assertClass(snakemake, "Snakemake")
## OUTPUT ##
assertList(snakemake@output, min.len = 1)
assertSubset(names(snakemake@output), c("", "consPeaks", "flag"))
par.l$output_peaksClean = snakemake@output$consPeaks
par.l$output_flag = snakemake@output$flag
## CONFIG ##
assertList(snakemake@config, min.len = 1)
file_peaks = snakemake@config$peaks$consensusPeaks
TFBS_dir = snakemake@config$additionalInputFiles$dir_TFBS
assertDirectoryExists(dirname(TFBS_dir), access = "r")
fastaFile = snakemake@config$additionalInputFiles$refGenome_fasta
assertFileExists(fastaFile)
assertDirectoryExists(dirname(fastaFile), access = "w")
par.l$file_input_sampleData = snakemake@config$samples$summaryFile
checkAndLogWarningsAndErrors(par.l$file_input_sampleData, checkFileExists(par.l$file_input_sampleData, access = "r"))
## LOG ##
assertList(snakemake@log, min.len = 1)
par.l$file_log = snakemake@log[[1]]
allDirs = c(dirname(par.l$file_log))
testExistanceAndCreateDirectoriesRecursively(allDirs)
######################
# FINAL PREPARATIONS #
######################
startLogger(par.l$file_log, par.l$log_minlevel, removeOldLog = TRUE)
printParametersLog(par.l)
######################
# LOAD ALL LIBRARIES #
######################
# Step 1
checkAndLoadPackages(c("tidyverse", "futile.logger", "DiffBind", "checkmate", "stats"), verbose = FALSE)
# Step 2
checkAndLoadPackages(c("tidyverse", "futile.logger", "DESeq2", "vsn", "csaw", "checkmate", "limma", "tools", "EDASeq", "geneplotter", "RColorBrewer", "BiocParallel", "rlist"), verbose = FALSE)
# Step 3
checkAndLoadPackages(c("tidyverse", "futile.logger", "DESeq2", "vsn", "modeest", "checkmate", "limma", "geneplotter", "RColorBrewer", "tools"), verbose = FALSE)
# Step 4
checkAndLoadPackages(c("tidyverse", "futile.logger", "modeest", "checkmate", "ggrepel"), verbose = FALSE)
# Step 5
checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "tools", "methods"), verbose = FALSE)
# Step 6
checkAndLoadPackages(c("tidyverse", "futile.logger", "lsr", "ggrepel", "checkmate", "tools", "methods", "boot"), verbose = TRUE)
#############################
# CHECK FASTA AND BAM FILES #
#############################
sampleData.df = read_tsv(par.l$file_input_sampleData, col_names = TRUE, col_types = cols())
# Build the fasta index. Requires write access to the folder where the fasta is stored (limitation of samtools faidx)
indexFa(fastaFile)
indexes.df = as.data.frame(scanFaIndex(fastaFile))
indexes.df$seqnames = as.character(indexes.df$seqnames)
# Check individually for each BAM file
for (bamCur in sampleData.df$bamReads) {
bamHeader = scanBamHeader(bamCur)
chrBAM.df = as.data.frame(bamHeader[[1]]$targets)
chrBAM.df$seqnames = rownames(chrBAM.df)
colnames(chrBAM.df) = c("width", "seqnames")
merged.df = full_join(chrBAM.df, indexes.df, by = "seqnames")
# Filter chromosomes and retain only those we keep in the pipeline
discardMatches = grepl("^chrX|^chrY|^chrM|^chrUn|random|hap|_gl", merged.df$seqnames, perl = TRUE)
if (length(discardMatches) > 0) {
chrNamesDiscarded = paste0(merged.df$seqnames[discardMatches], collapse = " ", sep = "\n")
flog.info(paste0("Discard the following chromosomes for compatibility comparisons because they are filtered in later steps anyway:\n ", chrNamesDiscarded))
merged.df = merged.df[!discardMatches,]
}
mismatches = merged.df$seqnames[which(merged.df$width.x != merged.df$width.y)]
if (length(mismatches) > 0) {
chrStr = paste0(mismatches, collapse = ",")
message = paste0("Chromosome lengths differ between the fasta file ", fastaFile, " and the BAM file ", bamCur, " for chromosome(s) ", chrStr)
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
} else {
flog.info(paste0("Reference fasta file ", fastaFile, " and BAM file ", bamCur, " are fully compatible."))
}
}
###################
# CHECK PEAK FILE #
###################
if (file_peaks != "") {
assertFileExists(snakemake@config$peaks$consensusPeaks)
peaks.df = read_tsv(snakemake@config$peaks$consensusPeaks, col_names = FALSE)
if (nrow(problems(peaks.df)) > 0) {
flog.fatal(paste0("Parsing errors: "), problems(overlapsAll.df), capture = TRUE)
stop("Parsing errors with file ", snakemake@config$peaks$consensusPeaks, ". See the log file for more information")
}
if (ncol(peaks.df) < 3) {
message = paste0("At least 3 columns are required, but only ", ncol(peaks.df), " columns have been found in the file ", snakemake@config$peaks$consensusPeaks)
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
assertCharacter(peaks.df$X1)
assertIntegerish(peaks.df$X2)
assertIntegerish(peaks.df$X3)
# End coordinates must not be smaller than start coordinates
assertIntegerish(peaks.df$X3 - peaks.df$X2, lower = 0)
if (ncol(peaks.df) == 3) {
peaks.df$X4 = "TODO"
peaks.df$X5 = 0
} else if (ncol(peaks.df) == 4) {
peaks.df$X5 = 0
} else if (ncol(peaks.df) == 5) {
} else {
message = paste0("More than 5 columns found. Only the first 5 are needed, the remaining ones will be dropped for subsequent analysis...")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
peaks.df = peaks.df[,1:5]
}
# Overwrite original annotation, we use our own from here on
peaks.df$X4 = paste0(peaks.df$X1, ":", peaks.df$X2, "-", peaks.df$X3)
colnamesFinal = c("chr", "start", "end", "annotation", "score")
colnames(peaks.df) = colnamesFinal
rowsBefore = nrow(peaks.df)
peaks.df = dplyr::distinct(peaks.df, annotation, .keep_all = TRUE)
nRowsFiltered = rowsBefore - nrow(peaks.df)
if (par.l$verbose & nRowsFiltered > 0) flog.info(paste0("Filtered ", nRowsFiltered, " non-unique positions out of ", rowsBefore, " from peaks."))
write_tsv(peaks.df, path = par.l$output_peaksClean, col_names = FALSE)
} else {
cat("DUMMY FILE, DO NOT DELETE", file = par.l$output_peaksClean)
}
TFs = createFileList(TFBS_dir, "_TFBS.bed", verbose = FALSE)
if (length(TFs) == 0) {
message = paste0("No files with pattern _TFBS.bed found in directory ", TFBS_dir, " as specified by the parameter \"dir_TFBS\"")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
for (TFCur in TFs) {
tableCur.df = read_tsv(TFCur, col_names = FALSE, col_types = cols())
if (nrow(problems(tableCur.df)) > 0) {
flog.fatal(paste0("Parsing errors: "), problems(tableCur.df), capture = TRUE)
stop("Parsing errors for file ", TFCur, ". See the log file for more information")
}
ncols = ncol(tableCur.df)
if (ncols != 6) {
message = paste0("Number of columns must be 6, but file ", TFCur, " has ", ncols, " instead")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
}
flog.info("Found ", length(TFs), " TF PWM bed files.")
# Write to dummy file
cat("DONE", file = par.l$output_flag)
......@@ -41,7 +41,7 @@ assertClass(snakemake, "Snakemake")
## INPUT ##
assertList(snakemake@input, min.len = 1)
assertSubset(names(snakemake@input), c("", "peaks"))
assertSubset(names(snakemake@input), c("", "peaks", "checkFlag"))
par.l$file_input_peaks = snakemake@input$peaks
......@@ -120,12 +120,12 @@ g = ggplot(consensusPeaks.df) + geom_density(aes(x = length)) + scale_x_continuo
ggsave(plot = g, filename = par.l$file_output_plot)
consensusPeaks.df$name = paste0(consensusPeaks.df$chr, ":", consensusPeaks.df$start, "-", consensusPeaks.df$end)
consensusPeaks.df$id = rownames(consensusPeaks.df)
consensusPeaks.df$annotation = paste0(consensusPeaks.df$chr, ":", consensusPeaks.df$start, "-", consensusPeaks.df$end)
# Delete columns we don't need
consensusPeaks.df$pvalue = NULL
consensusPeaks.df$length = NULL
# Deactivated now
# consensusPeaks.df$strand = "+"
# Reorganize columns
consensusPeaks.df = consensusPeaks.df[,c("chr", "start", "end", "annotation", "pvalue")]
write_tsv(consensusPeaks.df, path = par.l$file_output_consensusPeaks, col_names = FALSE)
......
......@@ -5,6 +5,10 @@ start.time <- Sys.time()
# LIBRARY AND FUNCTIONS #
#########################
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/2.DESeqPeaks.R.rds")
library("checkmate")
assertClass(snakemake, "Snakemake")
assertDirectoryExists(snakemake@config$par_general$dir_scripts)
......@@ -14,9 +18,7 @@ source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R"))
# SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES #
########################################################################
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/2.DESeqPeaks.R.rds")
createDebugFile(snakemake)
initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
......@@ -41,13 +43,13 @@ checkAndLogWarningsAndErrors(snakemake, checkClass(snakemake, "Snakemake"))
## INPUT ##
checkAndLogWarningsAndErrors(snakemake@input, checkList(snakemake@input, min.len = 1))
checkAndLogWarningsAndErrors(snakemake@input, checkSubset(names(snakemake@input), c("", "sampleData", "peaks")))
checkAndLogWarningsAndErrors(snakemake@input, checkSubset(names(snakemake@input), c("", "sampleData", "BAMPeakoverlaps")))
par.l$file_input_sampleData = snakemake@input$sampleData
checkAndLogWarningsAndErrors(par.l$file_input_sampleData, checkFileExists(par.l$file_input_sampleData, access = "r"))
par.l$files_input_peaks = snakemake@input$peaks
par.l$file_input_peakOverlaps = snakemake@input$BAMPeakoverlaps
for (fileCur in par.l$files_input_TF_summary) {
checkAndLogWarningsAndErrors(fileCur, checkFileExists(fileCur, access = "r"))
......@@ -180,63 +182,42 @@ if (datatypeVariableToPermute == "factor" & nLevels != 2) {
# ITERATE THROUGH PEAK FILES #
##############################
peaks.df = NULL
coverageAll.m = NULL
coverageAll.df = read_tsv(par.l$file_input_peakOverlaps, col_names = TRUE, comment = "#")
# TODO: Peak annotation verifiation: Really unique? Number of columns correct? Must be between 3 and 6
if (nrow(coverageAll.df) == 0) {
message = paste0("Empty file ", par.l$file_input_peaks, ".")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
flog.info(paste0("Iterating over ", length(par.l$files_input_peaks), " peak files "))
## transform as matrix data frame with counts
coverageAll.m = as.matrix(dplyr::select(coverageAll.df, -one_of("Geneid", "Chr", "Start", "End", "Strand", "Length")))
for (fileCur in par.l$files_input_peaks) {
# Take the basenames of the files, which have not been modified in the sorted versions of the BAMs as they are just located in different folders
sampleIDs = sampleData.df$SampleID[which(basename(sampleData.df$bamReads) %in% basename(colnames(coverageAll.m)))]
# TODO: Change accordingly: Between 4 and 7 here, set column names accordingly and
peaks.df = read_tsv(fileCur, col_names = c("chr", "PSS", "PES", "annotation", "ID", "coverage"), col_types = cols())
flog.info(paste0("Parsed peak file ", fileCur, " with ", nrow(peaks.df)," rows"))
if (nrow(peaks.df) == 0) {
message = paste0("The file ", fileCur, " is empty, no overlaps have been identified. This file will be skipped.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
} else {
#TODO: Needed?
peaks.df$identifier = paste0(peaks.df$chr,":", peaks.df$PSS,"-", peaks.df$PES)
# Filter and retain only unique identifiers
peaks.filtered.df = distinct(peaks.df, identifier, .keep_all = TRUE)
nRowsFiltered = nrow(peaks.df) - nrow(peaks.filtered.df)
if (par.l$verbose & nRowsFiltered > 0) flog.info(paste0("Filtered ", nRowsFiltered, " non-unique positions out of ", nrow(peaks.df), " from peaks table."))
peaks.df = peaks.filtered.df
# concatenate results from COV from each iteration
coverageAll.m = cbind(coverageAll.m, peaks.df$coverage)
}
}
if (length(unique(sampleIDs)) != nrow(sampleData.df)) {
message = paste0("Colnames mismatch.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
colnames(coverageAll.m) = sampleIDs
rownames(coverageAll.m) = coverageAll.df$Geneid
if (is.null(coverageAll.m)) {
message = paste0("All overlap files are empty. Cannot continue. A different peak file may solve the issue.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
peaks.df = dplyr::select(coverageAll.df, one_of(c("Chr", "Start", "End", "Geneid")))
colnames(peaks.df) = c("chr", "PSS", "PES", "annotation")
# Filter and retain only unique identifiers
peaks.filtered.df = distinct(peaks.df, annotation, .keep_all = TRUE)
nRowsFiltered = nrow(peaks.df) - nrow(peaks.filtered.df)
if (par.l$verbose & nRowsFiltered > 0) flog.info(paste0("Filtered ", nRowsFiltered, " non-unique positions out of ", nrow(peaks.df), " from peaks table."))
peaks.df = peaks.filtered.df
# Save the last, they are all identical anyway except for the count column
saveRDS(peaks.filtered.df, file = par.l$file_output_peaks)
## transform as matrix data frame with counts
coverageAll.m = as.matrix(coverageAll.m)
colnames(coverageAll.m) = sampleData.df$SampleID
rownames(coverageAll.m) = peaks.df$identifier # Take the first element as nameCur reprsentative, they are all identical anyway
#############################
# SAMPLE LABEL PERMUTATIONS #
#############################
......
......@@ -5,6 +5,11 @@ start.time <- Sys.time()
# LIBRARY AND FUNCTIONS #
#########################
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} and {TF} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/3.analyzeTF.{TF}.R.rds")
library("checkmate")
assertClass(snakemake, "Snakemake")
assertDirectoryExists(snakemake@config$par_general$dir_scripts)
......@@ -17,10 +22,7 @@ checkAndLoadPackages(c("tidyverse", "futile.logger", "DESeq2", "vsn", "modeest",
# SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES #
########################################################################
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} and {TF} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/3.analyzeTF.{TF}.R.rds")
# snakemake = readRDS("/scratch/carnold/CLL/27ac_TF/output/Logs_and_Benchmarks/3.analyzeTF.R_TF=MAFK.S.rds")
createDebugFile(snakemake)
###################
......@@ -136,9 +138,10 @@ sampleData.l = readRDS(par.l$file_input_metadata)
# Initiate data structures that are populated hereafter
res_DESeq.l = list()
# TODO: Remove one of the annotation columns
TF_output.df = tribble(~permutation, ~TF, ~chr, ~MSS, ~MES, ~strand, ~PSS, ~PES, ~annotation, ~ID, ~identifier, ~baseMean, ~log2FoldChange, ~lfcSE, ~stat, ~pvalue, ~padj)
outputSummary.df = tribble(~permutation, ~TF, ~Pos_l2FC, ~Mean_l2FC, ~Median_l2FC, ~Mode_l2FC, ~sd, ~Ttest_pval, ~Modeskewness, ~T_statistic, ~TFBS_num)
outputSummary.df = tribble(~permutation, ~TF, ~Pos_l2FC, ~Mean_l2FC, ~Median_l2FC, ~Mode_l2FC, ~sd_l2FC, ~pvalue_raw, ~skewness_l2FC, ~T_statistic, ~TFBS_num)
# TODO
coverageAll.df = NULL
......@@ -159,21 +162,26 @@ if (length(colnamesNew) != nrow(sampleData.df)) {
message = "Could not grep sampleIDs from filenames."
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
overlapsAll.df = read_tsv(par.l$file_input_peakTFOverlaps, col_names = FALSE, col_types = cols())
overlapsAll.df = read_tsv(par.l$file_input_peakTFOverlaps, col_names = TRUE, col_types = cols(), comment = "#")
if (nrow(problems(overlapsAll.df)) > 0) {
flog.fatal(paste0("Parsing errors: "), problems(overlapsAll.df), capture = TRUE)
stop("Error when parsing the file ", fileCur, ", see warnings")
}
colnames(overlapsAll.df) = c("chr","MSS","MES","annotation","ID","strand","fileOrigin", colnamesNew)
colnames(overlapsAll.df) = c("annotation", "chr","MSS","MES", "strand","length", colnamesNew)
overlapsAll.df = overlapsAll.df %>%
dplyr::mutate(identifier = paste0(chr,":", MSS, "-",MES)) %>%
dplyr::mutate(mean = apply(dplyr::select(overlapsAll.df, one_of(colnamesNew)), 1, mean)) %>%
dplyr::distinct(identifier, .keep_all = TRUE) %>%
dplyr::select(-one_of("fileOrigin"))
dplyr::mutate(TFBSID = paste0(chr,":", MSS, "-",MES)) %>%
dplyr::mutate(mean = apply(dplyr::select(overlapsAll.df, one_of(colnamesNew)), 1, mean),
peakID = sapply(strsplit(overlapsAll.df$annotation, split = "_", fixed = TRUE),"[[", 1)) %>%
dplyr::distinct(TFBSID, .keep_all = TRUE) %>%
dplyr::select(-one_of("length"))
if (nrow(overlapsAll.df) > 0) {
......@@ -181,12 +189,12 @@ if (nrow(overlapsAll.df) > 0) {
# Group by ID
# take only the maximum row mean of all samples, sample with biggest coverage
coverageAll_grouped.df = overlapsAll.df %>%
dplyr::group_by(ID) %>%
dplyr::group_by(peakID) %>%
dplyr::slice(which.max(mean))
TF.table.m = as.matrix(coverageAll_grouped.df[,sampleData.df$SampleID])
colnames(TF.table.m) = sampleData.df$SampleID
rownames(TF.table.m) = coverageAll_grouped.df$identifier
rownames(TF.table.m) = coverageAll_grouped.df$TFBSID
......@@ -258,22 +266,32 @@ for (permutationCur in 0:par.l$nPermutations) {
normFacs = normFacs.l[[permutationName]]
# Sanity check
if (length(which(!coverageAll_grouped.df$annotation %in% rownames(normFacs))) > 0) {
if (length(which(!coverageAll_grouped.df$peakID %in% rownames(normFacs))) > 0) {
errorMessage <<- "Inconsistency detected between the normalization factor rownames and the object coverageAll_grouped.df"
checkAndLogWarningsAndErrors(NULL, errorMessage, isWarning = TRUE)
checkAndLogWarningsAndErrors(NULL, errorMessage, isWarning = FALSE)
}
if (!identical(colnames(TF.cds), colnames(normFacs))) {
errorMessage <<- "Column names differenht between TF.cds and normFacs"
checkAndLogWarningsAndErrors(NULL, errorMessage, isWarning = TRUE)
errorMessage <<- "Column names different between TF.cds and normFacs"
checkAndLogWarningsAndErrors(NULL, errorMessage, isWarning = FALSE)
}
if (!identical(coverageAll_grouped.df$TFBSID, rownames(TF.cds))) {
errorMessage <<- "Row names different between coverageAll_grouped.df and TF.cds"
checkAndLogWarningsAndErrors(NULL, errorMessage, isWarning = FALSE)
}
# assertSubset(rownames(normFacs))
if (par.l$doCyclicLoess) {
# coverageAll_grouped.df$TFBSID and rownames(TF.cds) are identical
matchingOrder = match(coverageAll_grouped.df$peakID,rownames(normFacs))
normalizationFactors(TF.cds) <- normFacs[matchingOrder,, drop = FALSE]
rownamesTFs = which(rownames(normFacs) %in% coverageAll_grouped.df$annotation)
normalizationFactors(TF.cds) <- normFacs[rownamesTFs,, drop = FALSE]
} else {
sizeFactors(TF.cds) = normFacs
}
......@@ -375,23 +393,24 @@ for (permutationCur in 0:par.l$nPermutations) {
assertSubset(rownames(res_DESeq.df), overlapsAll.df$identifier)
assertSubset(rownames(res_DESeq.df), overlapsAll.df$TFBSID)
# todo: check the coverage columns
rm_col = c("chr.y","annotation.y","identifier.y" )
order = c("permutation", "TF", "chr","MSS","MES","strand", "PSS","PES","annotation","ID", "identifier","baseMean", "log2FoldChange","lfcSE","stat", "pvalue","padj")
rm_col = c("chr.y")
# TODO
order = c("permutation", "TF", "chr","MSS","MES","TFBSID", "strand", "PSS","PES","peakID","baseMean", "log2FoldChange","lfcSE","stat", "pvalue","padj")
# dplyr::mutate(TF = par.l$TF) gives the following weird error message: Error: Unsupported type NILSXP for column "TF"
TFCur = par.l$TF
TF_outputCur.df = res_DESeq.df %>%
rownames_to_column(var = "identifier") %>%
dplyr::full_join(overlapsAll.df,by = c("identifier")) %>%
dplyr::full_join(peaksFiltered.df, by = "ID") %>%
rownames_to_column(var = "TFBSID") %>%
dplyr::full_join(overlapsAll.df,by = c("TFBSID")) %>%
dplyr::full_join(peaksFiltered.df, by = c("peakID" = "annotation")) %>%
dplyr::filter(!is.na(baseMean)) %>%
dplyr::rename(annotation = annotation.x, chr = chr.x, identifier = identifier.x) %>%
dplyr::rename(chr = chr.x) %>%
dplyr::select(-one_of(rm_col)) %>%
dplyr::mutate(TF = TFCur, permutation = permutationCur) %>%
dplyr::select(one_of(order)) %>%
......@@ -417,9 +436,9 @@ for (permutationCur in 0:par.l$nPermutations) {
Mean_l2FC = mean(final.TF.df$D2_l2FC, na.rm = TRUE),
Median_l2FC = median(final.TF.df$D2_l2FC, na.rm = TRUE),
Mode_l2FC = modeNum[[1]],
sd = sd(final.TF.df$D2_l2FC, na.rm = TRUE),
Ttest_pval = Ttest$p.value,
Modeskewness = modeNum[[2]],
sd_l2FC = sd(final.TF.df$D2_l2FC, na.rm = TRUE),
pvalue_raw = Ttest$p.value,
skewness_l2FC = modeNum[[2]],
T_statistic = Ttest$statistic[[1]],
TFBS_num = nrow(final.TF.df)
)
......
......@@ -54,7 +54,6 @@ for (fileCur in par.l$files_input_TF_summary) {
## OUTPUT ##
assertList(snakemake@output, min.len = 1)
#assertSubset(names(snakemake@output), c("", "volcanoPlot", "outputTable"))
assertSubset(names(snakemake@output), c("", "outputTable"))
#par.l$file_output_volcanoPlot = snakemake@output$volcanoPlot
......@@ -66,8 +65,7 @@ par.l$file_log = snakemake@log[[1]]
allDirs = c(#dirname(par.l$file_output_volcanoPlot),
dirname(par.l$file_output_table),
allDirs = c(dirname(par.l$file_output_table),
dirname(par.l$file_log)
)
......@@ -101,7 +99,7 @@ for (fileCur in par.l$files_input_TF_summary) {
peaks.df = read_tsv(par.l$file_input_peaks, col_types = cols())
assertSubset(colnames(peaks.df), c("permutation", "position", "D2_baseMean", "D2_l2FC", "D2_ldcSE", "D2_stat", "D2_pval", "D2_padj")) #, "vst_diff"))
assertSubset(colnames(peaks.df), c("permutation", "position", "D2_baseMean", "D2_l2FC", "D2_ldcSE", "D2_stat", "D2_pval", "D2_padj"))
summary.df = NULL
......@@ -117,6 +115,11 @@ for (fileCur in par.l$files_input_TF_summary) {
stats.df = readRDS(fileCur)
if (nrow(stats.df) == 0) {
message = paste0("File ", fileCur, " contains no rows, this TF will be skipped thereafter")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
}
if (is.null(summary.df)) {
summary.df = stats.df
} else {
......@@ -126,44 +129,39 @@ for (fileCur in par.l$files_input_TF_summary) {
} else {
message = paste0("File missing: ", fileCur)
flog.warn(message)
warning(message)
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
}
}
# TODO: WHy same values for both TF
nRowsSummary = nrow(summary.df)
nTF = length(unique(summary.df$TF))
flog.info(paste0(" Imported ", nRowsSummary, " TFs out of a list of ", nTFs, ". Missing: ", nTFs - nRowsSummary))
flog.info(paste0(" Imported ", nTF, " TFs out of a list of ", nTFs, ". Missing: ", nTFs - nTF))
nTFMissing = length(which(is.na(summary.df$Pos_l2FC)))
if (nTFMissing == nrow(summary.df)) {
error = "All TF have missing data. Cannot continue. Add more samples or change the peaks."
flog.fatal(error)
stop(error)
checkAndLogWarningsAndErrors(NULL, error, isWarning = FALSE)
}
# Replace p-values of 0 with the smallest p-value on the system
summary.df$Ttest_pval[summary.df$Ttest_pval == 0] = .Machine$double.xmin
summary.df$pvalue_raw[summary.df$pvalue_raw == 0] = .Machine$double.xmin
# TODO: was previously assigned to modeNum, but why?
mode_peaks = mlv(round(peaks.df$D2_l2FC, 2), method = "mfv", na.rm = TRUE)
mode_peaks = mlv(peaks.df$D2_l2FC, method = "mfv", na.rm = TRUE)
summary.df = summary.df %>%
dplyr::mutate(
adj_pvalue = p.adjust(Ttest_pval, method = "fdr"),
Diff_mean = Mean_l2FC - mean (peaks.df$D2_l2FC, na.rm = TRUE),
DiffMedian = Median_l2FC - median(peaks.df$D2_l2FC, na.rm = TRUE),
Diff_mode = Mode_l2FC - mode_peaks[[1]],
Diff_skew = Modeskewness - mode_peaks[[2]]) %>%
adj_pvalue = p.adjust(pvalue_raw, method = "fdr"),
Diff_mean = Mean_l2FC - mean(peaks.df$D2_l2FC, na.rm = TRUE),
DiffMedian = Median_l2FC - median(peaks.df$D2_l2FC, na.rm = TRUE),
Diff_mode = Mode_l2FC - mode_peaks[[1]],
Diff_skew = skewness_l2FC - mode_peaks[[2]]) %>%
na.omit(summary.df)