-
Christian Arnold authoredChristian Arnold authored
summary1.R 4.82 KiB
start.time <- Sys.time()
#########################
# LIBRARY AND FUNCTIONS #
#########################
library("checkmate")
assertClass(snakemake, "Snakemake")
assertDirectoryExists(snakemake@config$par_general$dir_scripts)
source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R"))
initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
checkAndLoadPackages(c("tidyverse", "futile.logger", "modeest", "checkmate", "ggrepel"), verbose = FALSE)
########################################################################
# SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES #
########################################################################
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/4.summary1.R.rds")
createDebugFile(snakemake)
###################
#### PARAMETERS ###
###################
par.l = list()
# Hard-coded parameters
par.l$verbose = TRUE
par.l$min_pValue = .Machine$double.xmin
par.l$FDR_threshold = 0.05
par.l$plot_min_diffMean = 0.02
par.l$log_minlevel = "INFO"
#####################
# VERIFY PARAMETERS #
#####################
assertClass(snakemake, "Snakemake")
## INPUT ##
assertList(snakemake@input, min.len = 1)
assertSubset(names(snakemake@input), c("", "peaks", "TF", "analysesTFOutput"))
par.l$file_input_peaks = snakemake@input$peaks
assertFileExists(par.l$file_input_peaks, access = "r")
par.l$files_input_TF_summary = snakemake@input$TF
for (fileCur in par.l$files_input_TF_summary) {
assertFileExists(fileCur, access = "r")
}
## OUTPUT ##
assertList(snakemake@output, min.len = 1)
assertSubset(names(snakemake@output), c("", "outputTable"))
#par.l$file_output_volcanoPlot = snakemake@output$volcanoPlot
par.l$file_output_table = snakemake@output$outputTable
## LOG ##
assertList(snakemake@log, min.len = 1)
par.l$file_log = snakemake@log[[1]]
allDirs = c(dirname(par.l$file_output_table),
dirname(par.l$file_log)
)
testExistanceAndCreateDirectoriesRecursively(allDirs)
######################
# FINAL PREPARATIONS #
######################
startLogger(par.l$file_log, par.l$log_minlevel, removeOldLog = TRUE)
printParametersLog(par.l)
################
# Collect data #
################
par.l$filesTransl.l = list()
nTF = length(par.l$files_input_TF_summary)
# Create translation table
for (fileCur in par.l$files_input_TF_summary) {
elements = strsplit(fileCur, split = "/", fixed = TRUE)[[1]]
hit = which(grepl(pattern = "^extension", elements))
stopifnot(length(hit) == 1 & hit != 1)
par.l$filesTransl.l[[fileCur]] = elements[hit - 1]
}
peaks.df = read_tsv(par.l$file_input_peaks, col_types = cols())
summary.df = NULL
nTFs = length(par.l$filesTransl.l)
flog.info(paste0("Using ", nTFs, " TFs"))
for (fileCur in par.l$files_input_TF_summary) {
TF = par.l$filesTransl.l[[fileCur]]
stopifnot(!is.null(TF))
if (file.exists(fileCur)) {
stats.df = readRDS(fileCur)
if (nrow(stats.df) == 0) {
message = paste0("File ", fileCur, " contains no rows, this TF will be skipped thereafter")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
}
if (is.null(summary.df)) {
summary.df = stats.df
} else {
summary.df = rbind(summary.df, stats.df)
}
} else {
message = paste0("File missing: ", fileCur)
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
}
}
nTF = length(unique(summary.df$TF))
flog.info(paste0(" Imported ", nTF, " TFs out of a list of ", nTFs, ". Missing: ", nTFs - nTF))
nTFMissing = length(which(is.na(summary.df$Pos_l2FC)))
if (nTFMissing == nrow(summary.df)) {
error = "All TF have missing data. Cannot continue. Add more samples or change the peaks."
checkAndLogWarningsAndErrors(NULL, error, isWarning = FALSE)
}
# Replace p-values of 0 with the smallest p-value on the system
summary.df$pvalue_raw[summary.df$pvalue_raw == 0] = .Machine$double.xmin
mode_peaks = mlv(peaks.df$l2FC, method = "mfv", na.rm = TRUE)
summary.df = summary.df %>%
dplyr::mutate(
adj_pvalue = p.adjust(pvalue_raw, method = "fdr"),
Diff_mean = Mean_l2FC - mean(peaks.df$l2FC, na.rm = TRUE),
Diff_median = Median_l2FC - median(peaks.df$l2FC, na.rm = TRUE),
Diff_mode = Mode_l2FC - mode_peaks[[1]],
Diff_skew = skewness_l2FC - mode_peaks[[2]]) %>%
na.omit(summary.df)
summary.df = mutate_if(summary.df, is.numeric, as.character)
write_tsv(summary.df, par.l$file_output_table) # TODO: check the dec = "." parameter
.printExecutionTime(start.time)
flog.info("Session info: ", sessionInfo(), capture = TRUE)