start.time <- Sys.time() ######################### # LIBRARY AND FUNCTIONS # ######################### library("checkmate") assertClass(snakemake, "Snakemake") assertDirectoryExists(snakemake@config$par_general$dir_scripts) source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R")) initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE) checkAndLoadPackages(c("tidyverse", "futile.logger", "modeest", "checkmate", "ggrepel"), verbose = FALSE) ######################################################################## # SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES # ######################################################################## # Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes) # Replace {outputFolder} correspondingly. # snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/4.summary1.R.rds") createDebugFile(snakemake) ################### #### PARAMETERS ### ################### par.l = list() # Hard-coded parameters par.l$verbose = TRUE par.l$min_pValue = .Machine$double.xmin par.l$FDR_threshold = 0.05 par.l$plot_min_diffMean = 0.02 par.l$log_minlevel = "INFO" ##################### # VERIFY PARAMETERS # ##################### assertClass(snakemake, "Snakemake") ## INPUT ## assertList(snakemake@input, min.len = 1) assertSubset(names(snakemake@input), c("", "peaks", "TF", "analysesTFOutput")) par.l$file_input_peaks = snakemake@input$peaks assertFileExists(par.l$file_input_peaks, access = "r") par.l$files_input_TF_summary = snakemake@input$TF for (fileCur in par.l$files_input_TF_summary) { assertFileExists(fileCur, access = "r") } ## OUTPUT ## assertList(snakemake@output, min.len = 1) assertSubset(names(snakemake@output), c("", "outputTable")) #par.l$file_output_volcanoPlot = snakemake@output$volcanoPlot par.l$file_output_table = snakemake@output$outputTable ## LOG ## assertList(snakemake@log, min.len = 1) par.l$file_log = snakemake@log[[1]] allDirs = c(dirname(par.l$file_output_table), dirname(par.l$file_log) ) testExistanceAndCreateDirectoriesRecursively(allDirs) ###################### # FINAL PREPARATIONS # ###################### startLogger(par.l$file_log, par.l$log_minlevel, removeOldLog = TRUE) printParametersLog(par.l) ################ # Collect data # ################ par.l$filesTransl.l = list() nTF = length(par.l$files_input_TF_summary) # Create translation table for (fileCur in par.l$files_input_TF_summary) { elements = strsplit(fileCur, split = "/", fixed = TRUE)[[1]] hit = which(grepl(pattern = "^extension", elements)) stopifnot(length(hit) == 1 & hit != 1) par.l$filesTransl.l[[fileCur]] = elements[hit - 1] } peaks.df = read_tsv(par.l$file_input_peaks, col_types = cols()) summary.df = NULL nTFs = length(par.l$filesTransl.l) flog.info(paste0("Using ", nTFs, " TFs")) for (fileCur in par.l$files_input_TF_summary) { TF = par.l$filesTransl.l[[fileCur]] stopifnot(!is.null(TF)) if (file.exists(fileCur)) { stats.df = readRDS(fileCur) if (nrow(stats.df) == 0) { message = paste0("File ", fileCur, " contains no rows, this TF will be skipped thereafter") checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) } if (is.null(summary.df)) { summary.df = stats.df } else { summary.df = rbind(summary.df, stats.df) } } else { message = paste0("File missing: ", fileCur) checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) } } nTF = length(unique(summary.df$TF)) flog.info(paste0(" Imported ", nTF, " TFs out of a list of ", nTFs, ". Missing: ", nTFs - nTF)) nTFMissing = length(which(is.na(summary.df$Pos_l2FC))) if (nTFMissing == nrow(summary.df)) { error = "All TF have missing data. Cannot continue. Add more samples or change the peaks." checkAndLogWarningsAndErrors(NULL, error, isWarning = FALSE) } # Replace p-values of 0 with the smallest p-value on the system summary.df$pvalue_raw[summary.df$pvalue_raw == 0] = .Machine$double.xmin mode_peaks = mlv(peaks.df$l2FC, method = "mfv", na.rm = TRUE) summary.df = summary.df %>% dplyr::mutate( adj_pvalue = p.adjust(pvalue_raw, method = "fdr"), Diff_mean = Mean_l2FC - mean(peaks.df$l2FC, na.rm = TRUE), Diff_median = Median_l2FC - median(peaks.df$l2FC, na.rm = TRUE), Diff_mode = Mode_l2FC - mode_peaks[[1]], Diff_skew = skewness_l2FC - mode_peaks[[2]]) %>% na.omit(summary.df) summary.df = mutate_if(summary.df, is.numeric, as.character) write_tsv(summary.df, par.l$file_output_table) # TODO: check the dec = "." parameter .printExecutionTime(start.time) flog.info("Session info: ", sessionInfo(), capture = TRUE)