summary1.R 5.29 KB
Newer Older
Christian Arnold's avatar
Christian Arnold committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
start.time  <-  Sys.time()


#########################
# LIBRARY AND FUNCTIONS #
#########################
library("checkmate")
assertClass(snakemake, "Snakemake")
assertDirectoryExists(snakemake@config$par_general$dir_scripts)
source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R"))

initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
checkAndLoadPackages(c("tidyverse", "futile.logger", "modeest", "checkmate", "ggrepel"), verbose = FALSE)

########################################################################
# SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES #
########################################################################

Christian Arnold's avatar
Christian Arnold committed
19 20
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
21
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/summary1.R.rds")
22
createDebugFile(snakemake)
Christian Arnold's avatar
Christian Arnold committed
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67

###################
#### PARAMETERS ###
###################
par.l = list()

# Hard-coded parameters
par.l$verbose = TRUE
par.l$min_pValue   = .Machine$double.xmin
par.l$FDR_threshold = 0.05
par.l$plot_min_diffMean = 0.02
par.l$log_minlevel = "INFO"


#####################
# VERIFY PARAMETERS #
#####################

assertClass(snakemake, "Snakemake")

## INPUT ##
assertList(snakemake@input, min.len = 1)
assertSubset(names(snakemake@input), c("", "peaks", "TF", "analysesTFOutput"))

par.l$file_input_peaks = snakemake@input$peaks
assertFileExists(par.l$file_input_peaks, access = "r")

par.l$files_input_TF_summary = snakemake@input$TF
for (fileCur in par.l$files_input_TF_summary) {
  assertFileExists(fileCur, access = "r")
}

## OUTPUT ##
assertList(snakemake@output, min.len = 1)
assertSubset(names(snakemake@output), c("", "outputTable"))

#par.l$file_output_volcanoPlot = snakemake@output$volcanoPlot
par.l$file_output_table       = snakemake@output$outputTable

## LOG ##
assertList(snakemake@log, min.len = 1)
par.l$file_log = snakemake@log[[1]]



68
allDirs = c(dirname(par.l$file_output_table),
Christian Arnold's avatar
Christian Arnold committed
69 70 71 72 73 74 75 76 77 78
            dirname(par.l$file_log)
          )

testExistanceAndCreateDirectoriesRecursively(allDirs)


######################
# FINAL PREPARATIONS #
######################

Christian Arnold's avatar
Christian Arnold committed
79
startLogger(par.l$file_log, par.l$log_minlevel, removeOldLog = TRUE)
Christian Arnold's avatar
Christian Arnold committed
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
printParametersLog(par.l)


################
# Collect data #
################

par.l$filesTransl.l = list()
nTF = length(par.l$files_input_TF_summary)

# Create translation table
for (fileCur in par.l$files_input_TF_summary) {
  elements = strsplit(fileCur, split = "/",  fixed = TRUE)[[1]]
  hit =  which(grepl(pattern = "^extension", elements))
  stopifnot(length(hit) == 1 & hit != 1)
  par.l$filesTransl.l[[fileCur]] = elements[hit - 1]  
}



peaks.df = read_tsv(par.l$file_input_peaks, col_types = cols())

summary.df = NULL

nTFs = length(par.l$filesTransl.l)

flog.info(paste0("Using ", nTFs, " TFs"))

for (fileCur in par.l$files_input_TF_summary) {
  
  TF = par.l$filesTransl.l[[fileCur]]
  stopifnot(!is.null(TF))
  if (file.exists(fileCur)) {
    
    stats.df = readRDS(fileCur)
    
116 117 118 119 120
    if (nrow(stats.df) == 0) {
        message = paste0("File ", fileCur, " contains no rows, this TF will be skipped thereafter")
        checkAndLogWarningsAndErrors(NULL,  message, isWarning = TRUE)
    }
    
Christian Arnold's avatar
Christian Arnold committed
121 122 123 124 125 126 127 128 129
    if (is.null(summary.df)) {
      summary.df = stats.df
    } else {
      summary.df = rbind(summary.df, stats.df)
    }
    

  } else {
    message = paste0("File missing: ", fileCur)
130
    checkAndLogWarningsAndErrors(NULL,  message, isWarning = TRUE)
Christian Arnold's avatar
Christian Arnold committed
131 132 133 134
  }
  
}

135
nTF = length(unique(summary.df$TF))
Christian Arnold's avatar
Christian Arnold committed
136

137
flog.info(paste0(" Imported ", nTF, " TFs out of a list of ", nTFs, ". Missing: ", nTFs - nTF))
Christian Arnold's avatar
Christian Arnold committed
138 139 140 141

nTFMissing = length(which(is.na(summary.df$Pos_l2FC)))
if (nTFMissing == nrow(summary.df)) {
  error = "All TF have missing data. Cannot continue. Add more samples or change the peaks."
142
  checkAndLogWarningsAndErrors(NULL,  error, isWarning = FALSE)
Christian Arnold's avatar
Christian Arnold committed
143 144 145 146
}
  

# Replace p-values of 0 with the smallest p-value on the system
147
summary.df$pvalue_raw[summary.df$pvalue_raw == 0] = .Machine$double.xmin
Christian Arnold's avatar
Christian Arnold committed
148

149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
# Check the version of modeest, because version 2.3.2 introduced an implementation change that breaks things

if (packageVersion("modeest") < "2.3.2") {
    
    mode_peaks = mlv(peaks.df$l2FC, method = "mfv", na.rm = TRUE)
    
    stopifnot(is.list(mode_peaks))
    l2fc_mode = ifelse(is.null(mode_peaks$M), NA, mode_peaks$M)
    l2fc_skewness = ifelse(is.null(mode_peaks$skewness), NA, mode_peaks$skewness)
} else {
    
    l2fc_mode = mlv(peaks.df$l2FC, method = "mfv", na.rm = TRUE)[1]
    l2fc_skewness = skewness(peaks.df$l2FC, na.rm = TRUE)[1]
}

164

Christian Arnold's avatar
Christian Arnold committed
165 166 167

summary.df = summary.df %>%
              dplyr::mutate(
168 169 170
                  adj_pvalue  = p.adjust(pvalue_raw, method = "fdr"),
                  Diff_mean   = Mean_l2FC    -   mean(peaks.df$l2FC, na.rm = TRUE), 
                  Diff_median = Median_l2FC  - median(peaks.df$l2FC, na.rm = TRUE),
171 172
                  Diff_mode   = Mode_l2FC    - l2fc_mode,    
                  Diff_skew   = skewness_l2FC - l2fc_skewness)  %>%
Christian Arnold's avatar
Christian Arnold committed
173 174
              na.omit(summary.df)

175
summary.df = mutate_if(summary.df, is.numeric, as.character)
176
write_tsv(summary.df, par.l$file_output_table) # TODO: check the dec = "." parameter
Christian Arnold's avatar
Christian Arnold committed
177 178 179 180

.printExecutionTime(start.time)

flog.info("Session info: ", sessionInfo(), capture = TRUE)