Commit 7ca183fa authored by Christian Arnold's avatar Christian Arnold

Decreased running time and memory footprint for prepareBinning step

parent dba60d42
......@@ -152,6 +152,18 @@ if (file_peaks != "") {
stop("Parsing errors with file ", snakemake@config$peaks$consensusPeaks, ". See the log file for more information")
}
flog.info(paste0("Peak file contains ", nrow(peaks.df), " peaks."))
if (nrow(peaks.df) > 100000) {
message = paste0("The number of peaks is very high, subsequent steps may be slow, particularly in the prepareBinning and binningTF steps. Make sure the preparingBinning step has enough memory available. We recommend at least 50 GB. Alternatively, consider decreasing the number of peaks for improved performance.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
if (snakemake@config$par_general$nPermutations > 5) {
message = paste0("In addition to the high number of peaks, more than 5 permutations have been selected, which will further increase running times and memory footprint. Consider decreasing the number of peaks for improved performance.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
}
}
if (ncol(peaks.df) < 3) {
message = paste0("At least 3 columns are required, but only ", ncol(peaks.df), " columns have been found in the file ", snakemake@config$peaks$consensusPeaks)
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
......
......@@ -168,7 +168,7 @@ overlapsAll.df = read_tsv(par.l$file_input_peakTFOverlaps, col_names = TRUE, col
if (nrow(problems(overlapsAll.df)) > 0) {
flog.fatal(paste0("Parsing errors: "), problems(overlapsAll.df), capture = TRUE)
stop("Error when parsing the file ", fileCur, ", see warnings")
stop("Error when parsing the file ", fileCur, ", see errors above")
}
......
......@@ -11,7 +11,7 @@ assertDirectoryExists(snakemake@config$par_general$dir_scripts)
source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R"))
initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "tools", "methods"), verbose = FALSE)
checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "tools", "methods", "BiocParallel", "rlist"), verbose = FALSE)
# methods needed here because Rscript does not loads this package automatically, see http://stackoverflow.com/questions/19468506/rscript-could-not-find-function
......@@ -106,80 +106,96 @@ if (nrow(TF.motifs.ori) == 0) {
stop(message)
}
# TODO: eventuell löschen: chr, MSS, MES, PSS, PES
#Filter permutations in the original files that the user does not want anymore
TF.motifs.ori = filter(TF.motifs.ori, permutation <= par.l$nPermutations)
TF.motifs.ori$CG.identifier = paste0(TF.motifs.ori$TF,":",TF.motifs.ori$TFBSID)
#########################
# READ ALL NUC CG FILES #
#########################
TF.motifs.CG = NULL
for (permutationCur in 0:par.l$nPermutations) {
permutationName = paste0("permutation", permutationCur)
if (permutationCur > 0) {
flog.info(paste0("Reading file from permutation ", permutationCur))
}
fileCur = par.l$files_input_nucContentGenome[permutationCur+1]
TF.motifs.CG.cur = read_tsv(fileCur, col_names = TRUE, col_types = cols())
if (ncol(TF.motifs.CG.cur) != 14) {
error = paste0("Expected 14 columns in file ", fileCur, ", but found ", ncol(TF.motifs.CG.cur), " instead. Aborting.")
checkAndLogWarningsAndErrors(NULL, error, isWarning = FALSE)
}
colnames(TF.motifs.CG.cur) = c("chr","MSS","MES","strand","TF","AT","CG","A","C","G","T","N","other_nucl","length")
nRowsNA = length(which(is.na(TF.motifs.CG.cur$CG)))
if (nRowsNA > 0) {
message <- paste0("The file ", fileCur, " contains ", nRowsNA, " rows out of ", nrow(TF.motifs.CG.cur), " with a missing value for the CG content, which most likely results from an assembly discordance between the BAM files and the specified fasta file. These regions will be removed in subsequent steps. The first 10 are printed with their first 3 columns here for debugging purposes:")
message = paste0(message, paste0(unlist(TF.motifs.CG.cur[1:10,"chr"]), ":", unlist(TF.motifs.CG.cur[1:10,"MSS"]), "-", unlist(TF.motifs.CG.cur[1:10,"MES"]), collapse = ", "))
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
TF.motifs.CG.cur = TF.motifs.CG.cur[-nRowsNA,]
}
TF.motifs.CG.cur$permutation = permutationCur
# Rbind and add permutation column
if (is.null(TF.motifs.CG)){
TF.motifs.CG = TF.motifs.CG.cur
} else {
TF.motifs.CG = rbind(TF.motifs.CG, TF.motifs.CG.cur)
}
}
readFile <- function(permutationCur, par.l) {
fileCur = par.l$files_input_nucContentGenome[permutationCur + 1]
if (permutationCur > 0) {
flog.info(paste0("Reading and processing file ", fileCur, " from permutation ", permutationCur))
} else {
flog.info(paste0("Reading and processing file ", fileCur, " from original data ", permutationCur))
}
TF.motifs.CG.cur = read_tsv(fileCur, col_names = TRUE,
col_types = list(
col_skip(), # "chr"
col_skip(), # "MSS"
col_skip(), # "MES"
col_character(), # "strand"
col_character(), # "TF",
col_skip(), # "AT"
col_number(), # "CG"
col_skip(), # "A"
col_skip(), # "C"
col_skip(), # "G"
col_skip(), # "T"
col_skip(), # "N"
col_skip(), # "other_nucl"
col_skip() # "length"
)
)
if (nrow(problems(TF.motifs.CG.cur)) > 0) {
flog.fatal(paste0("Parsing errors: "), problems(TF.motifs.CG.cur), capture = TRUE)
stop("Error when parsing the file ", fileCur, ", see errors above")
}
colnames(TF.motifs.CG.cur) = c("TFBSID","TF","CG")
nRowsNA = length(which(is.na(TF.motifs.CG.cur$CG)))
if (nRowsNA > 0) {
message <- paste0("The file ", fileCur, " contains ", nRowsNA, " rows out of ", nrow(TF.motifs.CG.cur), " with a missing value for the CG content, which most likely results from an assembly discordance between the BAM files and the specified fasta file. These regions will be removed in subsequent steps. The first 10 are printed with their first 3 columns here for debugging purposes:")
message = paste0(message, paste0(unlist(TF.motifs.CG.cur[1:10,"chr"]), ":", unlist(TF.motifs.CG.cur[1:10,"MSS"]), "-", unlist(TF.motifs.CG.cur[1:10,"MES"]), collapse = ", "))
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
TF.motifs.CG.cur = TF.motifs.CG.cur[-nRowsNA,]
}
TF.motifs.CG.cur = TF.motifs.CG.cur %>%
mutate(CG.identifier = paste0(TF,":" ,TFBSID), permutation = permutationCur) %>%
select(-one_of("TFBSID", "TF"))
return(TF.motifs.CG.cur)
}
TF.motifs.CG$CG.identifier = paste0(TF.motifs.CG$TF,":" ,TF.motifs.CG$chr ,":", TF.motifs.CG$MSS, "-", TF.motifs.CG$MES)
#########
# MERGE #
#########
#TF.motifs.ori = filter(TF.motifs.ori, permutation<6)
nCores = snakemake@threads
results.l = .execInParallelGen(nCores,
returnAsList = TRUE, listNames = paste0("permutation", 0:par.l$nPermutations),
iteration = 0:par.l$nPermutations,
abortIfErrorParallel = TRUE,
verbose = TRUE,
readFile, par.l)
# Remove duplicated rows
#TF.motifs.ori = TF.motifs.ori[!duplicated(TF.motifs.ori),]
TF.motifs.CG = do.call("rbind", results.l)
# concatenate the data in one df
drop.cols = c( "A","C","G","T","N","other_nucl","length","chr.y","MSS.y","MES.y","strand.y","AT","CG.identifier","TF.y")
#########
# MERGE #
#########
CGBins = seq(0,1, 1/par.l$nBins)
TF.motifs.all = TF.motifs.ori %>%
full_join(TF.motifs.CG, by = c("CG.identifier", "permutation")) %>%
select(-one_of(drop.cols)) %>%
mutate(CG.bins = cut(CG, breaks = CGBins, labels = paste0(round(CGBins[-1] * 100,0),"%"), include.lowest = TRUE)) %>%
#rename(TF = TF.x, chr = chr.x, MSS = MSS.x, MES = MES.x, strand = strand.x)
dplyr::rename(TF = TF.x, chr = chr.x, MSS = MSS.x, MES = MES.x)
select(-one_of("CG.identifier")) %>%
mutate(CG.bins = cut(CG, breaks = CGBins, labels = paste0(round(CGBins[-1] * 100,0),"%"), include.lowest = TRUE))
# Not needed anymore, delete
rm(TF.motifs.CG)
......@@ -188,6 +204,8 @@ rm(TF.motifs.ori)
# remove duplicated TFBS from different TFs to use in the permuations
TF.motifs.all.unique = TF.motifs.all[!duplicated(TF.motifs.all[,c("permutation", "TFBSID")]),]
# TODO: TFBSID and peakID needed?
flog.info(paste0("Found ", nrow(TF.motifs.all) - nrow(TF.motifs.all.unique), " duplicated TFBS across all TF."))
saveRDS(TF.motifs.all, file = par.l$file_output_allTF)
......
......@@ -528,7 +528,7 @@ rule prepareBinning:
allTFUniqueData = TEMP_EXTENSION_DIR + "/" + compType + "allTFUniqueData_processedForPermutations.rds"
log: expand('{dir}/5.prepareBinning.R.log', dir = LOG_BENCHMARK_DIR)
message: "{ruleDisplayMessage}Run R script {script_prepareBinning} ..."
threads: 1
threads: min(threadsMax, config["par_general"]["nPermutations"] + 1)
params:
nCGBins = nCGBins
script: dir_scripts + script_prepareBinning
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment