Commit 86df1be5 authored by Christian Arnold's avatar Christian Arnold

Version 1.1.2, see Changelog

parent fedb66d6
......@@ -31,6 +31,9 @@ We also put the paper on *bioRxiv*, please read all methodological details here:
Change log
============================
Version 1.1.2 (2018-08-03)
- fixed a bug that made the ``3.analyzeTF`` script fail in case when the number of permutations has been changed throughout the analysis or when the value is higher than the actual maximum number (thanks to Jonas Ungerbeck)
Version 1.1.1 (2018-08-01)
- Documentation updates (referenced the bioRxiv paper, extended the section about errors)
- updated the information on how to load the snakemake object into the R workspace in the corresponding R scripts
......
......@@ -8,7 +8,7 @@ start.time <- Sys.time()
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} and {TF} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/3.analyzeTF.{TF}.R.rds")
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/analyzeTF.{TF}.R.rds")
library("checkmate")
assertClass(snakemake, "Snakemake")
......@@ -144,6 +144,18 @@ if (length(sampleData.l) == 0) {
}
# Adjust the number of permutations in case less have been computed
if (par.l$nPermutations + 1 < length(sampleData.l)) {
message = paste0("In the output objects, more permutations seem to be stored. They will be ignored and the currently specified value of nPermutations will be used")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
} else if (par.l$nPermutations + 1 > length(sampleData.l)) {
valueNew = length(sampleData.l) - 1
message = paste0("The value of the parameter nPermutations differs from what is saved in the output objects. The value of nPermutations will be adjusted to ", valueNew)
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
par.l$nPermutations = valueNew
}
sampleData.df = sampleData.l[["permutation0"]]
colnamesNew = sampleData.df$SampleID[which(sampleData.df$bamReads %in% par.l$allBAMS)]
if (length(colnamesNew) != nrow(sampleData.df)) {
......@@ -303,19 +315,7 @@ if (skipTF) {
################################
# ITERATE THROUGH PERMUTATIONS #
################################
# Adjust the number of permutations in case less have been computed
if (par.l$nPermutations + 1 < length(sampleData.l)) {
message = paste0("In the output objects, more permutations seem to be stored. They will be ignored and the uoriginal value of nPermutations will be used")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
} else if (par.l$nPermutations + 1 > length(sampleData.l)) {
valueNew = length(sampleData.l) - 1
message = paste0("The value of the parameter nPermutations differs from what is saved in the output objects. The value of nPermutations will be adjusted to ", valueNew)
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
par.l$nPermutations = valueNew
}
# Calculate the log2 counts once
if (par.l$nPermutations > 0) {
......
......@@ -21,7 +21,7 @@ checkAndLoadPackages(c("tidyverse", "futile.logger", "lsr", "ggrepel", "checkmat
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} and {TF} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/5.binningTF.{TF}.R.rds")
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/binningTF.{TF}.R.rds")
createDebugFile(snakemake)
......@@ -45,7 +45,10 @@ assertClass(snakemake, "Snakemake")
## INPUT ##
assertList(snakemake@input, min.len = 1)
assertSubset(names(snakemake@input), c("", "nucContent", "motifes"))
assertSubset(names(snakemake@input), c("", "sampleDataR", "nucContent", "motifes"))
par.l$file_input_metadata = snakemake@input$sampleDataR
assertFileExists(par.l$file_input_metadata, access = "r")
par.l$file_input_nucContentGenome = snakemake@input$nucContent
assertFileExists(par.l$file_input_nucContentGenome , access = "r")
......@@ -129,6 +132,25 @@ if (calculateVariance && par.l$nBootstraps < 1000) {
flog.warn(paste0("The value for nBootstraps is < 1000. We strongly recommend using a higher value in order to reliably estimate the statistical variance."))
}
sampleData.l = readRDS(par.l$file_input_metadata)
if (length(sampleData.l) == 0) {
message = "Length of sampleData.l list is 0 but is has to be at least 1. Rerun the rule DiffPeaks."
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
# Adjust the number of permutations in case less have been computed
if (par.l$nPermutations + 1 < length(sampleData.l)) {
message = paste0("In the output objects, more permutations seem to be stored. They will be ignored and the currently specified value of nPermutations will be used")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
} else if (par.l$nPermutations + 1 > length(sampleData.l)) {
valueNew = length(sampleData.l) - 1
message = paste0("The value of the parameter nPermutations differs from what is saved in the output objects. The value of nPermutations will be adjusted to ", valueNew)
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
par.l$nPermutations = valueNew
par.l$files_input_TF_allMotives = par.l$files_input_TF_allMotives[1:(par.l$nPermutations+1)]
}
####################
# READ NUC CG FILE #
......@@ -188,44 +210,8 @@ CGBins = seq(0,1, 1/par.l$nBins)
# PERMUTATIONS #
################
# TODO: Take at most x TFBS per TF for background, not all
# TODO: Binning preparation and storing all TFBS in one object is too large and not feasible
# TODO:Possibility 1: For each permutation, read in all 640 TF files with specific columns. Results in 640 * 640x1000 > 400 Million read_tsv calls.
#TODO: Alternative: Use the new column layout and modify the shell call to extract only the required columns: 1,2, and 2+permutationCur rather than using grep. Results in 1000 extra jobs but only 640*1000 = 640,000 read_tsv calls and acceptable memory
#for (permutationCur in 0:par.l$nPermutations)
for (fileCur in par.l$files_input_TF_allMotives) {
# Init the col type list to skip colums directly
# colType.l = vector("list", length = 2 + par.l$nPermutations + 1)
# colType.l[[1]] = col_character() # "TF"
# colType.l[[2]] = col_character() # "TFBSID"
# for (n in 1:permutationCur) {
# indexLast = 2+n
# colType.l[[indexLast]] = col_skip()
# }
# colType.l[[indexLast+1]] = col_double() # "log2FoldChange", important to have double here as col_number would not parse numbers in scientific notation
# for (n in 1:(par.l$nPermutations - permutationCur )) {
# colType.l[[indexLast + 1 + n]] = col_skip()
# }
#
#
# TF.motifs.ori = tribble(~permutation, ~TF, ~TFBSID, ~log2FoldChange)
# # Iterate over all TF
# for (fileCur in allFiles) {
# TF.motifs.ori = read_tsv(fileCur, col_types = colType.l)
#
# if (nrow(problems(TF.motifs.ori)) > 0) {
# flog.fatal(paste0("Parsing errors: "), problems(TF.motifs.ori), capture = TRUE)
# stop("Error when parsing the file ", fileCur, ", see errors above")
# }
#
# }
#
#
# Log 2 fold-changes from the particular permutation
TF.motifs.ori = read_tsv(fileCur, col_names = TRUE,
col_types = list(
......@@ -245,6 +231,11 @@ for (fileCur in par.l$files_input_TF_allMotives) {
message = paste0("The file ", fileCur, " is empty. Something went wrong before. Make sure the previous steps succeeded.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
if (ncol(TF.motifs.ori) != 3) {
message = paste0("The file ", fileCur, " does not have 3 columns. Something is wrong with the number of permutations. We recommend restarting the pipeline from the DiffPeaks step.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
colnames(TF.motifs.ori) = c("TF", "TFBSID", "log2FoldChange")
......@@ -260,9 +251,9 @@ for (fileCur in par.l$files_input_TF_allMotives) {
#Filter permutations in the original files that the user does not want anymore
TF.motifs.ori = TF.motifs.ori %>%
filter(permutation <= par.l$nPermutations) %>%
dplyr::filter(permutation <= par.l$nPermutations) %>%
mutate(CG.identifier = paste0(TF,":",TFBSID)) %>%
select(-one_of("TF"))
dplyr::select(-one_of("TF"))
#########
......@@ -273,7 +264,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
TF.motifs.all = TF.motifs.ori %>%
full_join(TF.motifs.CG, by = c("CG.identifier")) %>%
mutate(CG.bins = cut(CG, breaks = CGBins, labels = paste0(round(CGBins[-1] * 100,0),"%"), include.lowest = TRUE)) %>%
select(-one_of("CG.identifier", "CG"))
dplyr::select(-one_of("CG.identifier", "CG"))
# Not needed anymore, delete
......@@ -489,7 +480,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
plot.df = filter(perm.l[[TFCur]], !is.na(meanDifference)) %>%
mutate(binNo = as.numeric(gsub("%", "", bin))) %>%
select(one_of("binNo", "meanDifference", "ratio_TFBS")) %>%
dplyr::select(one_of("binNo", "meanDifference", "ratio_TFBS")) %>%
dplyr::rename(weight = ratio_TFBS)
plot.new.df = reshape2::melt(plot.df, id = "binNo")
......
......@@ -19,7 +19,7 @@ checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "Rsamtools"),
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/0.checkParameters.R.rds")
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/checkParameters.R.rds")
createDebugFile(snakemake)
par.l = list()
......
......@@ -20,7 +20,7 @@ checkAndLoadPackages(c("tidyverse", "futile.logger", "DiffBind", "checkmate", "s
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/1.createConsensusPeaks.R.rds")
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/createConsensusPeaks.R.rds")
createDebugFile(snakemake)
###################
......
......@@ -7,7 +7,7 @@ start.time <- Sys.time()
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/2.DiffPeaks.R.rds")
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/diffPeaks.R.rds")
library("checkmate")
assertClass(snakemake, "Snakemake")
......
......@@ -18,7 +18,7 @@ checkAndLoadPackages(c("tidyverse", "futile.logger", "modeest", "checkmate", "gg
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/4.summary1.R.rds")
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/summary1.R.rds")
createDebugFile(snakemake)
###################
......
......@@ -14,7 +14,7 @@ source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R"))
# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
# Replace {outputFolder} correspondingly.
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/6.summaryFinal.R.rds")
# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/summaryFinal.R.rds")
createDebugFile(snakemake)
initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
......
......@@ -611,6 +611,7 @@ rule calcNucleotideContent:
rule binningTF:
input:
nucContent = rules.calcNucleotideContent.output.bed,
sampleDataR = rules.DiffPeaks.output.sampleDataR,
motifes = expand("{dir}/{compType}allMotifs_log2FC_perm{perm}.tsv.gz", dir = TEMP_EXTENSION_DIR, compType = compType, perm = list(range(0, nPermutationsAdjusted + 1)))
output:
permResults = expand('{dir}/{{TF}}/{extension}/{compType}{{TF}}.permutationResults.rds', dir = TF_DIR, extension = extDir, compType = compType),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment