Commit c2843bcf authored by Christian Arnold's avatar Christian Arnold

Version 1.4, see Changelog for details

parent 2943e208
......@@ -44,7 +44,7 @@ Principally, there are two ways of installing *diffTF* and the proper tools:
.. code-block:: Bash
git clone https://git.embl.de/grp-zaugg/diffTF
git clone https://git.embl.de/grp-zaugg/diffTF.git
If you receive an error, *Git* may not be installed on your system. If you run Ubuntu, try the following command:
......
......@@ -55,9 +55,9 @@ author = 'Christian Arnold, Ivan Berest, Judith B. Zaugg'
# built documents.
#
# The short X.Y version.
version = '1.3'
version = '1.4'
# The full version, including alpha/beta/rc tags.
release = '1.3.3'
release = '1.4'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
......
......@@ -53,6 +53,14 @@ We also put the paper on *bioRxiv*, please read all methodological details here:
Change log
============================
Version 1.4 (2019-09-24)
- Various small improvements for increasing user experience
- For the peaks and TF-specific diagnostic plots, the pairwise mean-average plots between pairs of samples has been disabled by default. It was set to a maximum of 5 previously, but due to its time consuming nature and limited usage, we feel this is not needed for most users. We might add a parameter in the future to adjust this more flexibly, contact us if you want to have them back.
- various small improvements for the various diagnostic plots from the last step (summaryFinal), most of which concern classification-related plots and plots for the permutation-based approach
- For all output tables, diffTF now outputs only 2-3 significant digits, which reduces the size of some output tables significantly. THe memory footprint is thus overall decreased. More digits are not needed in our opinion.
- added diagnostic plots for the RNA-Seq data that is used for the classification. These include MA plots (based on original and shrunken log2 fold changes, (non)normalized log count density plots across all samples, and a mean-sd plot),
- added preliminary support for interaction terms in the design formula. This was not possible before but should now work without diffTF failing. if you continue having issues, please let us know.
Version 1.3.3 (2019-07-25)
- Fixed a bug that caused erroneously small p-values for the permutation-based approach in cases when the number of permutations that was actually done was smaller than what was specified in the configuration file (e.g., if 1000 permutations have been specified in the configuration file, but only 70 were actually done such as for a typical 4 vs 4 analysis). This is now corrected, and the last step of the pipeline (*summaryFinal*) should be rerun to correct for it. Although this happened in the special circumstances as described above (small number of samples and yet using the permutation-based approach), we apologize for this oversight. Thanks to Frauke Huth for noticing!
......
......@@ -475,9 +475,9 @@ if (skipTF) {
pdf(par.l$file_output_plot_diagnostic)
if (par.l$nPermutations == 0) {
plotDiagnosticPlots(TF.cds.filt, res_DESeq, conditionComparison, filename = NULL, maxPairwiseComparisons = 5)
plotDiagnosticPlots(TF.cds.filt, res_DESeq, conditionComparison, filename = NULL, maxPairwiseComparisons = 0, plotMA = FALSE)
} else {
plotDiagnosticPlots(TF.cds.filt, fit, conditionComparison, filename = NULL, maxPairwiseComparisons = 5)
plotDiagnosticPlots(TF.cds.filt, fit, conditionComparison, filename = NULL, maxPairwiseComparisons = 0, plotMA = FALSE)
}
......@@ -525,19 +525,31 @@ if (skipTF) {
# Check the version of modeest, because version 2.3.2 introduced an implementation change that breaks things
if (packageVersion("modeest") < "2.3.2") {
if (nrow(dplyr::filter(final.TF.df, !is.na(l2FC))) > 0) {
modeNum = mlv(final.TF.df$l2FC, method = "mfv", na.rm = TRUE)
if (packageVersion("modeest") < "2.3.2") {
modeNum = mlv(final.TF.df$l2FC, method = "mfv", na.rm = TRUE)
stopifnot(is.list(modeNum))
l2fc_mode = ifelse(is.null(modeNum$M), NA, modeNum$M)
l2fc_skewness = ifelse(is.null(modeNum$skewness), NA, modeNum$skewness)
} else {
l2fc_mode = mlv(final.TF.df$l2FC, method = "mfv", na.rm = TRUE)[1]
l2fc_skewness = skewness(final.TF.df$l2FC, na.rm = TRUE)[1]
}
stopifnot(is.list(modeNum))
l2fc_mode = ifelse(is.null(modeNum$M), NA, modeNum$M)
l2fc_skewness = ifelse(is.null(modeNum$skewness), NA, modeNum$skewness)
} else {
l2fc_mode = mlv(final.TF.df$l2FC, method = "mfv", na.rm = TRUE)[1]
l2fc_skewness = skewness(final.TF.df$l2FC, na.rm = TRUE)[1]
message = paste0("l2fc values could not calculated for any TFBS, this may happen for complex design formulas in combination with particular permutations. For permutation ", permutationCur, ", all values are NA.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
# For rare occassions, if only NA data are available, set l2fc_mode and l2fc_skewness to NA also
l2fc_mode = l2fc_skewness = NA
}
# We have to filter now by NAs because for some permutations, limma might have been unable to calculate the coefficients
final.TF.filtered.df = dplyr::filter(final.TF.df, !is.na(l2FC))
......@@ -592,7 +604,7 @@ if (skipTF) {
} # end for each permutation
if (!skipTF) {
TF_outputInclPerm.df = as.tibble(log2fc.m) %>%
TF_outputInclPerm.df = as_tibble(log2fc.m) %>%
add_column(TF = TFCur , TFBSID = TF_outputCur.df$TFBSID, .before = 1)
colnames(TF_outputInclPerm.df)[3:ncol(TF_outputInclPerm.df)] = paste0("log2fc_perm", 0:par.l$nPermutations)
......@@ -600,11 +612,18 @@ if (skipTF) {
} # end if !skipTF
TF_output.df = mutate_if(TF_output.df, is.numeric, as.character)
# Do it separately for each column because different rounding schemes might be needed
TF_output.df = TF_output.df %>%
mutate_at(c("l2FC", "limma_avgExpr", "limma_B", "limma_t_stat", "DESeq_ldcSE", "DESeq_stat", "DESeq_baseMean"), signif, 3) %>%
mutate_at(c("pval", "pval_adj"), formatC, format = "g", digits = 3)
# TF_output.df = mutate_if(TF_output.df, is.numeric, as.character)
write_tsv(TF_output.df, path = par.l$file_output_summaryAll)
TF_outputInclPerm.df = mutate_if(TF_outputInclPerm.df, is.numeric, as.character)
# All numeric columns can be treated the same way
TF_outputInclPerm.df = mutate_if(TF_outputInclPerm.df, is.numeric, signif, 2)
write_tsv(TF_outputInclPerm.df, path = par.l$file_outputPerm_summaryAll, col_names = FALSE)
saveRDS(outputSummary.df, file = par.l$file_output_summaryStats)
......
......@@ -113,10 +113,9 @@ checkAndLoadPackages(c("tidyverse", "futile.logger", "checkmate", "tools", "meth
# Step 6: summaryFinal
checkAndLoadPackages(c("tidyverse", "futile.logger","ggrepel", "checkmate", "tools", "methods", "grDevices", "pheatmap"), verbose = FALSE)
# Only needed if RNA-Seq integration is activated
if (par.l$plotRNASeqClassification) {
# Require some more packages here
checkAndLoadPackages(c( "lsr", "DESeq2", "matrixStats", "pheatmap", "preprocessCore"), verbose = FALSE)
checkAndLoadPackages(c( "lsr", "DESeq2", "matrixStats", "pheatmap", "preprocessCore", "apeglm"), verbose = FALSE)
}
# Check the version of readr, at least 1.1.0 is required to properly write gz files
......@@ -207,10 +206,16 @@ for (bamCur in sampleData.df$bamReads) {
chrBAM.df$seqnames = rownames(chrBAM.df)
colnames(chrBAM.df) = c("width", "seqnames")
# Check whether BAM file contains chromosomes with the "chr" notation
if (length(which(grepl("^chr", chrBAM.df$seqnames))) == 0) {
message = paste0("File ", bamCur, " does not have the correct chromosome names. The \"chr\" prefix is required for proper chromosome names, but they were not found. Check your BAM files and use samtools and sed to add \"chr\" to each chromosome name")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
merged.df = full_join(chrBAM.df, indexes.df, by = "seqnames")
# Filter chromosomes and retain only those we keep in the pipeline
discardMatches = grepl("^chrX|^chrY|^chrM|^chrUn|random|hap|_gl", merged.df$seqnames, perl = TRUE)
discardMatches = grepl("^chrX|^chrY|^chrM|^chrUn|random|hap|_gl|^GL", merged.df$seqnames, perl = TRUE)
if (length(discardMatches) > 0) {
chrNamesDiscarded = paste0(merged.df$seqnames[discardMatches], collapse = " ", sep = "\n")
......
......@@ -135,7 +135,8 @@ if (components3types["conditionSummary"] == "logical" | components3types["condit
}
# Read and modify samples metadata
sampleData.df = mutate(sampleData.df, name = file_path_sans_ext(basename(sampleData.df$bamReads)))
sampleData.df = mutate(sampleData.df, name = file_path_sans_ext(basename(sampleData.df$bamReads)),
SampleID = as.character(SampleID))
# Check and change column types as specified in the design formula
......@@ -429,7 +430,7 @@ if (par.l$nPermutations == 0) {
)
plotDiagnosticPlots(cds.peaks.filt, fit, comparisonDESeq, par.l$file_output_plots, maxPairwiseComparisons = 20)
plotDiagnosticPlots(cds.peaks.filt, fit, comparisonDESeq, par.l$file_output_plots, maxPairwiseComparisons = 0, plotMA = TRUE)
} else {
......@@ -510,7 +511,13 @@ if (par.l$nPermutations > 0) {
saveRDS(sampleData.l, par.l$file_output_metadata)
final.peaks.df = mutate_if(final.peaks.df, is.numeric, as.character)
# Do it separately for each column because different rounding schemes might be needed
final.peaks.df = final.peaks.df %>%
mutate(permutation = as.integer(permutation)) %>%
mutate_at(c("DESeq_baseMean", "l2FC", "DESeq_ldcSE", "DESeq_stat"), signif, 3) %>%
mutate_at(c("pval", "pval_adj"), formatC, format = "g", digits = 3)
write_tsv(final.peaks.df, path = par.l$file_output_peaksTSV)
#final.peaks.perm.df = mutate_if(final.peaks.perm.df, is.numeric, as.character)
......
......@@ -404,6 +404,26 @@ convertToFormula <- function(userFormula, validColnames = NULL) {
# Check colmn names
if (!is.null(colnames)) {
formulaVariables = attr(terms(designFormula), "term.labels")
indexInteractionTerms = which(grepl(":", formulaVariables))
if (length(indexInteractionTerms) > 0) {
interactionTerms = formulaVariables[indexInteractionTerms]
# Check whether all individual items have been specified
for (interactionTermCur in interactionTerms) {
componentsCur = strsplit(interactionTermCur, ":")[[1]]
if (!all(componentsCur %in% formulaVariables)) {
message = paste0("Design formula is incorrect, not all terms from the interactions (", paste0(componentsCur, collapse = ","), ") have been specified individually.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
}
formulaVariables = formulaVariables[-indexInteractionTerms]
}
assertSubset(formulaVariables, validColnames)
}
......@@ -437,10 +457,10 @@ myMAPlot <- function(M, idx, main, minMean = 0) {
}
plotDiagnosticPlots <- function(dd, differentialResults, conditionComparison, filename = NULL, maxPairwiseComparisons = 5, alpha = 0.05) {
plotDiagnosticPlots <- function(dd, differentialResults, conditionComparison, filename = NULL, maxPairwiseComparisons = 5, plotMA = FALSE, alpha = 0.05) {
checkAndLoadPackages(c("tidyverse", "checkmate", "geneplotter", "DESeq2", "vsn", "RColorBrewer", "limma"), verbose = FALSE)
flog.info(paste0("Plotting various diagnostic plots"))
assertClass(dd, "DESeqDataSet")
......@@ -452,17 +472,38 @@ plotDiagnosticPlots <- function(dd, differentialResults, conditionComparison, fi
pdf(filename)
}
if (testClass(differentialResults, "MArrayLM")) {
title = paste0("limma results\n", conditionComparison[1], " vs. ", conditionComparison[2])
isSign = ifelse(p.adjust(differentialResults$p.value[,ncol(differentialResults$p.value)], method = "BH") < alpha, paste0("sign. (BH, ", alpha, ")"), "not-significant")
limma::plotMA(differentialResults, main = title, status = isSign)
} else {
if (plotMA) {
# TODO: DeSEQ diagnostic plots
if (testClass(differentialResults, "MArrayLM")) {
title = paste0("limma results\n", conditionComparison[1], " vs. ", conditionComparison[2])
isSign = ifelse(p.adjust(differentialResults$p.value[,ncol(differentialResults$p.value)], method = "BH") < alpha, paste0("sign. (BH, ", alpha, ")"), "not-significant")
flog.info(paste0(" Plotting MA plots from limma..."))
limma::plotMA(differentialResults, main = title, status = isSign)
} else {
# DESeq2 specific MA plots
flog.info(paste0(" Plotting MA plot from DESeq (1)..."))
# 1. Regular MA plot:
# shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet
DESeq2::plotMA(dd, main = "Regular MA plot")
# 2. MA plot based on shrunken log2 fold changes
# Shrinkage of effect size (LFC estimates) is useful for visualization and ranking of genes. To shrink the LFC, we pass the dds object to the function lfcShrink. Below we specify to use the apeglm method for effect size shrinkage (Zhu, Ibrahim, and Love 2018), which improves on the previous estimator.
# It is more useful visualize the MA-plot for the shrunken log2 fold changes, which remove the noise associated with log2 fold changes from low count genes without requiring arbitrary filtering thresholds.
flog.info(paste0(" Plotting MA plot from DESeq (2)..."))
dd_LFC <- lfcShrink(dd, coef=resultsNames(dd)[length(resultsNames(dd))], type="apeglm")
DESeq2::plotMA(dd_LFC, main = "MA plot based on shrunken log2 fold changes")
}
}
......@@ -473,6 +514,7 @@ plotDiagnosticPlots <- function(dd, differentialResults, conditionComparison, fi
colors = colorRampPalette(brewer.pal(9, "Set1"))(nSamples)
xlabCur = "Mean log counts"
flog.info(paste0(" Plotting multidensity plot..."))
if (nSamples < 10) {
......@@ -493,38 +535,40 @@ plotDiagnosticPlots <- function(dd, differentialResults, conditionComparison, fi
multiecdf(log(DESeq2::counts(dd, normalized = TRUE) + 0.5) , xlab = xlabCur, main = "Normalized log counts", legend = NULL, col = colors)
}
# 3. Pairwise sample comparisons.
# To further assess systematic differences between the samples, we can also plot pairwise mean–average plots: We plot the average of the log–transformed counts vs the fold change per gene for each of the sample pairs.
MA.idx = t(combn(seq_len(dim(colData(dd))[1]), 2))
if (nrow(MA.idx) > maxPairwiseComparisons) {
flog.info(paste0("The number of pairwise comparisons to plot exceeds the current maximum of ", maxPairwiseComparisons, ". Only ", maxPairwiseComparisons, " pairwise comparisons will be shown in the PDF."))
MA.idx.filt = MA.idx[1:maxPairwiseComparisons,, drop = FALSE]
} else {
MA.idx.filt = MA.idx
}
for (i in seq_along(MA.idx.filt[,1])) {
if (maxPairwiseComparisons > 0) {
flog.info(paste0(" Plotting pairwise sample comparisons. This amy take a while."))
# 3. Pairwise sample comparisons.
# To further assess systematic differences between the samples, we can also plot pairwise mean–average plots: We plot the average of the log–transformed counts vs the fold change per gene for each of the sample pairs.
MA.idx = t(combn(seq_len(dim(colData(dd))[1]), 2))
if (nrow(MA.idx) > maxPairwiseComparisons) {
flog.info(paste0("The number of pairwise comparisons to plot exceeds the current maximum of ", maxPairwiseComparisons, ". Only ", maxPairwiseComparisons, " pairwise comparisons will be shown in the PDF."))
MA.idx.filt = MA.idx[1:maxPairwiseComparisons,, drop = FALSE]
flog.info(paste0(" Plotting pairwise comparison ", i, " out of ", nrow(MA.idx.filt)))
label = paste0(colnames(dd)[MA.idx.filt[i,1]], " vs ", colnames(dd)[MA.idx.filt[i,2]])
suppressWarnings(print(myMAPlot(DESeq2::counts(dd, normalized = TRUE), c(MA.idx[i,1], MA.idx.filt[i,2]), main = label)))
}
} else {
MA.idx.filt = MA.idx
}
for (i in seq_along(MA.idx.filt[,1])) {
flog.info(paste0(" Plotting pairwise comparison ", i, " out of ", nrow(MA.idx.filt)))
label = paste0(colnames(dd)[MA.idx.filt[i,1]], " vs ", colnames(dd)[MA.idx.filt[i,2]])
suppressWarnings(print(myMAPlot(DESeq2::counts(dd, normalized = TRUE), c(MA.idx[i,1], MA.idx.filt[i,2]), main = label)))
}
# Show an empty page with a warning if plots have been omitted
if (nrow(MA.idx) > nrow(MA.idx.filt)) {
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
message = paste0("All remaining pairwise comparisons plots\nbetween samples have been omitted\nfor time and memory reasons.\nThe current maximum is set to ", maxPairwiseComparisons, ".")
text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red")
}
# Show an empty page with a warning if plots have been omitted
if (nrow(MA.idx) > nrow(MA.idx.filt)) {
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
message = paste0("All remaining pairwise comparisons plots\nbetween samples have been omitted\nfor time and memory reasons.\nThe current maximum is set to ", maxPairwiseComparisons, ".")
text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red")
}
# 4. Mean SD plot: Plot row standard deviations versus row means
notAllZeroPeaks <- (rowSums(DESeq2::counts(dd)) > 0)
......@@ -691,7 +735,7 @@ heatmap.act.rep <- function(df.tf.peak.matrix, HOCOMOCO_mapping.df.exp, cor.m, p
missingGenes = which(!HOCOMOCO_mapping.df.exp$ENSEMBL %in% colnames(cor.m))
if (length(missingGenes) > 0) {
HOCOMOCO_mapping.df.exp = filter(HOCOMOCO_mapping.df.exp, ENSEMBL %in% colnames(cor.m))
HOCOMOCO_mapping.df.exp = dplyr::filter(HOCOMOCO_mapping.df.exp, ENSEMBL %in% colnames(cor.m))
}
cor.r.pearson.m <- cor.m[,as.character(HOCOMOCO_mapping.df.exp$ENSEMBL)]
......@@ -809,6 +853,26 @@ checkDesignIntegrity <- function(snakemake, par.l, sampleData.df, useRNA = FALSE
}
formulaVariables = attr(terms(designFormula), "term.labels")
indexInteractionTerms = which(grepl(":", formulaVariables))
if (length(indexInteractionTerms) > 0) {
interactionTerms = formulaVariables[indexInteractionTerms]
# Check whether all individual items have been specified
for (interactionTermCur in interactionTerms) {
componentsCur = strsplit(interactionTermCur, ":")[[1]]
if (!all(componentsCur %in% formulaVariables)) {
message = paste0("Design formula is incorrect, not all terms from the interactions (", paste0(componentsCur, collapse = ","), ") have been specified individually.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
}
formulaVariables = formulaVariables[-indexInteractionTerms]
}
checkAndLogWarningsAndErrors(components, checkVector(components, min.len = length(formulaVariables)))
# Extract the variable that defines the contrast. Always the last element in the formula
......
......@@ -168,11 +168,15 @@ summary.df = summary.df %>%
adj_pvalue = p.adjust(pvalue_raw, method = "fdr"),
Diff_mean = Mean_l2FC - mean(peaks.df$l2FC, na.rm = TRUE),
Diff_median = Median_l2FC - median(peaks.df$l2FC, na.rm = TRUE),
Diff_mode = Mode_l2FC - l2fc_mode,
Diff_skew = skewness_l2FC - l2fc_skewness) %>%
na.omit(summary.df)
Diff_mode = Mode_l2FC - l2fc_mode,
Diff_skew = skewness_l2FC - l2fc_skewness) %>%
na.omit(summary.df) %>%
mutate_at(c("Diff_mean", "Diff_median", "Diff_mode", "Diff_skew", "Mean_l2FC",
"Median_l2FC", "Mode_l2FC", "skewness_l2FC"), signif, 3) %>%
mutate_at(c("pvalue_raw", "adj_pvalue"), formatC, format = "g", digits = 3)
summary.df = mutate_if(summary.df, is.numeric, as.character)
# summary.df = mutate_if(summary.df, is.numeric, as.character)
write_tsv(summary.df, par.l$file_output_table) # TODO: check the dec = "." parameter
.printExecutionTime(start.time)
......
......@@ -19,7 +19,7 @@ source(paste0(snakemake@config$par_general$dir_scripts, "/functions.R"))
createDebugFile(snakemake)
initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
checkAndLoadPackages(c("tidyverse", "futile.logger", "ggrepel", "checkmate", "tools", "grDevices", "locfdr"), verbose = FALSE)
checkAndLoadPackages(c("tidyverse", "futile.logger", "ggrepel", "checkmate", "tools", "grDevices", "locfdr", "apeglm"), verbose = FALSE)
###################
......@@ -42,6 +42,7 @@ par.l$thresholds_CohensD = c(0.1, 0.5, 0.8)
par.l$corMethod = "pearson" # Expression-peak count correlation method. As we quantile normalize now, should be pearson
par.l$regressionMethod = "glm" # for correlating RNA-Seq classification with TF activity
par.l$filter_minCountsPerCondition = 5 # For filtering RNA-seq genes, see Documentation
par.l$thresholds_pvalue_Wilcoxon = 0.05
# 4. Volcano plot settings
par.l$maxTFsToLabel = 150 # Maximum Tfs to label in the Volcano plot
......@@ -207,7 +208,10 @@ if (par.l$nPermutations > 0) {
xrange = range(output.global.TFs.orig$weighted_meanDifference, na.rm = TRUE)
plot(density(dataPerm.df$weighted_meanDifference, na.rm = TRUE), col = "black", main = "Weighted mean difference values (black = permuted)", xlim = xrange)
maxY = max(density(dataPerm.df$weighted_meanDifference, na.rm = TRUE)$y, density(dataReal.df$weighted_meanDifference, na.rm = TRUE)$y)
plot(density(dataPerm.df$weighted_meanDifference, na.rm = TRUE), col = "black", main = "Weighted mean difference values (black = permuted)",
xlim = xrange, ylim = c(0,maxY))
lines(density(dataReal.df$weighted_meanDifference, na.rm = TRUE), col = "red")
for (TFCur in unique(output.global.TFs.orig$TF)) {
......@@ -226,7 +230,9 @@ if (par.l$nPermutations > 0) {
rangeX = range(dataCur.df$weighted_meanDifference)
rowCur = which(output.global.TFs.orig$TF == TFCur)
g = ggplot(dataPerm.df, aes(weighted_meanDifference)) + geom_density() + geom_vline(xintercept = dataReal.df$weighted_meanDifference[1], color = "red") + ggtitle(TFCur) + xlim(c(rangeX * 1.5)) # + scale_x_continuous(limits = c(min(dataCur.df$weighted_meanDifference) - 0.5, max(dataCur.df$weighted_meanDifference) + 0.5))
g = ggplot(dataPerm.df, aes(weighted_meanDifference)) + geom_density() +
geom_vline(xintercept = dataReal.df$weighted_meanDifference[1], color = "red") + ggtitle(TFCur) + xlim(c(rangeX * c(0.5,1.5)))
# + scale_x_continuous(limits = c(min(dataCur.df$weighted_meanDifference) - 0.5, max(dataCur.df$weighted_meanDifference) + 0.5))
diagPlots.l[[TFCur]] = g
plot(g)
......@@ -330,8 +336,10 @@ for (pValueCur in c(par.l$significanceThresholds , 1)) {
stats.df = group_by(output.global.TFs.orig, permutation) %>% summarise(max = max(weighted_meanDifference), min = min(weighted_meanDifference))
ggplot(stats.df, aes(min)) + geom_density()
ggplot(stats.df, aes(max)) + geom_density()
dev.off()
#################################
# mode of change quantification #
#################################
......@@ -396,7 +404,7 @@ if (par.l$plotRNASeqClassification) {
nFiltRows = nrow(sampleData.l[["permutation0"]]) - nrow(sampleData.df)
if (nFiltRows > 0) {
flog.warn(paste0("Filtered ", nFiltRows, " sample IDs after comparising sample names with RNA-Seq table"))
flog.warn(paste0("Filtered ", nFiltRows, " sample IDs after comparising sample names with RNA-Seq table. Remaining: ", nrow(sampleData.df)))
}
# The design formula for RNA-Seq is different from the one we used before for ATAC-Seq
......@@ -431,6 +439,7 @@ if (par.l$plotRNASeqClassification) {
dd = estimateSizeFactors(dd)
# dd = DESeq(dd)
dd_counts = DESeq2::counts(dd, normalized=TRUE)
######################################
# Filtering of lowly expressed genes #
......@@ -454,21 +463,22 @@ if (par.l$plotRNASeqClassification) {
nFiltered = length(which(idx == FALSE))
if (nFiltered > 0) {
flog.info(paste0("Filtered ", nFiltered, " genes from RNA-Seq table because of low counts."))
flog.info(paste0("Filtered ", nFiltered, " genes from RNA-Seq table because of low counts. Remaining: ", length(idx)))
dd.filt = dd[idx,]
} else {
dd.filt = dd
}
dd.filt <- DESeq(dd.filt)
dd_counts.filt = DESeq2::counts(dd.filt, normalized=TRUE)
RNA.counts.filt.df = dd_counts.filt %>% as.data.frame() %>% rownames_to_column("ENSEMBL") %>% as.tibble()
RNA.counts.filt.df = dd_counts.filt %>% as.data.frame() %>% rownames_to_column("ENSEMBL") %>% as_tibble()
# Raw counts, used for other types of normalization thereafter
dd_counts.raw.filt = DESeq2::counts(dd.filt, normalized=FALSE)
dd_counts.filt.quantile = normalize.quantiles(as.matrix(dd_counts.raw.filt))
RNA.counts.quantile.df.all = dd_counts.filt.quantile %>% as.data.frame() %>% as.tibble()
RNA.counts.quantile.df.all = dd_counts.filt.quantile %>% as.data.frame() %>% as_tibble()
# Fix row and column names
colnames(RNA.counts.quantile.df.all) = colnames(dd_counts.raw.filt)
......@@ -663,20 +673,6 @@ if (par.l$plotRNASeqClassification) {
}
# AR.data = as.data.frame(median.cor.tfs)
# AR.data$TF = rownames(AR.data)
#
# output.global.TFs = merge(output.global.TFs, AR.data, by = "TF",all.x = TRUE)
#
# output.global.TFs$classification = ifelse(is.na(output.global.TFs$median.cor.tfs), "not-expressed",
# ifelse(output.global.TFs$median.cor.tfs <= act.rep.thres[1], "repressor",
# ifelse(output.global.TFs$median.cor.tfs > act.rep.thres[2], "activator", "undetermined")))
#
# output.global.TFs$classification = factor(output.global.TFs$classification, levels = names(par.l$colorCategories))
#
#
#
#
# TODO: for each TFBS, a p-value and a correlation value
colnameClassification = paste0("classification")
......@@ -731,7 +727,7 @@ if (par.l$plotRNASeqClassification) {
# POST-FILTER: CHANGE SOME TFs TO UNDETERMINED #
################################################
par.l$thresholds_pvalue_Wilcoxon = 0.05
# Change the classification with the p-value from the distribution test
colnameClassificationPVal = paste0("classification_distr_rawP")
......@@ -759,17 +755,13 @@ if (par.l$plotRNASeqClassification) {
}
####################
####################
# DIAGNOSTIC PLOTS #
####################
####################
pdf(file = par.l$files_plotDiagnostic[2], width = 3, height = 8)
pdf(file = par.l$files_plotDiagnostic[2], width = 4, height = 8)
xlab="median pearson correlation (r)"
ylab=""
xlim= c(-max(abs(range(median.cor.tfs))) - 0.05, max(abs(range(median.cor.tfs))) + 0.05)
......@@ -783,32 +775,41 @@ if (par.l$plotRNASeqClassification) {
thresCur_upper = (1 - as.numeric(thresCur)) * 100
thresCur_lower = as.numeric(thresCur) * 100
mainCur = paste0("Stringency: ", thresCur)
plot(median.cor.tfs.non[names(median.cor.tfs)], 1:length(median.cor.tfs.non),
xlim=xlim, ylim=ylim, main="", xlab=xlab, ylab=ylab,
xlim=xlim, ylim=ylim, main=mainCur, xlab=xlab, ylab=ylab,
col=adjustcolor("darkgrey",alpha=1), pch = 16, cex = 0.5,axes = FALSE)
points(median.cor.tfs, 1:length(median.cor.tfs),
pch=16, cex=0.5,
col=ifelse(median.cor.tfs>thresCur.v[2], par.l$colorCategories["activator"] ,ifelse(median.cor.tfs<thresCur.v[1], par.l$colorCategories["repressor"], par.l$colorCategories["undetermined"]))
)
text(x =c((thresCur.v[1]-0.01),(thresCur.v[2]+0.01)),
y=c((length(median.cor.tfs.non)+5),(length(median.cor.tfs.non)+5)), pos=c(2,4),
labels =c(paste0(thresCur_lower, "\npercentile"), paste0(thresCur_upper, "\npercentile")),cex=0.7, col=c("black","black"))
text(x =c((thresCur.v[1]),(thresCur.v[2])),
y=c((length(median.cor.tfs.non)),(length(median.cor.tfs.non))), pos=c(2,4),
labels =c(paste0(thresCur_lower, " percentile\n", "(", round(thresCur.v[1],5), ")"),
paste0(thresCur_upper, " percentile\n", "(", round(thresCur.v[2],5), ")")),
cex=0.7, col=c("black","black"))
abline(v=thresCur.v[1], col=par.l$colorCategories["repressor"])
abline(v=thresCur.v[2], col=par.l$colorCategories["activator"])
axis(side = 1, lwd = 1, line = 0, at = c(-0.2,0,0.2), cex=1)
dataPoints = c(median.cor.tfs.non[names(median.cor.tfs)], median.cor.tfs)
yAxisLimits = c(min(dataPoints) * 1.1, max(dataPoints) * 1.1)
# Set the limits dynmaically
defaultLimits = seq(-1,1,0.2)
defaultLimits = defaultLimits[-c(which(defaultLimits < 0 & defaultLimits < yAxisLimits[1]-0.2), which(defaultLimits > 0 & defaultLimits < yAxisLimits[1]+0.2))]
axis(side = 1, at = defaultLimits, lwd = 1, line = 0, cex=1)
}
heatmap.act.rep(TF.peakMatrix.df, HOCOMOCO_mapping.df.exp, cor.m, par.l, median.cor.tfs, median.cor.tfs.non, act.rep.thres.l)
dev.off()
# Process results
res.peaks.filt = results(dd.filt) %>% as.data.frame() %>% rownames_to_column("ENSEMBL") %>% as.tibble()
res.peaks.filt = results(dd.filt) %>% as.data.frame() %>% rownames_to_column("ENSEMBL") %>% as_tibble()
# TODO
......@@ -826,6 +827,13 @@ if (par.l$plotRNASeqClassification) {
output.global.TFs$weighted_meanDifference = as.numeric(output.global.TFs$weighted_meanDifference)
pdf(par.l$files_plotDiagnostic[3])
# Diagnostic plots for DeSEQ2
plotDiagnosticPlots(dd.filt, dd.filt, conditionComparison, filename = NULL, maxPairwiseComparisons = 0, alpha = 0.05, plotMA = TRUE)