Commit 91811391 authored by Christian Arnold's avatar Christian Arnold

Updated to version 1.1. See changelog

parent e8af1357
This diff is collapsed.
......@@ -25,9 +25,15 @@ Ivan Berest*, Christian Arnold*, Armando Reyes-Palomares, Giovanni Palla, Kasper
Change log
============================
Version 1.1 (2018-07-27)
- updated the TFBS files that are available via download (some files were not presorted correctly)
- added a new parameter *dir_TFBS_sorted* in the config file to specify that the TFBS input files are already sorted, which saves some computation time by not resorting them
- added support for single-end BAM files. There is a new parameter *pairedEnd" in the config file now that specifies whether reads are paired-end or not.
- restructured some of the permutation-related output files to save space and computation time. The rule *concatenateMotifsPerm* should now be much faster, and the TF-specific *...outputPerm.tsv.gz* files are now much smaller due to an improved column structure
Version 1.0.1 (2018-07-25)
- fixed a bug in 2.DiffPeaks.R that sometimes caused the step to fail, thanks to Jonas Ungerbeck for letting us know
- fixed a bug in 3.analyzeTF for rare corner cases when DeSeq fails
- fixed a bug in *2.DiffPeaks.R* that sometimes caused the step to fail, thanks to Jonas Ungerbeck for letting us know
- fixed a bug in *3.analyzeTF* for rare corner cases when *DESeq* fails
Version 1.0 (2018-07-01)
- released stable version
......
{
"par_general": {
"outdir": "../output",
"maxCoresPerRule": 2,
"dir_TFBS_sorted":true,
"regionExtension": 100,
"comparisonType": "GMPvsMPP.all",
"conditionComparison": "GMP,MPP",
......@@ -15,7 +17,8 @@
"RNASeqIntegration": true
},
"samples": {
"summaryFile": "sampleData.tsv"
"summaryFile": "sampleData.tsv",
"pairedEnd": true
},
"peaks": {
"consensusPeaks": "",
......
library(matrixStats)
??matrixStats
?matrixStats
......@@ -65,8 +65,6 @@ assertIntegerish(par.l$nBootstraps, len = 1)
par.l$file_input_sampleData = snakemake@config$samples$summaryFile
checkAndLogWarningsAndErrors(par.l$file_input_sampleData, checkFileExists(par.l$file_input_sampleData, access = "r"))
par.l$designFormulaVariableTypes = snakemake@config$par_general$designVariableTypes
par.l$designFormula = snakemake@config$par_general$designContrast
## LOG ##
assertList(snakemake@log, min.len = 1)
par.l$file_log = snakemake@log[[1]]
......@@ -121,76 +119,23 @@ if (par.l$nPermutations == 0 && par.l$nBootstraps < 1000) {
}
#############################
# CHECK PARAMETER VALIDITY #
# CHECK FASTA AND BAM FILES #
#############################
sampleData.df = read_tsv(par.l$file_input_sampleData, col_names = TRUE, col_types = cols())
# Check the sample table
designFormula = as.formula(par.l$designFormula)
formulaVariables = attr(terms(designFormula), "term.labels")
# Extract the variable that defines the contrast. Always the last element in the formula
variableToPermute = formulaVariables[length(formulaVariables)]
par.l$designFormulaVariableTypes = gsub(" ", "", par.l$designFormulaVariableTypes)
components = strsplit(par.l$designFormulaVariableTypes, ",")[[1]]
checkAndLogWarningsAndErrors(components, checkVector(components, len = length(formulaVariables)))
# Split further
components2 = strsplit(components, ":")
par.l$designFormulaVariableTypes = gsub(" ", "", par.l$designFormulaVariableTypes)
components = strsplit(par.l$designFormulaVariableTypes, ",")[[1]]
checkAndLogWarningsAndErrors(components, checkVector(components, len = length(formulaVariables)))
# Split further
components2 = strsplit(components, ":")
if (!all(sapply(components2,length) == 2)) {
message = "The parameter \"designVariableTypes\" has not been specified correctly. It must contain all the variables that appear in the parameter \"designContrast\". See the documentation for details"
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
components3 = unlist(lapply(components2, "[[", 1))
components3types = tolower(unlist(lapply(components2, "[[", 2)))
names(components3types) = components3
checkAndLogWarningsAndErrors(sort(formulaVariables), checkSetEqual(sort(formulaVariables), sort(components3)))
checkAndLogWarningsAndErrors(components3types, checkSubset(components3types, c("factor", "integer", "numeric", "logical")))
datatypeVariableToPermute = components3types[variableToPermute]
nDistValues = length(unique(sampleData.df$conditionSummary))
if (datatypeVariableToPermute %in% c("integer", "numeric")) {
if (nDistValues < 3) {
message = paste0("The column 'conditionSummary' must contain at least 3 different values for the linear model, but ", nDistValues, " were found.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
} else {
if (nDistValues != 2) {
if (nDistValues != 2) {
message = paste0("The column 'conditionSummary' must contain exactly 2 different values, but ", nDistValues, " were found.")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
if (!testSubset(unique(sampleData.df$conditionSummary), conditionComparison)) {
}
if (!testSubset(unique(sampleData.df$conditionSummary), conditionComparison)) {
message = paste0("The elements specified in 'conditionComparison' in the config file must be a subset of the values in the column 'conditionSummary' in the sample file")
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
}
#############################
# CHECK FASTA AND BAM FILES #
#############################
# Build the fasta index. Requires write access to the folder where the fasta is stored (limitation of samtools faidx)
fastaIndex = paste0(fastaFile, ".fai")
......
This diff is collapsed.
......@@ -150,7 +150,8 @@ if (length(colnamesNew) != nrow(sampleData.df)) {
# Initiate data structures that are populated hereafter
TF_output.df = tribble(~permutation, ~TF, ~chr, ~MSS, ~MES, ~TFBSID, ~strand, ~peakID, ~limma_avgExpr, ~l2FC, ~limma_B, ~limma_t_stat, ~DESeq_ldcSE, ~DESeq_stat, ~DESeq_baseMean, ~pval, ~pval_adj)
TF_outputInclPerm.df = tribble(~permutation, ~TF, ~TFBSID, ~l2FC)
outputSummary.df = tribble(~permutation, ~TF, ~Pos_l2FC, ~Mean_l2FC, ~Median_l2FC, ~Mode_l2FC, ~sd_l2FC, ~pvalue_raw, ~skewness_l2FC, ~T_statistic, ~TFBS_num)
......@@ -176,6 +177,9 @@ overlapsAll.df = overlapsAll.df %>%
nTFBS = nrow(overlapsAll.df)
skipTF = FALSE
......@@ -183,11 +187,12 @@ skipTF = FALSE
if (nTFBS >= par.l$minNoDatapoints) {
# Group by ID
# Group by peak ID
# take only the maximum row mean of all samples, sample with biggest coverage
coverageAll_grouped.df = overlapsAll.df %>%
dplyr::group_by(peakID) %>%
dplyr::slice(which.max(mean))
dplyr::slice(which.max(mean)) %>%
dplyr::ungroup()
TF.table.m = as.matrix(coverageAll_grouped.df[,sampleData.df$SampleID])
colnames(TF.table.m) = sampleData.df$SampleID
......@@ -204,10 +209,15 @@ if (skipTF) {
# write a dummy pdf file
pdf(par.l$file_output_plot_diagnostic)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
message = paste0("Insufficient data to run analysis.\n", message)
message = paste0("Insufficient data to run analysis.\n", message, "\nThis TF will be ignored in subsequent steps.")
text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red")
dev.off()
TF_outputInclPerm.df = as.data.frame(matrix(nrow = 0, ncol = 2 + par.l$nPermutations + 1))
colnames(TF_outputInclPerm.df) = c("TF", "TFBSID", paste0("log2fc_perm", 0:par.l$nPermutations))
} else {
# Create formula based on user-defined design
......@@ -273,6 +283,11 @@ if (skipTF) {
# low RC, check by rowMean
TF.cds.filt = TF.cds[rowMeans(counts(TF.cds)) > 0, ]
nPeaks = nrow(TF.cds.filt)
# Preallocate data frame so no expensive reallocation has to be done
log2fc.m = matrix(NA, nrow = nPeaks , ncol = par.l$nPermutations + 1)
peaks.df = read_tsv(par.l$file_input_peak2, col_types = cols())
if (nrow(problems(peaks.df)) > 0) {
......@@ -298,6 +313,19 @@ if (skipTF) {
par.l$nPermutations = valueNew
}
# Calculate the log2 counts once
if (par.l$nPermutations > 0) {
# Generate normalized counts for limma analysis
countsNorm = counts(TF.cds.filt, norm = TRUE)
countsNorm.transf = log2(countsNorm + par.l$pseudocountAddition)
rownames(countsNorm.transf) = rownames(TF.cds.filt)
}
# dplyr::mutate(TF = par.l$TF) gives the following weird error message: Error: Unsupported type NILSXP for column "TF"
TFCur = par.l$TF
for (permutationCur in 0:par.l$nPermutations) {
if (permutationCur == 0) {
......@@ -310,14 +338,11 @@ if (skipTF) {
##############################
# RUN EITHER LIMMA OR DESEQ2 #
##############################
skipTF = FALSE
if (par.l$nPermutations > 0) {
# Generate normalized counts for limma analysis
countsNorm = counts(TF.cds.filt, norm = TRUE)
countsNorm.transf = log2(countsNorm + par.l$pseudocountAddition)
rownames(countsNorm.transf) = rownames(TF.cds.filt)
# model.matrix uses the first level in the specified column as reference, and so the corresponding column name and values are relative to that reference.
# That is, if the levels are "GMP" and "MPP", then all log2 fc will be the log2fc of MPP as compared to GMP.
# whatever comes first for model.matrix is taken as first value, then log2fc is of the second condition over the first
fit <- eBayes(lmFit(countsNorm.transf, design = model.matrix(designFormula, data = sampleData.df)))
results.df <- topTable(fit, coef = colnames(fit$design)[ncol(fit$design)], number = Inf, sort.by = "none")
......@@ -337,6 +362,7 @@ if (skipTF) {
sampleData.df = sampleData.l[[paste0("permutation0")]]
# We already set the factors for conditionSummary explicitly. The reference level is the first level for DeSeq.
# Run the local fit first, if that throws an error try the default fit type
res_DESeq = tryCatch( {
......@@ -352,22 +378,17 @@ if (skipTF) {
}, error = function(e) {
errorMessage <<- "Could not run DESeq with regular fitting either, set all values to NA."
checkAndLogWarningsAndErrors(NULL, errorMessage, isWarning = TRUE)
}
)
res_DESeq
}
)
if (class(res_DESeq) == "character") skipTF = TRUE
if (!skipTF) {
res_DESeq.df <- as.data.frame(DESeq2::results(res_DESeq))
final.TF.df = data_frame("TFBSID" = rownames(res_DESeq.df),
"DESeq_baseMean" = res_DESeq.df$baseMean,
"l2FC" = res_DESeq.df$log2FoldChange,
......@@ -380,7 +401,6 @@ if (skipTF) {
"limma_t_stat" = NA
)
}
}
##################################
......@@ -388,10 +408,7 @@ if (skipTF) {
##################################
if (!skipTF) {
order = c("permutation", "TF", "chr", "MSS", "MES", "TFBSID", "strand", "peakID", "l2FC", "limma_avgExpr", "limma_B", "limma_t_stat", "DESeq_ldcSE", "DESeq_stat", "DESeq_baseMean", "pval", "pval_adj")
# dplyr::mutate(TF = par.l$TF) gives the following weird error message: Error: Unsupported type NILSXP for column "TF"
TFCur = par.l$TF
TF_outputCur.df = final.TF.df %>%
dplyr::left_join(overlapsAll.df,by = c("TFBSID")) %>%
dplyr::left_join(peaksFiltered.df, by = c("peakID" = "annotation")) %>%
......@@ -400,8 +417,8 @@ if (skipTF) {
dplyr::arrange(chr) %>%
dplyr::mutate(TF = TFCur, permutation = permutationCur, l2FC = signif(l2FC, par.l$roundLog2FCDigits)) %>%
dplyr::select(one_of(order))
if (permutationCur == 0) {
TF_output.df = rbind(TF_output.df, TF_outputCur.df)
......@@ -457,10 +474,9 @@ if (skipTF) {
}
# Execute this ALWAYS, as also the values for the real data should be stored for easier later retrieval
TF_outputCurFiltered.df = dplyr::select(TF_outputCur.df, one_of("permutation", "TF", "TFBSID", "l2FC"))
TF_outputInclPerm.df = rbind(TF_outputInclPerm.df, TF_outputCurFiltered.df)
log2fc.m[,permutationCur + 1] = TF_outputCur.df$l2FC
# d) Comparisons between peaks and binding sites
modeNum = mlv(final.TF.df$l2FC, method = "mfv", na.rm = TRUE)
......@@ -504,25 +520,34 @@ if (skipTF) {
TFBS_num = nrow(final.TF.df)
)
}
} else { # if skipTF
message <- paste0("DESeq could not be run.")
pdf(par.l$file_output_plot_diagnostic)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
message = paste0("Insufficient data to run analysis.\n", message, "\nThis TF will be ignored in subsequent steps.")
text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red")
dev.off()
}
} # end for each permutation
if (!skipTF) {
TF_outputInclPerm.df = as.tibble(log2fc.m) %>%
add_column(TF = TFCur , TFBSID = TF_outputCur.df$TFBSID, .before = 1)
} else { # if skipTF
message <- paste0("DESeq could not be run.")
pdf(par.l$file_output_plot_diagnostic)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
message = paste0("Insufficient data to run analysis.\n", message, "\nThis TF will be ignored in subsequent steps.")
text(x = 0.5, y = 0.5, message, cex = 1.6, col = "red")
dev.off()
}
} # end for each permutation
colnames(TF_outputInclPerm.df)[3:ncol(TF_outputInclPerm.df)] = paste0("log2fc_perm", 0:par.l$nPermutations)
}
} # end if !skipTF
TF_output.df = mutate_if(TF_output.df, is.numeric, as.character)
write_tsv(TF_output.df, path = par.l$file_output_summaryAll)
TF_outputInclPerm.dfv = mutate_if(TF_outputInclPerm.df, is.numeric, as.character)
write_tsv(TF_outputInclPerm.df, path = par.l$file_outputPerm_summaryAll)
TF_outputInclPerm.df = mutate_if(TF_outputInclPerm.df, is.numeric, as.character)
write_tsv(TF_outputInclPerm.df, path = par.l$file_outputPerm_summaryAll, col_names = FALSE)
saveRDS(outputSummary.df, file = par.l$file_output_summaryStats)
# saveRDS(res_DESeq.l, file = par.l$file_output_DESeq)
......
......@@ -34,6 +34,9 @@ par.l$verbose = TRUE
par.l$log_minlevel = "INFO"
par.l$minNoDatapoints = 5
# Used for plotting
par.l$includePlots = FALSE
#####################
# VERIFY PARAMETERS #
#####################
......@@ -171,7 +174,7 @@ if (nRowsNA > 0) {
TF.motifs.CG = TF.motifs.CG %>%
mutate(CG.identifier = paste0(TF,":" ,TFBSID)) %>%
select(-one_of("TFBSID"))
dplyr:: select(-one_of("TFBSID"))
CGBins = seq(0,1, 1/par.l$nBins)
......@@ -185,12 +188,47 @@ CGBins = seq(0,1, 1/par.l$nBins)
# PERMUTATIONS #
################
# TODO: Take at most x TFBS per TF for background, not all
# TODO: Binning preparation and storing all TFBS in one object is too large and not feasible
# TODO:Possibility 1: For each permutation, read in all 640 TF files with specific columns. Results in 640 * 640x1000 > 400 Million read_tsv calls.
#TODO: Alternative: Use the new column layout and modify the shell call to extract only the required columns: 1,2, and 2+permutationCur rather than using grep. Results in 1000 extra jobs but only 640*1000 = 640,000 read_tsv calls and acceptable memory
#for (permutationCur in 0:par.l$nPermutations)
for (fileCur in par.l$files_input_TF_allMotives) {
# Init the col type list to skip colums directly
# colType.l = vector("list", length = 2 + par.l$nPermutations + 1)
# colType.l[[1]] = col_character() # "TF"
# colType.l[[2]] = col_character() # "TFBSID"
# for (n in 1:permutationCur) {
# indexLast = 2+n
# colType.l[[indexLast]] = col_skip()
# }
# colType.l[[indexLast+1]] = col_double() # "log2FoldChange", important to have double here as col_number would not parse numbers in scientific notation
# for (n in 1:(par.l$nPermutations - permutationCur )) {
# colType.l[[indexLast + 1 + n]] = col_skip()
# }
#
#
# TF.motifs.ori = tribble(~permutation, ~TF, ~TFBSID, ~log2FoldChange)
# # Iterate over all TF
# for (fileCur in allFiles) {
# TF.motifs.ori = read_tsv(fileCur, col_types = colType.l)
#
# if (nrow(problems(TF.motifs.ori)) > 0) {
# flog.fatal(paste0("Parsing errors: "), problems(TF.motifs.ori), capture = TRUE)
# stop("Error when parsing the file ", fileCur, ", see errors above")
# }
#
# }
#
#
# Log 2 fold-changes from the particular permutation
TF.motifs.ori = read_tsv(fileCur, col_names = FALSE,
TF.motifs.ori = read_tsv(fileCur, col_names = TRUE,
col_types = list(
col_integer(), # "permutation"
col_character(), # "TF",
col_character(), # "TFBSID"
col_double() # "log2FoldChange", important to have double here as col_number would not parse numbers in scientific notation correctly
......@@ -208,9 +246,10 @@ for (fileCur in par.l$files_input_TF_allMotives) {
checkAndLogWarningsAndErrors(NULL, message, isWarning = FALSE)
}
colnames(TF.motifs.ori) = c("permutation", "TF", "TFBSID", "log2FoldChange")
colnames(TF.motifs.ori) = c("TF", "TFBSID", "log2FoldChange")
permutationCur = unique(TF.motifs.ori$permutation)
permutationCur = as.numeric(gsub(".*perm([0-9]+).tsv.gz", '\\1', fileCur))
TF.motifs.ori$permutation = permutationCur
if (permutationCur > 0) {
flog.info(paste0("Running permutation ", permutationCur))
......@@ -230,6 +269,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
# MERGE #
#########
# TODO: full join necessary?
TF.motifs.all = TF.motifs.ori %>%
full_join(TF.motifs.CG, by = c("CG.identifier")) %>%
mutate(CG.bins = cut(CG, breaks = CGBins, labels = paste0(round(CGBins[-1] * 100,0),"%"), include.lowest = TRUE)) %>%
......@@ -260,7 +300,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
if (calculateVariance)
boostrapResults.l[[TFCur]][[as.character(permutationCur)]] = list()
binnedCombined.df = tribble(~bin, ~type, ~value)
if (par.l$includePlots) binnedCombined.df = tribble(~bin, ~type, ~value)
for (bin in uniqueBins) {
......@@ -324,8 +364,11 @@ for (fileCur in par.l$files_input_TF_allMotives) {
Tstat = Tstat
)
binnedCombined.df = add_row(binnedCombined.df, bin = bin, type = paste0(TFCur, "-only"), value = binned.curTF.df$log2FoldChange)
binnedCombined.df = add_row(binnedCombined.df, bin = bin, type = paste0("all_other"), value = binned.allTF.df$log2FoldChange)
if (par.l$includePlots) {
binnedCombined.df = add_row(binnedCombined.df, bin = bin, type = paste0(TFCur, "-only"), value = binned.curTF.df$log2FoldChange)
binnedCombined.df = add_row(binnedCombined.df, bin = bin, type = paste0("all_other"), value = binned.allTF.df$log2FoldChange)
}
......@@ -435,9 +478,8 @@ for (fileCur in par.l$files_input_TF_allMotives) {
TFBS = nRowsTF,
variance = varianceFinal)
# Used for plotting
includePlots = FALSE
if (includePlots) {
if (par.l$includePlots) {
xlabStr = paste0("log2 fold-change of TFBS")
binnedCombined.df$bin = factor(binnedCombined.df$bin, levels = levels(uniqueBins))
g1 = ggplot(binnedCombined.df, aes(value, fill = type)) + geom_density(alpha = 0.5) +
......
......@@ -20,7 +20,7 @@ createDebugFile(snakemake)
initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
checkAndLoadPackages(c("tidyverse", "futile.logger", "lsr", "DESeq2", "matrixStats", "ggrepel", "checkmate", "tools", "grDevices", "locfdr", "pheatmap"), verbose = FALSE)
# TODO: grDevices realyl needed? Only for adjustcolor so far
# TODO: grdevices realyl needed? Only for adjustcolor so far
###################
#### PARAMETERS ###
......@@ -523,7 +523,7 @@ if (par.l$nPermutations > 0) {
}
# Diagnostic plot for pvalue
# TODO: Diagnostic plot for pvalue
plot(ggplot(output.global.TFs.orig, aes(pvalue)) + geom_density() + ggtitle("Local fdr density across all TF"))
......@@ -564,12 +564,12 @@ if (par.l$nPermutations > 0) {
output.global.TFs.orig$weighted_Tstat_centralized = output.global.TFs.orig$weighted_Tstat - MLE.delta
output.global.TFs.orig$variance = as.numeric(output.global.TFs.orig$variance)
output.global.TFs.orig$pvalue = 2*pnorm(-abs(output.global.TFs.orig$weighted_Tstat_centralized), sd = sqrt(output.global.TFs.orig$variance))
output.global.TFs.orig$pvalue2 = 2*pnorm(-abs(output.global.TFs.orig$weighted_Tstat_centralized), sd = sqrt(output.global.TFs.orig$variance))
# Handle extreme cases with p-values that are practically 0 and would cause subsequent issues
index0 = which(output.global.TFs.orig$pvalue < .Machine$double.xmin)
index0 = which(output.global.TFs.orig$pvalue2 < .Machine$double.xmin)
if (length(index0) > 0) {
output.global.TFs.orig$pvalue[index0] = .Machine$double.xmin
output.global.TFs.orig$pvalue2[index0] = .Machine$double.xmin
}
}
......@@ -586,8 +586,12 @@ output.global.TFs$Cohend_factor = ifelse(output.global.TFs$weighted_CD < par.l$t
output.global.TFs$Cohend_factor = factor(output.global.TFs$Cohend_factor, levels = par.l$classes_CohensD, labels = seq_len(length(par.l$classes_CohensD)))
# TODO move?
output.global.TFs$pvalueAdj = p.adjust(output.global.TFs$pvalue, method = "BH")
#output.global.TFs$pvalueAdj2 = p.adjust(output.global.TFs$pvalue2, method = "BH")
colnamesToPlot = colnames(output.global.TFs)[-which(colnames(output.global.TFs) %in% c("TF", "sign", "classification"))]
......@@ -913,7 +917,9 @@ if (par.l$plotRNASeqClassification) {
heatmap.act.rep(TF.peakMatrix.df, HOCOMOCO_mapping.df.exp)
dev.off()
# TODO: TF_names_update = read_delim("/g/scb2/zaugg/berest/Projects/CLL/PREPARE/Armando.AR/52samplesAR/HOCOTFID2ENSEMBL.txt",delim = " ")
# Filter genes
samples_cond1 = colData(dd)$SampleID[which(colData(dd)$conditionSummary == levels(colData(dd)$conditionSummary)[1])]
samples_cond2 = colData(dd)$SampleID[which(colData(dd)$conditionSummary == levels(colData(dd)$conditionSummary)[2])]
......@@ -930,8 +936,17 @@ if (par.l$plotRNASeqClassification) {
# expresssion.df.all = plyr::join_all(list(expression.df.filt,res.peaks.filt), by = 'ENSEMBL', type = 'inner')
# TODO
# In ivans version, he used a non-filtered HOCOMOCO table. I however filter it before, so that some ENSEMBL IDs might already be filtered
TF.specific = left_join(HOCOMOCO_mapping.subset.df, res.peaks.filt, by = "ENSEMBL") %>% filter(!is.na(baseMean))
# TODO:_ deal with NAs, where do they come from?
output.global.TFs$weighted_meanDifference = as.numeric(output.global.TFs$weighted_meanDifference)
par.l$minPointSize = 0.3
......@@ -1019,14 +1034,15 @@ output.global.TFs.origReal = output.global.TFs
# PLOT FOR DIFFERENT P VALUE THRESHOLDS #
#########################################
# TODO: change pvalue to pvalueAdj ?
# Set the page dimensions to the maximum across all plotted variants
output.global.TFs.filteredSummary = filter(output.global.TFs, pvalueAdj <= max(par.l$significanceThresholds))
output.global.TFs.filteredSummary = filter(output.global.TFs, pvalue <= max(par.l$significanceThresholds))
nTF_label = nrow(output.global.TFs.filteredSummary)
TFLabelSize = ifelse(nTF_label < 20, 6,
ifelse(nTF_label < 40, 5,
ifelse(nTF_label < 60, 5,
TFLabelSize = ifelse(nTF_label < 20, 8,
ifelse(nTF_label < 40, 7,
ifelse(nTF_label < 60, 6,
ifelse(nTF_label < 60, 5,
ifelse(nTF_label < 60, 4, 3)))))
......@@ -1078,19 +1094,24 @@ for (significanceThresholdCur in par.l$significanceThresholds) {
g = ggplot()
if (par.l$plotRNASeqClassification) {
g = g + geom_point(data = output.global.TFs, aes(weighted_meanDifference, pValueAdj_log10, alpha = pValue_sig, size = TFBS, fill = classification), shape=21, stroke = 0.5, color = "black") + scale_fill_manual("TF class", values = par.l$colorCategories)
} else {
g = g + geom_point(data = output.global.TFs, aes(weighted_meanDifference, pValueAdj_log10, alpha = pValue_sig, size = TFBS), shape=21, stroke = 0.5, color = "black")
} else {
g = g + geom_point(data = output.global.TFs, aes(weighted_meanDifference, pValueAdj_log10, alpha = pValue_sig, size = TFBS), shape=21, stroke = 0.5, color = "black")
}
g = g + geom_rect(aes(xmin = 0, xmax = Inf, ymin = -Inf,ymax = Inf, color = par.l$colorConditions[1]), alpha = .3, fill = par.l$colorConditions[2], size = 0) +
geom_rect(aes(xmin = -Inf,xmax = 0,ymin = -Inf, ymax = Inf, color = par.l$colorConditions[2]),
alpha = .3, fill = par.l$colorConditions[1], size = 0) +
scale_color_manual(name = 'TF activity higher in', values = par.l$colorConditions, labels = conditionComparison) +
g = g +
geom_rect(aes(xmin = 0,
xmax = Inf,
ymin = -Inf,
ymax = Inf, fill = "condition1"),
alpha = .3) +
geom_rect(aes(xmin = -Inf,
xmax = 0,
ymin = -Inf,
ymax = Inf, fill = "condition2"),
alpha = .3) +
scale_fill_manual(name = 'TF activity higher in', values = par.l$colorConditions, labels = conditionComparison) +
ylim(-0.1,ymax) +
ylab(paste0(transform_yValues_caption(), " (adj. p-value)")) +
xlab("weighted mean difference") +
......@@ -1099,7 +1120,7 @@ for (significanceThresholdCur in par.l$significanceThresholds) {
if (par.l$plotRNASeqClassification) {
g = g + geom_label_repel(data = ggrepel_df, aes(weighted_meanDifference, pValueAdj_log10, label = TF, fill = classification),
size = TFLabelSize, fontface = 'bold', color = 'white',
size = TFLabelSize, fontface = 'bold', color = 'black',
segment.size = 0.3, box.padding = unit(0.2, "lines"), max.iter = 5000,
label.padding = unit(0.2, "lines"), # how thick is connectin line
nudge_y = 0.05, nudge_x = 0, # how far from center points
......@@ -1123,15 +1144,12 @@ for (significanceThresholdCur in par.l$significanceThresholds) {
if (par.l$plotRNASeqClassification) {
g = g + guides(alpha = guide_legend(override.aes = list(size=5), order = 2),
fill = guide_legend(override.aes = list(size=5), order = 3),
size = guide_legend(order = 4),
color = guide_legend(override.aes = list(size=5), order = 1))
fill = guide_legend(override.aes = list(size=5), order = 3))
allPlots.l[["volcano"]] [[pValThrStr]] [[paste0(showClasses,collapse = "-")]] = g
} else {
g = g + guides(alpha = guide_legend(override.aes = list(size=5), order = 2),
size = guide_legend(order = 3),
color = guide_legend(override.aes = list(size=5), order = 1))
fill = guide_legend(override.aes = list(size=5), order = 3))
allPlots.l[["volcano"]] [[pValThrStr]] = g
}
......
......@@ -421,40 +421,26 @@ myMAPlot <- function(M, idx, main, minMean = 0) {
}
plotDiagnosticPlots <- function(dd, differentialResults, conditionComparison, filename = NULL, maxPairwiseComparisons = 5, alpha = c(0.001, 0.01, 0.05,0.1,0.2)) {
plotDiagnosticPlots <- function(dd, differentialResults, conditionComparison, filename = NULL, maxPairwiseComparisons = 5, alpha = 0.05) {
checkAndLoadPackages(c("tidyverse", "checkmate", "geneplotter", "DESeq2", "vsn", "RColorBrewer", "limma"), verbose = FALSE)
assertClass(dd, "DESeqDataSet")
assert(testClass(differentialResults, "MArrayLM"), testClass(differentialResults, "DESeqDataSet"))
assert(testNull(conditionComparison), testVector(conditionComparison, len = 2))
assertVector(conditionComparison, len = 2)
assert(checkNull(filename), checkDirectory(dirname(filename), access = "w"))
if (!is.null(filename)) {
pdf(filename)
}
if (testClass(differentialResults, "MArrayLM")) {
title = paste0("limma results\n", conditionComparison[1], " vs. ", conditionComparison[2])
isSign = ifelse(p.adjust(differentialResults$p.value[,ncol(differentialResults$p.value)], method = "BH") < alpha, paste0("sign. (BH, ", alpha, ")"), "not-significant")
if (!is.null(conditionComparison)) {
title = paste0("limma results\n", conditionComparison[1], " vs. ", conditionComparison[2])
} else {
title = paste0("limma results\n")
}
for (alphaCur in alpha) {
isSign = ifelse(p.adjust(differentialResults$p.value[,ncol(differentialResults$p.value)], method = "BH") < alphaCur,
paste0("sign. (BH, ", alphaCur, ")"),
"not-significant")
limma::plotMA(differentialResults, main = title, status = isSign, bg.cex = 0.5, hl.cex = 0.5)
}
limma::plotMA(differentialResults, main = title, status = isSign)
} else {
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment