Commit d8d37de5 authored by Christian Arnold's avatar Christian Arnold

Version 1.1.7, see Changelog for details

parent 95b75e18
......@@ -31,6 +31,10 @@ We also put the paper on *bioRxiv*, please read all methodological details here:
Change log
============================
Version 1.1.7 (2018-10-25)
- the default value of the minimum number of data points for a CG bin to be included has been raised from 5 to 20 to make the variance calculation more reliable
- various small updates to the ``summaryFinal.R`` script
Version 1.1.6 (2018-10-11)
- fixed small issue in ``checkParameterValidity.R`` when not having sufficient permissions for the folder in which the fasta file is located
- updated the ``summaryFinal.R`` script. Now, for the Volcano plot PDF, in addition to adj. p-values, also the raw p-values are plotted in the end. This might be helpful for datasets with small signal when no adj. p-value is significant. In addition, labeling of TFs is now skipped when the number of TFs to label exceeds 150. THis makes the step faster and the PDF smaller and less crowded.
......
......@@ -32,7 +32,9 @@ createDebugFile(snakemake)
par.l = list()
par.l$verbose = TRUE
par.l$log_minlevel = "INFO"
par.l$minNoDatapoints = 5
# This value was determined empirically. Below 20, the estimated variance from the bootstrap is estimated to be artifically high and not reliable enough
par.l$minNoDatapoints = 20
# Used for plotting
par.l$includePlots = FALSE
......@@ -109,11 +111,12 @@ if (calculateVariance) {
boostrapResults.l[[TFCur]] = list()
}
output.global.TFs = tribble(~permutation, ~TF, ~weighted_meanDifference, ~weighted_CD, ~TFBS, ~weighted_Tstat, ~variance)
output.global.TFs = tribble(~permutation, ~TF, ~weighted_meanDifference, ~weighted_CD, ~TFBS, ~weighted_Tstat, ~variance)
perm.l[[TFCur]] = tribble(~permutation, ~bin, ~meanDifference, ~nDataAll, ~nDataBin, ~ratio_TFBS, ~cohensD, ~variance, ~df, ~pvalue, ~Tstat)
summaryCov.df = tribble(~permutation, ~bin1, ~bin2, ~weight1, ~weight2, ~cov)
######################
# FINAL PREPARATIONS #
######################
......@@ -377,6 +380,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
message = paste0(" Not enough data for any of the ", nBins, " bins, this TF will be skipped in subsequent steps")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
calculateVariance = FALSE
} else {
flog.info(paste0(" Finished calculation across bins successfully for ", nBinsWithData, " out of ", nBins, " bins"))
}
......@@ -442,7 +446,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
# see the paper for a derivation of the formula
varianceFinal = sum(weights^2 * varianceIndividual) + (2 * sum(summaryCov.filt.df$weight1 * summaryCov.filt.df$weight2 * summaryCov.filt.df$cov))
} else {
message = paste0("Could not calculate variance due to missing values. Set variance to NA")
......@@ -461,17 +465,27 @@ for (fileCur in par.l$files_input_TF_allMotives) {
perm.filtered.df = filter(perm.filtered.df, !is.na(df))
}
wmd = weighted.mean(perm.filtered.df$meanDifference, perm.filtered.df$ratio_TFBS, na.rm = TRUE)
if (nrow(perm.filtered.df) > 0) {
wmd = weighted.mean(perm.filtered.df$meanDifference, perm.filtered.df$ratio_TFBS, na.rm = TRUE)
weighted_CD = weighted.mean(perm.filtered.df$cohensD, perm.filtered.df$ratio_TFBS, na.rm = TRUE)
weighted_Tstat = weighted.mean(perm.filtered.df$Tstat , perm.filtered.df$ratio_TFBS, na.rm = TRUE)
} else {
wmd = weighted_CD = weighted_Tstat = NA
}
output.global.TFs = add_row(output.global.TFs,
permutation = permutationCur,
TF = TFCur,
weighted_meanDifference = wmd,
weighted_CD = weighted.mean(perm.filtered.df$cohensD, perm.filtered.df$ratio_TFBS, na.rm = TRUE),
weighted_Tstat = weighted.mean(perm.filtered.df$Tstat , perm.filtered.df$ratio_TFBS, na.rm = TRUE),
weighted_CD = weighted_CD,
weighted_Tstat = weighted_Tstat,
TFBS = nRowsTF,
variance = varianceFinal)
variance = varianceFinal
)
if (par.l$includePlots) {
xlabStr = paste0("log2 fold-change of TFBS")
......@@ -500,7 +514,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
# Save objects
saveRDS(perm.l, file = par.l$file_output_permResults)
saveRDS(list( binSummary = perm.l, covarianceSummary = summaryCov.df), file = par.l$file_output_permResults)
# Convert all numeric data types to character in order to prevent any scientific notation
output.global.TFs = mutate_if(output.global.TFs, is.numeric, as.character)
......
This diff is collapsed.
......@@ -324,8 +324,7 @@ rule checkParameterValidity:
rule produceConsensusPeaks:
input:
checkFlag = ancient(rules.checkParameterValidity.output.flag),
peaks = allPeakFiles,
sampleFile= config["samples"]["summaryFile"]
peaks = allPeakFiles
output:
consensusPeaks_bed = TEMP_DIR + "/" + compType + "consensusPeaks.bed",
summaryPlot = TEMP_DIR + "/" + compType + "consensusPeaks_lengthDistribution.pdf"
......@@ -407,8 +406,7 @@ def getBamFilesBasedOnPairedEnd(wildcards):
rule intersectPeaksAndBAM:
input:
consensusPeaks = rules.filterSexChromosomesAndSortPeaks.output.consensusPeaks_sorted,
allBAMs = getBamFilesBasedOnPairedEnd,
sampleFile = config["samples"]["summaryFile"]
allBAMs = getBamFilesBasedOnPairedEnd
output:
peaksBamOverlapRaw = temp(PEAKS_DIR + '/' + compType + 'allBams.peaks.overlaps.bed'),
peaksBamOverlap = PEAKS_DIR + '/' + compType + 'allBams.peaks.overlaps.bed.gz',
......@@ -474,8 +472,7 @@ rule intersectPeaksAndTFBS:
rule intersectTFBSAndBAM:
input:
bed = rules.intersectPeaksAndTFBS.output.TFBSinPeaksMod_bed,
allBAMs = getBamFilesBasedOnPairedEnd,
sampleFile = config["samples"]["summaryFile"]
allBAMs = getBamFilesBasedOnPairedEnd
output:
BAMOverlapRaw = temp(TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.allBAMs.overlaps.bed"),
BAMOverlap = TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.allBAMs.overlaps.bed.gz",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment