Commit d8d37de5 authored by Christian Arnold's avatar Christian Arnold

Version 1.1.7, see Changelog for details

parent 95b75e18
...@@ -31,6 +31,10 @@ We also put the paper on *bioRxiv*, please read all methodological details here: ...@@ -31,6 +31,10 @@ We also put the paper on *bioRxiv*, please read all methodological details here:
Change log Change log
============================ ============================
Version 1.1.7 (2018-10-25)
- the default value of the minimum number of data points for a CG bin to be included has been raised from 5 to 20 to make the variance calculation more reliable
- various small updates to the ``summaryFinal.R`` script
Version 1.1.6 (2018-10-11) Version 1.1.6 (2018-10-11)
- fixed small issue in ``checkParameterValidity.R`` when not having sufficient permissions for the folder in which the fasta file is located - fixed small issue in ``checkParameterValidity.R`` when not having sufficient permissions for the folder in which the fasta file is located
- updated the ``summaryFinal.R`` script. Now, for the Volcano plot PDF, in addition to adj. p-values, also the raw p-values are plotted in the end. This might be helpful for datasets with small signal when no adj. p-value is significant. In addition, labeling of TFs is now skipped when the number of TFs to label exceeds 150. THis makes the step faster and the PDF smaller and less crowded. - updated the ``summaryFinal.R`` script. Now, for the Volcano plot PDF, in addition to adj. p-values, also the raw p-values are plotted in the end. This might be helpful for datasets with small signal when no adj. p-value is significant. In addition, labeling of TFs is now skipped when the number of TFs to label exceeds 150. THis makes the step faster and the PDF smaller and less crowded.
......
...@@ -32,7 +32,9 @@ createDebugFile(snakemake) ...@@ -32,7 +32,9 @@ createDebugFile(snakemake)
par.l = list() par.l = list()
par.l$verbose = TRUE par.l$verbose = TRUE
par.l$log_minlevel = "INFO" par.l$log_minlevel = "INFO"
par.l$minNoDatapoints = 5
# This value was determined empirically. Below 20, the estimated variance from the bootstrap is estimated to be artifically high and not reliable enough
par.l$minNoDatapoints = 20
# Used for plotting # Used for plotting
par.l$includePlots = FALSE par.l$includePlots = FALSE
...@@ -109,11 +111,12 @@ if (calculateVariance) { ...@@ -109,11 +111,12 @@ if (calculateVariance) {
boostrapResults.l[[TFCur]] = list() boostrapResults.l[[TFCur]] = list()
} }
output.global.TFs = tribble(~permutation, ~TF, ~weighted_meanDifference, ~weighted_CD, ~TFBS, ~weighted_Tstat, ~variance) output.global.TFs = tribble(~permutation, ~TF, ~weighted_meanDifference, ~weighted_CD, ~TFBS, ~weighted_Tstat, ~variance)
perm.l[[TFCur]] = tribble(~permutation, ~bin, ~meanDifference, ~nDataAll, ~nDataBin, ~ratio_TFBS, ~cohensD, ~variance, ~df, ~pvalue, ~Tstat) perm.l[[TFCur]] = tribble(~permutation, ~bin, ~meanDifference, ~nDataAll, ~nDataBin, ~ratio_TFBS, ~cohensD, ~variance, ~df, ~pvalue, ~Tstat)
summaryCov.df = tribble(~permutation, ~bin1, ~bin2, ~weight1, ~weight2, ~cov) summaryCov.df = tribble(~permutation, ~bin1, ~bin2, ~weight1, ~weight2, ~cov)
###################### ######################
# FINAL PREPARATIONS # # FINAL PREPARATIONS #
###################### ######################
...@@ -377,6 +380,7 @@ for (fileCur in par.l$files_input_TF_allMotives) { ...@@ -377,6 +380,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
message = paste0(" Not enough data for any of the ", nBins, " bins, this TF will be skipped in subsequent steps") message = paste0(" Not enough data for any of the ", nBins, " bins, this TF will be skipped in subsequent steps")
checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE) checkAndLogWarningsAndErrors(NULL, message, isWarning = TRUE)
calculateVariance = FALSE
} else { } else {
flog.info(paste0(" Finished calculation across bins successfully for ", nBinsWithData, " out of ", nBins, " bins")) flog.info(paste0(" Finished calculation across bins successfully for ", nBinsWithData, " out of ", nBins, " bins"))
} }
...@@ -442,7 +446,7 @@ for (fileCur in par.l$files_input_TF_allMotives) { ...@@ -442,7 +446,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
# see the paper for a derivation of the formula # see the paper for a derivation of the formula
varianceFinal = sum(weights^2 * varianceIndividual) + (2 * sum(summaryCov.filt.df$weight1 * summaryCov.filt.df$weight2 * summaryCov.filt.df$cov)) varianceFinal = sum(weights^2 * varianceIndividual) + (2 * sum(summaryCov.filt.df$weight1 * summaryCov.filt.df$weight2 * summaryCov.filt.df$cov))
} else { } else {
message = paste0("Could not calculate variance due to missing values. Set variance to NA") message = paste0("Could not calculate variance due to missing values. Set variance to NA")
...@@ -461,17 +465,27 @@ for (fileCur in par.l$files_input_TF_allMotives) { ...@@ -461,17 +465,27 @@ for (fileCur in par.l$files_input_TF_allMotives) {
perm.filtered.df = filter(perm.filtered.df, !is.na(df)) perm.filtered.df = filter(perm.filtered.df, !is.na(df))
} }
wmd = weighted.mean(perm.filtered.df$meanDifference, perm.filtered.df$ratio_TFBS, na.rm = TRUE) if (nrow(perm.filtered.df) > 0) {
wmd = weighted.mean(perm.filtered.df$meanDifference, perm.filtered.df$ratio_TFBS, na.rm = TRUE)
weighted_CD = weighted.mean(perm.filtered.df$cohensD, perm.filtered.df$ratio_TFBS, na.rm = TRUE)
weighted_Tstat = weighted.mean(perm.filtered.df$Tstat , perm.filtered.df$ratio_TFBS, na.rm = TRUE)
} else {
wmd = weighted_CD = weighted_Tstat = NA
}
output.global.TFs = add_row(output.global.TFs, output.global.TFs = add_row(output.global.TFs,
permutation = permutationCur, permutation = permutationCur,
TF = TFCur, TF = TFCur,
weighted_meanDifference = wmd, weighted_meanDifference = wmd,
weighted_CD = weighted.mean(perm.filtered.df$cohensD, perm.filtered.df$ratio_TFBS, na.rm = TRUE), weighted_CD = weighted_CD,
weighted_Tstat = weighted.mean(perm.filtered.df$Tstat , perm.filtered.df$ratio_TFBS, na.rm = TRUE), weighted_Tstat = weighted_Tstat,
TFBS = nRowsTF, TFBS = nRowsTF,
variance = varianceFinal) variance = varianceFinal
)
if (par.l$includePlots) { if (par.l$includePlots) {
xlabStr = paste0("log2 fold-change of TFBS") xlabStr = paste0("log2 fold-change of TFBS")
...@@ -500,7 +514,7 @@ for (fileCur in par.l$files_input_TF_allMotives) { ...@@ -500,7 +514,7 @@ for (fileCur in par.l$files_input_TF_allMotives) {
# Save objects # Save objects
saveRDS(perm.l, file = par.l$file_output_permResults) saveRDS(list( binSummary = perm.l, covarianceSummary = summaryCov.df), file = par.l$file_output_permResults)
# Convert all numeric data types to character in order to prevent any scientific notation # Convert all numeric data types to character in order to prevent any scientific notation
output.global.TFs = mutate_if(output.global.TFs, is.numeric, as.character) output.global.TFs = mutate_if(output.global.TFs, is.numeric, as.character)
......
This diff is collapsed.
...@@ -324,8 +324,7 @@ rule checkParameterValidity: ...@@ -324,8 +324,7 @@ rule checkParameterValidity:
rule produceConsensusPeaks: rule produceConsensusPeaks:
input: input:
checkFlag = ancient(rules.checkParameterValidity.output.flag), checkFlag = ancient(rules.checkParameterValidity.output.flag),
peaks = allPeakFiles, peaks = allPeakFiles
sampleFile= config["samples"]["summaryFile"]
output: output:
consensusPeaks_bed = TEMP_DIR + "/" + compType + "consensusPeaks.bed", consensusPeaks_bed = TEMP_DIR + "/" + compType + "consensusPeaks.bed",
summaryPlot = TEMP_DIR + "/" + compType + "consensusPeaks_lengthDistribution.pdf" summaryPlot = TEMP_DIR + "/" + compType + "consensusPeaks_lengthDistribution.pdf"
...@@ -407,8 +406,7 @@ def getBamFilesBasedOnPairedEnd(wildcards): ...@@ -407,8 +406,7 @@ def getBamFilesBasedOnPairedEnd(wildcards):
rule intersectPeaksAndBAM: rule intersectPeaksAndBAM:
input: input:
consensusPeaks = rules.filterSexChromosomesAndSortPeaks.output.consensusPeaks_sorted, consensusPeaks = rules.filterSexChromosomesAndSortPeaks.output.consensusPeaks_sorted,
allBAMs = getBamFilesBasedOnPairedEnd, allBAMs = getBamFilesBasedOnPairedEnd
sampleFile = config["samples"]["summaryFile"]
output: output:
peaksBamOverlapRaw = temp(PEAKS_DIR + '/' + compType + 'allBams.peaks.overlaps.bed'), peaksBamOverlapRaw = temp(PEAKS_DIR + '/' + compType + 'allBams.peaks.overlaps.bed'),
peaksBamOverlap = PEAKS_DIR + '/' + compType + 'allBams.peaks.overlaps.bed.gz', peaksBamOverlap = PEAKS_DIR + '/' + compType + 'allBams.peaks.overlaps.bed.gz',
...@@ -474,8 +472,7 @@ rule intersectPeaksAndTFBS: ...@@ -474,8 +472,7 @@ rule intersectPeaksAndTFBS:
rule intersectTFBSAndBAM: rule intersectTFBSAndBAM:
input: input:
bed = rules.intersectPeaksAndTFBS.output.TFBSinPeaksMod_bed, bed = rules.intersectPeaksAndTFBS.output.TFBSinPeaksMod_bed,
allBAMs = getBamFilesBasedOnPairedEnd, allBAMs = getBamFilesBasedOnPairedEnd
sampleFile = config["samples"]["summaryFile"]
output: output:
BAMOverlapRaw = temp(TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.allBAMs.overlaps.bed"), BAMOverlapRaw = temp(TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.allBAMs.overlaps.bed"),
BAMOverlap = TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.allBAMs.overlaps.bed.gz", BAMOverlap = TF_DIR + "/{TF}/" + extDir + "/" + compType + "{TF}.allBAMs.overlaps.bed.gz",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment