From 9e73bbed6b4de91310d4e52a74cbf5e4badd59c7 Mon Sep 17 00:00:00 2001 From: Bernd Klaus Date: Wed, 22 Nov 2017 15:29:00 +0100 Subject: [PATCH] added first section on factor analysis --- factor_ana_testing_ml.R | 44 +++- factor_ana_testing_ml.Rmd | 72 ++++-- factor_ana_testing_ml.html | 340 ++++++++++++++++++++-------- get_citations_stat_methods_bioinf.R | 5 + stat_methods_bioinf.bib | 33 +++ 5 files changed, 372 insertions(+), 122 deletions(-) diff --git a/factor_ana_testing_ml.R b/factor_ana_testing_ml.R index 8bfd555..b2c9904 100644 --- a/factor_ana_testing_ml.R +++ b/factor_ana_testing_ml.R @@ -110,7 +110,7 @@ res_dds_batch <- results(dds_batch) res_dds_batch ## ----rnaSeqPvalHist------------------------------------------------------ -res_dds_batch <- as_data_frame(res_dds_batch) +res_dds_batch <- as.data.frame(res_dds_batch) ggplot(res_dds_batch, aes(x = pvalue)) + geom_histogram(binwidth = 0.05 ) @@ -161,6 +161,27 @@ ggplot(tibble(pv), aes(x = pv)) + subtitle = "variance of null distribution too low") +## ----holm---------------------------------------------------------------- +pv <- 2 - 2 * pnorm(abs(z), sd = sd.true) +pv_holm <- p.adjust(pv, method = "holm") +head(pv_holm[pv_holm < 0.1]) +z[pv_holm < 0.1] + +## ----holmrnaseq---------------------------------------------------------- +res_dds_batch$padj_holm <- p.adjust(res_dds_batch$pvalue, + method = "holm") +table(res_dds_batch$padj_holm < 0.1) +res_dds_batch[res_dds_batch$padj_holm < 0.1,] + +## ----bh------------------------------------------------------------------ +pv <- 2 - 2 * pnorm(abs(z), sd = sd.true) +pv_fdr <- p.adjust(pv, method = "fdr") +table(pv_fdr < 0.1) +z[pv_fdr < 0.1] + +## ----rna_fdr------------------------------------------------------------- +summary(results(dds_batch)) + ## ----remove_batch_sim---------------------------------------------------- cleaned_data <- removeBatchEffect(rld_batch, @@ -187,15 +208,24 @@ clean_pca_plot <- ggplot(cleaned_data_pca$pca, aes(x = PC1, y = PC2, clean_pca_plot +## ----chgdesign----------------------------------------------------------- +design(dds_batch) <- ~ sex + condition +dds_batch_sex <- DESeq(dds_batch) +res_sex <- results(dds_batch_sex) +resultsNames(dds_batch_sex) + +## ----pwrblocking--------------------------------------------------------- +summary(res_sex) + ## ----session_info, cache = FALSE----------------------------------------- sessionInfo() ## ----unloaAll, echo=FALSE, message=FALSE, eval = FALSE------------------- -## -## pkgs <- loaded_packages() %>% -## filter(package != "devtools") %>% -## {.$path} -## -## walk(pkgs, unload) + + pkgs <- loaded_packages() %>% + filter(package != "devtools") %>% + {.$path} + + walk(pkgs, unload) diff --git a/factor_ana_testing_ml.Rmd b/factor_ana_testing_ml.Rmd index a58ee42..cd43d19 100755 --- a/factor_ana_testing_ml.Rmd +++ b/factor_ana_testing_ml.Rmd @@ -22,7 +22,7 @@ bibliography: stat_methods_bioinf.bib

LAST UPDATE AT

-
   [1] "Wed Nov 22 08:18:53 2017"
+
   [1] "Wed Nov 22 15:26:41 2017"

1 Required packages and other preparations

library("readxl")
@@ -342,7 +347,7 @@ batch_pca_plot <- ggplot(batc
        scale_colour_brewer(palette="Dark2")
        
 batch_pca_plot 
-

+

We have a sex–related batch effect in our data: PC2 separates the samples by sex. However, PC1 clearly separates the two conditions so we do expect some differentially expressed genes. DESeq2 performs a test based on generalized linear model (glm). A generalized linear model is conceptually identical to an ordinary linear model. However, it allows for non–normally distributed responses. DESeq2 uses the negative binomial distribution to model count data. We can conveniently obtain the results of the test like so:

load(file.path(data_dir, "dds_batch.RData"))
 dds_batch <- DESeq(dds_batch)
@@ -372,12 +377,11 @@ res_dds_batch
gene999 1291 -0.138 0.262 -0.526 0.5990 0.832 gene1000 2093 -0.199 0.214 -0.932 0.3511 0.687

An import diagnostic plot is the histogram of p–values.

-
res_dds_batch <- as_data_frame(res_dds_batch)
-
   Warning in as.data.frame(x, row.names = NULL, optional = optional, ...):
-   Arguments in '...' ignored
-
ggplot(res_dds_batch, aes(x = pvalue)) +
+
res_dds_batch <- as.data.frame(res_dds_batch)
+
+ggplot(res_dds_batch, aes(x = pvalue)) +
   geom_histogram(binwidth = 0.05 )
-

+

We see a more or less uniform background, corresponding to genes that are not differentially expressed and a peak near zero, which represents the differentially expressed genes^See also David Robinson’s detailed blog post on that matter. We will now explore typical p–value histograms

@@ -410,7 +414,7 @@ z <- get.random.zscore( xlab("p-values") + ggtitle("Histogram of p-values, correct null distribution")
   `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-

We see that our \(p\)–values are uniformly distributed under the null hypotheses. Computing the \(p\)–values assuming a \(N(0,2)\) null distribution changes the picture.

+

We see that our \(p\)–values are uniformly distributed under the null hypotheses. Computing the \(p\)–values assuming a \(N(0,2)\) null distribution changes the picture.

pv <- 2 - 2*pnorm(abs(z), sd = 2)
 
 ggplot(tibble(pv),  aes(x = pv)) +
@@ -419,7 +423,7 @@ z <- get.random.zscore(      ggtitle("Histogram of p-values", 
               subtitle = "variance of null distribution too high") 
   `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-

+

If the assumed variance of the null distribution is too high, we often see hill–shaped \(p\)–value histogram. The p–values are to high on average and we cannot identify the true hits.

If the variance is too low, we get an extreme enrichment in small p–values, while we potentially loose the uniform background.

@@ -433,7 +437,7 @@ z <- get.random.zscore( ggtitle("Histogram of p-values", subtitle = "variance of null distribution too low")
   `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-