factor_ana_testing_ml.html 4.64 MB
 Bernd Klaus committed Nov 22, 2017 1 2 3 4 5 6 7 8 9 10 11 12 13   Bernd Klaus committed Apr 17, 2018 14   Bernd Klaus committed Nov 22, 2017 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167  Factor analysis, testing and machine learning for bioinformatics

Contents

LAST UPDATE AT

 Bernd Klaus committed Apr 17, 2018 232 
[1] "Mon Apr 16 10:10:11 2018"
 Bernd Klaus committed Nov 22, 2017 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 

1 Required packages and other preparations

library("readxl") library("BiocStyle") library("knitr") library("MASS") library("RColorBrewer") library("stringr") library("pheatmap") library("matrixStats") library("purrr") library("readr") library("magrittr") library("entropy") library("forcats") library("DESeq2") library("broom") library("tidyverse") library("limma") 
Bernd Klaus committed Nov 29, 2017  252                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             library("ggthemes") 
Bernd Klaus committed Dec 06, 2017  253     254     255     256     257     258     259     260                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     library("corpcor") library("sva") library("zinbwave") library("clusterExperiment") library("clue") library("sda") library("crossval") library("randomForest") 
Bernd Klaus committed Jan 31, 2018  261                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             library("keras") 
Bernd Klaus committed Nov 22, 2017  262                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
Bernd Klaus committed Dec 06, 2017  263                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             theme_set(theme_solarized(base_size = 18)) 
Bernd Klaus committed Nov 22, 2017  264     265                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
Bernd Klaus committed Dec 06, 2017  266                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             data_dir <- file.path("data")
 Bernd Klaus committed Nov 22, 2017 267 268 269 270 271 272 273 274 275 

1.1 mTEC single cell-RNA-Seq data import

We first import the mTEC data again.

 Bernd Klaus committed Dec 06, 2017 276 

1.2 Function to compute a PCA

 Bernd Klaus committed Nov 22, 2017 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 
compute_pca <- function(data_mat, ntop = 500, ...){      pvars <- rowVars(data_mat)   select <- order(pvars, decreasing = TRUE)[seq_len(min(ntop,                                                         length(pvars)))]      PCA <- prcomp(t(data_mat)[, select], center = TRUE, scale. = FALSE)   percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)         return(list(pca = data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],                       PC3 = PCA$x[,3], PC4 = PCA$x[,4], ...),               perc_var = percentVar,               selected_vars = select))  }

2 Statistical testing for high–throughput data

 Bernd Klaus committed Dec 06, 2017 297 

In high throughput data we often measure multiple features (e.g. genes) for which we then perform statistical tests (e.g. for differential expression). Choosing a p–value significance cutoff $$\alpha$$ of e.g. 10%, classical testing theory tells us that we will get a false positive result in 10% of the time. This is no longer true if multiple tests are performed simultaneously. At any $$\alpha$$ the probability of making no false rejection goes to zero quite rapidly:

 Bernd Klaus committed Nov 22, 2017 298 299 300 

$\underbrace{(1-\alpha)\cdot(1-\alpha)\cdot \dotso \cdot (1-\alpha)}_\text{many--times} \rightarrow 0$

 Bernd Klaus committed Dec 06, 2017 301 

For an $$\alpha$$ of 10% and 50 tests, we get a probability of 0.9^50 = 0.005 to not make a false rejection. So it is clear that we need different error measures for the multiple testing situation. Before we can define them, we need to look at the different error Types.

 Bernd Klaus committed Nov 22, 2017 302 303 

2.1 Error types and error rates

 Bernd Klaus committed Dec 06, 2017 304 305 306 307 308 

Suppose you are testing a hypothesis that a fold change $$\beta$$ equals zero versus the alternative that it does not equal zero. Let us assume that there are $$m_0$$ non–differentially expressed genes out of $$m$$ genes and that we declare $$R$$ genes as differentially expressed. (i.e. reject the null hypothesis). These are four possible outcomes:

 Bernd Klaus committed Nov 22, 2017 309 310 311 312 313 314 315 316 317 318 
• Type I error or false positive — Say that the fold change does not equal zero when it does

• Type II error or false negative — Say that the fold change equals zero when it doesn’t

Just like ordinary significance testing tries to control the false positive rate, there are two types of rates commonly used in multiple testing procedures:

• Family wise error rate (FWER) – The probability of at least one false positive $$\text{Pr}(FP \geq 1)$$

• False discovery rate (FDR) – The rate of false positives among all rejected hypothesis $$E\left[\frac{FP}{FP + TP}\right]$$

The FWER is conceptually closely related to classical significance testing, as it controls the probability of making any false rejection. However, as we will also see below, using a procedure to control it is generally too conservative in large scale problems.

 Bernd Klaus committed Dec 06, 2017 319 

The FDR allows for a certain rate of false positives and is the “typical” error rate controlled in large scale multiple testing problems.

 Bernd Klaus committed Nov 22, 2017 320 321 322 

2.2 Statistical testing with RNA–Seq data

 Bernd Klaus committed Dec 06, 2017 323 

Consider the following simulated RNA–Seq data, where we have simulated two groups with 8 samples in each group and a 1000 genes. Each group contains 2 batches: males and females. On top of this, there is genetic information on the subjects which we will use later on in the discussion of batch effects and unwanted variation.

 Bernd Klaus committed Nov 22, 2017 324 325 

The data is stored in an DESeqDataSet, and we apply a variance stabilization before we obtain the PCA plot.

load(file.path(data_dir, "dds_batch.RData")) 
Bernd Klaus committed Dec 06, 2017  326     327     328                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             dds_batch <- DESeq(dds_batch)  rld_batch <- assay(rlogTransformation(dds_batch, blind=TRUE)) 
Bernd Klaus committed Nov 22, 2017  329     330     331     332     333     334     335     336     337     338     339     340     341     342     343     344     345     346     347     348     349                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               batch_pca <- compute_pca(rld_batch, ntop = 500,                          condition = colData(dds_batch)$condition, sex = colData(dds_batch)$sex,                          gen = colData(dds_batch)$gen_back) sd_ratio <- sqrt(batch_pca$perc_var[2] / batch_pca$perc_var[1]) batch_pca_plot <- ggplot(batch_pca$pca, aes(x = PC1,  y = PC2,                 color =  condition,                 shape = sex)) +        geom_point(size = 4) +        ggtitle("PC1 vs PC2, top variable genes") +        labs(x = paste0("PC1, VarExp:", round(batch_pca$perc_var[1],4)), y = paste0("PC2, VarExp:", round(batch_pca$perc_var[2],4))) +        coord_fixed(ratio = sd_ratio) +        scale_colour_brewer(palette="Dark2")         batch_pca_plot
 Bernd Klaus committed Apr 17, 2018 350