Commit a71e4242 authored by Bernd Klaus's avatar Bernd Klaus

finished multiple testing section

parent 42c44da3
......@@ -4,3 +4,4 @@ material_stat_methods/
*_files/
SRP022054/
rse_gene.Rdata
stockori_tmp.Rmd
## create simulated data
set.seed(123)
library(DESeq2)
library(ggplot2)
### create example data set with some differential expression between the two groups
dds_batch <- makeExampleDESeqDataSet(m=16, betaSD = 0.25, interceptMean = 12,
interceptSD = 3)
## add sex as a batch effect, there are two males and two females in each group
colData(dds_batch)$sex <- factor(rep(c("m", "f"), 8))
## modify counts to add a batch effect, we add normally distributed random noise
## with mean 2 to randomly selected genes of male samples and then round the result
## we also add fold changes
cts <- counts(dds_batch)
ranGenes <- floor(runif(300)*1000)
for(i in ranGenes){
cts[i, colData(dds_batch)$sex == "m"] <- as.integer(cts[i, colData(dds_batch)$sex == "m"] +
round(rnorm(1,8, sd = 1)))
}
# # simulate fold changes
# for(i in seq_len(300)){
# cts[i, colData(dds_batch)$condition == "B"] <- as.integer(round(cts[i, colData(dds_batch)$condition == "B"] *
# abs(rnorm(1, 0.1))))
# }
log2(mean(cts[50, colData(dds_batch)$condition == "B"])
/ mean(cts[50, colData(dds_batch)$condition == "A"]))
counts(dds_batch) <- cts
dds_batch <- DESeq(dds_batch, betaPrior = TRUE)
results(dds_batch)[50,]
########################################################################
## add genotype confounding
gen <- c(0,1,1,1,0,2,2,2)
# gen <- 2^gen
colData(dds_batch)$gen_back <- factor(gen)
res <- results(DESeq(dds_batch))
summary(res)
idx_fc <- abs(res$log2FoldChange) > 0.8
cts <- counts(dds_batch)
# adapt fold changes to reflect genetic bias
# adapt fold changes to reflect genetic bias
for(i in which(idx_fc)){
if(res[i, "log2FoldChange"] > 0){
cts[i, colData(dds_batch)$condition == "B"] <- as.integer(round(cts[i,
colData(dds_batch)$condition == "B"] / 2^(res[i, "log2FoldChange"]
/ (2^gen))))
} else {
cts[i, colData(dds_batch)$condition == "A"] <- as.integer(round(cts[i,
colData(dds_batch)$condition == "A"] * 2^(res[i, "log2FoldChange"]
/ (2^gen))))
}
}
set.seed(292827)
# poisson <- Vectorize(function(x) {rpois(1, x)})
# c2 <- counts(dds_batch)[idx_fc] + t(base::apply(round(counts(dds_batch) / 1000)* gen,
# 1, poisson))
counts(dds_batch) <- cts
dds_batch <- estimateSizeFactors(dds_batch)
#sizeFactors(dds_batch)
dds_batch <- DESeq(dds_batch)
summary(results(dds_batch))
save(dds_batch, file = "dds_batch.RData")
\ No newline at end of file
## ----options, include=FALSE----------------------------------------------
library(knitr)
options(digits=3, width=80)
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo=TRUE,tidy=FALSE,include=TRUE,
dev=c('png', 'pdf', 'svg'), fig.height = 5, fig.width = 4 * golden_ratio, comment = ' ', dpi = 300,
cache = TRUE)
## ---- echo=FALSE, cache=FALSE--------------------------------------------
print(date())
## ----required_packages_and_data, echo = TRUE, cache=FALSE, message=FALSE----
library("readxl")
library("scran")
library("BiocStyle")
library("knitr")
library("MASS")
library("RColorBrewer")
library("stringr")
library("pheatmap")
library("matrixStats")
library("purrr")
library("readr")
library("factoextra")
library("magrittr")
library("entropy")
library("forcats")
library("forcats")
library("readxl")
library("DESeq2")
library("broom")
library("locfit")
library("recount")
library("psych")
library("vsn")
library("matrixStats")
library("pheatmap")
library("tidyverse")
library("Rtsne")
library("limma")
theme_set(theme_gray(base_size = 18))
data_dir <- file.path("data/")
## ----import_data---------------------------------------------------------
load(file.path(data_dir, "mtec_counts.RData"))
load(file.path(data_dir, "mtec_cell_anno.RData"))
load(file.path(data_dir, "mtec_gene_anno.RData"))
load(file.path(data_dir, "tras.RData"))
## ----scran_norm----------------------------------------------------------
mtec_scran_sf <- tibble(scran_sf = computeSumFactors(as.matrix(mtec_counts[, -1])),
cell_id = colnames(mtec_counts)[-1])
mtec_cell_anno <- left_join(mtec_cell_anno, mtec_scran_sf,
by = c("cellID" = "cell_id"))
## ----compute_pca---------------------------------------------------------
compute_pca <- function(data_mat, ntop = 500, ...){
pvars <- rowVars(data_mat)
select <- order(pvars, decreasing = TRUE)[seq_len(min(ntop,
length(pvars)))]
PCA <- prcomp(t(data_mat)[, select], center = TRUE, scale. = FALSE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
return(list(pca = data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
PC3 = PCA$x[,3], PC4 = PCA$x[,4], ...),
perc_var = percentVar,
selected_vars = select))
}
## ----sim_rna_seq---------------------------------------------------------
load(file.path(data_dir, "dds_batch.RData"))
dds_batch <- DESeq(dds_batch)
rld_batch <- assay(rlogTransformation(dds_batch, blind=TRUE))
batch_pca <- compute_pca(rld_batch, ntop = 500,
condition = colData(dds_batch)$condition,
sex = colData(dds_batch)$sex,
gen = colData(dds_batch)$gen_back)
sd_ratio <- sqrt(batch_pca$perc_var[2] / batch_pca$perc_var[1])
batch_pca_plot <- ggplot(batch_pca$pca, aes(x = PC1, y = PC2,
color = condition,
shape = sex)) +
geom_point(size = 4) +
ggtitle("PC1 vs PC2, top variable genes") +
labs(x = paste0("PC1, VarExp:", round(batch_pca$perc_var[1],4)),
y = paste0("PC2, VarExp:", round(batch_pca$perc_var[2],4))) +
coord_fixed(ratio = sd_ratio) +
scale_colour_brewer(palette="Dark2")
batch_pca_plot
## ----rna_seq_pval--------------------------------------------------------
load(file.path(data_dir, "dds_batch.RData"))
dds_batch <- DESeq(dds_batch)
res_dds_batch <- results(dds_batch)
res_dds_batch
## ----rnaSeqPvalHist------------------------------------------------------
res_dds_batch <- as_data_frame(res_dds_batch)
ggplot(res_dds_batch, aes(x = pvalue)) +
geom_histogram(binwidth = 0.05 )
## ----simzscores----------------------------------------------------------
sd.true <- 1
eta0.true <- 0.75
get.random.zscore = function(m = 200)
{
m0 = m*eta0.true
m1 = m-m0
z = c( rnorm(m0, mean = 0, sd = sd.true),
rnorm(m1, mean = 2, sd = 1))
return(z)
}
set.seed(555)
z <- get.random.zscore(200)
## ----pvaluehistogram-----------------------------------------------------
pv <- 2 - 2*pnorm(abs(z), sd = sd.true)
ggplot(tibble(pv), aes(x = pv)) +
geom_histogram(aes(fill = I("navyblue"))) +
xlab("p-values") +
ggtitle("Histogram of p-values, correct null distribution")
## ----pvaluehistogramwrongnull--------------------------------------------
pv <- 2 - 2*pnorm(abs(z), sd = 2)
ggplot(tibble(pv), aes(x = pv)) +
geom_histogram(aes(fill = I("coral2"))) +
xlab("p-values") +
ggtitle("Histogram of p-values",
subtitle = "variance of null distribution too high")
## ----pvaluehistogramwrongnull2-------------------------------------------
pv <- 2 - 2*pnorm(abs(z), sd = 0.5)
ggplot(tibble(pv), aes(x = pv)) +
geom_histogram(aes(fill = I("chartreuse4"))) +
xlab("p-values") +
ggtitle("Histogram of p-values",
subtitle = "variance of null distribution too low")
## ----remove_batch_sim----------------------------------------------------
cleaned_data <- removeBatchEffect(rld_batch,
batch = batch_pca$pca$sex,
design = model.matrix( ~ batch_pca$pca$condition))
cleaned_data_pca <- compute_pca(cleaned_data,
condition = batch_pca$pca$condition,
sex = batch_pca$pca$sex)
sd_ratio <- sqrt(cleaned_data_pca$perc_var[2] / cleaned_data_pca$perc_var[1])
clean_pca_plot <- ggplot(cleaned_data_pca$pca, aes(x = PC1, y = PC2,
color = condition,
shape = sex)) +
geom_point(size = 4) +
ggtitle("PC1 vs PC2, top variable genes") +
labs(x = paste0("PC1, VarExp:", round(cleaned_data_pca$perc_var[1],4)),
y = paste0("PC2, VarExp:", round(cleaned_data_pca$perc_var[2],4))) +
coord_fixed(ratio = sd_ratio) +
scale_colour_brewer(palette="Paired")
clean_pca_plot
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
## ----unloaAll, echo=FALSE, message=FALSE, eval = FALSE-------------------
##
## pkgs <- loaded_packages() %>%
## filter(package != "devtools") %>%
## {.$path}
##
## walk(pkgs, unload)
This diff is collapsed.
This diff is collapsed.
......@@ -36,11 +36,17 @@ citep("10.1198/106186008X318440")
citep("10.1007/BF02289565")
citep("10.1101/125112")
# Perraudeau et. al. , 2017
citep("10.12688/f1000research.12122.1")
write.bibtex(file = "stat_methods_bioinf.bib")
knitcitations::cleanbib()
# van der Maaten and Hinton, 2008
write("\n", file = "stat_methods_bioinf.bib", append = TRUE)
write("@article{vanDerMaaten_2008,
author = {van der Maaten, Laurens and Hinton, Geoffrey},
......@@ -55,3 +61,35 @@ write("@article{vanDerMaaten_2008,
}", file = "stat_methods_bioinf.bib", append = TRUE)
write("\n", file = "stat_methods_bioinf.bib", append = TRUE)
# Risso et. al., 2017
write("\n", file = "stat_methods_bioinf.bib", append = TRUE)
write("@article {Risso_2017,
author = {Risso, Davide and Perraudeau, Fanny and Gribkova, Svetlana and Dudoit, Sandrine and Vert, Jean-Philippe},
title = {ZINB-WaVE: A general and flexible method for signal extraction from single-cell RNA-seq data},
year = {2017},
doi = {10.1101/125112},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Single-cell RNA sequencing (scRNA-seq) is a powerful high-throughput technique that enables researchers to measure genome-wide transcription levels at the resolution of single cells. Because of the low amount of RNA present in a single cell, some genes may fail to be detected even though they are expressed; these genes are usually referred to as dropouts. Here, we present a general and flexible zero-inflated negative binomial model (ZINB-WaVE), which leads to low-dimensional representations of the data that account for zero inflation (dropouts), over-dispersion, and the count nature of the data. We demonstrate, with simulated and real data, that the model and its associated estimation procedure are able to give a more stable and accurate low-dimensional representation of the data than principal component analysis (PCA) and zero-inflated factor analysis (ZIFA), without the need for a preliminary normalization step.},
URL = {https://www.biorxiv.org/content/early/2017/11/02/125112},
eprint = {https://www.biorxiv.org/content/early/2017/11/02/125112.full.pdf},
journal = {bioRxiv}
}", file = "stat_methods_bioinf.bib", append = TRUE)
write("\n", file = "stat_methods_bioinf.bib", append = TRUE)
# Benjamini and Hochberg, 1995
write("\n", file = "stat_methods_bioinf.bib", append = TRUE)
write("@article{Benjamini_1995,
ISSN = {00359246},
URL = {http://www.jstor.org/stable/2346101},
abstract = {The common approach to the multiplicity problem calls for controlling the familywise error rate (FWER). This approach, though, has faults, and we point out a few. A different approach to problems of multiple significance testing is presented. It calls for controlling the expected proportion of falsely rejected hypotheses-the false discovery rate. This error rate is equivalent to the FWER when all hypotheses are true but is smaller otherwise. Therefore, in problems where the control of the false discovery rate rather than that of the FWER is desired, there is potential for a gain in power. A simple sequential Bonferroni-type procedure is proved to control the false discovery rate for independent test statistics, and a simulation study shows that the gain in power is substantial. The use of the new procedure and the appropriateness of the criterion are illustrated with examples.},
author = {Yoav Benjamini and Yosef Hochberg},
journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
number = {1},
pages = {289-300},
publisher = {[Royal Statistical Society, Wiley]},
title = {Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing},
volume = {57},
year = {1995}
}", file = "stat_methods_bioinf.bib", append = TRUE)
write("\n", file = "stat_methods_bioinf.bib", append = TRUE)
\ No newline at end of file
......@@ -123,6 +123,19 @@
journal = {Genome Biology},
}
@Article{Perraudeau_2017,
doi = {10.12688/f1000research.12122.1},
url = {https://doi.org/10.12688/f1000research.12122.1},
year = {2017},
month = {jul},
publisher = {F1000 Research, Ltd.},
volume = {6},
pages = {1158},
author = {Fanny Perraudeau and Davide Risso and Kelly Street and Elizabeth Purdom and Sandrine Dudoit},
title = {Bioconductor workflow for single-cell {RNA} sequencing: Normalization, dimensionality reduction, clustering, and lineage inference},
journal = {F1000Research},
}
@Article{Pinto_2013,
doi = {10.1073/pnas.1308311110},
url = {https://doi.org/10.1073/pnas.1308311110},
......@@ -166,3 +179,35 @@
}
@article {Risso_2017,
author = {Risso, Davide and Perraudeau, Fanny and Gribkova, Svetlana and Dudoit, Sandrine and Vert, Jean-Philippe},
title = {ZINB-WaVE: A general and flexible method for signal extraction from single-cell RNA-seq data},
year = {2017},
doi = {10.1101/125112},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Single-cell RNA sequencing (scRNA-seq) is a powerful high-throughput technique that enables researchers to measure genome-wide transcription levels at the resolution of single cells. Because of the low amount of RNA present in a single cell, some genes may fail to be detected even though they are expressed; these genes are usually referred to as dropouts. Here, we present a general and flexible zero-inflated negative binomial model (ZINB-WaVE), which leads to low-dimensional representations of the data that account for zero inflation (dropouts), over-dispersion, and the count nature of the data. We demonstrate, with simulated and real data, that the model and its associated estimation procedure are able to give a more stable and accurate low-dimensional representation of the data than principal component analysis (PCA) and zero-inflated factor analysis (ZIFA), without the need for a preliminary normalization step.},
URL = {https://www.biorxiv.org/content/early/2017/11/02/125112},
eprint = {https://www.biorxiv.org/content/early/2017/11/02/125112.full.pdf},
journal = {bioRxiv}
}
@article{Benjamini_1995,
ISSN = {00359246},
URL = {http://www.jstor.org/stable/2346101},
abstract = {The common approach to the multiplicity problem calls for controlling the familywise error rate (FWER). This approach, though, has faults, and we point out a few. A different approach to problems of multiple significance testing is presented. It calls for controlling the expected proportion of falsely rejected hypotheses-the false discovery rate. This error rate is equivalent to the FWER when all hypotheses are true but is smaller otherwise. Therefore, in problems where the control of the false discovery rate rather than that of the FWER is desired, there is potential for a gain in power. A simple sequential Bonferroni-type procedure is proved to control the false discovery rate for independent test statistics, and a simulation study shows that the gain in power is substantial. The use of the new procedure and the appropriateness of the criterion are illustrated with examples.},
author = {Yoav Benjamini and Yosef Hochberg},
journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
number = {1},
pages = {289-300},
publisher = {[Royal Statistical Society, Wiley]},
title = {Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing},
volume = {57},
year = {1995}
}
zinba.png

307 KB

Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment