Commit bc325de9 authored by Bernd Klaus's avatar Bernd Klaus

first final version, without slides

parent 80c7cb31
......@@ -6,3 +6,14 @@ SRP022054/
rse_gene.Rdata
stockori_tmp.Rmd
factanal.R
compare_to_scLVM.R
scran_offset_zinbawave.R
clustering.pdf
clustering_org.pdf
clustering_cl_10.pdf
data/cl_mtec.RData
data/genes_for_clustering.RData
data/nomarkerCellsClustering.RData
data/zinb.RData
get_clustering_aljeandro.R
norm.pdf
......@@ -4,14 +4,13 @@ options(digits=3, width=80)
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo=TRUE,tidy=FALSE,include=TRUE,
dev=c('png', 'pdf', 'svg'), fig.height = 5, fig.width = 4 * golden_ratio, comment = ' ', dpi = 300,
cache = TRUE)
cache = TRUE, message = FALSE)
## ---- echo=FALSE, cache=FALSE--------------------------------------------
print(date())
## ----required_packages_and_data, echo = TRUE, cache=FALSE, message=FALSE----
library("readxl")
library("scran")
library("BiocStyle")
library("knitr")
library("MASS")
......@@ -21,29 +20,28 @@ library("pheatmap")
library("matrixStats")
library("purrr")
library("readr")
library("factoextra")
library("magrittr")
library("entropy")
library("forcats")
library("forcats")
library("readxl")
library("DESeq2")
library("broom")
library("locfit")
library("recount")
library("psych")
library("vsn")
library("matrixStats")
library("pheatmap")
library("tidyverse")
library("Rtsne")
library("limma")
library("ggthemes")
library("corpcor")
library("sva")
library("zinbwave")
library("clusterExperiment")
library("clue")
library("sda")
library("crossval")
library("randomForest")
theme_set(theme_gray(base_size = 18))
theme_set(theme_solarized(base_size = 18))
data_dir <- file.path("data/")
data_dir <- file.path("data")
## ----import_data---------------------------------------------------------
......@@ -52,13 +50,6 @@ load(file.path(data_dir, "mtec_cell_anno.RData"))
load(file.path(data_dir, "mtec_gene_anno.RData"))
load(file.path(data_dir, "tras.RData"))
## ----scran_norm----------------------------------------------------------
mtec_scran_sf <- tibble(scran_sf = computeSumFactors(as.matrix(mtec_counts[, -1])),
cell_id = colnames(mtec_counts)[-1])
mtec_cell_anno <- left_join(mtec_cell_anno, mtec_scran_sf,
by = c("cellID" = "cell_id"))
## ----compute_pca---------------------------------------------------------
compute_pca <- function(data_mat, ntop = 500, ...){
......@@ -104,6 +95,17 @@ batch_pca_plot <- ggplot(batch_pca$pca, aes(x = PC1, y = PC2,
batch_pca_plot
ggplot(batch_pca$pca, aes(sex, PC2, color = sex)) +
geom_jitter(height = 0, width = 0.1) +
ggtitle("PC2 by sex") +
geom_boxplot(alpha = 0.1) +
scale_color_fivethirtyeight()
## ----pc2ttest, results='hide', echo=FALSE--------------------------------
t.test(PC2 ~ sex, data = (batch_pca$pca))
## ----rna_seq_pval--------------------------------------------------------
load(file.path(data_dir, "dds_batch.RData"))
dds_batch <- DESeq(dds_batch)
......@@ -114,7 +116,8 @@ res_dds_batch
res_dds_batch <- as.data.frame(res_dds_batch)
ggplot(res_dds_batch, aes(x = pvalue)) +
geom_histogram(binwidth = 0.05 )
geom_histogram(binwidth = 0.025, boundary = 0 ) +
ggtitle("p-value histogram for simulated RNA-Seq data")
## ----simzscores----------------------------------------------------------
......@@ -139,7 +142,8 @@ z <- get.random.zscore(200)
pv <- 2 - 2*pnorm(abs(z), sd = sd.true)
ggplot(tibble(pv), aes(x = pv)) +
geom_histogram(aes(fill = I("navyblue"))) +
geom_histogram(aes(fill = I(tableau_color_pal('tableau10light')(3)[1])),
boundary = 0, binwidth = 0.025) +
xlab("p-values") +
ggtitle("Histogram of p-values, correct null distribution")
......@@ -147,16 +151,19 @@ ggplot(tibble(pv), aes(x = pv)) +
pv <- 2 - 2*pnorm(abs(z), sd = 2)
ggplot(tibble(pv), aes(x = pv)) +
geom_histogram(aes(fill = I("coral2"))) +
geom_histogram(aes(fill = I(tableau_color_pal('tableau10light')(3)[2])),
boundary = 0, binwidth = 0.025) +
xlab("p-values") +
ggtitle("Histogram of p-values",
subtitle = "variance of null distribution too high")
## ----pvaluehistogramwrongnull2-------------------------------------------
## ----pvaluehistogramwrongnull2, results="hide", fig.show="hide"----------
pv <- 2 - 2*pnorm(abs(z), sd = 0.5)
ggplot(tibble(pv), aes(x = pv)) +
geom_histogram(aes(fill = I("chartreuse4"))) +
geom_histogram(aes(
fill = I(tableau_color_pal('tableau10light')(3)[3])),
boundary = 0, binwidth = 0.025) +
xlab("p-values") +
ggtitle("Histogram of p-values",
subtitle = "variance of null distribution too low")
......@@ -200,13 +207,20 @@ clean_pca_plot <- ggplot(cleaned_data_pca$pca, aes(x = PC1, y = PC2,
color = condition,
shape = sex)) +
geom_point(size = 4) +
ggtitle("PC1 vs PC2, top variable genes") +
ggtitle("PC1 vs PC2, cleaned data, top variable genes") +
labs(x = paste0("PC1, VarExp:", round(cleaned_data_pca$perc_var[1],4)),
y = paste0("PC2, VarExp:", round(cleaned_data_pca$perc_var[2],4))) +
coord_fixed(ratio = sd_ratio) +
scale_colour_brewer(palette="Paired")
clean_pca_plot
scale_colour_brewer(palette="Set2")
clean_pca_plot
ggplot(cleaned_data_pca$pca, aes(sex, PC2, color = sex)) +
geom_jitter(height = 0, width = 0.1) +
ggtitle("PC2 by sex") +
geom_boxplot(alpha = 0.1) +
scale_color_fivethirtyeight()
## ----chgdesign-----------------------------------------------------------
......@@ -218,15 +232,397 @@ resultsNames(dds_batch_sex)
## ----pwrblocking---------------------------------------------------------
summary(res_sex)
## ----runfactor-----------------------------------------------------------
factor_ana <- function(X, ntop = 500, ...){
cen_scaled_input <- scale(t(X))
pvars <- rowVars(cen_scaled_input)
select <- order(pvars, decreasing = TRUE)[seq_len(min(ntop,
length(pvars)))]
cen_scaled_input <- cen_scaled_input[select, ]
cor_matrix <- cor.shrink(cen_scaled_input)
rld_batch_fac <- factanal(covmat = cor_matrix, factors = 4,
n.obs = ncol(cen_scaled_input),
rotation = "none")
loadings <- rld_batch_fac$loadings[, 1:4]
factors <- as_data_frame(cen_scaled_input %*% solve(cor_matrix, loadings)) %>%
add_column(...)
return(factors)
}
## ----runfactanal---------------------------------------------------------
factors_rld_batch <- factor_ana(rld_batch, ntop = 500,
sex = batch_pca$pca$sex,
condition = batch_pca$pca$condition,
gen = batch_pca$pca$gen)
batch_fac_plot <- ggplot(factors_rld_batch, aes(x = Factor1, y = Factor2,
color = condition,
shape = sex)) +
geom_point(size = 4) +
ggtitle("Factor 1 vs Factor 2, top 500 genes") +
coord_fixed(ratio = sd_ratio) +
scale_colour_tableau(palette = "tableau10")
batch_fac_plot
## ----facvspca, results="hide", fig.show="hide"---------------------------
ggplot(factors_rld_batch, aes(sex, Factor2, color = sex)) +
geom_jitter(height = 0, width = 0.1) +
ggtitle("Factor2 by sex") +
geom_boxplot(alpha = 0.1) +
scale_color_fivethirtyeight()
t.test(Factor2 ~ sex, data = factors_rld_batch)
ggplot(factors_rld_batch, aes(condition, Factor2, color = condition)) +
geom_jitter(height = 0, width = 0.1) +
ggtitle("Factor2 by condition") +
geom_boxplot(alpha = 0.1) +
scale_color_fivethirtyeight()
t.test(Factor2 ~ condition, data = factors_rld_batch)
cor(as.numeric(batch_pca$pca$sex), factors_rld_batch$Factor2)
cor(as.numeric(batch_pca$pca$condition), factors_rld_batch$Factor2)
cor(as.numeric(factors_rld_batch$gen), factors_rld_batch$Factor4)
cor(as.numeric(factors_rld_batch$gen), batch_pca$pca$PC1)
## ----factordesign--------------------------------------------------------
colData(dds_batch) <- cbind(colData(dds_batch),
select(factors_rld_batch, 1:4))
design(dds_batch) <- ~ sex + Factor1 + Factor3 + Factor4 + condition
dds_batch_f <- DESeq(dds_batch)
res_f <- results(dds_batch_f)
summary(res_f)
summary(res_sex)
## ----factorpcaoverlap, results='hide'------------------------------------
idx_joint <- (res_f$padj < 0.1) & (res_sex$padj < 0.1)
res_f[idx_joint, ]
res_sex[idx_joint , ]
res_f[setdiff(which((res_sex$padj < 0.1)), which(idx_joint)), ]
res_sex[setdiff(which((res_sex$padj < 0.1)), which(idx_joint)) , ]
res_f[setdiff(which((res_f$padj < 0.1)), which(idx_joint)), ]
res_sex[setdiff(which((res_f$padj < 0.1)), which(idx_joint)) , ]
## ----secondfactor--------------------------------------------------------
ggplot(factors_rld_batch, aes(condition, Factor2, color = condition)) +
geom_jitter(height = 0, width = 0.1) +
ggtitle("Factor2 by condition") +
geom_boxplot(alpha = 0.1) +
scale_color_fivethirtyeight()
t.test(Factor2 ~ condition, data = factors_rld_batch)
## ----mult_reg_confounding------------------------------------------------
tidy(lm(cbind(as.numeric(colData(dds_batch)$sex),
as.numeric(colData(dds_batch)$gen_back)) ~ as.numeric(colData(dds_batch)$condition)))
## ----sva_sim-------------------------------------------------------------
mod <- model.matrix(~ condition, data = colData(dds_batch))
mod0 <- model.matrix(~ 1, data = colData(dds_batch))
n.sv <- num.sv(rld_batch, mod, method = "be", B = 100, vfilter = 500)
sva_res <- sva(rld_batch, mod, mod0, n.sv = n.sv, vfilter = 500, B = 20)
ggplot(data_frame(condition = factors_rld_batch$gen,
sv = as.vector(sva_res$sv)),
aes(condition, sv, color = condition)) +
geom_jitter(height = 0, width = 0.1) +
ggtitle("Surrogate variable by genetic background") +
geom_boxplot(alpha = 0.1) +
scale_color_fivethirtyeight()
anova(lm(as.vector(sva_res$sv) ~ factors_rld_batch$condition))
## ----svdesign------------------------------------------------------------
colData(dds_batch)$sv <- as.vector(sva_res$sv)
design(dds_batch) <- ~ sv + condition
dds_batch_sva <- DESeq(dds_batch)
res_sva <- results(dds_batch_sva)
summary(res_sva)
## ----runzinbawave--------------------------------------------------------
no_marker_cells <- filter(mtec_cell_anno, SurfaceMarker == "None")
count_matrix_nomarker <- as.matrix(mtec_counts[, no_marker_cells$cellID])
rownames(count_matrix_nomarker) <- mtec_counts$ensembl_id
#
# zinb <- zinbFit(count_matrix_nomarker,
# K=1, epsilon=1000)
# save(zinb,file = file.path(data_dir, "zinb.RData"))
## ----zinbanormcorrected--------------------------------------------------
load(file.path(data_dir, "zinb.RData"))
mtec_norm <- zinbwave(SummarizedExperiment(count_matrix_nomarker),
fitted_model=zinb,
normalizedValues=TRUE,
residuals = TRUE)
# keep only the residuals in the summarized experiment
assay(mtec_norm, 1) <- NULL
assay(mtec_norm, 1) <- NULL
colData(mtec_norm) <- DataFrame(no_marker_cells)
rowData(mtec_norm) <- DataFrame(filter(mtec_gene_anno,
ensembl_id %in% rownames(mtec_norm)))
mtec_norm
## ----checkscnorm---------------------------------------------------------
mtec_norm_tidy <- as.data.frame(assay(mtec_norm)) %>% {
colnames(.) = colData(mtec_norm)$cellID
.[, sample(ncol(mtec_norm), 25)]
} %>%
gather(key = "cell", value = "expression")
medians <- mtec_norm_tidy %>% group_by(cell) %>%
summarize(med = median(expression)) %>%
arrange(med)
mtec_norm_tidy$cell %<>% factor(levels = medians$cell)
pl <- ggplot(mtec_norm_tidy, aes(cell, expression, color = cell)) +
geom_boxplot() +
guides(color = FALSE) +
ggtitle("zinbawave normalized/cleaned expression data [log]") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pl
#ggsave("norm.pdf", pl, width = 28, height = 14)
## ----clusteringprep------------------------------------------------------
load(file.path(data_dir, "genes_for_clustering.RData"))
load(file.path(data_dir,"nomarkerCellsClustering.RData"))
mtec_norm_sub <- mtec_norm[genes_for_clustering, ]
rowData(mtec_norm_sub)$pub_cls <- cl_class_ids(
nomarkerCellsClustering[["consensus"]])
leftover_genes <- rowData(mtec_norm_sub)$pub_cls == 12
data_for_cl <- assay(mtec_norm_sub[!leftover_genes,])
colnames(data_for_cl) <- colData(mtec_norm_sub)$cellID
## ----runscclustering, eval=FALSE-----------------------------------------
cl_mtec <- clusterSingle(t(data_for_cl),
subsample = TRUE,
isCount = FALSE,
clusterLabel = "mtec_sub",
dimReduce = "PCA",
ndims = 10,
mainClusterArgs = list(minSize = 10,
clusterFunction = "pam",
clusterArgs=list(k=11)),
subsampleArgs = list(clusterFunction = "pam",
clusterArgs=list(k=11),
largeDataset = TRUE,
random.seed = 1234,
ncores = 1,
resamp.num = 10))
# save(cl_mtec, file = file.path(data_dir, "cl_mtec.RData"))
## ----validateclustering--------------------------------------------------
load(file = file.path(data_dir, "cl_mtec.RData"))
row_order <- tibble(genes = rownames(data_for_cl),
clusters = as.numeric(primaryClusterNamed(cl_mtec)),
clusters_org = rowData(mtec_norm_sub)$pub_cls[
rowData(mtec_norm_sub)$pub_cls!=12]) %>%
arrange(clusters) %>%
as.data.frame(.) %>%
{rownames(.) = .$genes
.
}
data_cor <- cor(t( data_for_cl[row_order$genes,]))
br <- seq(-1, 1, length.out=101) ** 3
cols <-
colorRampPalette(brewer.pal(9, "RdBu"), interpolate="spline", space="Lab")(100)
ann_colors = list(
clusters = tableau_color_pal(palette = "tableau20")(11),
clusters_org = tableau_color_pal(palette = "bluered12")(11)
)
pdf("clustering.pdf", width = 14, height = 14,
title = "single cell clustering using zinbawave corrected data")
pheatmap(data_cor,
color = cols,
breaks = br,
border_color = "grey60", cellwidth = NA, cellheight = NA, scale = "none",
cluster_rows = FALSE, cluster_cols = FALSE, legend = TRUE,
legend_labels = NA,
annotation_row = row_order[, "clusters", drop = FALSE],
show_rownames = FALSE,
show_colnames = FALSE,
annotation_colors = ann_colors
)
dev.off()
## ----heatorg, results='hide'---------------------------------------------
row_order <- arrange(row_order, clusters_org) %>%
{rownames(.) = .$genes
.
}
pdf("clustering_org.pdf", width = 14, height = 14,
title = "original clustering using zinbawave corrected data")
pheatmap(data_cor[row_order$genes[1:1000], row_order$genes[1:1000]],
color = cols,
breaks = br,
border_color = "grey60", cellwidth = NA, cellheight = NA, scale = "none",
cluster_rows = FALSE, cluster_cols = FALSE, legend = TRUE,
legend_labels = NA,
annotation_row = row_order[, "clusters_org", drop = FALSE],
show_rownames = FALSE,
show_colnames = FALSE,
annotation_colors = ann_colors
)
dev.off()
## ----cl10heat, results="hide"--------------------------------------------
idx_10 <- which(row_order$clusters == 10)
pdf("clustering_cl_10.pdf", width = 14, height = 14,
title = "single cell clustering using zinbawave corrected data")
pheatmap(data_cor[idx_10, idx_10],
color = cols,
breaks = br,
border_color = "grey60", cellwidth = NA, cellheight = NA, scale = "none",
cluster_rows = FALSE, cluster_cols = FALSE, legend = TRUE,
legend_labels = NA,
annotation_row = row_order[, "clusters", drop = FALSE],
show_rownames = FALSE,
show_colnames = FALSE,
annotation_colors = ann_colors
)
dev.off()
## ----prepareDataMl-------------------------------------------------------
genes_clusters <- row_order[rownames(data_for_cl), ] %>%
mutate(labs = factor(ifelse(clusters == 10, "cl10",
"other")))
class_priors <- prop.table(table(genes_clusters$labs))
class_priors
## ----testRF--------------------------------------------------------------
rf_fit <- randomForest(x = data_for_cl, y = genes_clusters$labs,
ntree = 500, nodesize = 5,
mtry = floor(sqrt(ncol(data_for_cl))),
classwt = class_priors,
do.trace = 100)
rf_fit$confusion
# acc <- sum(rf_fit$confusion[, "class.error"] * class_priors)
# acc
## ----compareToRandom-----------------------------------------------------
random_cf <- ifelse(rbernoulli(nrow(data_for_cl),
class_priors[1]), "cl10", "other")
random_confusion <- table(random_cf, genes_clusters$labs)
random_confusion <- cbind(random_confusion,
c(random_confusion["cl10", "other"] /
sum(random_confusion["cl10", ]),
random_confusion["other", "cl10"] /
sum(random_confusion["other", ])))
colnames(random_confusion)[3] <- "class.error"
random_confusion
## ----predfunForRF--------------------------------------------------------
predfun_rf <- function(train.x, train.y, test.x, test.y, negative){
rf_fit <- randomForest(x = train.x, y = as.factor(train.y),
ntree = 500, nodesize = 5,
mtry = floor(sqrt(ncol(train.x))),
classwt = class_priors,
do.trace = FALSE)
#browser()
ynew <- predict(rf_fit, test.x)
conf <- table(ynew, test.y)
err_rates <- c(conf["cl10", "other"] /
sum(conf["cl10", ]),
conf["other", "cl10"] /
sum(conf["other", ]))
names(err_rates) <- c("cl10", "other")
return(err_rates)
}
set.seed(7891)
train_idx <- sample(nrow(data_for_cl), 700)
test_idx <- setdiff(seq_len(nrow(data_for_cl)), train_idx)
predfun_rf(data_for_cl[train_idx,], genes_clusters$labs[train_idx],
data_for_cl[test_idx, ], genes_clusters$labs[test_idx])
## ----doCrossValForRF-----------------------------------------------------
set.seed(789)
rf_out <- crossval(predfun_rf, X = data_for_cl, Y = genes_clusters$labs,
K = 5, B = 10, negative="other", verbose = FALSE)
cv_res <- as.data.frame(rf_out$stat.cv) %>%
rownames_to_column( var = "BF") %>%
extract(col = BF, into = c("rep", "fold"),
regex = "([[:alnum:]]+).([[:alnum:]]+)" ) %>%
mutate_if( is.character, as_factor) %>%
gather(key = "class", value = "pred_error", cl10, other)
cv_plot <- ggplot(cv_res, aes(x = rep, y = pred_error, color = class)) +
geom_jitter(height = 0, width = 0.2) +
ggtitle("CV prediction error across folds") +
scale_color_tableau()
cv_plot
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
## ----unloaAll, echo=FALSE, message=FALSE, eval = FALSE-------------------
##
## pkgs <- loaded_packages() %>%
## filter(package != "devtools") %>%
## {.$path}
##
## walk(pkgs, unload)
pkgs <- loaded_packages() %>%
filter(package != "devtools") %>%
{.$path}
walk(pkgs, unload)
This diff is collapsed.
This diff is collapsed.
......@@ -2,6 +2,8 @@ library(knitcitations)
library(bibtex)
# Kyewski and Klein, 2006
citep("10.1146/annurev.immunol.23.021704.115601")
......@@ -41,8 +43,6 @@ citep("10.1101/125112")
# Perraudeau et. al. , 2017
citep("10.12688/f1000research.12122.1")
write.bibtex(file = "stat_methods_bioinf.bib")
knitcitations::cleanbib()
# Nygaard et. al, 2015
citep("10.1093/biostatistics/kxv027")
......@@ -50,10 +50,41 @@ citep("10.1093/biostatistics/kxv027")
# Jaffe et. al., 2015