Commit 517b7002 authored by Bernd Klaus's avatar Bernd Klaus

added keras / neural network example to the ML analysis

parent 5d90876f
......@@ -19,3 +19,4 @@ Slides_stat_methods_bioinf/slides_factor_ana_testing_ml_cache
Slides_stat_methods_bioinf/slides_graphics_bioinf_cache
Slides_stat_methods_bioinf/SRP022054
grouped_mutated.R
keras_test.RData
---
title: "Factor analysis, testing and machine learning for bioinformatics"
author: "Bernd Klaus"
date: "December 11, 2017"
date: "January 30th, 2017"
output:
slidy_presentation:
df_print: paged
......@@ -602,11 +602,35 @@ rf_fit <- randomForest(x = data_for_cl, y = genes_clusters$labs,
## cross validation errors
<img src="doCrossValForRF-1.png" style="width: 30%; height: 30%" >
<img src="doCrossValForRF-1.png" style="width: 50%; height: 50%" >
* error rate estimates are high for cluster 10
* low for "other"
## Neural Networks
<img src="nn_training.png" style="width: 60%; height: 60%" >
## A deep network for the sc data
* Keras model
```{r eval=FALSE, echo=TRUE, include=TRUE}
nn <- keras_model_sequential() %>%
layer_dense(units = 32, input_shape = c(203),
name = "input_layer",
kernel_regularizer = regularizer_l1(l = 0.3),
activation = "relu") %>%
layer_dropout(rate = 0.5, name = "input_dropout") %>%
layer_dense(units = 16, name = "hidden_layer_1", activation = "relu") %>%
layer_dense(units = 8, name = "hidden_layer_2", activation = "relu") %>%
layer_dense(bias_initializer = initializer_constant(-5),
units = 1, name = "output_layer", activation = "sigmoid")
```
## CV results for neural network
<img src="nn_cv-1.png" style="width: 60%; height: 60%" >
* error rate estimates are highly variable for cluster 10
* Figure 1b of the original paper: clustering largely driven by small
number of single cells
* If these are selected, prediction works well
* "other" class is easily predictable, indicating that
cluster 10 indeed contains "structure"
......@@ -36,6 +36,7 @@ library("clue")
library("sda")
library("crossval")
library("randomForest")
library("keras")
theme_set(theme_solarized(base_size = 18))
......@@ -538,8 +539,8 @@ rf_fit <- randomForest(x = data_for_cl, y = genes_clusters$labs,
rf_fit$confusion
# acc <- sum(rf_fit$confusion[, "class.error"] * class_priors)
# acc
acc <- 1-sum(rf_fit$confusion[, "class.error"] * class_priors)
acc
## ----compareToRandom-----------------------------------------------------
......@@ -548,10 +549,10 @@ random_cf <- ifelse(rbernoulli(nrow(data_for_cl),
random_confusion <- table(random_cf, genes_clusters$labs)
random_confusion <- cbind(random_confusion,
c(random_confusion["cl10", "other"] /
sum(random_confusion["cl10", ]),
random_confusion["other", "cl10"] /
sum(random_confusion["other", ])))
c(random_confusion["other", "cl10"] /
sum(random_confusion[, "cl10"]),
random_confusion["cl10", "other"] /
sum(random_confusion[, "other" ])))
colnames(random_confusion)[3] <- "class.error"
random_confusion
......@@ -566,28 +567,31 @@ predfun_rf <- function(train.x, train.y, test.x, test.y, negative){
mtry = floor(sqrt(ncol(train.x))),
classwt = class_priors,
do.trace = FALSE)
#browser()
ynew <- predict(rf_fit, test.x)
conf <- table(ynew, test.y)
err_rates <- c(conf["cl10", "other"] /
sum(conf["cl10", ]),
conf["other", "cl10"] /
sum(conf["other", ]))
err_rates <- c(conf["other", "cl10"] /
sum(conf[, "cl10" ]),
conf["cl10", "other"] /
sum(conf[, "other"]))
names(err_rates) <- c("cl10", "other")
return(err_rates)
}
set.seed(7891)
set.seed(123)
train_idx <- sample(nrow(data_for_cl), 700)
test_idx <- setdiff(seq_len(nrow(data_for_cl)), train_idx)
predfun_rf(data_for_cl[train_idx,], genes_clusters$labs[train_idx],
data_for_cl[test_idx, ], genes_clusters$labs[test_idx])
train.x <- data_for_cl[train_idx,]
train.y <- genes_clusters$labs[train_idx]
test.x <- data_for_cl[test_idx, ]
test.y <- genes_clusters$labs[test_idx]
predfun_rf(train.x, train.y,
test.x, test.y)
## ----doCrossValForRF-----------------------------------------------------
set.seed(789)
......@@ -603,12 +607,125 @@ cv_res <- as.data.frame(rf_out$stat.cv) %>%
cv_plot <- ggplot(cv_res, aes(x = rep, y = pred_error, color = class)) +
geom_jitter(height = 0, width = 0.2) +
ggtitle("CV prediction error by repitions") +
ggtitle("CV prediction error by repetitions") +
scale_color_tableau()
cv_plot
## ----keras_nn, eval=FALSE------------------------------------------------
## nn <- keras_model_sequential() %>%
## layer_dense(units = 32, input_shape = c(203),
## name = "input_layer",
## kernel_regularizer = regularizer_l1(l = 0.3),
## activation = "relu") %>%
## layer_dropout(rate = 0.5, name = "input_dropout") %>%
## layer_dense(units = 16, name = "hidden_layer_1", activation = "relu") %>%
## layer_dense(units = 8, name = "hidden_layer_2", activation = "relu") %>%
## layer_dense(bias_initializer = initializer_constant(-5),
## units = 1, name = "output_layer", activation = "sigmoid")
##
## ----nn_cv, eval=TRUE----------------------------------------------------
predfun_nn <- function(train.x, train.y, test.x, test.y, negative){
# create a custom callback that will stop model training if the
# overall accuracy is greater than some threshold
# this is checked per batch
acc_stop <- R6::R6Class("acc_stop",
inherit = KerasCallback,
public = list(
accs = NULL,
cl10errors = NULL,
on_batch_end = function(batch, logs = list()) {
self$accs <- c(self$accs, logs[["binary_accuracy"]])
self$cl10errors <- c(self$cl10errors, logs[["cl10_errors"]])
if(logs[["binary_accuracy"]] > 0.6){
self$model$stop_training = TRUE
}
}
))
call_acc_stop <- acc_stop$new()
nn <- keras_model_sequential() %>%
layer_dense(units = 32, input_shape = c(203),
name = "input_layer",
kernel_regularizer = regularizer_l1(l = 0.3),
activation = "relu") %>%
layer_dropout(rate = 0.5, name = "input_dropout") %>%
layer_dense(units = 16, name = "hidden_layer_1", activation = "relu") %>%
layer_dense(units = 8, name = "hidden_layer_2", activation = "relu") %>%
layer_dense(bias_initializer = initializer_constant(-5),
units = 1, name = "output_layer", activation = "sigmoid")
nn %>%
compile(optimizer = 'adam',
loss = loss_binary_crossentropy,
metrics = 'binary_accuracy')
nn %>%
fit(train.x,
train.y,
epochs=50, batch_size=64, verbose = 0,
callbacks = list(call_acc_stop))
ynew <- predict_classes(nn, test.x)
rm(nn)
k_clear_session()
conf <- table(ynew, test.y)
if(nrow(conf) != 2){
conf <- rbind(conf, c(0,0))
}
if(ncol(conf) != 2){
conf <- cbind(conf, c(0,0))
}
colnames(conf) <- rownames(conf) <- c("cl10", "other")
err_rates <- c(conf["other", "cl10"] /
sum(conf[, "cl10"]),
conf["cl10", "other"] /
sum(conf[, "other"]))
names(err_rates) <- c("cl10", "other")
return(err_rates)
}
set.seed(789)
labs_nn <- as.numeric(genes_clusters$labs) - 1
nn_out <- crossval(predfun_nn, X = data_for_cl, Y = labs_nn,
K = 5, B = 10, negative="other", verbose = FALSE)
cv_res_nn <- as.data.frame(nn_out$stat.cv) %>%
rownames_to_column( var = "BF") %>%
extract(col = BF, into = c("rep", "fold"),
regex = "([[:alnum:]]+).([[:alnum:]]+)" ) %>%
mutate_if( is.character, as_factor) %>%
gather(key = "class", value = "pred_error", cl10, other)
cv_plot_nn <- ggplot(cv_res_nn, aes(x = rep, y = pred_error, color = class)) +
geom_jitter(height = 0, width = 0.2) +
ggtitle("CV prediction error by repetitions") +
scale_colour_gdocs()
cv_plot_nn
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
......
This diff is collapsed.
This diff is collapsed.
......@@ -94,6 +94,9 @@ citep("10.1186/s13059-015-0844-5")
citep("10.1186/s13059-017-1334-8")
# Angermüller et. al., 2016
citep("10.15252/msb.20156651")
### export citations
......@@ -110,6 +113,16 @@ add_manually <- function(entry){
}
# Goodfellow et. al. , 2016
add_manually("@book{Goodfellow_2016,
title={Deep Learning},
author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
publisher={MIT Press},
note={\\url{http://www.deeplearningbook.org}},
year={2016}
}")
# van der Maaten and Hinton, 2008
add_manually("@article{vanDerMaaten_2008,
......@@ -197,3 +210,19 @@ add_manually("
title = {Sincell: an R/Bioconductor package for statistical assessment of cell-state hierarchies from single-cell {RNA}-seq: Fig. 1.},
journal = {Bioinformatics},
}")
# Ching et. al. , 2018
add_manually("
@article {Ching_2018,
author = {Ching, Travers and Himmelstein, Daniel S. and Beaulieu-Jones, Brett K. and Kalinin, Alexandr A. and Do, Brian T. and Way, Gregory P. and Ferrero, Enrico and Agapow, Paul-Michael and Zietz, Michael and Hoffman, Michael M and Xie, Wei and Rosen, Gail L. and Lengerich, Benjamin J. and Israeli, Johnny and Lanchantin, Jack and Woloszynek, Stephen and Carpenter, Anne E. and Shrikumar, Avanti and Xu, Jinbo and Cofer, Evan M. and Lavender, Christopher A and Turaga, Srinivas C and Alexandari, Amr M and Lu, Zhiyong and Harris, David J. and DeCaprio, Dave and Qi, Yanjun and Kundaje, Anshul and Peng, Yifan and Wiley, Laura K. and Segler, Marwin H. S. and Boca, Simina M and Swamidass, S. Joshua and Huang, Austin and Gitter, Anthony and Greene, Casey S.},
title = {Opportunities And Obstacles For Deep Learning In Biology And Medicine},
year = {2018},
doi = {10.1101/142760},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Deep learning, which describes a class of machine learning algorithms, has recently showed impressive results across a variety of domains. Biology and medicine are data rich, but the data are complex and often ill-understood. Problems of this nature may be particularly well-suited to deep learning techniques. We examine applications of deep learning to a variety of biomedical problems - patient classification, fundamental biological processes, and treatment of patients - and discuss whether deep learning will transform these tasks or if the biomedical sphere poses unique challenges. We find that deep learning has yet to revolutionize or definitively resolve any of these problems, but promising advances have been made on the prior state of the art. Even when improvement over a previous baseline has been modest, we have seen signs that deep learning methods may speed or aid human investigation. More work is needed to address concerns related to interpretability and how to best model each problem. Furthermore, the limited amount of labeled data for training presents problems in some domains, as do legal and privacy constraints on work with sensitive health records. Nonetheless, we foresee deep learning powering changes at both bench and bedside with the potential to transform several areas of biology and medicine.},
URL = {https://www.biorxiv.org/content/early/2018/01/19/142760},
eprint = {https://www.biorxiv.org/content/early/2018/01/19/142760.full.pdf},
journal = {bioRxiv}
}")
......@@ -12,6 +12,20 @@
journal = {The Annals of Applied Statistics},
}
@Article{Angermueller_2016,
doi = {10.15252/msb.20156651},
url = {https://doi.org/10.15252/msb.20156651},
year = {2016},
month = {jul},
publisher = {{EMBO}},
volume = {12},
number = {7},
pages = {878},
author = {Christof Angermueller and Tanel P{\"a}rnamaa and Leopold Parts and Oliver Stegle},
title = {Deep learning for computational biology},
journal = {Molecular Systems Biology},
}
@Article{Brennecke_2013,
doi = {10.1038/nmeth.2645},
url = {https://doi.org/10.1038/nmeth.2645},
......@@ -285,6 +299,17 @@
}
@book{Goodfellow_2016,
title={Deep Learning},
author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
publisher={MIT Press},
note={\url{http://www.deeplearningbook.org}},
year={2016}
}
@article{vanDerMaaten_2008,
author = {van der Maaten, Laurens and Hinton, Geoffrey},
interhash = {370ba8b9e1909b61880a6f47c93bcd49},
......@@ -376,3 +401,19 @@
}
@article {Ching_2018,
author = {Ching, Travers and Himmelstein, Daniel S. and Beaulieu-Jones, Brett K. and Kalinin, Alexandr A. and Do, Brian T. and Way, Gregory P. and Ferrero, Enrico and Agapow, Paul-Michael and Zietz, Michael and Hoffman, Michael M and Xie, Wei and Rosen, Gail L. and Lengerich, Benjamin J. and Israeli, Johnny and Lanchantin, Jack and Woloszynek, Stephen and Carpenter, Anne E. and Shrikumar, Avanti and Xu, Jinbo and Cofer, Evan M. and Lavender, Christopher A and Turaga, Srinivas C and Alexandari, Amr M and Lu, Zhiyong and Harris, David J. and DeCaprio, Dave and Qi, Yanjun and Kundaje, Anshul and Peng, Yifan and Wiley, Laura K. and Segler, Marwin H. S. and Boca, Simina M and Swamidass, S. Joshua and Huang, Austin and Gitter, Anthony and Greene, Casey S.},
title = {Opportunities And Obstacles For Deep Learning In Biology And Medicine},
year = {2018},
doi = {10.1101/142760},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Deep learning, which describes a class of machine learning algorithms, has recently showed impressive results across a variety of domains. Biology and medicine are data rich, but the data are complex and often ill-understood. Problems of this nature may be particularly well-suited to deep learning techniques. We examine applications of deep learning to a variety of biomedical problems - patient classification, fundamental biological processes, and treatment of patients - and discuss whether deep learning will transform these tasks or if the biomedical sphere poses unique challenges. We find that deep learning has yet to revolutionize or definitively resolve any of these problems, but promising advances have been made on the prior state of the art. Even when improvement over a previous baseline has been modest, we have seen signs that deep learning methods may speed or aid human investigation. More work is needed to address concerns related to interpretability and how to best model each problem. Furthermore, the limited amount of labeled data for training presents problems in some domains, as do legal and privacy constraints on work with sensitive health records. Nonetheless, we foresee deep learning powering changes at both bench and bedside with the potential to transform several areas of biology and medicine.},
URL = {https://www.biorxiv.org/content/early/2018/01/19/142760},
eprint = {https://www.biorxiv.org/content/early/2018/01/19/142760.full.pdf},
journal = {bioRxiv}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment