Commit 517b7002 authored by Bernd Klaus's avatar Bernd Klaus

added keras / neural network example to the ML analysis

parent 5d90876f
......@@ -19,3 +19,4 @@ Slides_stat_methods_bioinf/slides_factor_ana_testing_ml_cache
Slides_stat_methods_bioinf/slides_graphics_bioinf_cache
Slides_stat_methods_bioinf/SRP022054
grouped_mutated.R
keras_test.RData
---
title: "Factor analysis, testing and machine learning for bioinformatics"
author: "Bernd Klaus"
date: "December 11, 2017"
date: "January 30th, 2017"
output:
slidy_presentation:
df_print: paged
......@@ -602,11 +602,35 @@ rf_fit <- randomForest(x = data_for_cl, y = genes_clusters$labs,
## cross validation errors
<img src="doCrossValForRF-1.png" style="width: 30%; height: 30%" >
<img src="doCrossValForRF-1.png" style="width: 50%; height: 50%" >
* error rate estimates are high for cluster 10
* low for "other"
## Neural Networks
<img src="nn_training.png" style="width: 60%; height: 60%" >
## A deep network for the sc data
* Keras model
```{r eval=FALSE, echo=TRUE, include=TRUE}
nn <- keras_model_sequential() %>%
layer_dense(units = 32, input_shape = c(203),
name = "input_layer",
kernel_regularizer = regularizer_l1(l = 0.3),
activation = "relu") %>%
layer_dropout(rate = 0.5, name = "input_dropout") %>%
layer_dense(units = 16, name = "hidden_layer_1", activation = "relu") %>%
layer_dense(units = 8, name = "hidden_layer_2", activation = "relu") %>%
layer_dense(bias_initializer = initializer_constant(-5),
units = 1, name = "output_layer", activation = "sigmoid")
```
## CV results for neural network
<img src="nn_cv-1.png" style="width: 60%; height: 60%" >
* error rate estimates are highly variable for cluster 10
* Figure 1b of the original paper: clustering largely driven by small
number of single cells
* If these are selected, prediction works well
* "other" class is easily predictable, indicating that
cluster 10 indeed contains "structure"
......@@ -36,6 +36,7 @@ library("clue")
library("sda")
library("crossval")
library("randomForest")
library("keras")
theme_set(theme_solarized(base_size = 18))
......@@ -538,8 +539,8 @@ rf_fit <- randomForest(x = data_for_cl, y = genes_clusters$labs,
rf_fit$confusion
# acc <- sum(rf_fit$confusion[, "class.error"] * class_priors)
# acc
acc <- 1-sum(rf_fit$confusion[, "class.error"] * class_priors)
acc
## ----compareToRandom-----------------------------------------------------
......@@ -548,10 +549,10 @@ random_cf <- ifelse(rbernoulli(nrow(data_for_cl),
random_confusion <- table(random_cf, genes_clusters$labs)
random_confusion <- cbind(random_confusion,
c(random_confusion["cl10", "other"] /
sum(random_confusion["cl10", ]),
random_confusion["other", "cl10"] /
sum(random_confusion["other", ])))
c(random_confusion["other", "cl10"] /
sum(random_confusion[, "cl10"]),
random_confusion["cl10", "other"] /
sum(random_confusion[, "other" ])))
colnames(random_confusion)[3] <- "class.error"
random_confusion
......@@ -566,28 +567,31 @@ predfun_rf <- function(train.x, train.y, test.x, test.y, negative){
mtry = floor(sqrt(ncol(train.x))),
classwt = class_priors,
do.trace = FALSE)
#browser()
ynew <- predict(rf_fit, test.x)
conf <- table(ynew, test.y)
err_rates <- c(conf["cl10", "other"] /
sum(conf["cl10", ]),
conf["other", "cl10"] /
sum(conf["other", ]))
err_rates <- c(conf["other", "cl10"] /
sum(conf[, "cl10" ]),
conf["cl10", "other"] /
sum(conf[, "other"]))
names(err_rates) <- c("cl10", "other")
return(err_rates)
}
set.seed(7891)
set.seed(123)
train_idx <- sample(nrow(data_for_cl), 700)
test_idx <- setdiff(seq_len(nrow(data_for_cl)), train_idx)
predfun_rf(data_for_cl[train_idx,], genes_clusters$labs[train_idx],
data_for_cl[test_idx, ], genes_clusters$labs[test_idx])
train.x <- data_for_cl[train_idx,]
train.y <- genes_clusters$labs[train_idx]
test.x <- data_for_cl[test_idx, ]
test.y <- genes_clusters$labs[test_idx]
predfun_rf(train.x, train.y,
test.x, test.y)
## ----doCrossValForRF-----------------------------------------------------
set.seed(789)
......@@ -603,12 +607,125 @@ cv_res <- as.data.frame(rf_out$stat.cv) %>%
cv_plot <- ggplot(cv_res, aes(x = rep, y = pred_error, color = class)) +
geom_jitter(height = 0, width = 0.2) +
ggtitle("CV prediction error by repitions") +
ggtitle("CV prediction error by repetitions") +
scale_color_tableau()
cv_plot
## ----keras_nn, eval=FALSE------------------------------------------------
## nn <- keras_model_sequential() %>%
## layer_dense(units = 32, input_shape = c(203),
## name = "input_layer",
## kernel_regularizer = regularizer_l1(l = 0.3),
## activation = "relu") %>%
## layer_dropout(rate = 0.5, name = "input_dropout") %>%
## layer_dense(units = 16, name = "hidden_layer_1", activation = "relu") %>%
## layer_dense(units = 8, name = "hidden_layer_2", activation = "relu") %>%
## layer_dense(bias_initializer = initializer_constant(-5),
## units = 1, name = "output_layer", activation = "sigmoid")
##
## ----nn_cv, eval=TRUE----------------------------------------------------
predfun_nn <- function(train.x, train.y, test.x, test.y, negative){
# create a custom callback that will stop model training if the
# overall accuracy is greater than some threshold
# this is checked per batch
acc_stop <- R6::R6Class("acc_stop",
inherit = KerasCallback,
public = list(
accs = NULL,
cl10errors = NULL,
on_batch_end = function(batch, logs = list()) {
self$accs <- c(self$accs, logs[["binary_accuracy"]])
self$cl10errors <- c(self$cl10errors, logs[["cl10_errors"]])
if(logs[["binary_accuracy"]] > 0.6){
self$model$stop_training = TRUE
}
}
))
call_acc_stop <- acc_stop$new()
nn <- keras_model_sequential() %>%
layer_dense(units = 32, input_shape = c(203),
name = "input_layer",
kernel_regularizer = regularizer_l1(l = 0.3),
activation = "relu") %>%
layer_dropout(rate = 0.5, name = "input_dropout") %>%
layer_dense(units = 16, name = "hidden_layer_1", activation = "relu") %>%
layer_dense(units = 8, name = "hidden_layer_2", activation = "relu") %>%
layer_dense(bias_initializer = initializer_constant(-5),
units = 1, name = "output_layer", activation = "sigmoid")
nn %>%
compile(optimizer = 'adam',
loss = loss_binary_crossentropy,
metrics = 'binary_accuracy')
nn %>%
fit(train.x,
train.y,
epochs=50, batch_size=64, verbose = 0,
callbacks = list(call_acc_stop))
ynew <- predict_classes(nn, test.x)
rm(nn)
k_clear_session()
conf <- table(ynew, test.y)
if(nrow(conf) != 2){
conf <- rbind(conf, c(0,0))
}
if(ncol(conf) != 2){
conf <- cbind(conf, c(0,0))
}
colnames(conf) <- rownames(conf) <- c("cl10", "other")
err_rates <- c(conf["other", "cl10"] /
sum(conf[, "cl10"]),
conf["cl10", "other"] /
sum(conf[, "other"]))
names(err_rates) <- c("cl10", "other")
return(err_rates)
}
set.seed(789)
labs_nn <- as.numeric(genes_clusters$labs) - 1
nn_out <- crossval(predfun_nn, X = data_for_cl, Y = labs_nn,
K = 5, B = 10, negative="other", verbose = FALSE)
cv_res_nn <- as.data.frame(nn_out$stat.cv) %>%
rownames_to_column( var = "BF") %>%
extract(col = BF, into = c("rep", "fold"),
regex = "([[:alnum:]]+).([[:alnum:]]+)" ) %>%
mutate_if( is.character, as_factor) %>%
gather(key = "class", value = "pred_error", cl10, other)
cv_plot_nn <- ggplot(cv_res_nn, aes(x = rep, y = pred_error, color = class)) +
geom_jitter(height = 0, width = 0.2) +
ggtitle("CV prediction error by repetitions") +
scale_colour_gdocs()
cv_plot_nn
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
......
......@@ -75,6 +75,7 @@ library("clue")
library("sda")
library("crossval")
library("randomForest")
library("keras")
theme_set(theme_solarized(base_size = 18))
......@@ -263,7 +264,7 @@ are not differentially expressed and a peak near zero, which represents
the differentially expressed genes. Separating the background from the
peak at zero by eye--balling, we would expect around 40 differently expressed
genes. We do however appreciate the strong dependencies between the values,
seens as little "hills" in the histogram.
seen as little "hills" in the histogram.
We will now explore typical p--value histograms^[See also David Robinson's
[detailed blog post](http://varianceexplained.org/statistics/interpreting-pvalue-histogram/) on that matter.]
......@@ -859,7 +860,7 @@ summary(res_sva)
Unfortunately, although sva is able to capture the genetic background, it
is too confounded with condition leaving us with zero detected genes.
Ultimately, genetic background and experimental conditions are too confounded
with each other, so that we cannot really desintangle the two. It is remarkable
with each other, so that we cannot really disentangle the two. It is remarkable
though that sva can actually identify the genetic background from the data.
# Latent factors for sc--RNA Seq: The ZINB-WaVE model
......@@ -1290,8 +1291,8 @@ rf_fit <- randomForest(x = data_for_cl, y = genes_clusters$labs,
rf_fit$confusion
# acc <- sum(rf_fit$confusion[, "class.error"] * class_priors)
# acc
acc <- 1-sum(rf_fit$confusion[, "class.error"] * class_priors)
acc
```
As we can see, in the out of bag samples, we classify
......@@ -1313,10 +1314,10 @@ random_cf <- ifelse(rbernoulli(nrow(data_for_cl),
random_confusion <- table(random_cf, genes_clusters$labs)
random_confusion <- cbind(random_confusion,
c(random_confusion["cl10", "other"] /
sum(random_confusion["cl10", ]),
random_confusion["other", "cl10"] /
sum(random_confusion["other", ])))
c(random_confusion["other", "cl10"] /
sum(random_confusion[, "cl10"]),
random_confusion["cl10", "other"] /
sum(random_confusion[, "other" ])))
colnames(random_confusion)[3] <- "class.error"
random_confusion
......@@ -1348,35 +1349,38 @@ predfun_rf <- function(train.x, train.y, test.x, test.y, negative){
mtry = floor(sqrt(ncol(train.x))),
classwt = class_priors,
do.trace = FALSE)
#browser()
ynew <- predict(rf_fit, test.x)
conf <- table(ynew, test.y)
err_rates <- c(conf["cl10", "other"] /
sum(conf["cl10", ]),
conf["other", "cl10"] /
sum(conf["other", ]))
err_rates <- c(conf["other", "cl10"] /
sum(conf[, "cl10" ]),
conf["cl10", "other"] /
sum(conf[, "other"]))
names(err_rates) <- c("cl10", "other")
return(err_rates)
}
set.seed(7891)
set.seed(123)
train_idx <- sample(nrow(data_for_cl), 700)
test_idx <- setdiff(seq_len(nrow(data_for_cl)), train_idx)
predfun_rf(data_for_cl[train_idx,], genes_clusters$labs[train_idx],
data_for_cl[test_idx, ], genes_clusters$labs[test_idx])
train.x <- data_for_cl[train_idx,]
train.y <- genes_clusters$labs[train_idx]
test.x <- data_for_cl[test_idx, ]
test.y <- genes_clusters$labs[test_idx]
predfun_rf(train.x, train.y,
test.x, test.y)
```
The error rate estimate for cluster 10 is considerably lower than the out of bag
The error rate estimate for cluster 10 is similar to the out of bag
error. Let's see whether we can confirm this via cross validation. We split
the data set repeatedly into K = 10 folds, and use each of theses folds once for
the data set repeatedly into K = 5 folds, and use each of theses folds once for
prediction (and the others for training). The number of repeats is B = 10 times,
giving us 100 estimates of the two error rates in total.
giving us 50 estimates of the two error rates in total.
In general, choosing a low number of folds will increase the bias, while
a large number of repetitions will decrease the variability of the estimate.
......@@ -1398,28 +1402,248 @@ cv_res <- as.data.frame(rf_out$stat.cv) %>%
cv_plot <- ggplot(cv_res, aes(x = rep, y = pred_error, color = class)) +
geom_jitter(height = 0, width = 0.2) +
ggtitle("CV prediction error by repitions") +
ggtitle("CV prediction error by repetitions") +
scale_color_tableau()
cv_plot
```
We can see that the error rate estimates are highly variable for cluster 10,
it seems that some genes in it are harder to predict than others. This might be
in line with Figure 2b of the original paper, which indicates that the
wihtin--cluster correlations are potentially largely driven by just a small
number of single cells: If they are in the training and test set,
the prediction works well,
otherwise it does not.
Therefore, due to the bimodal nature of the error estimates,
one should probably rather summarize
the repetitions by the maximum to be on the safe side.
On the other hand, the "other" class is easily predictable, indicating that
cluster 10 indeed contains "structure" (which might be driven by a subset of
single cells though).
We can see that the error rate estimates are both large and quite
variable for cluster 10, with the mean being close to the out--of--bag
estimate from random forest.
In general, cluster 10 seems to be hard to predict and the clustering does not
seem to be very strong. On the other hand, the "other" class is easily
predictable, indicating that cluster 10 indeed contains "structure".
# Neural networks
Recent years have seen an increasing interest in (deep) neural networks.
An artificial neural network, initially inspired by neural networks in the brain
consists of layers of interconnected compute units (neurons).
<!-- <img src="Colored_neural_network.svg.png" style="width: 50%; height: 50%" > -->
![A Neural Network](Colored_neural_network.svg.png)
In the canonical configuration shown in the figure^[image source: [Wikipedia Commons](https://commons.wikimedia.org/wiki/File:Colored_neural_network.svg)],
the network receives data in
an input layer, which are then transformed in a nonlinear way through (multiple)
hidden layers, before final outputs are computed in the output layer. ^[multiple hidden layers = "deep learning"].
Each neuron computes a weighted sum of its inputs and applies a nonlinear
activation function to calculate its output:
<span style="font-size:large;"> output = activation( input * weights + bias) </span>
The most
popular activation function is the rectified linear unit (ReLU)^[See the
[Wikipedia article](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))]
that thresholds negative signals to 0 and passes through positive signal.
The weights between neurons are free parameters that capture the model’s
representation of the data and are learned from input/output samples.
Learning minimizes a loss function hat measures the fit of the model output
to the true label of a sample. This minimization is
challenging, since the loss function is high-dimensional and non--convex, similar
to a landscape with many hills and valleys. The loss function is optimized
by changing the weights along its negative gradient. The gradient points
into the direction of steepest ascent, but as minimization of the loss function
is desired, the negative gradient is used.
The gradient is evaluated via the chain rule for derivatives, This, together with
optimization on random batches^[batch = random data sample on which
the current is updated on] of the data is called
"stochastic gradient descent" allows
for efficient training of neural networks. Usually one optimizes the
model during multiple "epochs", where an epoch is one full pass through the
data.^[E.g. If there are 1000 samples and the batch size is 32 an epoch
consists of training on roughly 31 batches]
Alternative architectures to such fully connected feedforward networks have been
developed for specific applications, which differ in the way neurons
are arranged. These include convolutional neural networks, which are widely used
for image analysis and recurrent neural networks for sequential
data, or autoencoders for unsupervised learning.
A concise review of deep learning in biology is given by @Angermueller_2016,
a comprehensive overview of the topic is provided by the book of
@Goodfellow_2016.
^[See also [this](https://keras.rstudio.com/articles/learn.html)
rstudio page with a list of deep learning resources.
@Ching_2018 provides an extensive, "crowd--sourced" overview of deep learning
applications in biology.
## Fitting neural networks with Keras
Creating and fitting neural network architectures used to be technically
challenging. In order to alleviate this tasks, powerful frameworks have
been developed to easily implement the "backend" using the full power
of today's CPUs and GPUs. Here, we use
[TensorFlow](https://www.tensorflow.org/)^[TensorFlow, the TensorFlow logo and any related marks are trademarks of Google Inc.], another popular framework
is [Theano](http://www.deeplearning.net/software/theano/). While these
frameworks make it easier to create and train neural networks^[For a simple
TensorFlow example, see the [paper by P.
Goldsborough](https://arxiv.org/abs/1610.01178)]. However, even though
these frameworks make it easier, they still require quite some expert knowledge
to train neural networks. This is where [Keras](https://keras.io/) comes
in, which builds on top of e.g. TensorFlow in order to provide a high--level
interface that allows for a "natural" definition of neural network architectures.
Keras is written in python, but there is an [R interface as well](https://keras.rstudio.com/).
## A deep feed forward network for the single cell data
Instead of using random forest for prediction, we can also try to train
a deep feed--forward network on the single cell data. We define the network
as follows:
```{r keras_nn, eval=FALSE}
nn <- keras_model_sequential() %>%
layer_dense(units = 32, input_shape = c(203),
name = "input_layer",
kernel_regularizer = regularizer_l1(l = 0.3),
activation = "relu") %>%
layer_dropout(rate = 0.5, name = "input_dropout") %>%
layer_dense(units = 16, name = "hidden_layer_1", activation = "relu") %>%
layer_dense(units = 8, name = "hidden_layer_2", activation = "relu") %>%
layer_dense(bias_initializer = initializer_constant(-5),
units = 1, name = "output_layer", activation = "sigmoid")
```
We have 203 single cells to predict our genes, so the input
layer accepts a vector with 203 entries which the first hidden
layer turns into a an output vector of length 32. We put a penalty
on the weights of the input layer in order to avoid overfitting.
A dropout--layer on the output of the input layer means that
only 50% of units are used at any single training step. This also
helps to avoid overfitting.^[[Overview article on dropout](https://medium.com/@amarbudhiraja/https-medium-com-amarbudhiraja-learning-less-to-learn-better-dropout-in-deep-machine-learning-74334da4bfc5)].
We then use two ordinary hidden layers and then a sigmoidal^[[Wikipedia article](https://en.wikipedia.org/wiki/Sigmoid_function)] output layer
that maps real values to the 0--1 range and that we use to obtain our
classification prediction.
Note that we initialize the bias of the sigmoid to a negative number, this
means that before training, our neural network will favor genes assigned
to cluster 10. This bias will be "unlearned" during the training and we
then stop the training after having obtained a reasonable overall accuracy.
(Here set to 60%)^[This is implemented via a "[callback](https://keras.rstudio.com/articles/training_callbacks.html)"]
This way, we hope to obtain a better per--class error for cluster 10 than
the 60% we get from the random forest. We again assess this via
cross--validation.^[Note that we need to turn the text labels into
zeros and ones as this is the output of our network]
```{r nn_cv, eval=TRUE}
predfun_nn <- function(train.x, train.y, test.x, test.y, negative){
# create a custom callback that will stop model training if the
# overall accuracy is greater than some threshold
# this is checked per batch
acc_stop <- R6::R6Class("acc_stop",
inherit = KerasCallback,
public = list(
accs = NULL,
cl10errors = NULL,
on_batch_end = function(batch, logs = list()) {
self$accs <- c(self$accs, logs[["binary_accuracy"]])
self$cl10errors <- c(self$cl10errors, logs[["cl10_errors"]])
if(logs[["binary_accuracy"]] > 0.6){
self$model$stop_training = TRUE
}
}
))
call_acc_stop <- acc_stop$new()
nn <- keras_model_sequential() %>%
layer_dense(units = 32, input_shape = c(203),
name = "input_layer",
kernel_regularizer = regularizer_l1(l = 0.3),
activation = "relu") %>%
layer_dropout(rate = 0.5, name = "input_dropout") %>%
layer_dense(units = 16, name = "hidden_layer_1", activation = "relu") %>%
layer_dense(units = 8, name = "hidden_layer_2", activation = "relu") %>%
layer_dense(bias_initializer = initializer_constant(-5),
units = 1, name = "output_layer", activation = "sigmoid")
nn %>%
compile(optimizer = 'adam',
loss = loss_binary_crossentropy,
metrics = 'binary_accuracy')
nn %>%
fit(train.x,
train.y,
epochs=50, batch_size=64, verbose = 0,
callbacks = list(call_acc_stop))
ynew <- predict_classes(nn, test.x)
rm(nn)
k_clear_session()
conf <- table(ynew, test.y)
if(nrow(conf) != 2){
conf <- rbind(conf, c(0,0))
}
if(ncol(conf) != 2){
conf <- cbind(conf, c(0,0))
}
colnames(conf) <- rownames(conf) <- c("cl10", "other")
err_rates <- c(conf["other", "cl10"] /
sum(conf[, "cl10"]),
conf["cl10", "other"] /
sum(conf[, "other"]))
names(err_rates) <- c("cl10", "other")