Commit b8384062 authored by Bernd Klaus's avatar Bernd Klaus

added some more info on scatterplot smoothers, fixed some hickups in the graphics part

parent bc432e8c
......@@ -74,6 +74,22 @@ citep("10.1214/09-AOAS277")
citep("10.1186/s13059-015-0844-5")
# Risso et. al., 2018
citep("10.1038/s41467-017-02554-5")
# add_manually("@article {Risso_2017,
# author = {Risso, Davide and Perraudeau, Fanny and Gribkova, Svetlana and Dudoit, Sandrine and Vert, Jean-Philippe},
# title = {ZINB-WaVE: A general and flexible method for signal extraction from single-cell RNA-seq data},
# year = {2017},
# doi = {10.1101/125112},
# publisher = {Cold Spring Harbor Laboratory},
# abstract = {Single-cell RNA sequencing (scRNA-seq) is a powerful high-throughput technique that enables researchers to measure genome-wide transcription levels at the resolution of single cells. Because of the low amount of RNA present in a single cell, some genes may fail to be detected even though they are expressed; these genes are usually referred to as dropouts. Here, we present a general and flexible zero-inflated negative binomial model (ZINB-WaVE), which leads to low-dimensional representations of the data that account for zero inflation (dropouts), over-dispersion, and the count nature of the data. We demonstrate, with simulated and real data, that the model and its associated estimation procedure are able to give a more stable and accurate low-dimensional representation of the data than principal component analysis (PCA) and zero-inflated factor analysis (ZIFA), without the need for a preliminary normalization step.},
# URL = {https://www.biorxiv.org/content/early/2017/11/02/125112},
# eprint = {https://www.biorxiv.org/content/early/2017/11/02/125112.full.pdf},
# journal = {bioRxiv}
# }")
### export citations
......@@ -104,18 +120,7 @@ add_manually("@article{vanDerMaaten_2008,
year = 2008
}")
# Risso et. al., 2017
add_manually("@article {Risso_2017,
author = {Risso, Davide and Perraudeau, Fanny and Gribkova, Svetlana and Dudoit, Sandrine and Vert, Jean-Philippe},
title = {ZINB-WaVE: A general and flexible method for signal extraction from single-cell RNA-seq data},
year = {2017},
doi = {10.1101/125112},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Single-cell RNA sequencing (scRNA-seq) is a powerful high-throughput technique that enables researchers to measure genome-wide transcription levels at the resolution of single cells. Because of the low amount of RNA present in a single cell, some genes may fail to be detected even though they are expressed; these genes are usually referred to as dropouts. Here, we present a general and flexible zero-inflated negative binomial model (ZINB-WaVE), which leads to low-dimensional representations of the data that account for zero inflation (dropouts), over-dispersion, and the count nature of the data. We demonstrate, with simulated and real data, that the model and its associated estimation procedure are able to give a more stable and accurate low-dimensional representation of the data than principal component analysis (PCA) and zero-inflated factor analysis (ZIFA), without the need for a preliminary normalization step.},
URL = {https://www.biorxiv.org/content/early/2017/11/02/125112},
eprint = {https://www.biorxiv.org/content/early/2017/11/02/125112.full.pdf},
journal = {bioRxiv}
}")
# Benjamini and Hochberg, 1995
add_manually("@article{Benjamini_1995,
......
......@@ -10,8 +10,8 @@ cache = TRUE, warning = FALSE)
print(date())
## ----required_packages_and_data, echo = TRUE, cache=FALSE, message=FALSE----
library("readxl")
library("scran")
library("readxl")
library("BiocStyle")
library("knitr")
library("MASS")
......@@ -25,7 +25,6 @@ library("factoextra")
library("magrittr")
library("entropy")
library("forcats")
library("forcats")
library("readxl")
library("DESeq2")
library("broom")
......@@ -39,6 +38,7 @@ library("tidyverse")
library("Rtsne")
library("devtools")
library("ggthemes")
library("scar")
theme_set(theme_solarized(base_size = 18))
......@@ -51,13 +51,13 @@ load(file.path(data_dir, "mtec_cell_anno.RData"))
load(file.path(data_dir, "mtec_gene_anno.RData"))
load(file.path(data_dir, "tras.RData"))
## ----mtec_count_table, results="asis"------------------------------------
## ----mtec_count_table----------------------------------------------------
mtec_counts
## ----mtec_cell_anno, results="asis"--------------------------------------
## ----mtec_cell_anno------------------------------------------------------
mtec_cell_anno
## ----trasENSEMBL, results="asis"-----------------------------------------
## ----trasENSEMBL---------------------------------------------------------
tras
mtec_gene_anno
......@@ -127,14 +127,17 @@ ggplot(aes(x, y), data = sim_data) +
## ----loessExercise, dependson="loessExampleLinFit", echo=FALSE, fig.show="hide", results="hide"----
sim_data$locFit <- predict(locfit(y~lp(x, nn=0.1, deg=1), data=sim_data),
sim_data$locFit <- predict(locfit(y~lp(x, nn=0.2, deg=1), data=sim_data),
newdata = sim_data$x)
scar_fit <- scar(sim_data$x, sim_data$y, shape = c("in"))
sim_data$scar <- scar_fit$componentfit + scar_fit$constant
ggplot(aes(x, y), data = sim_data) +
geom_point() +
geom_smooth() +
ggtitle("Linear vs. local regression") +
geom_line(aes(x = x, y = locFit), color = "coral3")
geom_point() +
ggtitle("Local vs. shape constrained regression") +
geom_line(aes(x = x, y = locFit), color = "coral3") +
geom_line(aes(x = x, y = scar), color = "royalblue")
## ----compCells-----------------------------------------------------------
......@@ -369,10 +372,8 @@ mean_sd
## ----varStabCountData----------------------------------------------------
counts_norm_vst_crc <- varianceStabilizingTransformation(counts_crc)
mean_sd_crc <- meanSdPlot(counts_norm_vst_crc)$gg +
ylim(c(0, 5))
mean_sd_crc
meanSdPlot(counts_norm_vst_crc)
## ----pcaCRCdata----------------------------------------------------------
......@@ -543,7 +544,7 @@ ggplot(data_dist, aes(x = org_distance, y = mds_distance)) +
## ----ex_sheppard_plot, echo=FALSE, results="hide"------------------------
ggplot(data_dist, aes(x = org_distance, y = mds_distance)) +
stat_density2d(aes(fill = ..level..), geom = "polygon",
n = 100, h = c(0.1, 0.1)) +
n = 100, h = c(0.2, 0.2)) +
geom_smooth(color = "grey10") +
scale_fill_distiller(palette = "YlGn", direction = 1) +
ggtitle(label = "Shepard plot",
......@@ -649,7 +650,7 @@ ggplot(data_dist_tSNE, aes(x = org_distance, y = tSNE_distance)) +
geom_hex(binwidth = c(0.05, 0.3)) +
geom_smooth(color = "grey10", method = "loess", span = 0.5) +
scale_fill_distiller(palette = "BrBG", direction = 1) +
ggtitle(label = "Shepard plot for t--SNE",
ggtitle(label = "Shepard plot for t-SNE",
subtitle = "Original vs tSNE distances") +
coord_fixed(1/(max(data_dist_tSNE$tSNE_distance)
- min(data_dist_tSNE$tSNE_distance))) +
......
......@@ -3,9 +3,8 @@ title: "Visual exploration for bioinformatics"
author: "Bernd Klaus"
date: "`r doc_date()`"
output:
BiocStyle::html_document2:
BiocStyle::html_document:
toc: true
highlight: tango
self_contained: true
toc_float: false
code_download: true
......@@ -13,7 +12,6 @@ output:
toc_depth: 2
BiocStyle::pdf_document:
toc: true
highlight: tango
toc_depth: 2
bibliography: stat_methods_bioinf.bib
---
......@@ -48,8 +46,8 @@ print(date())
```{r required_packages_and_data, echo = TRUE, cache=FALSE, message=FALSE}
library("readxl")
library("scran")
library("readxl")
library("BiocStyle")
library("knitr")
library("MASS")
......@@ -63,7 +61,6 @@ library("factoextra")
library("magrittr")
library("entropy")
library("forcats")
library("forcats")
library("readxl")
library("DESeq2")
library("broom")
......@@ -77,6 +74,7 @@ library("tidyverse")
library("Rtsne")
library("devtools")
library("ggthemes")
library("scar")
theme_set(theme_solarized(base_size = 18))
......@@ -103,13 +101,13 @@ with the number of sequenced fragments whose genomic alignment overlaps with the
genomic coordinates of the genes. It looks like this:
```{r mtec_count_table, results="asis"}
```{r mtec_count_table}
mtec_counts
```
The annotation of the cells is stored in a table called `mtec_cell_anno`
```{r mtec_cell_anno, results="asis"}
```{r mtec_cell_anno}
mtec_cell_anno
```
......@@ -122,7 +120,7 @@ and we only retain those genes for further analysis.
We also import a table with genes annotated as tissue restricted antigens (`tras`)
from @Pinto_2013 and annotation for mouse genes from ENSEMBL (`mtec_gene_anno`).
```{r trasENSEMBL, results="asis"}
```{r trasENSEMBL}
tras
mtec_gene_anno
```
......@@ -354,25 +352,40 @@ ggplot(aes(x, y), data = sim_data) +
```
R offers very complex packages for scatterplot smoothing, such as
`r CRANpkg("fda")` (Functional data analysis) or `r CRANpkg("mgcv")`,
which work by smoothing a set of (spline--) basis functions that depend
on the x--variable(s).
There is another noteworthy recent package called `r CRANpkg("scar")`,
which allows shape--constrained regression without any tuning paramters.
I.e. one can obtain a monotone fit for something like growth--curves. The
obtained fit however is not neccessarily smooth.
### Exercise: Local fit experiments
### Exercise: Local and shape--constrained fit experiments
1. Check the help for `geom_smooth` to find out what `r CRANpkg("ggplot2")`
uses as default smoothing methods.
2. Play with the `nn` parameter of the locfit in the plot, how does affect the fit?
3. Produce a monotone fit using the `r CRANpkg("scar")` package. Hint:
the scar fit consists of a fit with mean zero + an intercept. You have
to add up the two components to get the actual fit.
```{r loessExercise, dependson="loessExampleLinFit", echo=FALSE, fig.show="hide", results="hide"}
sim_data$locFit <- predict(locfit(y~lp(x, nn=0.1, deg=1), data=sim_data),
sim_data$locFit <- predict(locfit(y~lp(x, nn=0.2, deg=1), data=sim_data),
newdata = sim_data$x)
scar_fit <- scar(sim_data$x, sim_data$y, shape = c("in"))
sim_data$scar <- scar_fit$componentfit + scar_fit$constant
ggplot(aes(x, y), data = sim_data) +
geom_point() +
geom_smooth() +
ggtitle("Linear vs. local regression") +
geom_line(aes(x = x, y = locFit), color = "coral3")
geom_point() +
ggtitle("Local vs. shape constrained regression") +
geom_line(aes(x = x, y = locFit), color = "coral3") +
geom_line(aes(x = x, y = scar), color = "royalblue")
```
......@@ -896,7 +909,7 @@ called overdispersion, i.e. the variance is greater than than the mean.
(For \(\alpha = 0\), the NB distribution becomes the Poisson distribution)
Single cell RNA-Seq data contains many zeros, due to limited capture
efficiency, thus zero inflated models (@Finak_2015, @Risso_2017) have been
efficiency, thus zero inflated models (@Finak_2015, @Risso_2018) have been
proposed to account for this.
## Variance stabilization by log + pseudocount
......@@ -928,10 +941,8 @@ transform called `rlog`.]
```{r varStabCountData}
counts_norm_vst_crc <- varianceStabilizingTransformation(counts_crc)
mean_sd_crc <- meanSdPlot(counts_norm_vst_crc)$gg +
ylim(c(0, 5))
mean_sd_crc
meanSdPlot(counts_norm_vst_crc)
```
We can see from the plot that the transformation has worked, the data
......@@ -1369,7 +1380,7 @@ to avoid overplotting?
```{r ex_sheppard_plot, echo=FALSE, results="hide"}
ggplot(data_dist, aes(x = org_distance, y = mds_distance)) +
stat_density2d(aes(fill = ..level..), geom = "polygon",
n = 100, h = c(0.1, 0.1)) +
n = 100, h = c(0.2, 0.2)) +
geom_smooth(color = "grey10") +
scale_fill_distiller(palette = "YlGn", direction = 1) +
ggtitle(label = "Shepard plot",
......@@ -1663,7 +1674,7 @@ ggplot(data_dist_tSNE, aes(x = org_distance, y = tSNE_distance)) +
geom_hex(binwidth = c(0.05, 0.3)) +
geom_smooth(color = "grey10", method = "loess", span = 0.5) +
scale_fill_distiller(palette = "BrBG", direction = 1) +
ggtitle(label = "Shepard plot for t--SNE",
ggtitle(label = "Shepard plot for t-SNE",
subtitle = "Original vs tSNE distances") +
coord_fixed(1/(max(data_dist_tSNE$tSNE_distance)
- min(data_dist_tSNE$tSNE_distance))) +
......
This diff is collapsed.
No preview for this file type
......@@ -245,6 +245,19 @@
journal = {{PLoS} {ONE}},
}
@Article{Risso_2018,
doi = {10.1038/s41467-017-02554-5},
url = {https://doi.org/10.1038/s41467-017-02554-5},
year = {2018},
month = {jan},
publisher = {Springer Nature},
volume = {9},
number = {1},
author = {Davide Risso and Fanny Perraudeau and Svetlana Gribkova and Sandrine Dudoit and Jean-Philippe Vert},
title = {A general and flexible method for signal extraction from single-cell {RNA}-seq data},
journal = {Nature Communications},
}
@Article{Sch_fer_2005,
doi = {10.2202/1544-6115.1175},
url = {https://doi.org/10.2202/1544-6115.1175},
......@@ -288,21 +301,6 @@
@article {Risso_2017,
author = {Risso, Davide and Perraudeau, Fanny and Gribkova, Svetlana and Dudoit, Sandrine and Vert, Jean-Philippe},
title = {ZINB-WaVE: A general and flexible method for signal extraction from single-cell RNA-seq data},
year = {2017},
doi = {10.1101/125112},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Single-cell RNA sequencing (scRNA-seq) is a powerful high-throughput technique that enables researchers to measure genome-wide transcription levels at the resolution of single cells. Because of the low amount of RNA present in a single cell, some genes may fail to be detected even though they are expressed; these genes are usually referred to as dropouts. Here, we present a general and flexible zero-inflated negative binomial model (ZINB-WaVE), which leads to low-dimensional representations of the data that account for zero inflation (dropouts), over-dispersion, and the count nature of the data. We demonstrate, with simulated and real data, that the model and its associated estimation procedure are able to give a more stable and accurate low-dimensional representation of the data than principal component analysis (PCA) and zero-inflated factor analysis (ZIFA), without the need for a preliminary normalization step.},
URL = {https://www.biorxiv.org/content/early/2017/11/02/125112},
eprint = {https://www.biorxiv.org/content/early/2017/11/02/125112.full.pdf},
journal = {bioRxiv}
}
@article{Benjamini_1995,
ISSN = {00359246},
URL = {http://www.jstor.org/stable/2346101},
......
......@@ -9,6 +9,6 @@ stat_pkgs <- c("readxl", "scran", "BiocStyle", "knitr", "MASS", "RColorBrewer",
"stringr", "pheatmap", "matrixStats", "purrr", "fdrtool", "readr", "gtools",
"factoextra", "magrittr", "entropy", "forcats", "plotly", "corrplot", "car", "forcats",
"openxlsx", "readxl", "limma", "ggthemes","corpcor","sva","zinbwave","clusterExperiment",
"clue","sda","crossval","randomForest")
"clue","sda","crossval","randomForest", "scar")
biocLite(unique(stat_pkgs))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment