Commit 5a1a1d49 authored by Bernd Klaus's avatar Bernd Klaus

included refs, did spell checking, version used for predo 17

parent 66de9b3a
......@@ -9,7 +9,7 @@ cache = TRUE)
## ---- echo=FALSE, cache=FALSE--------------------------------------------
print(date())
## ----required_packages_and_data, echo = TRUE, cache=FALSE----------------
## ----required_packages_and_data, echo = TRUE, cache=FALSE, message=FALSE----
library("readxl")
library("BiocStyle")
library("knitr")
......@@ -145,7 +145,7 @@ pat_tiny <- filter(pat, Height < 1.7)
select(pat_tiny, PatientId, Height, Gender)
## ----light_and_small_patients, echo = TRUE-----------------------------
filter(pat, Height < 1.5, Gender == "f")
filter(pat, Height < 1.7, Gender == "f")
## ----light_or_small_patients, echo = TRUE------------------------------
filter(pat, (Height < 1.5) | (Gender == "f"))
......
......@@ -11,10 +11,11 @@ output:
code_download: true
df_print: paged
toc_depth: 2
BiocStyle::pdf_document2:
BiocStyle::pdf_document:
toc: true
highlight: tango
toc_depth: 2
bibliography: stat_methods_bioinf.bib
---
<!--
......@@ -22,7 +23,7 @@ graphics.off();rm(list=ls());rmarkdown::render('data_handling_bioinf.Rmd',
output_format = "all");purl('data_handling_bioinf.Rmd')
rmarkdown::render('data_handling_bioinf.Rmd', output_format =
BiocStyle::pdf_document2(toc = TRUE, highlight = "tango",
BiocStyle::pdf_document(toc = TRUE, highlight = "tango",
df_print = "paged", toc_depth = 2))
-->
......@@ -46,7 +47,7 @@ print(date())
# Required packages and other preparations
```{r required_packages_and_data, echo = TRUE, cache=FALSE}
```{r required_packages_and_data, echo = TRUE, cache=FALSE, message=FALSE}
library("readxl")
library("BiocStyle")
library("knitr")
......@@ -113,14 +114,14 @@ a broad scope of tolerance before T cells circulate through the body.
This "training" of the T--cells is neccessary to prevent autoimunity. It
happens by the "promiscuous" expression of numerous tissue-specific antigens
(TRAs) in medullary thymic epithelial cells (mTECs) [Kyewski and Klein, 2006].
(TRAs) in medullary thymic epithelial cells (mTECs, @Kyewski_2006).
However, it is poorly understood how this process is regulated in
single mTECs and is coordinated at the population level. Brennecke et. al.
single mTECs and is coordinated at the population level. @Brennecke_2015
obtained scRNA--Seq data of mTECs and find evidence of numerous recurring
TRA--co--expression patterns, each present in only a subset of mTECs.
Thus, they can show that the gene expression in mTEC cells rather "mosaic"
and not entirely stochastic.
Thus, they could show that the gene expression in mTEC cells rather "mosaic"
and coordianted rather than entirely stochastic.
Before we dive more deeply into the data, we review important R basics.
......@@ -459,7 +460,7 @@ For example, we can easily get light and female patients via:
```{r light_and_small_patients, echo = TRUE}
filter(pat, Height < 1.5, Gender == "f")
filter(pat, Height < 1.7, Gender == "f")
```
We can also retrieve small OR female patients via
......@@ -1012,6 +1013,7 @@ group_by(hq_cells, batch) %>%
sessionInfo()
```
# References
This diff is collapsed.
No preview for this file type
library(knitcitations)
library(bibtex)
# Kyewski and Klein, 2006
citep("10.1146/annurev.immunol.23.021704.115601")
# Brennecke et. al., 2015
citep("10.1038/ni.3246")
# Pinto et. al. 2013
citep("10.1073/pnas.1308311110")
# recount2 resource
citep("10.1038/nbt.3838")
# Löhr et. al. 2013
citep("10.1371/journal.pone.0067461")
# Brennecke et. al., 2013
citep("10.1038/nmeth.2645")
# [Lun et. al.](http://dx.doi.org/10.1186/s13059-016-0947-7)
citep("10.1186/s13059-016-0947-7")
# cf. Leek et. al. 2010
citep("10.1038/nrg2825")
# [Buettner et. al.](http://dx.doi.org/10.1038/nbt.3102)
citep("10.1038/nbt.3102")
# Buja et. al. , 2007
citep("10.1198/106186008X318440")
# Kruskal, 1964
citep("10.1007/BF02289565")
write.bibtex(file = "stat_methods_bioinf.bib")
knitcitations::cleanbib()
# van der Maaten and Hinton, 2008
write("\n", file = "stat_methods_bioinf.bib", append = TRUE)
write("@article{vanDerMaaten_2008,
author = {van der Maaten, Laurens and Hinton, Geoffrey},
interhash = {370ba8b9e1909b61880a6f47c93bcd49},
intrahash = {8b9aebb404ad4a4c6a436ea413550b30},
journal = {Journal of Machine Learning Research},
pages = {2579--2605},
title = {Visualizing Data using {t-SNE} },
url = {http://www.jmlr.org/papers/v9/vandermaaten08a.html},
volume = 9,
year = 2008
}", file = "stat_methods_bioinf.bib", append = TRUE)
write("\n", file = "stat_methods_bioinf.bib", append = TRUE)
......@@ -9,7 +9,7 @@ cache = TRUE)
## ---- echo=FALSE, cache=FALSE--------------------------------------------
print(date())
## ----required_packages_and_data, echo = TRUE, cache=FALSE----------------
## ----required_packages_and_data, echo = TRUE, cache=FALSE, message=FALSE----
library("readxl")
library("scran")
library("BiocStyle")
......@@ -36,57 +36,53 @@ library("vsn")
library("matrixStats")
library("pheatmap")
library("tidyverse")
#library("printr")
library("Rtsne")
library("devtools")
theme_set(theme_gray(base_size = 18))
library(BiocParallel)
multicoreParam <- MulticoreParam(workers = round(parallel::detectCores() * 0.5))
register(multicoreParam)
data_dir <- file.path("data/")
## ----import_gene_expression, echo=FALSE, eval=FALSE----------------------
# Here we import the gene expression data using only the subset of highly
# variable genes, resave it
load(file.path(data_dir, "deGenesNone.RData"))
load(file.path(data_dir, "mTECdxd.RData"))
mtec_counts <- counts(dxd)[deGenesNone, ]
mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
var = "ensembl_id"))
mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
modify_if(.p = is.factor, as.character)
data("biotypes")
data("geneNames")
mtec_gene_anno <- tibble(biotype) %>%
add_column(ensembl_id = names(biotype),
gene_name = geneNames,
.before = "biotype")
mtec_cell_anno <- colData(dxd)
save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
compress = "xz")
save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
compress = "xz")
save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
compress = "xz")
tras <- as_tibble(tras)
save(tras, file = file.path(data_dir, "tras.RData"),
compress = "xz")
## # Here we import the gene expression data using only the subset of highly
## # variable genes, resave it
## load(file.path(data_dir, "deGenesNone.RData"))
## load(file.path(data_dir, "mTECdxd.RData"))
##
## mtec_counts <- counts(dxd)[deGenesNone, ]
##
## mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
## var = "ensembl_id"))
##
## mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
## modify_if(.p = is.factor, as.character)
##
## data("biotypes")
## data("geneNames")
##
## mtec_gene_anno <- tibble(biotype) %>%
## add_column(ensembl_id = names(biotype),
## gene_name = geneNames,
## .before = "biotype")
##
## mtec_cell_anno <- colData(dxd)
##
## save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
## compress = "xz")
##
## save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
## compress = "xz")
##
##
## save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
## compress = "xz")
##
## tras <- as_tibble(tras)
##
## save(tras, file = file.path(data_dir, "tras.RData"),
## compress = "xz")
## ----import_data---------------------------------------------------------
load(file.path(data_dir, "mtec_counts.RData"))
......@@ -136,7 +132,7 @@ scatter_tra
scatter_tra +
geom_rug(alpha = I(0.2))
## ----ex_geom, echo=FALSE-------------------------------------------------
## ----ex_geom, echo=FALSE, results="hide", fig.show="hide"----------------
ggplot(tra_detected, aes(x = total_detected, y = tra))
scatter_tra +
......@@ -168,7 +164,7 @@ ggplot(aes(x, y), data = sim_data) +
geom_line(aes(x = x, y = locFit), color = "coral3")
## ----loessExercise, dependson="loessExampleLinFit", echo=FALSE-----------
## ----loessExercise, dependson="loessExampleLinFit", echo=FALSE, fig.show="hide", results="hide"----
sim_data$locFit <- predict(locfit(y~lp(x, nn=0.1, deg=1), data=sim_data),
newdata = sim_data$x)
......@@ -219,7 +215,7 @@ counts_crc <- assay(crc_data)
counts_crc[1:5, 1:5]
## ----crc_filtering-------------------------------------------------------
counts_crc <- counts_crc[rowMeans(counts_crc) >= 5, ]
counts_crc <- counts_crc[rowMeans(counts_crc) >= 20, ]
dim(counts_crc)
## ----pre_crc_genes-------------------------------------------------------
......@@ -266,7 +262,7 @@ count_boxplot <- ggplot(counts_crc_tidy,
count_boxplot
## ----exp_boxplot---------------------------------------------------------
## ----exp_boxplot, results="hide", echo=FALSE, fig.show="hide"------------
count_boxplot_tissue <- ggplot(counts_crc_tidy,
aes(x = sample_id_by_median,
y = log2(count),
......@@ -300,7 +296,64 @@ counts_crc_tidy <- left_join(counts_crc_tidy,
## ----norm_data-----------------------------------------------------------
counts_crc_tidy <- mutate(counts_crc_tidy, count_norm = count / sf )
## ----data_norm_plot------------------------------------------------------
## ----MAplot_function-----------------------------------------------------
glog2 <- function(x) ((asinh(x)-log(2))/log(2))
ref_sample <- group_by(counts_crc_tidy, ensembl_id) %>%
summarize(ref_sample = mean(glog2(count_norm )))
counts_crc_tidy <- left_join(counts_crc_tidy, ref_sample) %>%
mutate(M = ref_sample - glog2(count_norm),
A = (ref_sample + glog2(count_norm))/ 2)
## ----createMAs, fig.wide="TRUE"------------------------------------------
colorscale <- scale_fill_gradientn(
colors = rev(brewer.pal(9, "YlGnBu")),
values = c(0, exp(seq(-5, 0, length.out = 200))^0.5))
ma_pl <- ggplot(aes(A, M), data = sample_frac(counts_crc_tidy,
size = 0.1)) +
facet_wrap(~ sample_id) +
geom_hex(binwidth = c(0.5, 0.5)) +
geom_smooth(method = "loess", se = FALSE, col = "#5D84C5", span = .4) +
geom_hline(aes(yintercept = 0), col = "#9850C3") +
ylim(c(-5, 5)) +
coord_fixed(ratio = 2) +
colorscale +
ggtitle("MA plots (vs reference sample)")
ma_pl
## ----ex_data_norm_plot, results="hide", fig.show="hide", echo=FALSE------
comp_data <- tibble(x = seq(0, 10, by = 0.1),
log2 = log2(x),
glog2 = glog2(x)) %>%
gather(key = "func",
value = "value", log2, glog2)
ggplot(data = comp_data,
aes(x = x, y = value, color = func)) +
geom_line()
ma_pl_points <- ggplot(aes(A, M), data = sample_frac(counts_crc_tidy,
size = 0.1)) +
facet_wrap(~ sample_id) +
geom_point(alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, col = "#5D84C5", span = .4) +
geom_hline(aes(yintercept = 0), col = "#9850C3") +
ylim(c(-5, 5)) +
coord_fixed(ratio = 2) +
ggtitle("MA plot (vs ref sample)")
ma_pl_points
## ----scran_norm----------------------------------------------------------
......@@ -310,12 +363,12 @@ mtec_scran_sf <- tibble(scran_sf = computeSumFactors(as.matrix(mtec_counts[, -1]
mtec_cell_anno <- left_join(mtec_cell_anno, mtec_scran_sf,
by = c("cellID" = "cell_id"))
## ----compar_sf-----------------------------------------------------------
ggplot(mtec_cell_anno, aes(x = sizeFactor, y = scran_sf)) +
ggtitle("SCRAN vs. classical size factors") +
geom_point() +
geom_smooth()
## ----compar_sf, echo=FALSE, eval=FALSE-----------------------------------
##
## ggplot(mtec_cell_anno, aes(x = sizeFactor, y = scran_sf)) +
## ggtitle("SCRAN vs. classical size factors") +
## geom_point() +
## geom_smooth()
## ----createNormCounts----------------------------------------------------
counts_norm_crc_mat <- select(counts_crc_tidy, sample_id, ensembl_id, count_norm) %>%
......@@ -328,14 +381,17 @@ counts_norm_crc_mat <- select(counts_crc_tidy, sample_id, ensembl_id, count_norm
select(-ensembl_id) %>%
as.matrix()
meanSdPlot(counts_norm_crc_mat)$gg +
ylim(c(0, 2000))
mean_sd <- meanSdPlot(counts_norm_crc_mat, plot = FALSE)$gg +
ylim(c(0, 1000))
mean_sd
## ----varStabCountData----------------------------------------------------
counts_norm_vst_crc <- varianceStabilizingTransformation(counts_crc)
meanSdPlot(counts_norm_vst_crc)$gg +
ylim(c(0, 5))
mean_sd_crc <- meanSdPlot(counts_norm_vst_crc)$gg +
ylim(c(0, 5))
mean_sd_crc
## ----pcaCRCdata----------------------------------------------------------
......@@ -362,7 +418,7 @@ pca <- compute_pca(counts_norm_vst_crc,
patient = col_data_crc$patient,
tissue = col_data_crc$tissue)
pca
names(pca)
sum(pca$perc_var[1:4])
## ----vizPca, fig.height = 3, fig.width = 10------------------------------
......@@ -382,7 +438,7 @@ pca_plot <- ggplot(pca$pca, aes(x = PC1, y = PC2,
pca_plot
## ----pca23---------------------------------------------------------------
## ----pca23, echo=FALSE, results="hide"-----------------------------------
sd_ratio_2 <- sqrt(pca$perc_var[3] / pca$perc_var[2])
......@@ -446,7 +502,7 @@ pheatmap(create_dist_mat(cleaned_data,
## ----scalingSingleCell---------------------------------------------------
tcell_log_counts <- as.data.frame(
read_csv("http://www-huber.embl.de/users/klaus/nbt.3102-S7.csv"))
read_csv(file.path(data_dir, "nbt.3102-S7.csv")))
rownames(tcell_log_counts) <- tcell_log_counts$X1
tcell_log_counts$X1 <- NULL
tcell_log_counts <- as.matrix(tcell_log_counts)
......@@ -477,7 +533,7 @@ mds_plot_tcells <- ggplot(scaling_tcells, aes(x = MDS_dimension_1,
MDS_dimension_2,
color = fct_rev(gata3_group))) +
geom_point(size = 3) +
ggtitle("Metric MDS of the T-cell single cell data") +
ggtitle("Kruskal MDS of the T-cell single cell data") +
scale_color_brewer(palette = "RdBu", direction = 1) +
coord_equal()
......@@ -503,7 +559,7 @@ ggplot(data_dist, aes(x = org_distance, y = mds_distance)) +
coord_equal()
## ----ex_sheppard_plot----------------------------------------------------
## ----ex_sheppard_plot, echo=FALSE, results="hide"------------------------
ggplot(data_dist, aes(x = org_distance, y = mds_distance)) +
stat_density2d(aes(fill = ..level..), geom = "polygon",
n = 100, h = c(0.1, 0.1)) +
......@@ -514,38 +570,157 @@ ggplot(data_dist, aes(x = org_distance, y = mds_distance)) +
coord_equal()
## ----sammonScaling-------------------------------------------------------
scaling_tcells_sam <- as_tibble(sammon(dist_tcells)$points)
colnames(scaling_tcells_sam) <- c("MDS_dimension_1", "MDS_dimension_2")
scaling_tcells_sam <- add_column(scaling_tcells_sam, cell_id = labels(dist_tcells),
.before = "MDS_dimension_1")
## ----sammonScaling, eval=FALSE, echo=FALSE-------------------------------
## scaling_tcells_sam <- as_tibble(sammon(dist_tcells)$points)
## colnames(scaling_tcells_sam) <- c("MDS_dimension_1", "MDS_dimension_2")
## scaling_tcells_sam <- add_column(scaling_tcells_sam, cell_id = labels(dist_tcells),
## .before = "MDS_dimension_1")
##
## mds_plot_tcells_sam <- ggplot(scaling_tcells_sam, aes(x = MDS_dimension_1,
## MDS_dimension_2,
## color = fct_rev(gata3_group))) +
## geom_point(size = 3) +
## ggtitle("Sammon Scaling of the T-cell single cell data") +
## scale_color_brewer(palette = "RdBu", direction = 1) +
## coord_equal()
## mds_plot_tcells_sam
##
## dist_tcells_mds_sam <- get_dist(select(scaling_tcells_sam,
## MDS_dimension_1, MDS_dimension_2),
## method = "euclidean")
##
## data_dist_sam <- tibble(org_distance = as.vector(dist_tcells),
## mds_distance = as.vector(dist_tcells_mds_sam))
##
##
## ggplot(data_dist_sam, aes(x = org_distance, y = mds_distance)) +
## geom_hex(binwidth = 0.1) +
## geom_smooth(color = "grey10") +
## scale_fill_distiller(palette = "RdPu", direction = 1) +
## ggtitle(label = "Shepard plot Sammon",
## subtitle = "Original vs MDS distances") +
## coord_equal()
##
## ----runtSNE-------------------------------------------------------------
run_tsne <- function(X, perplexity = 20, pca = FALSE, max_iter = 5000,
verbose = FALSE, is_distance = TRUE, seed=123L, ...){
set.seed(seed)
tX <- Rtsne(X, perplexity = perplexity, pca = pca, max_iter = max_iter,
verbose = verbose, is_distance = is_distance)$Y
if(class(X) == "dist"){
labs <- labels(X)
} else {
labs <- rownames(X)
}
# browser()
colnames(tX) <- c("tSNE_dimension_1", "tSNE_dimension_2")
tX <- add_column(as_tibble(tX),
cell_id = labs,
.before = "tSNE_dimension_1")
tX
}
mds_plot_tcells_sam <- ggplot(scaling_tcells_sam, aes(x = MDS_dimension_1,
MDS_dimension_2,
## ----tSNEtCell-----------------------------------------------------------
tcell_tsne <- run_tsne(dist_tcells, perplexity = 5,
pca = FALSE, max_iter = 5000,
verbose = FALSE, is_distance = TRUE, seed = 123)
tcell_tsne
tsne_plot_tcells_5 <- ggplot(tcell_tsne,
aes(x = tSNE_dimension_1,
tSNE_dimension_2,
color = fct_rev(gata3_group))) +
geom_point(size = 3) +
ggtitle("Sammon Scaling of the T-cell single cell data") +
ggtitle("t-SNE plot of the T-cell single cell data") +
scale_color_brewer(palette = "RdBu", direction = 1) +
coord_equal()
mds_plot_tcells_sam
dist_tcells_mds_sam <- get_dist(select(scaling_tcells_sam,
MDS_dimension_1, MDS_dimension_2),
method = "euclidean")
tsne_plot_tcells_5
data_dist_sam <- tibble(org_distance = as.vector(dist_tcells),
mds_distance = as.vector(dist_tcells_mds_sam))
## ----compareOrgtSNE, fig.wide="TRUE"-------------------------------------
dist_tcells_tSNE <- get_dist(select(tcell_tsne,
tSNE_dimension_1,
tSNE_dimension_2),
method = "euclidean")
ggplot(data_dist_sam, aes(x = org_distance, y = mds_distance)) +
geom_hex(binwidth = 0.1) +
geom_smooth(color = "grey10") +
scale_fill_distiller(palette = "RdPu", direction = 1) +
ggtitle(label = "Shepard plot Sammon",
subtitle = "Original vs MDS distances") +
coord_equal()
data_dist_tSNE <- tibble(org_distance = as.vector(dist_tcells),
tSNE_distance = as.vector(dist_tcells_tSNE))
ggplot(data_dist_tSNE, aes(x = org_distance, y = tSNE_distance)) +
geom_hex(binwidth = c(0.05, 0.3)) +
geom_smooth(color = "grey10", method = "loess", span = 0.5) +
scale_fill_distiller(palette = "BrBG", direction = 1) +
ggtitle(label = "Shepard plot for t--SNE",
subtitle = "Original vs tSNE distances") +
coord_fixed(1/(max(data_dist_tSNE$tSNE_distance)
- min(data_dist_tSNE$tSNE_distance))) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## ----tSNE, eval=FALSE, echo=FALSE, fig.show="hide"-----------------------
## tcell_tsne_20 <- run_tsne(dist_tcells, perplexity = 20,
## pca = FALSE, max_iter = 5000,
## verbose = FALSE, is_distance = TRUE, seed = 123)
##
##
##
## tsne_plot_tcells_20 <- ggplot(tcell_tsne_20,
## aes(x = tSNE_dimension_1,
## tSNE_dimension_2,
## color = fct_rev(gata3_group))) +
## geom_point(size = 3) +
## ggtitle("t-SNE plot of the T-cell single cell data") +
## scale_color_brewer(palette = "RdBu", direction = 1) +
## coord_equal()
##
## tsne_plot_tcells_5
## tsne_plot_tcells_20
##
## tcell_tsne_org <- run_tsne(tcell_log_counts, perplexity = 5,
## pca = FALSE, max_iter = 5000,
## verbose = FALSE, is_distance = FALSE, seed = 123)
##
##
##
## tsne_plot_tcells_org <- ggplot(tcell_tsne_org,
## aes(x = tSNE_dimension_1,
## tSNE_dimension_2,
## color = fct_rev(gata3_group))) +
## geom_point(size = 3) +
## ggtitle("t-SNE plot of the T-cell single cell data") +
## scale_color_brewer(palette = "RdBu", direction = 1) +
## coord_equal()
##
##
##
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
## ----unloaAll, echo=FALSE, message=FALSE, eval = FALSE-------------------
##
## pkgs <- loaded_packages() %>%
## filter(package != "devtools") %>%
## {.$path}
##
## walk(pkgs, unload)
This diff is collapsed.
This diff is collapsed.
No preview for this file type
@Article{Brennecke_2013,
doi = {10.1038/nmeth.2645},
url = {https://doi.org/10.1038/nmeth.2645},
year = {2013},
month = {sep},
publisher = {Springer Nature},
volume = {10},
number = {11},
pages = {1093--1095},
author = {Philip Brennecke and Simon Anders and Jong Kyoung Kim and Aleksandra A Ko{\l}odziejczyk and Xiuwei Zhang and Valentina Proserpio and Bianka Baying and Vladimir Benes and Sarah A Teichmann and John C Marioni and Marcus G Heisler},
title = {Accounting for technical noise in single-cell {RNA}-seq experiments},
journal = {Nature Methods},
}
@Article{Brennecke_2015,
doi = {10.1038/ni.3246},
url = {https://doi.org/10.1038/ni.3246},
year = {2015},
month = {aug},
publisher = {Springer Nature},
volume = {16},
number = {9},
pages = {933--941},
author = {Philip Brennecke and Alejandro Reyes and Sheena Pinto and Kristin Rattay and Michelle Nguyen and Rita K{\"u}chler and Wolfgang Huber and Bruno Kyewski and Lars M Steinmetz},
title = {Single-cell transcriptome analysis reveals coordinated ectopic gene-expression patterns in medullary thymic epithelial cells},
journal = {Nature Immunology},
}
@Article{Buettner_2015,