Commit 6314585b authored by Bernd Klaus's avatar Bernd Klaus

added TODOs to the data handling document, sc data import section for graphics document

parent 9e125bb3
......@@ -2,3 +2,4 @@ material_stat_methods/
*_cache/
data/
.Rhistory
*_files/
......@@ -77,6 +77,11 @@ data_dir <- file.path("data/")
# Essential data handling using single cell data qc data
TODO:
* import / export of data
* relational operations, (joins!)
## Introduction to single cell RNA--Seq data
TODO, explain:
......
## ----options, include=FALSE----------------------------------------------
library(knitr)
options(digits=3, width=80)
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo=TRUE,tidy=FALSE,include=TRUE,
dev=c('png', 'pdf', 'svg'), fig.height = 5, fig.width = 4 * golden_ratio, comment = ' ', dpi = 300,
cache = TRUE)
## ---- echo=FALSE, cache=FALSE--------------------------------------------
print(date())
## ----required_packages_and_data, echo = TRUE, cache=FALSE----------------
library("readxl")
library("BiocStyle")
library("knitr")
library("tidyverse")
library("RColorBrewer")
library("stringr")
library("pheatmap")
library("matrixStats")
library("purrr")
library("fdrtool")
library("readr")
library("gtools")
library("factoextra")
library("magrittr")
library("entropy")
library("forcats")
library("plotly")
library("corrplot")
library("car")
library("forcats")
library("openxlsx")
library("readxl")
library("limma")
library("Single.mTEC.Transcriptomes")
library("DESeq2")
library("tibble")
theme_set(theme_gray(base_size = 18))
library(BiocParallel)
multicoreParam <- MulticoreParam(workers = round(parallel::detectCores() * 0.5))
register(multicoreParam)
data_dir <- file.path("data/")
## ----import_gene_expression, echo=FALSE, eval=FALSE----------------------
# Here we import the gene expression data using only the subset of highly
# variable genes, resave it
load(file.path(data_dir, "deGenesNone.RData"))
load(file.path(data_dir, "mTECdxd.RData"))
mtec_counts <- counts(dxd)[deGenesNone, ]
mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
var = "ensembl_id"))
mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
modify_if(.p = is.factor, as.character)
data("biotypes")
data("geneNames")
mtec_gene_anno <- tibble(biotype) %>%
add_column(ensembl_id = names(biotype),
gene_name = geneNames,
.before = "biotype")
save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
compress = "xz")
save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
compress = "xz")
save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
compress = "xz")
tras <- as_tibble(tras)
save(tras, file = file.path(data_dir, "tras.RData"),
compress = "xz")
## ----import_data---------------------------------------------------------
load(file.path(data_dir, "mtec_counts.RData"))
load(file.path(data_dir, "mtec_cell_anno.RData"))
load(file.path(data_dir, "mtec_gene_anno.RData"))
load(file.path(data_dir, "tras.RData"))
## ----mtec_count_table----------------------------------------------------
mtec_counts
## ----tidy_count----------------------------------------------------------
mtec_counts_tidy <- gather(mtec_counts, key = "cell_id", value = "count",
-ensembl_id) %>%
mutate(is_tra = ensembl_id %in% tras$gene.ids,
is_detected = count > 0) %>%
left_join(mtec_cell_anno,
by = c("cell_id" = "cellID"))
## ----tra_per_cell--------------------------------------------------------
tra_detected <- filter(mtec_counts_tidy, is_detected == TRUE,
SurfaceMarker == "None") %>%
mutate(is_tra = ifelse(is_tra, "tra", "not_a_tra")) %>%
group_by(cell_id, is_tra) %>%
tally() %>%
spread(key = is_tra, value = n) %>%
mutate(total_detected = sum(tra, not_a_tra))
tra_detected
## ----tra_vs_all----------------------------------------------------------
ggplot(tra_detected, aes(x = total_detected, y = tra))+
geom_point() +
coord_equal()
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
---
title: "Visual exploration for bioinformatics"
author: "Bernd Klaus"
date: "`r doc_date()`"
output:
BiocStyle::html_document2:
toc: true
highlight: tango
self_contained: true
toc_float: false
code_download: true
---
<!--<
To compile this document
graphics.off();rm(list=ls());rmarkdown::render('graphics_bioinf.Rmd');purl('graphics_bioinf.Rmd')
-->
```{r options, include=FALSE}
library(knitr)
options(digits=3, width=80)
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo=TRUE,tidy=FALSE,include=TRUE,
dev=c('png', 'pdf', 'svg'), fig.height = 5, fig.width = 4 * golden_ratio, comment = ' ', dpi = 300,
cache = TRUE)
```
**LAST UPDATE AT**
```{r, echo=FALSE, cache=FALSE}
print(date())
```
# Required packages and other preparations
```{r required_packages_and_data, echo = TRUE, cache=FALSE}
library("readxl")
library("BiocStyle")
library("knitr")
library("tidyverse")
library("RColorBrewer")
library("stringr")
library("pheatmap")
library("matrixStats")
library("purrr")
library("fdrtool")
library("readr")
library("gtools")
library("factoextra")
library("magrittr")
library("entropy")
library("forcats")
library("plotly")
library("corrplot")
library("car")
library("forcats")
library("openxlsx")
library("readxl")
library("limma")
library("Single.mTEC.Transcriptomes")
library("DESeq2")
library("tibble")
theme_set(theme_gray(base_size = 18))
library(BiocParallel)
multicoreParam <- MulticoreParam(workers = round(parallel::detectCores() * 0.5))
register(multicoreParam)
data_dir <- file.path("data/")
```
```{r import_gene_expression, echo=FALSE, eval=FALSE}
# Here we import the gene expression data using only the subset of highly
# variable genes, resave it
load(file.path(data_dir, "deGenesNone.RData"))
load(file.path(data_dir, "mTECdxd.RData"))
mtec_counts <- counts(dxd)[deGenesNone, ]
mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
var = "ensembl_id"))
mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
modify_if(.p = is.factor, as.character)
data("biotypes")
data("geneNames")
mtec_gene_anno <- tibble(biotype) %>%
add_column(ensembl_id = names(biotype),
gene_name = geneNames,
.before = "biotype")
save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
compress = "xz")
save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
compress = "xz")
save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
compress = "xz")
tras <- as_tibble(tras)
save(tras, file = file.path(data_dir, "tras.RData"),
compress = "xz")
```
# mTEC single cell-RNA-Seq data
We summarize single-cell transcriptomes in the form of count matrices in which each row
represents a gene and each column represents one cell. The matrix is filled
with the number of sequenced fragments whose genomic alignment overlaps with the
genomic coordinates of the genes. For this, we only use cells for which more
than 40% of the sequenced fragments could be uniquely assigned to the Mouse
reference genome. We also discarded 28 cells from the batch 4 (since they were
from another cell type that was sequenced together with some of the mTECs).
It is sensible to only use highly variable genes in the analysis of
single cell data (HVG), the original publication identified more than
9,000 as highly variable using a method published in Brennecke et. al. 2013.
We retain those genes for further analysis.
We also import a table with genes annotated as tissue restricted antigens (tras)
from Shinto et. al. 2013 and annotation for mouse genes from ENSEMBL.
```{r import_data}
load(file.path(data_dir, "mtec_counts.RData"))
load(file.path(data_dir, "mtec_cell_anno.RData"))
load(file.path(data_dir, "mtec_gene_anno.RData"))
load(file.path(data_dir, "tras.RData"))
```
# Graphics in R
# ggplot2
`r CRANpkg("ggplot2")` is a package by Hadley Wickham that implements the idea of
*grammar of graphics* -- a concept created by Leland Wilkinson in his book of the same name. Comprehensive documentation for the package can
be found on <http://docs.ggplot2.org>. The online documentation includes
example use cases for each of the graphic types that are introduced in this lab (and
many more) and is an invaluable resource when creating figures.
Single cell RNA-Seq data has many zero due to a limited capture efficiency
and biases in the reverse transcription. It is thus interesting to see how
the total number of genes expressed (at least one count) relates to
the number of TRA encoding genes expressed.
The single cell data also contains mTEC cells select based on surface markers,
we exclude those and make sure to inspect only protein coding genes.
In order to do this, we first turn the count table:
```{r mtec_count_table}
mtec_counts
```
which contains the counts for each cell in separte columns into a tidy
data frame. We `gather` all columns except for the first one and then
add a column telling us whether a specific genes is coding for a TRA
and whether it was detected or not.
Then, we join the annotation to the tidy table
```{r tidy_count}
mtec_counts_tidy <- gather(mtec_counts, key = "cell_id", value = "count",
-ensembl_id) %>%
mutate(is_tra = ensembl_id %in% tras$gene.ids,
is_detected = count > 0) %>%
left_join(mtec_cell_anno,
by = c("cell_id" = "cellID"))
```
Now we can compute the number of TRA genes detected per cell and filter
the non--protein coding genes as well as the cells that have been processed
using FACS based on a surface marker.
Finally, we obtain a tidy table with two columns per cell: the number of
tras and non-tras expressed. There is a subtle change in observations:
we go from genes within a single cell as our unit to single cells.
```{r tra_per_cell}
tra_detected <- filter(mtec_counts_tidy, is_detected == TRUE,
SurfaceMarker == "None") %>%
mutate(is_tra = ifelse(is_tra, "tra", "not_a_tra")) %>%
group_by(cell_id, is_tra) %>%
tally() %>%
spread(key = is_tra, value = n) %>%
mutate(total_detected = sum(tra, not_a_tra))
tra_detected
```
We visualize this relationship in a scatterplot with ggplot2.
```{r tra_vs_all}
ggplot(tra_detected, aes(x = total_detected, y = tra))+
geom_point() +
coord_equal()
```
# Session Info
```{r session_info, cache = FALSE}
sessionInfo()
```
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment