Commit c610234e authored by Bernd Klaus's avatar Bernd Klaus

added TODOs to the data handling document, sc data import section for graphics document

parent 9e125bb3
......@@ -77,6 +77,11 @@ data_dir <- file.path("data/")
# Essential data handling using single cell data qc data
TODO:
* import / export of data
* relational operations, (joins!)
## Introduction to single cell RNA--Seq data
TODO, explain:
......
## ----options, include=FALSE----------------------------------------------
library(knitr)
options(digits=3, width=80)
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo=TRUE,tidy=FALSE,include=TRUE,
dev=c('png', 'pdf', 'svg'), fig.height = 5, fig.width = 4 * golden_ratio, comment = ' ', dpi = 300,
cache = TRUE)
## ---- echo=FALSE, cache=FALSE--------------------------------------------
print(date())
## ----required_packages_and_data, echo = TRUE, cache=FALSE----------------
library("readxl")
library("BiocStyle")
library("knitr")
library("tidyverse")
library("RColorBrewer")
library("stringr")
library("pheatmap")
library("matrixStats")
library("purrr")
library("fdrtool")
library("readr")
library("gtools")
library("factoextra")
library("magrittr")
library("entropy")
library("forcats")
library("plotly")
library("corrplot")
library("car")
library("forcats")
library("openxlsx")
library("readxl")
library("limma")
library("Single.mTEC.Transcriptomes")
library("DESeq2")
theme_set(theme_gray(base_size = 18))
library(BiocParallel)
multicoreParam <- MulticoreParam(workers = round(parallel::detectCores() * 0.5))
register(multicoreParam)
data_dir <- file.path("data/")
## ----load_inspect_qc_data------------------------------------------------
load(file.path(data_dir, "sc_qc_stats.RData"))
sc_qc_stats
## ----import_gene_expression, echo=FALSE, eval=FALSE----------------------
# Here we import the gene expression data using only the subset of highly
# variable genes, resave it
load(file.path(data_dir, "deGenesNone.RData"))
load(file.path(data_dir, "mTECdxd.RData"))
mtec_counts <- counts(dxd)[deGenesNone, ]
mtec_cell_anno <- colData(dxd)
save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
compress = "xz")
save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
compress = "xz")
## ----import_data---------------------------------------------------------
load(file.path(data_dir, "mtec_counts.RData"))
load(file.path(data_dir, "mtec_cell_anno.RData"))
load(file.path(data_dir, "tras.RData"))
## ----xxx-----------------------------------------------------------------
# ggplot(DNase, aes(x = conc, y = density)) + geom_point()
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
---
title: "Visual exploration for bioinformatics"
author: "Bernd Klaus"
date: "`r doc_date()`"
output:
BiocStyle::html_document2:
toc: true
highlight: tango
self_contained: true
toc_float: false
code_download: true
---
<!--<
To compile this document
graphics.off();rm(list=ls());rmarkdown::render('graphics_bioinf.Rmd');purl('graphics_bioinf.Rmd')
-->
```{r options, include=FALSE}
library(knitr)
options(digits=3, width=80)
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo=TRUE,tidy=FALSE,include=TRUE,
dev=c('png', 'pdf', 'svg'), fig.height = 5, fig.width = 4 * golden_ratio, comment = ' ', dpi = 300,
cache = TRUE)
```
**LAST UPDATE AT**
```{r, echo=FALSE, cache=FALSE}
print(date())
```
# Required packages and other preparations
```{r required_packages_and_data, echo = TRUE, cache=FALSE}
library("readxl")
library("BiocStyle")
library("knitr")
library("tidyverse")
library("RColorBrewer")
library("stringr")
library("pheatmap")
library("matrixStats")
library("purrr")
library("fdrtool")
library("readr")
library("gtools")
library("factoextra")
library("magrittr")
library("entropy")
library("forcats")
library("plotly")
library("corrplot")
library("car")
library("forcats")
library("openxlsx")
library("readxl")
library("limma")
library("Single.mTEC.Transcriptomes")
library("DESeq2")
theme_set(theme_gray(base_size = 18))
library(BiocParallel)
multicoreParam <- MulticoreParam(workers = round(parallel::detectCores() * 0.5))
register(multicoreParam)
data_dir <- file.path("data/")
```
We look at the data after importing it via a `load` command from an `RData`
file.
```{r load_inspect_qc_data}
load(file.path(data_dir, "sc_qc_stats.RData"))
sc_qc_stats
```
```{r import_gene_expression, echo=FALSE, eval=FALSE}
# Here we import the gene expression data using only the subset of highly
# variable genes, resave it
load(file.path(data_dir, "deGenesNone.RData"))
load(file.path(data_dir, "mTECdxd.RData"))
mtec_counts <- counts(dxd)[deGenesNone, ]
mtec_cell_anno <- colData(dxd)
save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
compress = "xz")
save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
compress = "xz")
```
# mTEC single cell-RNA-Seq data
We summarize single-cell transcriptomes in the form of count matrices in which each row
represents a gene and each column represents one cell. The matrix is filled
with the number of sequenced fragments whose genomic alignment overlaps with the
genomic coordinates of the genes. For this, we only use cells for which more
than 40% of the sequenced fragments could be uniquely assigned to the Mouse
reference genome. We also discarded 28 cells from the batch 4 (since they were
from another cell type that was sequenced together with some of the mTECs).
It is sensible to only use highly variable genes in the analysis of
single cell data (HVG), the original publication identified more than
9,000 as highly variable using a method published in Brennecke et. al. 2013.
We retain those genes for further analysis.
We also import a table with genes annotated as tissue restricted antigens (tras)
from Shinto et. al. 2013.
```{r import_data}
load(file.path(data_dir, "mtec_counts.RData"))
load(file.path(data_dir, "mtec_cell_anno.RData"))
load(file.path(data_dir, "tras.RData"))
```
# Graphics in R
# ggplot2
`r CRANpkg("ggplot2")` is a package by Hadley Wickham that implements the idea of
*grammar of graphics* -- a concept created by Leland Wilkinson in his book of the same name. Comprehensive documentation for the package can
be found on <http://docs.ggplot2.org>. The online documentation includes
example use cases for each of the graphic types that are introduced in this lab (and
many more) and is an invaluable resource when creating figures.
Single cell RNA-Seq data has many zero due to a limited capture efficiency
and biases in the reverse transcription. It is thus interesting to see
wether
```{r xxx}
# ggplot(DNase, aes(x = conc, y = density)) + geom_point()
```
# Session Info
```{r session_info, cache = FALSE}
sessionInfo()
```
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment