Commit 1d4b4371 authored by Bernd Klaus's avatar Bernd Klaus

added heatmap and data cleaning

parent 88125faa
......@@ -48,43 +48,43 @@ data_dir <- file.path("data/")
## ----import_gene_expression, echo=FALSE, eval=FALSE----------------------
# Here we import the gene expression data using only the subset of highly
# variable genes, resave it
load(file.path(data_dir, "deGenesNone.RData"))
load(file.path(data_dir, "mTECdxd.RData"))
mtec_counts <- counts(dxd)[deGenesNone, ]
mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
var = "ensembl_id"))
mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
modify_if(.p = is.factor, as.character)
data("biotypes")
data("geneNames")
mtec_gene_anno <- tibble(biotype) %>%
add_column(ensembl_id = names(biotype),
gene_name = geneNames,
.before = "biotype")
mtec_cell_anno <- colData(dxd)
save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
compress = "xz")
save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
compress = "xz")
save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
compress = "xz")
tras <- as_tibble(tras)
save(tras, file = file.path(data_dir, "tras.RData"),
compress = "xz")
## # Here we import the gene expression data using only the subset of highly
## # variable genes, resave it
## load(file.path(data_dir, "deGenesNone.RData"))
## load(file.path(data_dir, "mTECdxd.RData"))
##
## mtec_counts <- counts(dxd)[deGenesNone, ]
##
## mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
## var = "ensembl_id"))
##
## mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
## modify_if(.p = is.factor, as.character)
##
## data("biotypes")
## data("geneNames")
##
## mtec_gene_anno <- tibble(biotype) %>%
## add_column(ensembl_id = names(biotype),
## gene_name = geneNames,
## .before = "biotype")
##
## mtec_cell_anno <- colData(dxd)
##
## save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
## compress = "xz")
##
## save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
## compress = "xz")
##
##
## save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
## compress = "xz")
##
## tras <- as_tibble(tras)
##
## save(tras, file = file.path(data_dir, "tras.RData"),
## compress = "xz")
## ----import_data---------------------------------------------------------
load(file.path(data_dir, "mtec_counts.RData"))
......@@ -360,17 +360,39 @@ pca_plot
## ----pca23---------------------------------------------------------------
sd_ratio_2 <- sqrt(pca$perc_var[3] / pca$perc_var[2])
pca_plot_2 <- ggplot(pca$pca, aes(x = PC2, y = PC3,
color = patient,
shape = tissue)) +
geom_point(size = 4) +
ggtitle("PC2 vs PC3, top variable genes") +
labs(x = paste0("PC2, VarExp:", round(pca$perc_var[2],4)),
y = paste0("PC3, VarExp:", round(pca$perc_var[3],4))) +
coord_fixed(ratio = sd_ratio_2) +
scale_colour_brewer(palette="Dark2")
pca_plot_2
## ----heatmap, fig.height = 5, eval=TRUE----------------------------------
dists <- as.matrix(get_dist(t(counts_norm_vst_crc), method = "pearson"))
diag(dists) <- NA
if(!("cl_anno" %in% colnames(col_data_crc))){
col_data_crc <- unite(col_data_crc, cl_anno, patient, tissue, remove = FALSE)
}
rownames(dists) <- col_data_crc$cl_anno
colnames(dists) <- col_data_crc$cl_anno
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
pheatmap(dists, trace="none", col = rev(hmcol))
## ----heatmap_pc_regression-----------------------------------------------
lm_pc1 <- lm(t(counts_norm_vst_crc) ~ pca$pca$PC1)
## ----heatmap, fig.height = 5, eval=FALSE---------------------------------
dists <- as.matrix(dist(t(counts_norm_vst_crc)))
col_data_crc <- unite(col_data_crc, cl_anno, patient, tissue, remove = FALSE)
rownames(dists) <- col_data_crc$cl_anno
colnames(dists) <- col_data_crc$cl_anno
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
pheatmap(dists, trace="none", col = rev(hmcol))
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
......
......@@ -799,7 +799,9 @@ between the original observations.
If we have many variables (genes) as in our case, it is advisable
to perform a selection of variables since this leads to more stable PCA results.
Here, we simply use the 500 most variable genes.
Here, we simply use the 500 most variable genes. The function we define returns
a list of the computed PCA, the percentage of the variance and the top variable
genes.
```{r pcaCRCdata}
......@@ -858,7 +860,8 @@ pca_plot
We can see that the first PC separates the patients rather than the
tissues. This is not uncommon in studies that feature multiple
measurements from the same individual.
measurements from the same individual. The second PC separates the normal
from the tumor tissues.
It is also important to choose the
aspect ratio properly: PC1 explains about 5 times more variance than
......@@ -880,6 +883,20 @@ Create a PCA plot displaying PC2 and PC3. Interpret it.
```{r pca23}
sd_ratio_2 <- sqrt(pca$perc_var[3] / pca$perc_var[2])
pca_plot_2 <- ggplot(pca$pca, aes(x = PC2, y = PC3,
color = patient,
shape = tissue)) +
geom_point(size = 4) +
ggtitle("PC2 vs PC3, top variable genes") +
labs(x = paste0("PC2, VarExp:", round(pca$perc_var[2],4)),
y = paste0("PC3, VarExp:", round(pca$perc_var[3],4))) +
coord_fixed(ratio = sd_ratio_2) +
scale_colour_brewer(palette="Dark2")
pca_plot_2
```
......@@ -889,26 +906,81 @@ Another very common visualization technique is a heatmap. Analogous to a PCA
we can visualize the pairwise distances between the samples
a false color representation of the distance between two samples. Here
we use the euclidean distance of all the genes per sample. This can be conveniently
computed using the function `dist`.
computed using the function `dist`. As a sample always has a distance of zero
to itself, we remove the diagonal of the distance matrix in order to obtain
a more fine--grained color scale.
This heatmap is then ordered via hierarchical clustering. Hierarchical clustering
starts with as many clusters as there are samples and successively merges samples
that are close to each other. This merging process is commonly visualized as
a tree like graphic called a dendogramm.
a tree like graphic called a dendogramm.
```{r heatmap, fig.height = 5, eval=TRUE}
```{r heatmap, fig.height = 5, eval=FALSE}
dists <- as.matrix(dist(t(counts_norm_vst_crc)))
col_data_crc <- unite(col_data_crc, cl_anno, patient, tissue, remove = FALSE)
rownames(dists) <- col_data_crc$cl_anno
colnames(dists) <- col_data_crc$cl_anno
create_dist_mat <- function(data, anno, method = "pearson"){
dists <- as.matrix(get_dist(t(data), method = method))
diag(dists) <- NA
rownames(dists) <- anno
colnames(dists) <- anno
return(dists)
}
if(!("cl_anno" %in% colnames(col_data_crc))){
col_data_crc <- unite(col_data_crc, cl_anno, patient, tissue, remove = FALSE)
}
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
pheatmap(dists, trace="none", col = rev(hmcol))
pheatmap(create_dist_mat(counts_norm_vst_crc, anno = col_data_crc$cl_anno),
trace="none", col = rev(hmcol))
```
Again we see that samples cluster mainly by experimental condition, but also
by batch.
The PCA plot already indicated that there is not clear--cut clustering present,
this is confirmed by the heatmap. We predominantly see a clustering by patient.
As the first PC mainly captures patient to
patient variation, we now "regress out" PC1 and perform the clustering again.
### Clustering with cleaned data
In order to do this, we build multiple regression models, each of them
regresses the first PC against the gene expression data. We then take
the residuals and add the estimated intercept again to preserve the means
of the gene expression.
```{r pc_regression, warning=FALSE}
lm_pc1 <- lm(t(counts_norm_vst_crc) ~ pca$pca$PC1)
gene_means <- tidy(lm_pc1) %>%
filter(term == "(Intercept)") %>%
select(response, estimate)
cleaned_data <- t(residuals(lm_pc1)) + gene_means$estimate
# sanity check whether gene wise means are equal
all.equal(rowMeans(cleaned_data), rowMeans(counts_norm_vst_crc))
```
We can now create a heatmap of the cleaned data margin^[Note that the cleaned
data should be used with caution for downstream analysis. It is always best to
incorporate a batch directly into statistical modelling. See Jaffe and Langaas]
```{r heatmapCleanedData}
hmcol <- colorRampPalette(brewer.pal(9, "YlOrRd"))(256)
pheatmap(create_dist_mat(cleaned_data,
anno = col_data_crc$cl_anno,
method = "kendall"),
trace="none", col = rev(hmcol))
```
\section{Multidimensional scaling (MDS)}
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment