Commit 88125faa authored by Bernd Klaus's avatar Bernd Klaus

finished PCA part, started clustering, TODO: MDS and tsne

parent 249a86bd
......@@ -34,6 +34,7 @@ library("recount")
library("psych")
library("vsn")
library("matrixStats")
library("pheatmap")
theme_set(theme_gray(base_size = 18))
......@@ -47,43 +48,43 @@ data_dir <- file.path("data/")
## ----import_gene_expression, echo=FALSE, eval=FALSE----------------------
## # Here we import the gene expression data using only the subset of highly
## # variable genes, resave it
## load(file.path(data_dir, "deGenesNone.RData"))
## load(file.path(data_dir, "mTECdxd.RData"))
##
## mtec_counts <- counts(dxd)[deGenesNone, ]
##
## mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
## var = "ensembl_id"))
##
## mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
## modify_if(.p = is.factor, as.character)
##
## data("biotypes")
## data("geneNames")
##
## mtec_gene_anno <- tibble(biotype) %>%
## add_column(ensembl_id = names(biotype),
## gene_name = geneNames,
## .before = "biotype")
##
## mtec_cell_anno <- colData(dxd)
##
## save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
## compress = "xz")
##
## save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
## compress = "xz")
##
##
## save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
## compress = "xz")
##
## tras <- as_tibble(tras)
##
## save(tras, file = file.path(data_dir, "tras.RData"),
## compress = "xz")
# Here we import the gene expression data using only the subset of highly
# variable genes, resave it
load(file.path(data_dir, "deGenesNone.RData"))
load(file.path(data_dir, "mTECdxd.RData"))
mtec_counts <- counts(dxd)[deGenesNone, ]
mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
var = "ensembl_id"))
mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
modify_if(.p = is.factor, as.character)
data("biotypes")
data("geneNames")
mtec_gene_anno <- tibble(biotype) %>%
add_column(ensembl_id = names(biotype),
gene_name = geneNames,
.before = "biotype")
mtec_cell_anno <- colData(dxd)
save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
compress = "xz")
save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
compress = "xz")
save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
compress = "xz")
tras <- as_tibble(tras)
save(tras, file = file.path(data_dir, "tras.RData"),
compress = "xz")
## ----import_data---------------------------------------------------------
load(file.path(data_dir, "mtec_counts.RData"))
......@@ -212,7 +213,8 @@ col_data_crc <- select(as.data.frame(colData(crc_data)),
tidyr::extract(title, into = c("quantification", "patient", "tissue"),
regex = "([[:alnum:]]+)_([[:alnum:]]+)_([[:alnum:]]+)") %>%
dplyr::filter(quantification == "mRNA")
counts_crc <- counts_crc[, col_data_crc$sample_id]
## ----mrna_counts---------------------------------------------------------
counts_crc_tidy <- rownames_to_column(data.frame(counts_crc), var = "ensembl_id") %>%
......@@ -296,7 +298,7 @@ ggplot(mtec_cell_anno, aes(x = sizeFactor, y = scran_sf)) +
counts_norm_crc_mat <- select(counts_crc_tidy, sample_id, ensembl_id, count_norm) %>%
spread(key = sample_id, value = count_norm) %>%
{
tmp <-as.data.frame(.)
tmp <- as.data.frame(.)
rownames(tmp) <- tmp$ensembl_id
tmp
} %>%
......@@ -308,10 +310,68 @@ meanSdPlot(counts_norm_crc_mat)$gg +
## ----varStabCountData----------------------------------------------------
counts_norm_trans_crc <- varianceStabilizingTransformation(counts_crc)
meanSdPlot(counts_norm_trans_crc)$gg +
counts_norm_vst_crc <- varianceStabilizingTransformation(counts_crc)
meanSdPlot(counts_norm_vst_crc)$gg +
ylim(c(0, 5))
## ----pcaCRCdata----------------------------------------------------------
compute_pca <- function(data_mat, ntop = 500, ...){
pvars <- rowVars(data_mat)
select <- order(pvars, decreasing = TRUE)[seq_len(min(ntop,
length(pvars)))]
PCA <- prcomp(t(data_mat)[, select], center = TRUE, scale. = FALSE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
return(list(pca = data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
PC3 = PCA$x[,3], PC4 = PCA$x[,4], ...),
perc_var = percentVar))
}
pca <- compute_pca(counts_norm_vst_crc,
patient = col_data_crc$patient,
tissue = col_data_crc$tissue)
pca
sum(pca$perc_var[1:4])
## ----vizPca, fig.height = 3, fig.width = 10------------------------------
sd_ratio <- sqrt(pca$perc_var[2] / pca$perc_var[1])
pca_plot <- ggplot(pca$pca, aes(x = PC1, y = PC2,
color = patient,
shape = tissue)) +
geom_point(size = 4) +
ggtitle("PC1 vs PC2, top variable genes") +
labs(x = paste0("PC1, VarExp:", round(pca$perc_var[1],4)),
y = paste0("PC2, VarExp:", round(pca$perc_var[2],4))) +
coord_fixed(ratio = sd_ratio) +
scale_colour_brewer(palette="Dark2")
pca_plot
## ----pca23---------------------------------------------------------------
## ----heatmap, fig.height = 5, eval=FALSE---------------------------------
dists <- as.matrix(dist(t(counts_norm_vst_crc)))
col_data_crc <- unite(col_data_crc, cl_anno, patient, tissue, remove = FALSE)
rownames(dists) <- col_data_crc$cl_anno
colnames(dists) <- col_data_crc$cl_anno
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
pheatmap(dists, trace="none", col = rev(hmcol))
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo()
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment