Commit e09ec3da authored by Thomas Schwarzl's avatar Thomas Schwarzl
Browse files
parents 01de3a55 d58138d6
.Rproj.user
*.Rproj
.Rhistory
.RData
.Ruserdata
......
# basic R usage
4 + 6
x <- 6
y <- 4
x + y
# list all variables in your environment
ls()
sqrt(16)
# to remove a variable
rm(x)
z <- c(5,9,1,0)
x <- c(5,9)
# combine two vectors
combinedVector <- c(z,x)
# another to create a vector is to use seq()
seq(1,9, by=2)
seq(8,20, length=6)
# R will handle vectors for vector arithmatic
x <- seq(1,5,by=1)
y <- 1:5
# adding two vectors will return a vector of sums
x + y
# add a vector and a numeric value
x + 3
x
shortVector <- c(1,3)
longVector <- 1:6
shortVector + longVector
longVector
midVector <- 1:5
midVector
midVector + longVector
# some useful functions that work with vectors
length(midVector)
mean(midVector)
summary(longVector)
min(longVector)
max(longVector,shortVector)
?seq()
# how to subset vectors
x<-c(7.5, 8.2,3.1,5.6,8.2,9.3,6.5,7.0,9.3,1.2,14.5,6.2)
mean(x)
x[1]
x[c(1,5,8)]
x[1:5]
head(x)
# arrange values in a vector
sort(x)
sort(x = x,decreasing = TRUE)
sort(x,TRUE)
sort(longVector, decreasing = TRUE)
sort(x, decreasing) # this will cause error
sort(TRUE,x) # this will cause confusing error
# this is why it's safer to use named parameters
sort(x=x,decreasing=TRUE ) # best way to call this function
sort(x=longVector,decreasing=TRUE)
# Data types
# numerics
9
a <- 9
# is. functions to test for the data type
is.numeric(a)
# character
myChar <- "t"
is.numeric(myChar)
is.character(myChar)
# logicals
TRUE
FALSE
myLgl <- TRUE
is.logical(myLgl)
# save numbers as characters
myCharNum <- "9"
is.numeric(myCharNum)
# change the type of an object
# called "coersion"
as.numeric(myCharNum)
# no quotes means that an object is a number
# quotes means that it is a character
myCoercedNum <- as.numeric(myCharNum)
# function to find classes or data types
class(myChar)
typeof(myChar)
str(myChar)
# Matrix
myMatrix <- matrix(data = c(5,7,9,3,4,6),nrow = 3)
myVec1 <- 3:9
myVec2 <- 13:19
cbind(myVec1, myVec2)
myMatrix * 2
myMatrix
# Pull data out of matrix similarly to vectors
myMatrix[1,1]
myMatrix[1,c(1,2)]
# short cut
myMatrix[,3]
myMatrix[,2]
# to pull out values from a matrix
# use [rows,columns]
myMatrix[-1,]
# this will print everything except the first row
myMixedVector <- c(1,2,4,"a")
# what if we want to hold differen data types in the same
# object
# we can use a list
myList <- list(1,2,4,"a")
# named list
myNamedList <- list(myFirstElem=1,
mySecond=3,
myCharElem="a")
myNamedList
# $ sign notation can pull out named elements
myNamedList$mySecond
myNamedList[2]
# single [] will always return a list (from a list)
mySub1 <- myNamedList[2]
# double [[]] will return value at the position
myNamedList[[2]]
# review of accessing elements in a vector
# accessing by name
namedVector <- c(Alice = 5.5, Bob = 6.4, Steve=5.9)
namedVector
namedVector["Alice"]
# access by position
namedVector[1]
# access using logicals
namedVector[c(TRUE,TRUE,FALSE)]
namedVector == "Alice"
namedVector == 5.5
namedVector[namedVector == 5.5]
# > >= != <=
myNamedList$myFirstElem
namedVector["Alice"]
#coersion
# change the data type of an object
as.numeric("9")
as.numeric("a")
library(tidyverse)
load(url("http://www-huber.embl.de/users/klaus/BasicR/bodyfat.rda"))
bodyfatDF <- bodyfat
head(bodyfat)
str(bodyfat)
bodyfat
as_tibble(bodyfat)
# two ways to create a tibble
bodyfat <- as_tibble(bodyfat) # coerce
tibble(bodyfatDF) # create new from data
head(bodyfat)
bodyfat
# interact with this tibble
# filter() will let you pull out rows from a tibble
bodyfat
filter(.data = bodyfat, age < 40)
filter(.data = bodyfat, age > 40 & age < 60)
filter(bodyfat, age < 40 | age > 60)
# arrange rows in different orders
arrange(bodyfat, age)
arrange(bodyfat, age, weight)
# change the direction of the order
arrange(bodyfat, desc(age), weight)
# select columns
select(bodyfat, age, weight)
# select columns we don't want
select(bodyfat, -age, -weight)
# another way to pull out a column is by name
bodyfat$age
# another using baseR to pull out data []
bodyfat[1,5]
# create new data
mutate(bodyfat, weight_kg = weight*0.454)
bodyfatWithKG <- mutate(bodyfat,weight_kg = weight*0.454)
oneCol <- select(bodyfatWithKG, weight_kg)
# chaining aka piping
# avoids creating intermediate objects
oneCol <- mutate(bodyfat, weight_kg = weight*0.454) %>%
select(weight_kg)
x
sort(x)
head(x,n=2)
x %>% sort()
x %>% sort(decreasing = TRUE)
x %>% sort() %>% head(n=2) %>% mean()
# alternative to write this without pipes
mean(head(sort(x),n=2)) # this is hard
# Challenge
# create a new column in the bodyfat tibble
# called height_m
# conversion : in to m = x*0.0254
# save that as bodyfat
# show just the two height columns for age > 40
# yellow sticky when you're done
bodyfat_converted <- mutate(bodyfat, height_m = height*0.0254)
bodyfat_converted %>%
filter(age > 40) %>%
select(height, height_m)
# summarise
summarise(bodyfat, meanAge = mean(age))
summarise(bodyfat, meanAge = mean(age),
medianAge = median(age))
# this only returns the colums that we calculated
bodyfat %>%
mutate(olderThan40 = age > 40) %>%
group_by(olderThan40) %>%
summarise(meanAge = mean(age),
meanWeight = mean(weight))
# load(url("http://www-huber.embl.de/users/klaus/BasicR/bodyfat.rda"))
# 1. create a new column in your data called "bmi" using this formula:
# bmi = (weight*0.454) / (height*.0254)^2
# 2. Is there a correlation between `bmi` & `wrist.circum` ?
# 3. create a new variable that indicates the category of each person's BMI.
# Here are the categories
# Underweight: BMI is 0 to 18.5.
# Normal weight: BMI is 18.5 to 24.9.
# Overweight: BMI is 25 to 29.9.
# Obese: BMI is 30 to 100
# Anything over 100 is very unusual
# To do this, check out the cut() function.
# If the built-in R help isn't clear, try googling: "R cut() examples"
# 4. what is the average wrist.circum for each bmi category?
# 5. do you have an NA in your data? can you figure out why?
#https://www.r-bloggers.com/from-continuous-to-categorical/
## ---- message = FALSE----------------------------------------------------
library(tidyverse)
library(psych)
library(factoextra)
library(plotly)
library(ComplexHeatmap)
library(viridis)
library(circlize)
## ------------------------------------------------------------------------
input_data <- readRDS("./data/pca_input_data.rds")
(input_data <- as_tibble(input_data))
## ------------------------------------------------------------------------
no_cells_per_well <-
group_by(input_data, well) %>%
dplyr::summarize(no_cells = sum(count))
no_cells_per_well
(data_with_sums <- left_join(input_data, no_cells_per_well))
data_for_PCA <- mutate(data_with_sums, perc = count / no_cells,
z_score = logit(perc))
data_for_PCA
## ------------------------------------------------------------------------
data_for_PCA_wide <- data_for_PCA %>%
dplyr::select(class, well, z_score) %>%
spread(key = class, value = z_score)
data_for_PCA_wide
## ----PCA-----------------------------------------------------------------
(PCA <- prcomp(data_for_PCA_wide[, -1], center = TRUE, scale. = TRUE))
summary(PCA)
genes <-
group_by(input_data, well) %>%
dplyr::summarize(gene = unique(Gene.Symbol))
genes <- ifelse(is.na(genes$gene), "empty", genes$gene)
(dataGG = tibble(PC1 = PCA$x[,1], PC2 = PCA$x[,2], PC3 = PCA$x[,3],
PC4 = PCA$x[,4], genes))
pl <- (ggplot(dataGG, aes(x = PC1, y = PC2, color = genes))
+ geom_text(aes(label = genes), size = I(2))
+ coord_fixed(ratio = (PCA$sdev^2)[2] / (PCA$sdev^2)[1])
+ ggtitle("Principal components plot")
)
pl
## ----var_imp-------------------------------------------------------------
(loadings <- PCA$rotation[, 1:2])
(loadings_gg <-
as.data.frame(loadings) %>%
rownames_to_column(var = "class") %>%
dplyr::select(class, PC1, PC2))
(loadings_gg <- gather(loadings_gg, key = "comp", value = "loading", PC1:PC2))
ggplot(loadings_gg, aes(x = class, y = loading, fill = class)) +
facet_wrap( ~ comp) +
geom_bar(stat = "identity", position = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(type = "qual", palette = "Set3")
## ----co_circle-----------------------------------------------------------
fviz_pca_var(PCA, col.circle = "black", title = "Correlation Circle for the PCA") + coord_equal()
## ------------------------------------------------------------------------
# prepare the data set, don't go through all the steps
cluster_input_long <-
data_for_PCA %>%
select(class, Well, Gene.Symbol, Group, count, no_cells, perc) %>%
filter(Group == "target") %>%
group_by(Gene.Symbol, class) %>%
slice(1) %>%
ungroup()
cluster_input_long
cluster_input_wide <-
select(cluster_input_long, class, Gene.Symbol, perc) %>%
spread(key = class, value = perc)
cluster_input_wide
cluster_input_wide_m <- as.matrix(cluster_input_wide[, -1])
rownames(cluster_input_wide_m) <- cluster_input_wide$Gene.Symbol
cluster_input_wide_m
## ------------------------------------------------------------------------
(m0 <- rbind(c(0, 0), c(0, 1), c(1, 1)))
dist(m0)
## ------------------------------------------------------------------------
m <- cluster_input_wide_m
d <- dist(m) # alternatively: run as.dist() on a matrix of precomputed distances
d
h <- hclust(d)
h
plot(h)
## ------------------------------------------------------------------------
# k to specify number of clusters
(clusts <- cutree(h, k = 2))
cutree(h, h = 0.4)
# tibble(gene = names(clusts), cluster = clusts) %>% group_by(cluster) %>% nest()
## ------------------------------------------------------------------------
Heatmap(m,
cluster_rows = FALSE,
cluster_columns = FALSE,
column_names_side = "top",
row_names_side = "left")
quantile(m)
my_cols <- plasma(20)
Heatmap(m,
col = colorRamp2(
c(seq(from = 0, to = 0.1, length.out = 20)), my_cols),
cluster_rows = FALSE,
cluster_columns = FALSE,
column_names_side = "top",
row_names_side = "left")
Heatmap(m,
col = colorRamp2(
c(seq(from = 0, to = 0.1, length.out = 20)), my_cols),
column_names_side = "top",
row_names_side = "left")
Heatmap(m,
col = colorRamp2(
c(seq(from = 0, to = 0.1, length.out = 20)), my_cols),
column_names_side = "top",
row_names_side = "left",
clustering_distance_columns = "spearman",
clustering_distance_rows = "spearman")
---
title: "Clustering and dimension reduction"
author: "Florian Huber, materials adapted from Bernd Klaus"
date: "`r Sys.Date()`"
output:
bookdown::html_document2:
toc: true
toc_float: true
highlight: tango
code_folding: show
BiocStyle::pdf_document:
toc: true
highlight: tango
---
Materials adapted from Bernd Klaus
(chapter 13 of [this document](https://www.huber.embl.de/users/klaus/graphics_and_data_handling/graphics_and_data_handling.html)).
```{r, message = FALSE}
library(tidyverse)
library(psych)
library(factoextra)
library(plotly)
library(ComplexHeatmap)
library(viridis)
library(circlize)
```
# Clustering and dimensionality reduction techniques
In the statistical learning field, we often want to group "similar observations"
together. Because the structure of the data is not known _a priori_ in this
case, we use so-called _unsupervised learning_ techniques.
One way of doing this is to transform the data space of the observations with
the aim of finding a low-dimensional representation that captures most of the
variation of the data. We will cover one of the best-known examples of dimensionality
reduction in this session, the s-called *principal components analysis* or *PCA* for short. Other
commonly used
dimensionality reduction techniques are t-distributed stochastic neighbour
embedding (*t-SNE*), multidimensional scaling (*MDS*), or matrix factorisation
techniques.
Related and often used in combination are clustering approaches.
When clustering data we use various
distance measures and aggregation techniques to group some observations
together. This can be done on either the original feature space or on a
feature space derived from a dimensionality reduction technique. In this session
we will cover *hierarchical clustering*, also known as agglomerative clustering,
which is broadly used in conjunction with omics data. Again, clustering is a
wide field, notable other techniques include K-means clustering, DBSCAN,
or spectral clustering.
In both cases the aim is usually to find some kind of structure in the data,
for example to understand what the sources of variation are or to formulate
hypotheses. Therefore, we usually want to have some kind of domain knowledge.
# Principal components analysis (PCA)
![From: James, Witten, Hastie, Tibshirani: "An Introduction to Statistical Learning (2013)](PCA.png){ width=70% }
## Data inspection and restructuring
In this tutorial, we will import a single plate from a high content screen performed
on 96 well plates. The input data that we are going to use are class labels for
each single cell. These classification results have been obtained using a machine
learning algorithm based on the original image features. Each cell is classified
into one of 10 mitotic phenotype classes.
```{r}
input_data <- readRDS("./data/pca_input_data.rds")
(input_data <- as_tibble(input_data))
```
<!--
ex: data exploration
-->
We will use these classes as the features that we want to summarise using PCA.
Later on, we will use the same classes to perform hierarchical clustering on
the data.
Before we can run the PCA, however, we need to run a number of data
transformation steps. As feature values we will use a transformed and scaled version of
the class percentages. This is necessary because PCA works best on data that
is normally distributed (z-scores). To turn percentages into z-scores, we will
run a [logit](https://en.wikipedia.org/wiki/Logit) transformation on the data.
This is similar in spirit to a log transformation on
intensity values.
In the code chunk below, we use the `group_by()` function
to plot the dataset into groups according to the well ID.
We then apply the function `sum()` to the counts of each well and
use `summarize()` to obtain a table of counts per well. This is an example
of a __split--apply--combine__ strategy.
```{r}
no_cells_per_well <-
group_by(input_data, well) %>%
dplyr::summarize(no_cells = sum(count))
no_cells_per_well
(data_with_sums <- left_join(input_data, no_cells_per_well))
data_for_PCA <- mutate(data_with_sums, perc = count / no_cells,
z_score = logit(perc))
data_for_PCA
```