Commit e09ec3da authored by Thomas Schwarzl's avatar Thomas Schwarzl
parents 01de3a55 d58138d6
.Rproj.user
*.Rproj
.Rhistory
.RData
.Ruserdata
......
# basic R usage
4 + 6
x <- 6
y <- 4
x + y
# list all variables in your environment
ls()
sqrt(16)
# to remove a variable
rm(x)
z <- c(5,9,1,0)
x <- c(5,9)
# combine two vectors
combinedVector <- c(z,x)
# another to create a vector is to use seq()
seq(1,9, by=2)
seq(8,20, length=6)
# R will handle vectors for vector arithmatic
x <- seq(1,5,by=1)
y <- 1:5
# adding two vectors will return a vector of sums
x + y
# add a vector and a numeric value
x + 3
x
shortVector <- c(1,3)
longVector <- 1:6
shortVector + longVector
longVector
midVector <- 1:5
midVector
midVector + longVector
# some useful functions that work with vectors
length(midVector)
mean(midVector)
summary(longVector)
min(longVector)
max(longVector,shortVector)
?seq()
# how to subset vectors
x<-c(7.5, 8.2,3.1,5.6,8.2,9.3,6.5,7.0,9.3,1.2,14.5,6.2)
mean(x)
x[1]
x[c(1,5,8)]
x[1:5]
head(x)
# arrange values in a vector
sort(x)
sort(x = x,decreasing = TRUE)
sort(x,TRUE)
sort(longVector, decreasing = TRUE)
sort(x, decreasing) # this will cause error
sort(TRUE,x) # this will cause confusing error
# this is why it's safer to use named parameters
sort(x=x,decreasing=TRUE ) # best way to call this function
sort(x=longVector,decreasing=TRUE)
# Data types
# numerics
9
a <- 9
# is. functions to test for the data type
is.numeric(a)
# character
myChar <- "t"
is.numeric(myChar)
is.character(myChar)
# logicals
TRUE
FALSE
myLgl <- TRUE
is.logical(myLgl)
# save numbers as characters
myCharNum <- "9"
is.numeric(myCharNum)
# change the type of an object
# called "coersion"
as.numeric(myCharNum)
# no quotes means that an object is a number
# quotes means that it is a character
myCoercedNum <- as.numeric(myCharNum)
# function to find classes or data types
class(myChar)
typeof(myChar)
str(myChar)
# Matrix
myMatrix <- matrix(data = c(5,7,9,3,4,6),nrow = 3)
myVec1 <- 3:9
myVec2 <- 13:19
cbind(myVec1, myVec2)
myMatrix * 2
myMatrix
# Pull data out of matrix similarly to vectors
myMatrix[1,1]
myMatrix[1,c(1,2)]
# short cut
myMatrix[,3]
myMatrix[,2]
# to pull out values from a matrix
# use [rows,columns]
myMatrix[-1,]
# this will print everything except the first row
myMixedVector <- c(1,2,4,"a")
# what if we want to hold differen data types in the same
# object
# we can use a list
myList <- list(1,2,4,"a")
# named list
myNamedList <- list(myFirstElem=1,
mySecond=3,
myCharElem="a")
myNamedList
# $ sign notation can pull out named elements
myNamedList$mySecond
myNamedList[2]
# single [] will always return a list (from a list)
mySub1 <- myNamedList[2]
# double [[]] will return value at the position
myNamedList[[2]]
# review of accessing elements in a vector
# accessing by name
namedVector <- c(Alice = 5.5, Bob = 6.4, Steve=5.9)
namedVector
namedVector["Alice"]
# access by position
namedVector[1]
# access using logicals
namedVector[c(TRUE,TRUE,FALSE)]
namedVector == "Alice"
namedVector == 5.5
namedVector[namedVector == 5.5]
# > >= != <=
myNamedList$myFirstElem
namedVector["Alice"]
#coersion
# change the data type of an object
as.numeric("9")
as.numeric("a")
library(tidyverse)
load(url("http://www-huber.embl.de/users/klaus/BasicR/bodyfat.rda"))
bodyfatDF <- bodyfat
head(bodyfat)
str(bodyfat)
bodyfat
as_tibble(bodyfat)
# two ways to create a tibble
bodyfat <- as_tibble(bodyfat) # coerce
tibble(bodyfatDF) # create new from data
head(bodyfat)
bodyfat
# interact with this tibble
# filter() will let you pull out rows from a tibble
bodyfat
filter(.data = bodyfat, age < 40)
filter(.data = bodyfat, age > 40 & age < 60)
filter(bodyfat, age < 40 | age > 60)
# arrange rows in different orders
arrange(bodyfat, age)
arrange(bodyfat, age, weight)
# change the direction of the order
arrange(bodyfat, desc(age), weight)
# select columns
select(bodyfat, age, weight)
# select columns we don't want
select(bodyfat, -age, -weight)
# another way to pull out a column is by name
bodyfat$age
# another using baseR to pull out data []
bodyfat[1,5]
# create new data
mutate(bodyfat, weight_kg = weight*0.454)
bodyfatWithKG <- mutate(bodyfat,weight_kg = weight*0.454)
oneCol <- select(bodyfatWithKG, weight_kg)
# chaining aka piping
# avoids creating intermediate objects
oneCol <- mutate(bodyfat, weight_kg = weight*0.454) %>%
select(weight_kg)
x
sort(x)
head(x,n=2)
x %>% sort()
x %>% sort(decreasing = TRUE)
x %>% sort() %>% head(n=2) %>% mean()
# alternative to write this without pipes
mean(head(sort(x),n=2)) # this is hard
# Challenge
# create a new column in the bodyfat tibble
# called height_m
# conversion : in to m = x*0.0254
# save that as bodyfat
# show just the two height columns for age > 40
# yellow sticky when you're done
bodyfat_converted <- mutate(bodyfat, height_m = height*0.0254)
bodyfat_converted %>%
filter(age > 40) %>%
select(height, height_m)
# summarise
summarise(bodyfat, meanAge = mean(age))
summarise(bodyfat, meanAge = mean(age),
medianAge = median(age))
# this only returns the colums that we calculated
bodyfat %>%
mutate(olderThan40 = age > 40) %>%
group_by(olderThan40) %>%
summarise(meanAge = mean(age),
meanWeight = mean(weight))
# load(url("http://www-huber.embl.de/users/klaus/BasicR/bodyfat.rda"))
# 1. create a new column in your data called "bmi" using this formula:
# bmi = (weight*0.454) / (height*.0254)^2
# 2. Is there a correlation between `bmi` & `wrist.circum` ?
# 3. create a new variable that indicates the category of each person's BMI.
# Here are the categories
# Underweight: BMI is 0 to 18.5.
# Normal weight: BMI is 18.5 to 24.9.
# Overweight: BMI is 25 to 29.9.
# Obese: BMI is 30 to 100
# Anything over 100 is very unusual
# To do this, check out the cut() function.
# If the built-in R help isn't clear, try googling: "R cut() examples"
# 4. what is the average wrist.circum for each bmi category?
# 5. do you have an NA in your data? can you figure out why?
#https://www.r-bloggers.com/from-continuous-to-categorical/
## ---- message = FALSE----------------------------------------------------
library(tidyverse)
library(psych)
library(factoextra)
library(plotly)
library(ComplexHeatmap)
library(viridis)
library(circlize)
## ------------------------------------------------------------------------
input_data <- readRDS("./data/pca_input_data.rds")
(input_data <- as_tibble(input_data))
## ------------------------------------------------------------------------
no_cells_per_well <-
group_by(input_data, well) %>%
dplyr::summarize(no_cells = sum(count))
no_cells_per_well
(data_with_sums <- left_join(input_data, no_cells_per_well))
data_for_PCA <- mutate(data_with_sums, perc = count / no_cells,
z_score = logit(perc))
data_for_PCA
## ------------------------------------------------------------------------
data_for_PCA_wide <- data_for_PCA %>%
dplyr::select(class, well, z_score) %>%
spread(key = class, value = z_score)
data_for_PCA_wide
## ----PCA-----------------------------------------------------------------
(PCA <- prcomp(data_for_PCA_wide[, -1], center = TRUE, scale. = TRUE))
summary(PCA)
genes <-
group_by(input_data, well) %>%
dplyr::summarize(gene = unique(Gene.Symbol))
genes <- ifelse(is.na(genes$gene), "empty", genes$gene)
(dataGG = tibble(PC1 = PCA$x[,1], PC2 = PCA$x[,2], PC3 = PCA$x[,3],
PC4 = PCA$x[,4], genes))
pl <- (ggplot(dataGG, aes(x = PC1, y = PC2, color = genes))
+ geom_text(aes(label = genes), size = I(2))
+ coord_fixed(ratio = (PCA$sdev^2)[2] / (PCA$sdev^2)[1])
+ ggtitle("Principal components plot")
)
pl
## ----var_imp-------------------------------------------------------------
(loadings <- PCA$rotation[, 1:2])
(loadings_gg <-
as.data.frame(loadings) %>%
rownames_to_column(var = "class") %>%
dplyr::select(class, PC1, PC2))
(loadings_gg <- gather(loadings_gg, key = "comp", value = "loading", PC1:PC2))
ggplot(loadings_gg, aes(x = class, y = loading, fill = class)) +
facet_wrap( ~ comp) +
geom_bar(stat = "identity", position = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(type = "qual", palette = "Set3")
## ----co_circle-----------------------------------------------------------
fviz_pca_var(PCA, col.circle = "black", title = "Correlation Circle for the PCA") + coord_equal()
## ------------------------------------------------------------------------
# prepare the data set, don't go through all the steps
cluster_input_long <-
data_for_PCA %>%
select(class, Well, Gene.Symbol, Group, count, no_cells, perc) %>%
filter(Group == "target") %>%
group_by(Gene.Symbol, class) %>%
slice(1) %>%
ungroup()
cluster_input_long
cluster_input_wide <-
select(cluster_input_long, class, Gene.Symbol, perc) %>%
spread(key = class, value = perc)
cluster_input_wide
cluster_input_wide_m <- as.matrix(cluster_input_wide[, -1])
rownames(cluster_input_wide_m) <- cluster_input_wide$Gene.Symbol
cluster_input_wide_m
## ------------------------------------------------------------------------
(m0 <- rbind(c(0, 0), c(0, 1), c(1, 1)))
dist(m0)
## ------------------------------------------------------------------------
m <- cluster_input_wide_m
d <- dist(m) # alternatively: run as.dist() on a matrix of precomputed distances
d
h <- hclust(d)
h
plot(h)
## ------------------------------------------------------------------------
# k to specify number of clusters
(clusts <- cutree(h, k = 2))
cutree(h, h = 0.4)
# tibble(gene = names(clusts), cluster = clusts) %>% group_by(cluster) %>% nest()
## ------------------------------------------------------------------------
Heatmap(m,
cluster_rows = FALSE,
cluster_columns = FALSE,
column_names_side = "top",
row_names_side = "left")
quantile(m)
my_cols <- plasma(20)
Heatmap(m,
col = colorRamp2(
c(seq(from = 0, to = 0.1, length.out = 20)), my_cols),
cluster_rows = FALSE,
cluster_columns = FALSE,
column_names_side = "top",
row_names_side = "left")
Heatmap(m,
col = colorRamp2(
c(seq(from = 0, to = 0.1, length.out = 20)), my_cols),
column_names_side = "top",
row_names_side = "left")
Heatmap(m,
col = colorRamp2(
c(seq(from = 0, to = 0.1, length.out = 20)), my_cols),
column_names_side = "top",
row_names_side = "left",
clustering_distance_columns = "spearman",
clustering_distance_rows = "spearman")
This diff is collapsed.
This diff is collapsed.
## ---- message = FALSE----------------------------------------------------
library(tidyverse)
library(ggbeeswarm)
## ---- eval = FALSE, include = FALSE--------------------------------------
## data("x")
## x
## ?x
## dim(x)
## exprs(x)
## class(exprs(x))
## exprs(x)[1:5, 1:5]
##
## selectedProbes <- c( Fgf4 = "1420085_at", Gata4 = "1418863_at",
## Gata6 = "1425463_at", Sox2 = "1416967_at")
##
## genes <- as_tibble(exprs(x), rownames = "probe")
##
## genes <- filter(genes, probe %in% selectedProbes) %>%
## gather(key = sample, value = value, -probe)
## genes
##
## genes$gene <- names(selectedProbes)[ match(genes$probe, selectedProbes) ]
##
## ##
##
## head(pData(x), n = 2)
## class(pData(x))
##
## (metadata <- as_tibble(pData(x), rownames = "rowname"))
##
## groups <- group_by(metadata, sampleGroup) %>%
## dplyr::summarise(n = n(), colour = unique(sampleColour))
## groups
## ------------------------------------------------------------------------
# brackets around an expression will evaluate it so you don't have to type
# cells again after assignment to see its contents
(cells <- read_delim("../data/ImagedCells.csv", delim = ";"))
## ---- eval = FALSE-------------------------------------------------------
## # not run: run in the console to see the output
## str(cells)
## glimpse(cells)
## head(cells)
## summary(cells)
## # View(cells)
## ------------------------------------------------------------------------
range(cells$cell_area)
max(cells$gfp_T0)
which.max(cells$gfp_T0)
quantile(cells$gfp_T0)
quantile(cells$gfp_T0, probs = c(0.01, 0.25, 0.75, 0.99))
unique(cells$Well)
n_distinct(cells$Well)
## ------------------------------------------------------------------------
table(cells$Well)
table(cells$Well, cells$treated)
## ------------------------------------------------------------------------
mean(cells$roundness)
median(cells$roundness)
cor(cells$cell_area, cells$gfp_T1, method = "spearman")
cor.test(cells$cell_area, cells$gfp_T1, method = "spearman")
## ------------------------------------------------------------------------
# make sure that the ggplot2 package is actually loaded: library(ggplot) or
# library(tidyverse)
ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0))
## ------------------------------------------------------------------------
ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0)) +
geom_point()
## ------------------------------------------------------------------------
p <- ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0)) +
geom_point()
p
## ------------------------------------------------------------------------
p <- p + labs(x = "GFP at time point T0", y = "mCherry at time point T0",
title = "mCherry vs. GFP intensity")
p
## ------------------------------------------------------------------------
# units are in inches
ggsave(filename = "A_plot_has_been_born.pdf", plot = p, width = 5, height = 5)
## ------------------------------------------------------------------------
ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0)) +
geom_point(aes(colour = treated)) +
labs(x = "GFP at time point T0", y = "mCherry at time point T0",
title = "mCherry vs. GFP intensity")
# any part of the plotting command can be saved in a variable:
my_label <- labs(x = "GFP at time point T0", y = "mCherry at time point T0",
title = "mCherry vs. GFP intensity")
ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0)) +
geom_point(aes(colour = treated)) +
my_label
## ------------------------------------------------------------------------
ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0)) +
geom_point(aes(colour = treated)) +
geom_smooth(method = "lm", se = FALSE) +
my_label
## ------------------------------------------------------------------------
ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0)) +
geom_point(aes(colour = treated)) +
geom_smooth(aes(colour = treated), method = "lm", se = FALSE) +
my_label
# using inheritance
ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0, colour = treated)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
my_label
## ------------------------------------------------------------------------
cells %>%
filter(treated == "yes") %>%
lm(formula = mCh_T0 ~ gfp_T0)
group_by(cells, treated) %>%
summarise(my_mod = list(lm(mCh_T0 ~ gfp_T0)))
## ------------------------------------------------------------------------
ggplot(cells, aes(x = treated)) +
geom_bar()
cells[1:6, ] %>%
ggplot(aes(x = cellID, y = gfp_T0)) +
geom_bar(stat = "identity")
## ------------------------------------------------------------------------
ggplot(cells, aes(x = Well, y = gfp_T0)) +
geom_boxplot()
## ------------------------------------------------------------------------
ggplot(cells, aes(x = Well, y = gfp_T0)) +
geom_violin()
ggplot(cells, aes(x = Well, y = gfp_T0)) +
geom_violin() +
geom_point(alpha = 0.5)
## ------------------------------------------------------------------------
# set height to 0!
ggplot(cells, aes(x = Well, y = gfp_T0)) +
geom_violin() +
geom_point(alpha = 0.5, position = position_jitter(width = 0.1, height = 0))
## ------------------------------------------------------------------------
ggplot(cells, aes(x = Well, y = gfp_T0)) +
geom_violin() +
ggbeeswarm::geom_beeswarm(cex = 2)
## ------------------------------------------------------------------------
ggplot(cells, aes(x = gfp_T0)) +
geom_histogram(bins = 20)
## ------------------------------------------------------------------------
ggplot(cells, aes(x = gfp_T0)) +
geom_histogram(aes(y = ..density..), bins = 20)
## ------------------------------------------------------------------------
ggplot(cells, aes(x = gfp_T0)) +
geom_histogram(aes(y = ..density..), bins = 20) +
geom_density()
ggplot(cells, aes(x = gfp_T0)) +
geom_histogram(aes(y = ..density..), bins = 20) +
geom_density(adjust = 4)
## ------------------------------------------------------------------------
ggplot(cells, aes(x = gfp_T0)) +
geom_freqpoly(bins = 20)
## ------------------------------------------------------------------------
ggplot(cells, aes(x = gfp_T0, colour = Well)) +
stat_ecdf()
## ------------------------------------------------------------------------
p <- ggplot(data = cells, aes(x = gfp_T0, y = mCh_T0))
p + geom_point(alpha = 0.2)