Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
stat_methods_bioinf
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Bernd Klaus
stat_methods_bioinf
Commits
c2e9e1ac
Commit
c2e9e1ac
authored
Aug 18, 2017
by
Bernd Klaus
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
started section on var stabilization
parent
9386b0c8
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
517 additions
and
286 deletions
+517
-286
graphics_bioinf.R
graphics_bioinf.R
+156
-56
graphics_bioinf.Rmd
graphics_bioinf.Rmd
+73
-12
graphics_bioinf.html
graphics_bioinf.html
+288
-218
No files found.
graphics_bioinf.R
View file @
c2e9e1ac
...
...
@@ -11,6 +11,7 @@ print(date())
## ----required_packages_and_data, echo = TRUE, cache=FALSE----------------
library
(
"readxl"
)
library
(
"scran"
)
library
(
"BiocStyle"
)
library
(
"knitr"
)
library
(
"tidyverse"
)
...
...
@@ -19,27 +20,20 @@ library("stringr")
library
(
"pheatmap"
)
library
(
"matrixStats"
)
library
(
"purrr"
)
library
(
"fdrtool"
)
library
(
"readr"
)
library
(
"gtools"
)
library
(
"factoextra"
)
library
(
"magrittr"
)
library
(
"entropy"
)
library
(
"forcats"
)
library
(
"plotly"
)
library
(
"corrplot"
)
library
(
"car"
)
library
(
"forcats"
)
library
(
"openxlsx"
)
library
(
"readxl"
)
library
(
"limma"
)
library
(
"Single.mTEC.Transcriptomes"
)
library
(
"DESeq2"
)
library
(
"tibble"
)
library
(
"broom"
)
library
(
"scran"
)
library
(
"locfit"
)
library
(
"recount"
)
library
(
"psych"
)
library
(
"vsn"
)
library
(
"matrixStats"
)
theme_set
(
theme_gray
(
base_size
=
18
))
...
...
@@ -53,43 +47,43 @@ data_dir <- file.path("data/")
## ----import_gene_expression, echo=FALSE, eval=FALSE----------------------
# Here we import the gene expression data using only the subset of highly
# variable genes, resave it
load
(
file.path
(
data_dir
,
"deGenesNone.RData"
))
load
(
file.path
(
data_dir
,
"mTECdxd.RData"
))
mtec_counts
<-
counts
(
dxd
)[
deGenesNone
,
]
mtec_counts
<-
as_tibble
(
rownames_to_column
(
as.data.frame
(
mtec_counts
),
var
=
"ensembl_id"
))
mtec_cell_anno
<-
as_tibble
(
as.data.frame
(
colData
(
dxd
)))
%>%
modify_if
(
.p
=
is.factor
,
as.character
)
data
(
"biotypes"
)
data
(
"geneNames"
)
mtec_gene_anno
<-
tibble
(
biotype
)
%>%
add_column
(
ensembl_id
=
names
(
biotype
),
gene_name
=
geneNames
,
.before
=
"biotype"
)
mtec_cell_anno
<-
colData
(
dxd
)
save
(
mtec_counts
,
file
=
file.path
(
data_dir
,
"mtec_counts.RData"
),
compress
=
"xz"
)
save
(
mtec_cell_anno
,
file
=
file.path
(
data_dir
,
"mtec_cell_anno.RData"
),
compress
=
"xz"
)
save
(
mtec_gene_anno
,
file
=
file.path
(
data_dir
,
"mtec_gene_anno.RData"
),
compress
=
"xz"
)
tras
<-
as_tibble
(
tras
)
save
(
tras
,
file
=
file.path
(
data_dir
,
"tras.RData"
),
compress
=
"xz"
)
##
# Here we import the gene expression data using only the subset of highly
##
# variable genes, resave it
##
load(file.path(data_dir, "deGenesNone.RData"))
##
load(file.path(data_dir, "mTECdxd.RData"))
##
##
mtec_counts <- counts(dxd)[deGenesNone, ]
##
##
mtec_counts <- as_tibble(rownames_to_column(as.data.frame(mtec_counts),
##
var = "ensembl_id"))
##
##
mtec_cell_anno <- as_tibble(as.data.frame(colData(dxd))) %>%
##
modify_if(.p = is.factor, as.character)
##
##
data("biotypes")
##
data("geneNames")
##
##
mtec_gene_anno <- tibble(biotype) %>%
##
add_column(ensembl_id = names(biotype),
##
gene_name = geneNames,
##
.before = "biotype")
##
##
mtec_cell_anno <- colData(dxd)
##
##
save(mtec_counts, file = file.path(data_dir, "mtec_counts.RData"),
##
compress = "xz")
##
##
save(mtec_cell_anno, file = file.path(data_dir, "mtec_cell_anno.RData"),
##
compress = "xz")
##
##
##
save(mtec_gene_anno, file = file.path(data_dir, "mtec_gene_anno.RData"),
##
compress = "xz")
##
##
tras <- as_tibble(tras)
##
##
save(tras, file = file.path(data_dir, "tras.RData"),
##
compress = "xz")
## ----import_data---------------------------------------------------------
load
(
file.path
(
data_dir
,
"mtec_counts.RData"
))
...
...
@@ -103,20 +97,20 @@ mtec_counts
## ----tidy_count----------------------------------------------------------
mtec_counts_tidy
<-
gather
(
mtec_counts
,
key
=
"cell_id"
,
value
=
"count"
,
-
ensembl_id
)
%>%
mutate
(
is_tra
=
ensembl_id
%in%
tras
$
gene.ids
,
dplyr
::
mutate
(
is_tra
=
ensembl_id
%in%
tras
$
gene.ids
,
is_detected
=
count
>
0
)
%>%
left_join
(
mtec_cell_anno
,
by
=
c
(
"cell_id"
=
"cellID"
))
## ----tra_per_cell--------------------------------------------------------
tra_detected
<-
filter
(
mtec_counts_tidy
,
is_detected
==
TRUE
,
tra_detected
<-
dplyr
::
filter
(
mtec_counts_tidy
,
is_detected
==
TRUE
,
SurfaceMarker
==
"None"
)
%>%
mutate
(
is_tra
=
ifelse
(
is_tra
,
"tra"
,
"not_a_tra"
))
%>%
dplyr
::
mutate
(
is_tra
=
ifelse
(
is_tra
,
"tra"
,
"not_a_tra"
))
%>%
group_by
(
cell_id
,
is_tra
)
%>%
tally
()
%>%
spread
(
key
=
is_tra
,
value
=
n
)
%>%
mutate
(
total_detected
=
sum
(
tra
,
not_a_tra
))
dplyr
::
mutate
(
total_detected
=
sum
(
tra
,
not_a_tra
))
tra_detected
...
...
@@ -173,7 +167,10 @@ dataL$locFit <- predict(locfit(y~lp(a, nn=0.5, deg=1), data=dataL),
## ----import_crc----------------------------------------------------------
download_study
(
"SRP022054"
,
type
=
"rse-gene"
)
if
(
!
file.exists
(
"SRP022054/rse_gene.Rdata"
)){
download_study
(
"SRP022054"
,
type
=
"rse-gene"
)
}
load
(
file.path
(
"SRP022054"
,
"rse_gene.Rdata"
))
crc_data
<-
rse_gene
crc_data
...
...
@@ -197,14 +194,117 @@ text(62.5,82.5,"colData")
counts_crc
<-
assay
(
crc_data
)
counts_crc
[
1
:
5
,
1
:
5
]
counts_p4_meta
<-
counts_crc
[,
c
(
"SRR837829"
,
"SRR837847"
)]
View
(
counts_p4_meta
)
## ----crc_filtering-------------------------------------------------------
counts_crc
<-
counts_crc
[
rowMeans
(
counts_crc
)
>=
5
,
]
dim
(
counts_crc
)
## ----pre_crc_genes-------------------------------------------------------
colnames
(
colData
(
crc_data
))
colData
(
crc_data
)[
1
:
5
,
c
(
"title"
,
"
characteristics"
,
"
mapped_read_count"
)]
tail
(
colnames
(
colData
(
crc_data
)
))
colData
(
crc_data
)[
1
:
5
,
c
(
"title"
,
"mapped_read_count"
)]
nrow
(
colData
(
crc_data
))
## ----create_crc_col_data-------------------------------------------------
col_data_crc
<-
select
(
as.data.frame
(
colData
(
crc_data
)),
title
,
mapped_read_count
)
%>%
rownames_to_column
(
var
=
"sample_id"
)
%>%
as_tibble
()
%>%
tidyr
::
extract
(
title
,
into
=
c
(
"quantification"
,
"patient"
,
"tissue"
),
regex
=
"([[:alnum:]]+)_([[:alnum:]]+)_([[:alnum:]]+)"
)
%>%
dplyr
::
filter
(
quantification
==
"mRNA"
)
## ----mrna_counts---------------------------------------------------------
counts_crc_tidy
<-
rownames_to_column
(
data.frame
(
counts_crc
),
var
=
"ensembl_id"
)
%>%
as_tibble
()
%>%
gather
(
key
=
"sample_id"
,
value
=
"count"
,
-
ensembl_id
)
%>%
dplyr
::
filter
(
sample_id
%in%
col_data_crc
$
sample_id
)
sample_medians
<-
group_by
(
counts_crc_tidy
,
sample_id
)
%>%
dplyr
::
filter
(
count
>
0
)
%>%
summarize
(
sample_median
=
median
(
log2
(
count
)))
counts_crc_tidy
<-
left_join
(
counts_crc_tidy
,
sample_medians
,
by
=
c
(
"sample_id"
=
"sample_id"
))
%>%
dplyr
::
arrange
(
sample_median
)
%>%
dplyr
::
mutate
(
sample_id_by_median
=
as_factor
(
sample_id
))
%>%
left_join
(
col_data_crc
)
count_boxplot
<-
ggplot
(
counts_crc_tidy
,
aes
(
x
=
sample_id_by_median
,
y
=
log2
(
count
),
fill
=
sample_id
)
)
+
geom_boxplot
()
+
ylim
(
c
(
0
,
10
))
+
scale_fill_brewer
(
palette
=
"Paired"
)
+
theme
(
axis.text.x
=
element_text
(
angle
=
90
,
hjust
=
1
))
count_boxplot
## ----exp_boxplot---------------------------------------------------------
count_boxplot_tissue
<-
ggplot
(
counts_crc_tidy
,
aes
(
x
=
sample_id_by_median
,
y
=
log2
(
count
),
fill
=
tissue
))
+
geom_boxplot
()
+
ylim
(
c
(
0
,
10
))
+
theme
(
axis.text.x
=
element_text
(
angle
=
90
,
hjust
=
1
))
count_boxplot_tissue
count_boxplot_tissue
+
facet_grid
(
patient
~
.
)
## ----calcsf--------------------------------------------------------------
crc_sf
<-
estimateSizeFactorsForMatrix
(
counts_crc
[,
col_data_crc
$
sample_id
])
sample_medians_sf
<-
left_join
(
sample_medians
,
tibble
(
sample_id
=
names
(
crc_sf
),
sf
=
crc_sf
))
%>%
mutate
(
s_med_count_scaled
=
2
^
sample_median
/
geometric.mean
(
2
^
sample_median
))
%>%
dplyr
::
arrange
(
sf
)
ggplot
(
sample_medians_sf
,
aes
(
x
=
s_med_count_scaled
,
y
=
sf
))
+
ggtitle
(
"Size factors versus scaled median"
)
+
geom_point
()
counts_crc_tidy
<-
left_join
(
counts_crc_tidy
,
dplyr
::
select
(
sample_medians_sf
,
sample_id
,
sf
))
## ----norm_data-----------------------------------------------------------
counts_crc_tidy
<-
mutate
(
counts_crc_tidy
,
count_norm
=
count
/
sf
)
## ----data_norm_plot------------------------------------------------------
## ----scran_norm----------------------------------------------------------
mtec_scran_sf
<-
tibble
(
scran_sf
=
computeSumFactors
(
as.matrix
(
mtec_counts
[,
-1
])),
cell_id
=
colnames
(
mtec_counts
)[
-1
])
mtec_cell_anno
<-
left_join
(
mtec_cell_anno
,
mtec_scran_sf
,
by
=
c
(
"cellID"
=
"cell_id"
))
## ----compar_sf-----------------------------------------------------------
ggplot
(
mtec_cell_anno
,
aes
(
x
=
sizeFactor
,
y
=
scran_sf
))
+
ggtitle
(
"SCRAN vs. classical size factors"
)
+
geom_point
()
+
geom_smooth
()
## ----createNormCounts----------------------------------------------------
counts_norm_crc_mat
<-
select
(
counts_crc_tidy
,
sample_id
,
ensembl_id
,
count_norm
)
%>%
spread
(
key
=
sample_id
,
value
=
count_norm
)
%>%
{
tmp
<-
as.data.frame
(
.
)
rownames
(
tmp
)
<-
tmp
$
ensembl_id
tmp
}
%>%
select
(
-
ensembl_id
)
%>%
as.matrix
()
meanSdPlot
(
counts_norm_crc_mat
)
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo
()
graphics_bioinf.Rmd
View file @
c2e9e1ac
...
...
@@ -48,27 +48,20 @@ library("stringr")
library
(
"pheatmap"
)
library
(
"matrixStats"
)
library
(
"purrr"
)
library
(
"fdrtool"
)
library
(
"readr"
)
library
(
"gtools"
)
library
(
"factoextra"
)
library
(
"magrittr"
)
library
(
"entropy"
)
library
(
"forcats"
)
library
(
"plotly"
)
library
(
"corrplot"
)
library
(
"car"
)
library
(
"forcats"
)
library
(
"openxlsx"
)
library
(
"readxl"
)
library
(
"limma"
)
library
(
"Single.mTEC.Transcriptomes"
)
library
(
"DESeq2"
)
library
(
"tibble"
)
library
(
"broom"
)
library
(
"locfit"
)
library
(
"recount"
)
library
(
"psych"
)
library
(
"vsn"
)
library
(
"matrixStats"
)
theme_set
(
theme_gray
(
base_size
=
18
))
...
...
@@ -466,6 +459,12 @@ counts_crc <- assay(crc_data)
counts_crc
[
1
:
5
,
1
:
5
]
```
We
filter
all
the
features
that
have
less
than
5
counts
on
average
:
```{
r
crc_filtering
}
counts_crc
<-
counts_crc
[
rowMeans
(
counts_crc
)
>=
5
,
]
dim
(
counts_crc
)
```
The
annotation
for
the
genes
looks
like
this
:
...
...
@@ -554,7 +553,7 @@ will be sufficient for this data set.
##
Exercise
:
Adapt
the
boxplot
Experiment
with
the
color
variable
and
try
to
create
faceted
plots
using
`
r
facet_wrap
()`
or
`
r
facet_grid
()`
:
Do
you
see
a
pattern
if
you
color
/
wrap
the
`
facet_wrap
()`
or
`
facet_grid
()`
:
Do
you
see
a
pattern
if
you
color
/
wrap
the
boxplots
by
tissue
and
or
patient
?
...
...
@@ -616,7 +615,22 @@ We can see tha the size factors computed are the scaled medians
on
the
raw
count
scale
,
however
they
are
not
identical
.
margin
^[
Note
that
it
is
nor
recommended
to
scale
sequencing
libraries
by
the
total
library
size
:
it
is
not
robust
and
artificially
limits
the
range
of
the
data
to
0
--
1
]
of
the
data
to
0
--
1
].
We
can
now
normalize
the
data
:
```{
r
norm_data
}
counts_crc_tidy
<-
mutate
(
counts_crc_tidy
,
count_norm
=
count
/
sf
)
```
##
Exercise
:
Data
normalization
Normalize
the
data
using
the
computed
size
factors
and
create
boxplots
of
the
normalized
data
```{
r
data_norm_plot
}
```
##
Size
factors
for
single
cell
data
...
...
@@ -662,7 +676,54 @@ ggplot(mtec_cell_anno, aes(x = sizeFactor, y = scran_sf)) +
#
Variance
stabilization
##
Confounding
factors
The
variance
of
count
data
depends
on
the
its
mean
,
i
.
e
.
the
higher
the
counts
,
the
higher
their
variance
.
This
is
readily
seen
in
a
plot
of
variance
against
the
mean
.
We
use
the
function
`
meanSdPlot
`
from
the
`
r
Biocpkg
(
"vsn"
)
`
package
to
create
such
a
plot
for
the
normalized
RNA
--
Seq
data
.
This
function
accepts
an
expression
matrix
as
an
input
,
so
we
need
to
turn
our
tidy
data
table
into
an
expression
matrix
first
.
The
complicated
code
in
the
middle
is
need
to
assign
the
colum
`
ensembl_id
`
as
rownames
.
Here
,
the
dot
`.`
refers
to
the
current
data
.
```{
r
createNormCounts
}
counts_norm_crc_mat
<-
select
(
counts_crc_tidy
,
sample_id
,
ensembl_id
,
count_norm
)
%>%
spread
(
key
=
sample_id
,
value
=
count_norm
)
%>%
{
tmp
<-
as
.
data
.
frame
(.)
rownames
(
tmp
)
<-
tmp
$
ensembl_id
tmp
}
%>%
select
(-
ensembl_id
)
%>%
as
.
matrix
()
meanSdPlot
(
counts_norm_crc_mat
)
```
overdispersion
,
i
.
e
.
the
variance
is
greater
than
than
the
mean
.
In
fact
,
it
is
known
that
a
standard
Poisson
model
can
only
account
for
the
technical
noise
in
RNA
--
Seq
data
.
In
the
Poisson
model
the
variance
is
equal
to
the
mean
,
while
in
RNA
--
Seq
data
the
variance
is
greater
than
the
mean
.
A
popular
way
to
model
this
is
to
use
a
Negative
--
Binomial
-
distribution
(
NB
),
which
includes
an
additional
parameter
dispersion
parameter
$\
alpha
$
such
that
$
E
(
NB
)
=
\
mu
$
and
\
begin
{
gather
*}
\
text
{
Var
[
NB
($\
mu
$,
$\
alpha
$)]}
=
\
mu
+
\
alpha
\
cdot
\
mu
^
2
\
end
{
gather
*}
Hence
,
the
variance
is
greater
than
the
mean
.
\
Biocpkg
{
DESeq2
}
uses
the
NB
model
and
fits
dispersion
values
(
see
below
).
#
Confounding
factors
,
Testing
,
Machine
Learning
,
move
to
new
doc
!
*
ZINBA
Wave
...
...
graphics_bioinf.html
View file @
c2e9e1ac
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment