Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Bernd Klaus
stat_methods_bioinf
Commits
bc325de9
Commit
bc325de9
authored
Dec 06, 2017
by
Bernd Klaus
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
first final version, without slides
parent
80c7cb31
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1883 additions
and
467 deletions
+1883
-467
.gitignore
.gitignore
+11
-0
factor_ana_testing_ml.R
factor_ana_testing_ml.R
+432
-36
factor_ana_testing_ml.Rmd
factor_ana_testing_ml.Rmd
+492
-156
factor_ana_testing_ml.html
factor_ana_testing_ml.html
+785
-253
get_citations_stat_methods_bioinf.R
get_citations_stat_methods_bioinf.R
+66
-15
stat_methods_bioinf.bib
stat_methods_bioinf.bib
+95
-6
stats_methods_packages.R
stats_methods_packages.R
+2
-1
No files found.
.gitignore
View file @
bc325de9
...
...
@@ -6,3 +6,14 @@ SRP022054/
rse_gene.Rdata
stockori_tmp.Rmd
factanal.R
compare_to_scLVM.R
scran_offset_zinbawave.R
clustering.pdf
clustering_org.pdf
clustering_cl_10.pdf
data/cl_mtec.RData
data/genes_for_clustering.RData
data/nomarkerCellsClustering.RData
data/zinb.RData
get_clustering_aljeandro.R
norm.pdf
factor_ana_testing_ml.R
View file @
bc325de9
...
...
@@ -4,14 +4,13 @@ options(digits=3, width=80)
golden_ratio
<-
(
1
+
sqrt
(
5
))
/
2
opts_chunk
$
set
(
echo
=
TRUE
,
tidy
=
FALSE
,
include
=
TRUE
,
dev
=
c
(
'png'
,
'pdf'
,
'svg'
),
fig.height
=
5
,
fig.width
=
4
*
golden_ratio
,
comment
=
' '
,
dpi
=
300
,
cache
=
TRUE
)
cache
=
TRUE
,
message
=
FALSE
)
## ---- echo=FALSE, cache=FALSE--------------------------------------------
print
(
date
())
## ----required_packages_and_data, echo = TRUE, cache=FALSE, message=FALSE----
library
(
"readxl"
)
library
(
"scran"
)
library
(
"BiocStyle"
)
library
(
"knitr"
)
library
(
"MASS"
)
...
...
@@ -21,29 +20,28 @@ library("pheatmap")
library
(
"matrixStats"
)
library
(
"purrr"
)
library
(
"readr"
)
library
(
"factoextra"
)
library
(
"magrittr"
)
library
(
"entropy"
)
library
(
"forcats"
)
library
(
"forcats"
)
library
(
"readxl"
)
library
(
"DESeq2"
)
library
(
"broom"
)
library
(
"locfit"
)
library
(
"recount"
)
library
(
"psych"
)
library
(
"vsn"
)
library
(
"matrixStats"
)
library
(
"pheatmap"
)
library
(
"tidyverse"
)
library
(
"Rtsne"
)
library
(
"limma"
)
library
(
"ggthemes"
)
library
(
"corpcor"
)
library
(
"sva"
)
library
(
"zinbwave"
)
library
(
"clusterExperiment"
)
library
(
"clue"
)
library
(
"sda"
)
library
(
"crossval"
)
library
(
"randomForest"
)
theme_set
(
theme_
gray
(
base_size
=
18
))
theme_set
(
theme_
solarized
(
base_size
=
18
))
data_dir
<-
file.path
(
"data
/
"
)
data_dir
<-
file.path
(
"data"
)
## ----import_data---------------------------------------------------------
...
...
@@ -52,13 +50,6 @@ load(file.path(data_dir, "mtec_cell_anno.RData"))
load
(
file.path
(
data_dir
,
"mtec_gene_anno.RData"
))
load
(
file.path
(
data_dir
,
"tras.RData"
))
## ----scran_norm----------------------------------------------------------
mtec_scran_sf
<-
tibble
(
scran_sf
=
computeSumFactors
(
as.matrix
(
mtec_counts
[,
-1
])),
cell_id
=
colnames
(
mtec_counts
)[
-1
])
mtec_cell_anno
<-
left_join
(
mtec_cell_anno
,
mtec_scran_sf
,
by
=
c
(
"cellID"
=
"cell_id"
))
## ----compute_pca---------------------------------------------------------
compute_pca
<-
function
(
data_mat
,
ntop
=
500
,
...
){
...
...
@@ -104,6 +95,17 @@ batch_pca_plot <- ggplot(batch_pca$pca, aes(x = PC1, y = PC2,
batch_pca_plot
ggplot
(
batch_pca
$
pca
,
aes
(
sex
,
PC2
,
color
=
sex
))
+
geom_jitter
(
height
=
0
,
width
=
0.1
)
+
ggtitle
(
"PC2 by sex"
)
+
geom_boxplot
(
alpha
=
0.1
)
+
scale_color_fivethirtyeight
()
## ----pc2ttest, results='hide', echo=FALSE--------------------------------
t.test
(
PC2
~
sex
,
data
=
(
batch_pca
$
pca
))
## ----rna_seq_pval--------------------------------------------------------
load
(
file.path
(
data_dir
,
"dds_batch.RData"
))
dds_batch
<-
DESeq
(
dds_batch
)
...
...
@@ -114,7 +116,8 @@ res_dds_batch
res_dds_batch
<-
as.data.frame
(
res_dds_batch
)
ggplot
(
res_dds_batch
,
aes
(
x
=
pvalue
))
+
geom_histogram
(
binwidth
=
0.05
)
geom_histogram
(
binwidth
=
0.025
,
boundary
=
0
)
+
ggtitle
(
"p-value histogram for simulated RNA-Seq data"
)
## ----simzscores----------------------------------------------------------
...
...
@@ -139,7 +142,8 @@ z <- get.random.zscore(200)
pv
<-
2
-
2
*
pnorm
(
abs
(
z
),
sd
=
sd.true
)
ggplot
(
tibble
(
pv
),
aes
(
x
=
pv
))
+
geom_histogram
(
aes
(
fill
=
I
(
"navyblue"
)))
+
geom_histogram
(
aes
(
fill
=
I
(
tableau_color_pal
(
'tableau10light'
)(
3
)[
1
])),
boundary
=
0
,
binwidth
=
0.025
)
+
xlab
(
"p-values"
)
+
ggtitle
(
"Histogram of p-values, correct null distribution"
)
...
...
@@ -147,16 +151,19 @@ ggplot(tibble(pv), aes(x = pv)) +
pv
<-
2
-
2
*
pnorm
(
abs
(
z
),
sd
=
2
)
ggplot
(
tibble
(
pv
),
aes
(
x
=
pv
))
+
geom_histogram
(
aes
(
fill
=
I
(
"coral2"
)))
+
geom_histogram
(
aes
(
fill
=
I
(
tableau_color_pal
(
'tableau10light'
)(
3
)[
2
])),
boundary
=
0
,
binwidth
=
0.025
)
+
xlab
(
"p-values"
)
+
ggtitle
(
"Histogram of p-values"
,
subtitle
=
"variance of null distribution too high"
)
## ----pvaluehistogramwrongnull2
---------------------------------
----------
## ----pvaluehistogramwrongnull2
, results="hide", fig.show="hide"
----------
pv
<-
2
-
2
*
pnorm
(
abs
(
z
),
sd
=
0.5
)
ggplot
(
tibble
(
pv
),
aes
(
x
=
pv
))
+
geom_histogram
(
aes
(
fill
=
I
(
"chartreuse4"
)))
+
geom_histogram
(
aes
(
fill
=
I
(
tableau_color_pal
(
'tableau10light'
)(
3
)[
3
])),
boundary
=
0
,
binwidth
=
0.025
)
+
xlab
(
"p-values"
)
+
ggtitle
(
"Histogram of p-values"
,
subtitle
=
"variance of null distribution too low"
)
...
...
@@ -200,13 +207,20 @@ clean_pca_plot <- ggplot(cleaned_data_pca$pca, aes(x = PC1, y = PC2,
color
=
condition
,
shape
=
sex
))
+
geom_point
(
size
=
4
)
+
ggtitle
(
"PC1 vs PC2, top variable genes"
)
+
ggtitle
(
"PC1 vs PC2,
cleaned data,
top variable genes"
)
+
labs
(
x
=
paste0
(
"PC1, VarExp:"
,
round
(
cleaned_data_pca
$
perc_var
[
1
],
4
)),
y
=
paste0
(
"PC2, VarExp:"
,
round
(
cleaned_data_pca
$
perc_var
[
2
],
4
)))
+
coord_fixed
(
ratio
=
sd_ratio
)
+
scale_colour_brewer
(
palette
=
"Paired"
)
clean_pca_plot
scale_colour_brewer
(
palette
=
"Set2"
)
clean_pca_plot
ggplot
(
cleaned_data_pca
$
pca
,
aes
(
sex
,
PC2
,
color
=
sex
))
+
geom_jitter
(
height
=
0
,
width
=
0.1
)
+
ggtitle
(
"PC2 by sex"
)
+
geom_boxplot
(
alpha
=
0.1
)
+
scale_color_fivethirtyeight
()
## ----chgdesign-----------------------------------------------------------
...
...
@@ -218,15 +232,397 @@ resultsNames(dds_batch_sex)
## ----pwrblocking---------------------------------------------------------
summary
(
res_sex
)
## ----runfactor-----------------------------------------------------------
factor_ana
<-
function
(
X
,
ntop
=
500
,
...
){
cen_scaled_input
<-
scale
(
t
(
X
))
pvars
<-
rowVars
(
cen_scaled_input
)
select
<-
order
(
pvars
,
decreasing
=
TRUE
)[
seq_len
(
min
(
ntop
,
length
(
pvars
)))]
cen_scaled_input
<-
cen_scaled_input
[
select
,
]
cor_matrix
<-
cor.shrink
(
cen_scaled_input
)
rld_batch_fac
<-
factanal
(
covmat
=
cor_matrix
,
factors
=
4
,
n.obs
=
ncol
(
cen_scaled_input
),
rotation
=
"none"
)
loadings
<-
rld_batch_fac
$
loadings
[,
1
:
4
]
factors
<-
as_data_frame
(
cen_scaled_input
%*%
solve
(
cor_matrix
,
loadings
))
%>%
add_column
(
...
)
return
(
factors
)
}
## ----runfactanal---------------------------------------------------------
factors_rld_batch
<-
factor_ana
(
rld_batch
,
ntop
=
500
,
sex
=
batch_pca
$
pca
$
sex
,
condition
=
batch_pca
$
pca
$
condition
,
gen
=
batch_pca
$
pca
$
gen
)
batch_fac_plot
<-
ggplot
(
factors_rld_batch
,
aes
(
x
=
Factor1
,
y
=
Factor2
,
color
=
condition
,
shape
=
sex
))
+
geom_point
(
size
=
4
)
+
ggtitle
(
"Factor 1 vs Factor 2, top 500 genes"
)
+
coord_fixed
(
ratio
=
sd_ratio
)
+
scale_colour_tableau
(
palette
=
"tableau10"
)
batch_fac_plot
## ----facvspca, results="hide", fig.show="hide"---------------------------
ggplot
(
factors_rld_batch
,
aes
(
sex
,
Factor2
,
color
=
sex
))
+
geom_jitter
(
height
=
0
,
width
=
0.1
)
+
ggtitle
(
"Factor2 by sex"
)
+
geom_boxplot
(
alpha
=
0.1
)
+
scale_color_fivethirtyeight
()
t.test
(
Factor2
~
sex
,
data
=
factors_rld_batch
)
ggplot
(
factors_rld_batch
,
aes
(
condition
,
Factor2
,
color
=
condition
))
+
geom_jitter
(
height
=
0
,
width
=
0.1
)
+
ggtitle
(
"Factor2 by condition"
)
+
geom_boxplot
(
alpha
=
0.1
)
+
scale_color_fivethirtyeight
()
t.test
(
Factor2
~
condition
,
data
=
factors_rld_batch
)
cor
(
as.numeric
(
batch_pca
$
pca
$
sex
),
factors_rld_batch
$
Factor2
)
cor
(
as.numeric
(
batch_pca
$
pca
$
condition
),
factors_rld_batch
$
Factor2
)
cor
(
as.numeric
(
factors_rld_batch
$
gen
),
factors_rld_batch
$
Factor4
)
cor
(
as.numeric
(
factors_rld_batch
$
gen
),
batch_pca
$
pca
$
PC1
)
## ----factordesign--------------------------------------------------------
colData
(
dds_batch
)
<-
cbind
(
colData
(
dds_batch
),
select
(
factors_rld_batch
,
1
:
4
))
design
(
dds_batch
)
<-
~
sex
+
Factor1
+
Factor3
+
Factor4
+
condition
dds_batch_f
<-
DESeq
(
dds_batch
)
res_f
<-
results
(
dds_batch_f
)
summary
(
res_f
)
summary
(
res_sex
)
## ----factorpcaoverlap, results='hide'------------------------------------
idx_joint
<-
(
res_f
$
padj
<
0.1
)
&
(
res_sex
$
padj
<
0.1
)
res_f
[
idx_joint
,
]
res_sex
[
idx_joint
,
]
res_f
[
setdiff
(
which
((
res_sex
$
padj
<
0.1
)),
which
(
idx_joint
)),
]
res_sex
[
setdiff
(
which
((
res_sex
$
padj
<
0.1
)),
which
(
idx_joint
))
,
]
res_f
[
setdiff
(
which
((
res_f
$
padj
<
0.1
)),
which
(
idx_joint
)),
]
res_sex
[
setdiff
(
which
((
res_f
$
padj
<
0.1
)),
which
(
idx_joint
))
,
]
## ----secondfactor--------------------------------------------------------
ggplot
(
factors_rld_batch
,
aes
(
condition
,
Factor2
,
color
=
condition
))
+
geom_jitter
(
height
=
0
,
width
=
0.1
)
+
ggtitle
(
"Factor2 by condition"
)
+
geom_boxplot
(
alpha
=
0.1
)
+
scale_color_fivethirtyeight
()
t.test
(
Factor2
~
condition
,
data
=
factors_rld_batch
)
## ----mult_reg_confounding------------------------------------------------
tidy
(
lm
(
cbind
(
as.numeric
(
colData
(
dds_batch
)
$
sex
),
as.numeric
(
colData
(
dds_batch
)
$
gen_back
))
~
as.numeric
(
colData
(
dds_batch
)
$
condition
)))
## ----sva_sim-------------------------------------------------------------
mod
<-
model.matrix
(
~
condition
,
data
=
colData
(
dds_batch
))
mod0
<-
model.matrix
(
~
1
,
data
=
colData
(
dds_batch
))
n.sv
<-
num.sv
(
rld_batch
,
mod
,
method
=
"be"
,
B
=
100
,
vfilter
=
500
)
sva_res
<-
sva
(
rld_batch
,
mod
,
mod0
,
n.sv
=
n.sv
,
vfilter
=
500
,
B
=
20
)
ggplot
(
data_frame
(
condition
=
factors_rld_batch
$
gen
,
sv
=
as.vector
(
sva_res
$
sv
)),
aes
(
condition
,
sv
,
color
=
condition
))
+
geom_jitter
(
height
=
0
,
width
=
0.1
)
+
ggtitle
(
"Surrogate variable by genetic background"
)
+
geom_boxplot
(
alpha
=
0.1
)
+
scale_color_fivethirtyeight
()
anova
(
lm
(
as.vector
(
sva_res
$
sv
)
~
factors_rld_batch
$
condition
))
## ----svdesign------------------------------------------------------------
colData
(
dds_batch
)
$
sv
<-
as.vector
(
sva_res
$
sv
)
design
(
dds_batch
)
<-
~
sv
+
condition
dds_batch_sva
<-
DESeq
(
dds_batch
)
res_sva
<-
results
(
dds_batch_sva
)
summary
(
res_sva
)
## ----runzinbawave--------------------------------------------------------
no_marker_cells
<-
filter
(
mtec_cell_anno
,
SurfaceMarker
==
"None"
)
count_matrix_nomarker
<-
as.matrix
(
mtec_counts
[,
no_marker_cells
$
cellID
])
rownames
(
count_matrix_nomarker
)
<-
mtec_counts
$
ensembl_id
#
# zinb <- zinbFit(count_matrix_nomarker,
# K=1, epsilon=1000)
# save(zinb,file = file.path(data_dir, "zinb.RData"))
## ----zinbanormcorrected--------------------------------------------------
load
(
file.path
(
data_dir
,
"zinb.RData"
))
mtec_norm
<-
zinbwave
(
SummarizedExperiment
(
count_matrix_nomarker
),
fitted_model
=
zinb
,
normalizedValues
=
TRUE
,
residuals
=
TRUE
)
# keep only the residuals in the summarized experiment
assay
(
mtec_norm
,
1
)
<-
NULL
assay
(
mtec_norm
,
1
)
<-
NULL
colData
(
mtec_norm
)
<-
DataFrame
(
no_marker_cells
)
rowData
(
mtec_norm
)
<-
DataFrame
(
filter
(
mtec_gene_anno
,
ensembl_id
%in%
rownames
(
mtec_norm
)))
mtec_norm
## ----checkscnorm---------------------------------------------------------
mtec_norm_tidy
<-
as.data.frame
(
assay
(
mtec_norm
))
%>%
{
colnames
(
.
)
=
colData
(
mtec_norm
)
$
cellID
.
[,
sample
(
ncol
(
mtec_norm
),
25
)]
}
%>%
gather
(
key
=
"cell"
,
value
=
"expression"
)
medians
<-
mtec_norm_tidy
%>%
group_by
(
cell
)
%>%
summarize
(
med
=
median
(
expression
))
%>%
arrange
(
med
)
mtec_norm_tidy
$
cell
%<>%
factor
(
levels
=
medians
$
cell
)
pl
<-
ggplot
(
mtec_norm_tidy
,
aes
(
cell
,
expression
,
color
=
cell
))
+
geom_boxplot
()
+
guides
(
color
=
FALSE
)
+
ggtitle
(
"zinbawave normalized/cleaned expression data [log]"
)
+
theme
(
axis.text.x
=
element_text
(
angle
=
45
,
hjust
=
1
))
pl
#ggsave("norm.pdf", pl, width = 28, height = 14)
## ----clusteringprep------------------------------------------------------
load
(
file.path
(
data_dir
,
"genes_for_clustering.RData"
))
load
(
file.path
(
data_dir
,
"nomarkerCellsClustering.RData"
))
mtec_norm_sub
<-
mtec_norm
[
genes_for_clustering
,
]
rowData
(
mtec_norm_sub
)
$
pub_cls
<-
cl_class_ids
(
nomarkerCellsClustering
[[
"consensus"
]])
leftover_genes
<-
rowData
(
mtec_norm_sub
)
$
pub_cls
==
12
data_for_cl
<-
assay
(
mtec_norm_sub
[
!
leftover_genes
,])
colnames
(
data_for_cl
)
<-
colData
(
mtec_norm_sub
)
$
cellID
## ----runscclustering, eval=FALSE-----------------------------------------
cl_mtec
<-
clusterSingle
(
t
(
data_for_cl
),
subsample
=
TRUE
,
isCount
=
FALSE
,
clusterLabel
=
"mtec_sub"
,
dimReduce
=
"PCA"
,
ndims
=
10
,
mainClusterArgs
=
list
(
minSize
=
10
,
clusterFunction
=
"pam"
,
clusterArgs
=
list
(
k
=
11
)),
subsampleArgs
=
list
(
clusterFunction
=
"pam"
,
clusterArgs
=
list
(
k
=
11
),
largeDataset
=
TRUE
,
random.seed
=
1234
,
ncores
=
1
,
resamp.num
=
10
))
# save(cl_mtec, file = file.path(data_dir, "cl_mtec.RData"))
## ----validateclustering--------------------------------------------------
load
(
file
=
file.path
(
data_dir
,
"cl_mtec.RData"
))
row_order
<-
tibble
(
genes
=
rownames
(
data_for_cl
),
clusters
=
as.numeric
(
primaryClusterNamed
(
cl_mtec
)),
clusters_org
=
rowData
(
mtec_norm_sub
)
$
pub_cls
[
rowData
(
mtec_norm_sub
)
$
pub_cls
!=
12
])
%>%
arrange
(
clusters
)
%>%
as.data.frame
(
.
)
%>%
{
rownames
(
.
)
=
.
$
genes
.
}
data_cor
<-
cor
(
t
(
data_for_cl
[
row_order
$
genes
,]))
br
<-
seq
(
-1
,
1
,
length.out
=
101
)
**
3
cols
<-
colorRampPalette
(
brewer.pal
(
9
,
"RdBu"
),
interpolate
=
"spline"
,
space
=
"Lab"
)(
100
)
ann_colors
=
list
(
clusters
=
tableau_color_pal
(
palette
=
"tableau20"
)(
11
),
clusters_org
=
tableau_color_pal
(
palette
=
"bluered12"
)(
11
)
)
pdf
(
"clustering.pdf"
,
width
=
14
,
height
=
14
,
title
=
"single cell clustering using zinbawave corrected data"
)
pheatmap
(
data_cor
,
color
=
cols
,
breaks
=
br
,
border_color
=
"grey60"
,
cellwidth
=
NA
,
cellheight
=
NA
,
scale
=
"none"
,
cluster_rows
=
FALSE
,
cluster_cols
=
FALSE
,
legend
=
TRUE
,
legend_labels
=
NA
,
annotation_row
=
row_order
[,
"clusters"
,
drop
=
FALSE
],
show_rownames
=
FALSE
,
show_colnames
=
FALSE
,
annotation_colors
=
ann_colors
)
dev.off
()
## ----heatorg, results='hide'---------------------------------------------
row_order
<-
arrange
(
row_order
,
clusters_org
)
%>%
{
rownames
(
.
)
=
.
$
genes
.
}
pdf
(
"clustering_org.pdf"
,
width
=
14
,
height
=
14
,
title
=
"original clustering using zinbawave corrected data"
)
pheatmap
(
data_cor
[
row_order
$
genes
[
1
:
1000
],
row_order
$
genes
[
1
:
1000
]],
color
=
cols
,
breaks
=
br
,
border_color
=
"grey60"
,
cellwidth
=
NA
,
cellheight
=
NA
,
scale
=
"none"
,
cluster_rows
=
FALSE
,
cluster_cols
=
FALSE
,
legend
=
TRUE
,
legend_labels
=
NA
,
annotation_row
=
row_order
[,
"clusters_org"
,
drop
=
FALSE
],
show_rownames
=
FALSE
,
show_colnames
=
FALSE
,
annotation_colors
=
ann_colors
)
dev.off
()
## ----cl10heat, results="hide"--------------------------------------------
idx_10
<-
which
(
row_order
$
clusters
==
10
)
pdf
(
"clustering_cl_10.pdf"
,
width
=
14
,
height
=
14
,
title
=
"single cell clustering using zinbawave corrected data"
)
pheatmap
(
data_cor
[
idx_10
,
idx_10
],
color
=
cols
,
breaks
=
br
,
border_color
=
"grey60"
,
cellwidth
=
NA
,
cellheight
=
NA
,
scale
=
"none"
,
cluster_rows
=
FALSE
,
cluster_cols
=
FALSE
,
legend
=
TRUE
,
legend_labels
=
NA
,
annotation_row
=
row_order
[,
"clusters"
,
drop
=
FALSE
],
show_rownames
=
FALSE
,
show_colnames
=
FALSE
,
annotation_colors
=
ann_colors
)
dev.off
()
## ----prepareDataMl-------------------------------------------------------
genes_clusters
<-
row_order
[
rownames
(
data_for_cl
),
]
%>%
mutate
(
labs
=
factor
(
ifelse
(
clusters
==
10
,
"cl10"
,
"other"
)))
class_priors
<-
prop.table
(
table
(
genes_clusters
$
labs
))
class_priors
## ----testRF--------------------------------------------------------------
rf_fit
<-
randomForest
(
x
=
data_for_cl
,
y
=
genes_clusters
$
labs
,
ntree
=
500
,
nodesize
=
5
,
mtry
=
floor
(
sqrt
(
ncol
(
data_for_cl
))),
classwt
=
class_priors
,
do.trace
=
100
)
rf_fit
$
confusion
# acc <- sum(rf_fit$confusion[, "class.error"] * class_priors)
# acc
## ----compareToRandom-----------------------------------------------------
random_cf
<-
ifelse
(
rbernoulli
(
nrow
(
data_for_cl
),
class_priors
[
1
]),
"cl10"
,
"other"
)
random_confusion
<-
table
(
random_cf
,
genes_clusters
$
labs
)
random_confusion
<-
cbind
(
random_confusion
,
c
(
random_confusion
[
"cl10"
,
"other"
]
/
sum
(
random_confusion
[
"cl10"
,
]),
random_confusion
[
"other"
,
"cl10"
]
/
sum
(
random_confusion
[
"other"
,
])))
colnames
(
random_confusion
)[
3
]
<-
"class.error"
random_confusion
## ----predfunForRF--------------------------------------------------------
predfun_rf
<-
function
(
train.x
,
train.y
,
test.x
,
test.y
,
negative
){
rf_fit
<-
randomForest
(
x
=
train.x
,
y
=
as.factor
(
train.y
),
ntree
=
500
,
nodesize
=
5
,
mtry
=
floor
(
sqrt
(
ncol
(
train.x
))),
classwt
=
class_priors
,
do.trace
=
FALSE
)
#browser()
ynew
<-
predict
(
rf_fit
,
test.x
)
conf
<-
table
(
ynew
,
test.y
)
err_rates
<-
c
(
conf
[
"cl10"
,
"other"
]
/
sum
(
conf
[
"cl10"
,
]),
conf
[
"other"
,
"cl10"
]
/
sum
(
conf
[
"other"
,
]))
names
(
err_rates
)
<-
c
(
"cl10"
,
"other"
)
return
(
err_rates
)
}
set.seed
(
7891
)
train_idx
<-
sample
(
nrow
(
data_for_cl
),
700
)
test_idx
<-
setdiff
(
seq_len
(
nrow
(
data_for_cl
)),
train_idx
)
predfun_rf
(
data_for_cl
[
train_idx
,],
genes_clusters
$
labs
[
train_idx
],
data_for_cl
[
test_idx
,
],
genes_clusters
$
labs
[
test_idx
])
## ----doCrossValForRF-----------------------------------------------------
set.seed
(
789
)
rf_out
<-
crossval
(
predfun_rf
,
X
=
data_for_cl
,
Y
=
genes_clusters
$
labs
,
K
=
5
,
B
=
10
,
negative
=
"other"
,
verbose
=
FALSE
)
cv_res
<-
as.data.frame
(
rf_out
$
stat.cv
)
%>%
rownames_to_column
(
var
=
"BF"
)
%>%
extract
(
col
=
BF
,
into
=
c
(
"rep"
,
"fold"
),
regex
=
"([[:alnum:]]+).([[:alnum:]]+)"
)
%>%
mutate_if
(
is.character
,
as_factor
)
%>%
gather
(
key
=
"class"
,
value
=
"pred_error"
,
cl10
,
other
)
cv_plot
<-
ggplot
(
cv_res
,
aes
(
x
=
rep
,
y
=
pred_error
,
color
=
class
))
+
geom_jitter
(
height
=
0
,
width
=
0.2
)
+
ggtitle
(
"CV prediction error across folds"
)
+
scale_color_tableau
()
cv_plot
## ----session_info, cache = FALSE-----------------------------------------
sessionInfo
()
## ----unloaAll, echo=FALSE, message=FALSE, eval = FALSE-------------------
##
##
pkgs <- loaded_packages() %>%
##
filter(package != "devtools") %>%
##
{.$path}
##
##
walk(pkgs, unload)
pkgs
<-
loaded_packages
()
%>%
filter
(
package
!=
"devtools"
)
%>%
{
.
$
path
}
walk
(
pkgs
,
unload
)
factor_ana_testing_ml.Rmd
View file @
bc325de9
This diff is collapsed.
Click to expand it.
factor_ana_testing_ml.html
View file @
bc325de9
This diff is collapsed.
Click to expand it.
get_citations_stat_methods_bioinf.R
View file @
bc325de9
...
...
@@ -2,6 +2,8 @@ library(knitcitations)
library
(
bibtex
)
# Kyewski and Klein, 2006
citep
(
"10.1146/annurev.immunol.23.021704.115601"
)
...
...
@@ -41,8 +43,6 @@ citep("10.1101/125112")
# Perraudeau et. al. , 2017
citep
(
"10.12688/f1000research.12122.1"
)
write.bibtex
(
file
=
"stat_methods_bioinf.bib"
)
knitcitations
::
cleanbib
()
# Nygaard et. al, 2015
citep
(
"10.1093/biostatistics/kxv027"
)
...
...
@@ -50,10 +50,41 @@ citep("10.1093/biostatistics/kxv027")
# Jaffe et. al., 2015