Commit 57dbc021 authored by Jakob Wirbel's avatar Jakob Wirbel
Browse files

enforce tab limit for bioconductor.

parent c1cd12c5
......@@ -118,7 +118,7 @@ check.assoc <- function(object){
# mult.corr
if (!object$assoc.param$mult.corr %in%
c("holm", "hochberg", "hommel", "bonferroni",
"BH", "BY", "fdr", "none")){
"BH", "BY", "fdr", "none")){
msg<-'Multiple testing correction method not valid!'
errors <- c(errors, msg)
}
......
......@@ -65,8 +65,8 @@ library("curatedMetagenomciData")
The data are part of the `combined_metadata`
```{r get_metadata, eval=FALSE}
meta.nielsen.full <- combined_metadata %>%
filter(dataset_name=='NielsenHB_2014')
meta.nielsen.full <- combined_metadata %>%
filter(dataset_name=='NielsenHB_2014')
```
One thing we have to keep in mind are repeated samples per subject (see also
......@@ -87,25 +87,25 @@ first visit for each subject.
```{r clean_metadata_1, eval=FALSE}
meta.nielsen <- meta.nielsen.full %>%
select(sampleID, subjectID, study_condition, disease_subtype,
disease, age, country, number_reads, median_read_length, BMI) %>%
mutate(visit=str_extract(sampleID, '_[0-9]+$')) %>%
mutate(visit=str_remove(visit, '_')) %>%
mutate(visit=as.numeric(visit)) %>%
mutate(visit=case_when(is.na(visit)~0, TRUE~visit)) %>%
group_by(subjectID) %>%
filter(visit==min(visit)) %>%
ungroup() %>%
mutate(Sample_ID=sampleID) %>%
mutate(Group=case_when(disease=='healthy'~'CTR',
TRUE~disease_subtype))
select(sampleID, subjectID, study_condition, disease_subtype,
disease, age, country, number_reads, median_read_length, BMI) %>%
mutate(visit=str_extract(sampleID, '_[0-9]+$')) %>%
mutate(visit=str_remove(visit, '_')) %>%
mutate(visit=as.numeric(visit)) %>%
mutate(visit=case_when(is.na(visit)~0, TRUE~visit)) %>%
group_by(subjectID) %>%
filter(visit==min(visit)) %>%
ungroup() %>%
mutate(Sample_ID=sampleID) %>%
mutate(Group=case_when(disease=='healthy'~'CTR',
TRUE~disease_subtype))
```
Now, we can restrict our metadata to samples with `UC` and healthy control
samples:
```{r clean_metadata_4, eval=FALSE}
meta.nielsen <- meta.nielsen %>%
filter(Group %in% c('UC', 'CTR'))
filter(Group %in% c('UC', 'CTR'))
```
As a last step, we can adjust the column names for the metadata so that they
......@@ -115,7 +115,7 @@ and features.
```{r clean_metadata_3, eval=FALSE}
meta.nielsen <- meta.nielsen %>%
mutate(Country=country)
mutate(Country=country)
meta.nielsen <- as.data.frame(meta.nielsen)
rownames(meta.nielsen) <- meta.nielsen$sampleID
```
......@@ -163,17 +163,17 @@ data.location <- 'https://www.embl.de/download/zeller/metaHIT/'
meta.nielsen <- read_tsv(paste0(data.location, 'meta_Nielsen.tsv'))
# also here, we have to remove repeated samplings and CD samples
meta.nielsen <- meta.nielsen %>%
filter(Group %in% c('CTR', 'UC')) %>%
group_by(Individual_ID) %>%
filter(Sampling_day==min(Sampling_day)) %>%
ungroup() %>%
as.data.frame()
filter(Group %in% c('CTR', 'UC')) %>%
group_by(Individual_ID) %>%
filter(Sampling_day==min(Sampling_day)) %>%
ungroup() %>%
as.data.frame()
rownames(meta.nielsen) <- meta.nielsen$Sample_ID
## features
feat <- read.table(paste0(data.location, 'metaHIT_motus.tsv'),
stringsAsFactors = FALSE, sep='\t',
check.names = FALSE, quote = '', comment.char = '')
stringsAsFactors = FALSE, sep='\t',
check.names = FALSE, quote = '', comment.char = '')
feat <- feat[,colSums(feat) > 0]
feat <- prop.table(as.matrix(feat), 2)
```
......@@ -202,10 +202,10 @@ Now, we can filter feature with low overall abundance and prevalence.
```{r feature_filtering}
sc.obj <- filter.features(sc.obj, cutoff=1e-04,
filter.method = 'abundance')
filter.method = 'abundance')
sc.obj <- filter.features(sc.obj, cutoff=0.05,
filter.method='prevalence',
feature.type = 'filtered')
filter.method='prevalence',
feature.type = 'filtered')
```
## Association Plot
......@@ -215,11 +215,10 @@ metrics of association (such as generalized fold change and single-feautre
AUROC).
```{r assoc_plot, message=FALSE, warning=FALSE}
sc.obj <- check.associations(sc.obj, detect.lim = 1e-06,
alpha=0.1, max.show = 20,
plot.type = 'quantile.rect',
panels = c('fc'),
fn.plot = './association_plot_nielsen.pdf')
sc.obj <- check.associations(sc.obj, detect.lim = 1e-06, alpha=0.1,
max.show = 20,plot.type = 'quantile.rect',
panels = c('fc'),
fn.plot = './association_plot_nielsen.pdf')
```
![](./association_plot_nielsen.png)
......@@ -253,7 +252,7 @@ contains the following steps:
```{r ml_workflow, eval=FALSE}
sc.obj <- normalize.features(sc.obj, norm.method = 'log.std',
norm.param = list(log.n0=1e-06, sd.min.q=0))
norm.param = list(log.n0=1e-06, sd.min.q=0))
## Features normalized successfully.
sc.obj <- create.data.split(sc.obj, num.folds = 5, num.resample = 5)
## Features splitted for cross-validation successfully.
......@@ -292,8 +291,8 @@ the selected features.
```{r model_interpretation_plot, eval=FALSE}
model.interpretation.plot(sc.obj, consens.thres = 0.8,
fn.plot = './interpretation_nielsen.pdf')
## Successfully plotted model interpretation plot to: ./interpretation_nielsen.pdf
fn.plot = './interpret_nielsen.pdf')
## Successfully plotted model interpretation plot to: ./interpret_nielsen.pdf
```
![](./interpretation_nielsen.png)
......@@ -315,12 +314,12 @@ Danish controls:
```{r conf_start}
sc.obj.full <- siamcat(feat=feat, meta=meta.nielsen,
label='Group', case='UC')
label='Group', case='UC')
sc.obj.full <- filter.features(sc.obj.full, cutoff=1e-04,
filter.method = 'abundance')
filter.method = 'abundance')
sc.obj.full <- filter.features(sc.obj.full, cutoff=0.05,
filter.method='prevalence',
feature.type = 'filtered')
filter.method='prevalence',
feature.type = 'filtered')
```
The confounder plot would show us that the meta-variable "country" might be
......@@ -337,10 +336,10 @@ First, we can use `SIAMCAT` to test for associations including the Danish
samples.
```{r assoc_plot_2, warning=FALSE, message=FALSE}
sc.obj.full <- check.associations(sc.obj.full, detect.lim = 1e-06,
alpha=0.1, max.show = 20,
plot.type = 'quantile.rect',
fn.plot = './association_plot_dnk.pdf')
sc.obj.full <- check.associations(sc.obj.full, detect.lim = 1e-06, alpha=0.1,
max.show = 20,
plot.type = 'quantile.rect',
fn.plot = './association_plot_dnk.pdf')
```
Confounders can lead to biases in association testing. After using `SIAMCAT` to
......@@ -357,16 +356,16 @@ assoc.sp_dnk$species <- rownames(assoc.sp_dnk)
df.plot <- full_join(assoc.sp, assoc.sp_dnk, by='species')
df.plot %>%
mutate(highlight=str_detect(species, 'formicigenerans')) %>%
ggplot(aes(x=-log10(p.adj.x), y=-log10(p.adj.y), col=highlight)) +
mutate(highlight=str_detect(species, 'formicigenerans')) %>%
ggplot(aes(x=-log10(p.adj.x), y=-log10(p.adj.y), col=highlight)) +
geom_point(alpha=0.3) +
xlab('Spanish samples only\n-log10(q)') +
ylab('Spanish and Danish samples only\n-log10(q)') +
theme_classic() +
theme(panel.grid.major = element_line(colour='lightgrey'),
aspect.ratio = 1.3) +
scale_colour_manual(values=c('darkgrey', '#D41645'), guide=FALSE) +
annotate('text', x=0.7, y=8, label='Dorea formicigenerans')
xlab('Spanish samples only\n-log10(q)') +
ylab('Spanish and Danish samples only\n-log10(q)') +
theme_classic() +
theme(panel.grid.major = element_line(colour='lightgrey'),
aspect.ratio = 1.3) +
scale_colour_manual(values=c('darkgrey', '#D41645'), guide=FALSE) +
annotate('text', x=0.7, y=8, label='Dorea formicigenerans')
```
This result shows that several species are only signficant if the Danish
......@@ -383,23 +382,23 @@ label.dnk <- label(sc.obj.full)$label
country <- meta(sc.obj.full)$Country
names(country) <- rownames(meta(sc.obj.full))
df.plot <- tibble(dorea=log10(feat.dnk[str_detect(rownames(feat.dnk),
'formicigenerans'),
names(label.dnk)] + 1e-05),
label=label.dnk, country=country) %>%
mutate(label=case_when(label=='-1'~'CTR', TRUE~"UC")) %>%
mutate(x_value=paste0(country, '_', label))
df.plot <- tibble(dorea=log10(feat.dnk[
str_detect(rownames(feat.dnk),'formicigenerans'),
names(label.dnk)] + 1e-05),
label=label.dnk, country=country) %>%
mutate(label=case_when(label=='-1'~'CTR', TRUE~"UC")) %>%
mutate(x_value=paste0(country, '_', label))
df.plot %>%
ggplot(aes(x=x_value, y=dorea)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.08, stroke=0, alpha=0.2) +
theme_classic() +
xlab('') +
ylab("log10(Dorea formicigenerans)") +
stat_compare_means(comparisons = list(c('DNK_CTR', 'ESP_CTR'),
c('DNK_CTR', 'ESP_UC'),
c('ESP_CTR', 'ESP_UC')))
ggplot(aes(x=x_value, y=dorea)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.08, stroke=0, alpha=0.2) +
theme_classic() +
xlab('') +
ylab("log10(Dorea formicigenerans)") +
stat_compare_means(comparisons = list(c('DNK_CTR', 'ESP_CTR'),
c('DNK_CTR', 'ESP_UC'),
c('ESP_CTR', 'ESP_UC')))
```
......@@ -410,7 +409,7 @@ differences between countries, leading to exaggerated performance estimates.
```{r ml_workflow_dnk, eval=FALSE}
sc.obj.full <- normalize.features(sc.obj.full, norm.method = 'log.std',
norm.param = list(log.n0=1e-06, sd.min.q=0))
norm.param = list(log.n0=1e-06, sd.min.q=0))
## Features normalized successfully.
sc.obj.full <- create.data.split(sc.obj.full, num.folds = 5, num.resample = 5)
## Features splitted for cross-validation successfully.
......@@ -430,8 +429,8 @@ samples can be very large.
```{r eval_plot_comp, eval=FALSE}
model.evaluation.plot("Spanish samples only"=sc.obj,
"Danish and Spanish samples"=sc.obj.full,
fn.plot = './eval_plot_dnk.pdf')
"Danish and Spanish samples"=sc.obj.full,
fn.plot = './eval_plot_dnk.pdf')
## Plotted evaluation of predictions successfully to: ./eval_plot_dnk.pdf
```
......@@ -447,16 +446,16 @@ almost perfect accuracy.
meta.nielsen.country <- meta.nielsen[meta.nielsen$Group=='CTR',]
sc.obj.country <- siamcat(feat=feat, meta=meta.nielsen.country,
label='Country', case='ESP')
label='Country', case='ESP')
sc.obj.country <- filter.features(sc.obj.country, cutoff=1e-04,
filter.method = 'abundance')
filter.method = 'abundance')
sc.obj.country <- filter.features(sc.obj.country, cutoff=0.05,
filter.method='prevalence',
feature.type = 'filtered')
filter.method='prevalence',
feature.type = 'filtered')
sc.obj.country <- normalize.features(sc.obj.country, norm.method = 'log.std',
norm.param = list(log.n0=1e-06,
sd.min.q=0))
sc.obj.country <- create.data.split(sc.obj.country,
norm.param = list(log.n0=1e-06,
sd.min.q=0))
sc.obj.country <- create.data.split(sc.obj.country,
num.folds = 5, num.resample = 5)
sc.obj.country <- train.model(sc.obj.country, method='lasso')
sc.obj.country <- make.predictions(sc.obj.country)
......
......@@ -53,8 +53,8 @@ datasets <- c('metaHIT', 'Lewis_2015', 'He_2017', 'Franzosa_2019', 'HMP2')
meta.all <- read_tsv(paste0(data.location, 'CD_meta/meta_all.tsv'))
# features
feat <- read.table(paste0(data.location, 'CD_meta/feat_genus.tsv'),
check.names = FALSE, stringsAsFactors = FALSE, quote = '',
sep='\t')
check.names = FALSE, stringsAsFactors = FALSE, quote = '',
sep='\t')
feat <- as.matrix(feat)
# check that metadata and features agree
stopifnot(all(colnames(feat) == meta.all$Sample_ID))
......@@ -75,9 +75,9 @@ each individual.
```{r meta_dereplicate}
meta.ind <- meta.all %>%
group_by(Individual_ID) %>%
filter(Timepoint==min(Timepoint)) %>%
ungroup()
group_by(Individual_ID) %>%
filter(Timepoint==min(Timepoint)) %>%
ungroup()
```
# Compare Associations
......@@ -90,26 +90,23 @@ To test for associations, we can encapsulate each dataset into a different
```{r compute_associations, message=FALSE, warning=FALSE}
assoc.list <- list()
for (d in datasets){
# filter metadata and convert to dataframe
meta.train <- meta.ind %>%
filter(Study==d) %>%
as.data.frame()
rownames(meta.train) <- meta.train$Sample_ID
# create SIAMCAT object
sc.obj <- siamcat(feat=feat, meta=meta.train, label='Group', case='CD')
# test for associations
sc.obj <- check.associations(sc.obj, detect.lim = 1e-05,
feature.type = 'original',
fn.plot = paste0('./assoc_plot_',
d, '.pdf'))
# extract the associations and save them in the assoc.list
temp <- associations(sc.obj)
temp$genus <- rownames(temp)
assoc.list[[d]] <- temp %>%
select(genus, fc, auc, p.adj) %>%
mutate(Study=d)
# filter metadata and convert to dataframe
meta.train <- meta.ind %>%
filter(Study==d) %>%
as.data.frame()
rownames(meta.train) <- meta.train$Sample_ID
# create SIAMCAT object
sc.obj <- siamcat(feat=feat, meta=meta.train, label='Group', case='CD')
# test for associations
sc.obj <- check.associations(sc.obj, detect.lim = 1e-05,
feature.type = 'original',fn.plot = paste0('./assoc_plot_', d, '.pdf'))
# extract the associations and save them in the assoc.list
temp <- associations(sc.obj)
temp$genus <- rownames(temp)
assoc.list[[d]] <- temp %>%
select(genus, fc, auc, p.adj) %>%
mutate(Study=d)
}
# combine all associations
df.assoc <- bind_rows(assoc.list)
......@@ -126,35 +123,34 @@ and plot the generalized fold change as heatmap.
```{r genera_of_interest}
genera.of.interest <- df.assoc %>%
group_by(genus) %>%
summarise(m=mean(auc), n.filt=any(auc < 0.25 | auc > 0.75),
.groups='keep') %>%
filter(n.filt) %>%
arrange(m)
group_by(genus) %>%
summarise(m=mean(auc), n.filt=any(auc < 0.25 | auc > 0.75),
.groups='keep') %>%
filter(n.filt) %>%
arrange(m)
```
After we extracted the genera, we plot them:
```{r heatmap}
df.assoc %>%
# take only genera of interest
filter(genus %in% genera.of.interest$genus) %>%
# convert to factor to enforce an ordering by mean AUC
mutate(genus=factor(genus, levels = rev(genera.of.interest$genus))) %>%
# convert to factor to enforce ordering again
mutate(Study=factor(Study, levels = datasets)) %>%
# annotate the cells in the heatmap with stars
mutate(l=case_when(p.adj < 0.01~'*', TRUE~'')) %>%
ggplot(aes(y=genus, x=Study, fill=fc)) +
geom_tile() +
scale_fill_gradient2(low = '#3B6FB6', high='#D41645', mid = 'white',
limits=c(-2.7, 2.7),
name='Generalized\nfold change') +
theme_minimal() +
geom_text(aes(label=l)) +
theme(panel.grid = element_blank()) +
xlab('') + ylab('') +
theme(axis.text = element_text(size=6))
# take only genera of interest
filter(genus %in% genera.of.interest$genus) %>%
# convert to factor to enforce an ordering by mean AUC
mutate(genus=factor(genus, levels = rev(genera.of.interest$genus))) %>%
# convert to factor to enforce ordering again
mutate(Study=factor(Study, levels = datasets)) %>%
# annotate the cells in the heatmap with stars
mutate(l=case_when(p.adj < 0.01~'*', TRUE~'')) %>%
ggplot(aes(y=genus, x=Study, fill=fc)) +
geom_tile() +
scale_fill_gradient2(low = '#3B6FB6', high='#D41645', mid = 'white',
limits=c(-2.7, 2.7), name='Generalized\nfold change') +
theme_minimal() +
geom_text(aes(label=l)) +
theme(panel.grid = element_blank()) +
xlab('') + ylab('') +
theme(axis.text = element_text(size=6))
```
......@@ -167,11 +163,11 @@ function.
```{r check_confounders, warning=FALSE}
df.meta <- meta.ind %>%
as.data.frame()
as.data.frame()
rownames(df.meta) <- df.meta$Sample_ID
sc.obj <- siamcat(feat=feat, meta=df.meta, label='Group', case='CD')
check.confounders(sc.obj, fn.plot = './confounder_plot_cd_meta.pdf',
feature.type='original')
feature.type='original')
```
......@@ -197,85 +193,82 @@ pitfalls**).
```{r ml_meta_analysis, message=FALSE, warning=FALSE, eval=FALSE}
# create tibble to store all the predictions
auroc.all <- tibble(study.train=character(0),
auroc.all <- tibble(study.train=character(0),
study.test=character(0),
AUC=double(0))
# and a list to save the trained SIAMCAT objects
sc.list <- list()
for (i in datasets){
# restrict to a single study
meta.train <- meta.all %>%
filter(Study==i) %>%
as.data.frame()
rownames(meta.train) <- meta.train$Sample_ID
## take into account repeated sampling by including a parameters
## in the create.data.split function
## For studies with repeated samples, we want to block the
## cross validation by the column 'Individual_ID'
block <- NULL
if (i %in% c('metaHIT', 'Lewis_2015', 'HMP2')){
block <- 'Individual_ID'
if (i == 'HMP2'){
# for the HMP2 dataset, the number of repeated sample per subject need
# to be reduced, because some subjects have been sampled 20 times,
# other only 5 times
meta.train <- meta.all %>%
filter(Study=='HMP2') %>%
group_by(Individual_ID) %>%
sample_n(5, replace = TRUE) %>%
distinct() %>%
# restrict to a single study
meta.train <- meta.all %>%
filter(Study==i) %>%
as.data.frame()
rownames(meta.train) <- meta.train$Sample_ID
rownames(meta.train) <- meta.train$Sample_ID
## take into account repeated sampling by including a parameters
## in the create.data.split function
## For studies with repeated samples, we want to block the
## cross validation by the column 'Individual_ID'
block <- NULL
if (i %in% c('metaHIT', 'Lewis_2015', 'HMP2')){
block <- 'Individual_ID'
if (i == 'HMP2'){
# for the HMP2 dataset, the number of repeated sample per subject
# need to be reduced, because some subjects have been sampled
# 20 times, other only 5 times
meta.train <- meta.all %>%
filter(Study=='HMP2') %>%
group_by(Individual_ID) %>%
sample_n(5, replace = TRUE) %>%
distinct() %>%
as.data.frame()
rownames(meta.train) <- meta.train$Sample_ID
}
}
}
# create SIAMCAT object
sc.obj.train <- siamcat(feat=feat, meta=meta.train,
label='Group', case='CD')
# normalize features
sc.obj.train <- normalize.features(sc.obj.train, norm.method = 'log.std',
norm.param=list(log.n0=1e-05,
sd.min.q=0),
feature.type = 'original')
# Create data split
sc.obj.train <- create.data.split(sc.obj.train,
num.folds = 10, num.resample = 10,
inseparable = block)
# train LASSO model
sc.obj.train <- train.model(sc.obj.train, method='lasso')
## apply trained models to other datasets
# loop through datasets again
for (i2 in datasets){
if (i == i2){
# make and evaluate cross-validation predictions (on the same dataset)
sc.obj.train <- make.predictions(sc.obj.train)
sc.obj.train <- evaluate.predictions(sc.obj.train)
auroc.all <- auroc.all %>%
add_row(study.train=i, study.test=i,
AUC=eval_data(sc.obj.train)$auroc %>% as.double())
} else {
# make and evaluate on the external datasets
# use meta.ind here, since we want only one sample per subject!
meta.test <- meta.ind %>%
filter(Study==i2) %>%
as.data.frame()
rownames(meta.test) <- meta.test$Sample_ID
sc.obj.test <- siamcat(feat=feat, meta=meta.test,
label='Group', case='CD')
# make holdout predictions
sc.obj.test <- make.predictions(sc.obj.train,
siamcat.holdout = sc.obj.test)
sc.obj.test <- evaluate.predictions(sc.obj.test)
auroc.all <- auroc.all %>%
add_row(study.train=i, study.test=i2,
AUC=eval_data(sc.obj.test)$auroc %>% as.double())
# create SIAMCAT object
sc.obj.train <- siamcat(feat=feat, meta=meta.train,
label='Group', case='CD')
# normalize features
sc.obj.train <- normalize.features(sc.obj.train, norm.method = 'log.std',
norm.param=list(log.n0=1e-05, sd.min.q=0),feature.type = 'original')
# Create data split
sc.obj.train <- create.data.split(sc.obj.train,
num.folds = 10, num.resample = 10, inseparable = block)
# train LASSO model
sc.obj.train <- train.model(sc.obj.train, method='lasso')
## apply trained models to other datasets
# loop through datasets again
for (i2 in datasets){
if (i == i2){
# make and evaluate cross-validation predictions (same dataset)
sc.obj.train <- make.predictions(sc.obj.train)
sc.obj.train <- evaluate.predictions(sc.obj.train)
auroc.all <- auroc.all %>%
add_row(study.train=i, study.test=i,
AUC=eval_data(sc.obj.train)$auroc %>% as.double())
} else {
# make and evaluate on the external datasets
# use meta.ind here, since we want only one sample per subject!
meta.test <- meta.ind %>%
filter(Study==i2) %>%
as.data.frame()
rownames(meta.test) <- meta.test$Sample_ID
sc.obj.test <- siamcat(feat=feat, meta=meta.test,
label='Group', case='CD')
# make holdout predictions
sc.obj.test <- make.predictions(sc.obj.train,
siamcat.holdout = sc.obj.test)
sc.obj.test <- evaluate.predictions(sc.obj.test)
auroc.all <- auroc.all %>%
add_row(study.train=i, study.test=i2,
AUC=eval_data(sc.obj.test)$auroc %>% as.double())
}
}
}
# save the trained model
sc.list[[i]] <- sc.obj.train
# save the trained model
sc.list[[i]] <- sc.obj.train
}
```
```{r load_aurocs, echo=FALSE, message=FALSE}
......@@ -292,10 +285,10 @@ each dataset:
```{r get_test_average}
test.average <- auroc.all %>%
filter(study.train!=study.test) %>%
group_by(study.test) %>%
summarise(AUC=mean(AUC), .groups='drop') %>%
mutate(study.train="Average")
filter(study.train!=study.test) %>%
group_by(study.test) %>%
summarise(AUC=mean(AUC), .groups='drop') %>%
mutate(study.train="Average")
```
Now that we have the AUROC values, we can plot them into a nice heatmap:
......@@ -303,37 +296,36 @@ Now that we have the AUROC values, we can plot them into a nice heatmap:
```{r plot_auroc}
# combine AUROC values with test average
bind_rows(auroc.all, test.average) %>%
# highlight cross validation versus transfer results
mutate(CV=study.train == study.test) %>%
# for facetting later
mutate(split=case_when(study.train=='Average'~'Average',
TRUE~'none')) %>%
mutate(split=factor(split, levels = c('none', 'Average'))) %>%
# convert to factor to enforce ordering
mutate(study.train=factor(study.train, levels = c(datasets, 'Average'))) %>%
mutate(study.test=factor(study.test, levels=c(rev(datasets),'Average'))) %>%
ggplot(aes(y=study.test, x=study.train, fill=AUC, size=CV, color=CV)) +
geom_tile() + theme_minimal() +
# text in tiles
geom_text(aes_string(label="format(AUC, digits=2)"),
col='white', size=2)+
# color scheme
scale_fill_gradientn(colours=rev(c('darkgreen','forestgreen',
'chartreuse3','lawngreen',
'yellow')), limits=c(0.5, 1)) +
# axis position/remove boxes/ticks/facet background/etc.
scale_x_discrete(position='top') +
theme(axis.line=element_blank(),
axis.ticks = element_blank(),
axis.text.x.top = element_text(angle=45, hjust=.1),
panel.grid=element_blank(),
panel.border=element_blank(),
strip.background = element_blank(),
strip.text = element_blank()) +
xlab('Training Set') + ylab('Test Set') +
scale_color_manual(values=c('#FFFFFF00', 'grey'), guide=FALSE) +
scale_size_manual(values=c(0, 1), guide=FALSE) +
facet_grid(~split, scales = 'free', space = 'free')
# highlight cross validation versus transfer results
mutate(CV=study.train == study.test) %>%
# for facetting later
mutate(split=case_when(study.train=='Average'~'Average', TRUE~'none')) %>%
mutate(split=factor(split, levels = c('none', 'Average'))) %>%
# convert to factor to enforce ordering
mutate(study.train=factor(study.train, levels=c(datasets, 'Average'))) %>%
mutate(study.test=factor(study.test, levels=c(rev(datasets),'Average'))) %>%
ggplot(aes(y=study.test, x=study.train, fill=AUC, size=CV, color=CV)) +
geom_tile() + theme_minimal() +
# text in tiles
geom_text(aes_string(label="format(AUC, digits=2)"),
col='white', size=2)+
# color scheme
scale_fill_gradientn(colours=rev(c('darkgreen','forestgreen',
'chartreuse3','lawngreen',
'yellow')), limits=c(0.5, 1)) +
# axis position/remove boxes/ticks/facet background/etc.
scale_x_discrete(position='top') +
theme(axis.line=element_blank(),
axis.ticks = element_blank(),
axis.text.x.top = element_text(angle=45, hjust=.1),