...
 
Commits (304)
<<<<<<< HEAD
.Rproj.user
.Rhistory
.RData
......@@ -8,3 +9,16 @@
*.Rproj
*.tar.gz
*.Rcheck*/
=======
.Rbuildignore
SIAMCAT.RProj
.Rproj.user
.Rhistory
.RData
*.DS_Store*
.history
.Rapp.history
*.tar.gz
*.Rcheck*/
inst/doc
>>>>>>> development
image: r-base
before_script:
- cd Rscript_flavor
- Rscript 00_setup.r
- cd ../
- R CMD build ./
- mv SIAMCAT*.tar.gz Rscript_flavor/.
- cd Rscript_flavor
- wget http://congo.embl.de/downloads/siamcat_test_data.tar.gz
- tar -zxf siamcat_test_data.tar.gz && rm siamcat_test_data.tar.gz
test:
stage: test
script:
- Rscript 00_setup.r
- Rscript 01_validate_data.r --metadata_in num_metadata_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --metadata_out valMetaData.tsv --label_in label_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --label_out valLabel.tsv --feat_in feat_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --feat_out valFeat.tsv
- Rscript 02_select_samples.r --metadata_in valMetaData.tsv --metadata_out valMetaData_sel.tsv --label_in valLabel.tsv --label_out valLabel_sel.tsv --feat_in valFeat.tsv --feat_out valFeat_sel.tsv --filter_var="age" --allowed_range="[0,120]"
- Rscript 03_check_for_confounders.r --metadata_in valMetaData_sel.tsv --plot metaCheck.pdf --label_in valLabel_sel.tsv
- Rscript 04_filter_features.r --feat_in valFeat_sel.tsv --feat_out valFeat_sel_filtered.tsv --method="abundance" --cutoff="0.001" --recomp_prop="FALSE" --rm_unmapped="TRUE"
- Rscript 05_check_associations.r --feat_in valFeat_sel_filtered.tsv --label_in valLabel_sel.tsv --plot assoc.pdf --col_scheme="RdYlBu" --alpha="0.7" --min_fc="0" --mult_test="fdr" --max_show="50" --detect_limit="1e-06" --plot_type="bean"
- Rscript 06_normalize_features.r --feat_in valFeat_sel_filtered.tsv --feat_out valFeat_sel_norm.tsv --param_out param_out.tsv --method="log.unit" --log_n0="1e-08" --sd_min_quantile="0.2" --vector_norm="2" --norm_margin="1"
- Rscript 07_add_metadata_as_predictor.r --feat_in valFeat_sel_norm.tsv --feat_out valFeat_sel_norm.tsv --metadata_in valMetaData_sel.tsv --pred_names="fobt" --std_meta="TRUE"
- Rscript 08_split_data.r --label_in valLabel_sel.tsv --train_sets trainSets.tsv --test_sets testSets.tsv --num_folds="10" --resample="5" --stratify="TRUE" --inseparable="NULL" --metadata_in valMetaData_sel.tsv
- Rscript 09_train_models.r --feat_in valFeat_sel_norm.tsv --label_in valLabel_sel.tsv --method="lasso" --train_sets trainSets.tsv --mlr_models_list models.RData --stratify="TRUE" --sel_criterion="auprc" --min_nonzero_coeff="2"
- Rscript 10_make_predictions.r --label_in valLabel_sel.tsv --feat_in valFeat_sel_norm.tsv --test_sets testSets.tsv --pred pred.tsv --mlr_models_list models.RData
- Rscript 11_evaluate_predictions.r --label_in valLabel_sel.tsv --plot evalPlot.pdf --pred pred.tsv
- Rscript 12_interpret_model.r --label_in valLabel_sel.tsv --feat_in valFeat_sel_norm.tsv --origin_feat valFeat_sel.tsv --metadata_in valMetaData_sel.tsv --mlr_models_list models.RData --plot interpretation.pdf --pred pred.tsv --col_scheme="BrBG" --heatmap_type="zscore" --consens_thres="0.5"
only:
- master
- development
after_script:
- rm -r *.tsv
- echo "cleaned up"
Package: SIAMCAT
Type: Package
Title: Statistical Inference of Associations between Microbial Communities And host phenoTypes
Version: 0.2.0
Author: Georg Zeller, Nicolai Karcher, Konrad Zych
Version: 0.3.0
Author: Georg Zeller [aut,cre], Nicolai Karcher [aut], Jakob Wirbel[aut], Konrad Zych [aut]
Authors@R: c(person("Georg", "Zeller", role = c("aut", "cre"), email = "zeller@embl.de"),
person("Konrad", "Zych", role = "aut", email = "konrad.zych@embl.de", comment = c(ORCID = "0000-0001-7426-0516")),
person("Nicolai", "Karcher", role = "aut"),
person("Jakob", "Wirbel", role = "aut", email = "jakob.wirbel@embl.de", comment = c(ORCID = "0000-0002-4073-3562"))
)
Maintainer: Konrad Zych <konrad.zych@embl.de>
Description: SIAMCAT is a pipeline for Statistical Inference of Associations between Microbial Communities And host phenoTypes. A primary goal of analyzing microbiome data is to determine changes in community composition that are associated with environmental factors. In particular, linking human microbiome composition to host phenotypes such as diseases has become an area of intense research. For this, robust statistical modeling and biomarker extraction toolkits are crucially needed. SIAMCAT provides a full pipeline supporting data preprocessing, statistical association testing, statistical modeling (LASSO logistic regression) including tools for evaluation and interpretation of these models (such as cross validation, parameter selection, ROC analysis and diagnostic model plots).
Depends:
Depends:
R (>= 3.1.0),
RColorBrewer,
beanplot,
glmnet,
LiblineaR,
pROC
Imports:
pROC,
mlr,
PRROC
Imports:
optparse,
colorRamps,
gelnet
......@@ -20,3 +27,7 @@ License: GNU GPL-3.0
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.0.1
Suggests: testthat,
knitr,
rmarkdown
VignetteBuilder: knitr
# Generated by roxygen2: do not edit by hand
S3method(plot,data.range)
S3method(predict,plm)
export(add.meta.pred)
export(appendDirName)
export(assign.fold)
export(check.associations)
export(confounder.check)
export(create.tints.rgb)
export(data.splitter)
export(draw.error.bar)
export(eval.result)
export(eval.predictions)
export(evaluation.model.plot)
export(filter.feat)
export(get.foldList)
export(interpretor.model.plot)
export(label.plot.horizontal)
export(make_evaler_options)
export(make_filter_options)
export(make_frozen_normalizer_options)
export(make_meta_adder_options)
export(make_model_interpretor_options)
export(make_normalizer_options)
export(make.predictions)
export(normalize.feat)
export(parse.label.header)
export(parse.model.header)
export(plm.predictor)
export(plm.trainer)
export(read.features)
export(read.labels)
export(read.meta)
export(sample.strat)
export(select.model)
export(select.samples)
export(train.model)
export(train.plm)
export(validate.data)
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Add metadata as predictors
#' @description This function adds metadata to the feature matrix to be later used as predictors
#' @param feat features object
#' @param meta metadata object
#' @param pred.names vector of names of the metavariables to be added to the feature matrix as predictors
#' @param std.meta boolean, should added metadata features be standardized?
#' @keywords SIAMCAT add.meta.pred
#' @export
#' @return features object with added metadata
add.meta.pred <- function(feat, meta, pred.names=NULL, std.meta){
### add metadata as predictors to the feature matrix
cnt <- 0
if (pred.names != '' && !is.null(pred.names)) {
for (p in pred.names) {
if(!p%in%colnames(meta)) stop("There is no meta variable called ",p,"\n")
idx <- which(colnames(meta) == p)
if(length(idx) != 1) stop(p, "matches multiple columns in the metada\n")
cat('adding ', p, '\n', sep='')
m <- meta[,idx]
if (!all(is.finite(m))) {
na.cnt <- sum(!is.finite(m))
cat('filling in', na.cnt, 'missing values by mean imputation\n')
mn <- mean(m, na.rm=TRUE)
m[!is.finite(m)] <- mn
}
if (std.meta) {
cat('standardize metadata feature', p, '\n')
m.mean <- mean(m, na.rm = TRUE)
m.sd <- sd(m, na.rm = TRUE)
stopifnot(!m.sd == 0)
m <- (m - m.mean)/m.sd
}
feat <- rbind(feat, m)
rownames(feat)[nrow(feat)] <- paste('META_', toupper(p), sep='')
cnt <- cnt + 1
}
cat('added', cnt, 'meta-variables as predictors to the feature matrix\n')
} else {
cat('Not adding any of the meta-variables as predictor to the feature matrix\n')
}
stopifnot(all(!is.na(feat)))
invisible(feat)
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Check for potential confounders in the metadata
#' @description This function checks for associations between class labels and potential confounders (e.g. age, sex, or BMI) that are present in the metadata.
#' Statistical testing is performed with Fisher's exact test or Wilcoxon test, while associations are visualized either as barplot or Q-Q plot, depending on the type of metadata.
#' @param meta metadata object
#' @param label labels object
#' @param fn.plot string, filename for the pdf-plot
#' @keywords SIAMCAT confounder.check
#' @export
#' @return Does not return anything, but produces a single plot for each metadata category
confounder.check <- function(meta, label, fn.plot){
# TODO: implement color.scheme selection as function parameter
pdf(fn.plot, onefile=TRUE)
par(xpd=FALSE)
for (m in 1:ncol(meta)) {
mname <- gsub('[_.-]', ' ', colnames(meta)[m])
mname <- paste(toupper(substring(mname, 1, 1)), substring(mname, 2), sep="")
cat('checking', mname, 'as a potential confounder...\n')
mvar <- as.numeric(meta[,m])
u.val <- unique(mvar)
u.val <- u.val[!is.na(u.val)]
colors <- brewer.pal(5,"Dark2")
if (length(u.val) == 1) {
cat(' skipped because all subjects have the same value\n')
} else if (length(u.val) <= 5) {
cat(' using a bar plot\n')
par(mar=c(6.1,4.1,4.1,4.1))
ct <- matrix(NA, nrow=2, ncol=length(u.val))
for (i in 1:length(u.val)) {
ct[1,i] = sum(mvar[label$n.idx] == u.val[i], na.rm=TRUE)
ct[2,i] = sum(mvar[label$p.idx] == u.val[i], na.rm=TRUE)
}
freq <- t(ct)
for (i in 1:dim(freq)[2]) {
freq[,i] <- freq[,i] / sum(freq[,i])
}
barplot(freq, ylim=c(0,1), main=mname, names.arg=c(label$n.lab, label$p.lab), col=colors)
#legend("right", legend=u.val, col=colors)
p.val <- fisher.test(ct)$p.value
mtext(paste('Fisher test p-value:', format(p.val, digits=4)), side=1, line=3, at=1, adj=0)
} else {
cat(' using a Q-Q plot\n')
par(mar=c(5.1,4.1,4.1,4.1))
ax.int <- c(min(mvar, na.rm=TRUE), max(mvar, na.rm=TRUE))
qqplot(mvar[label$n.idx], mvar[label$p.idx], xlim=ax.int, ylim=ax.int, pch=16, cex=0.6,
xlab=label$n.lab, ylab=label$p.lab, main=paste('Q-Q plot for', mname))
abline(0, 1, lty=3)
p.val <- wilcox.test(mvar[label$n.idx], mvar[label$p.idx], exact=FALSE)$p.value
text(ax.int[1]+0.9*(ax.int[2]-ax.int[1]), ax.int[1]+0.1*(ax.int[2]-ax.int[1]),
paste('MWW test p-value:', format(p.val, digits=4)), pos=2)
}
}
tmp <- dev.off()
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Evaluate prediction results
#' @description This function takes the correct labels and predictions for all samples and evaluates the results using the \itemize{
#' \item Area Under the Receiver Operating Characteristic (ROC) Curve (AU-ROC)
#' \item and the Precision-recall Curve (PR)
#' }
#' as metric. Predictions can be supplied either for a single case or as matrix after resampling of the dataset.
#'
#' Prediction results are usually produced with the function \link{plm.predictor}.
#' @param label label object
#' @param pred prediction for each sample by the model, should be a matrix with dimensions \code{length(label) x 1} or \code{length(label) x num.resample}
#' @keywords SIAMCAT eval.result
#' @export
#' @return list containing \itemize{
#' \item \code{$roc.average} average ROC-curve across repeats or a single ROC-curve on complete dataset;
#' \item \code{$auc.average} AUC value for the average ROC-curve;
#' \item \code{$ev.list} list of \code{length(num.folds)}, containing for different decision thresholds the number of false positives, false negatives, true negatives, and true positives;
#' \item \code{$pr.list} list of \code{length(num.folds)}, containing the positive predictive value (precision) and true positive rate (recall) values used to plot the PR curves;
#' }. If \code{prediction} had more than one column, i.e. if the models has been trained with several repeats, the function will additonally return \itemize{
#' \item \code{$roc.all} list of roc objects (see \link[pROC]{roc}) for every repeat;
#' \item \code{$aucspr} vector of AUC values for the PR curves for every repeat;
#' \item \code{$auc.all} vector of AUC values for the ROC curves for every repeat
#'}
eval.predictions <- function(label, pred){
# TODO compare header to label
### make sure that label and prediction are in the same order
#stopifnot(all(names(label) %in% rownames(pred)) && all(rownames(pred) %in% names(label)))
m <- match(names(label$label), rownames(pred))
#cat(m, '\n')
pred <- pred[m,,drop=FALSE]
stopifnot(all(names(label$label) == rownames(pred)))
# ROC curve
auroc = 0
if (ncol(pred) > 1) {
rocc = list(NULL)
aucs = vector('numeric', ncol(pred))
for (c in 1:ncol(pred)) {
rocc[c] = list(roc(response=label$label, predictor=pred[,c], ci=FALSE))
aucs[c] = rocc[[c]]$auc
}
l.vec = rep(label$label, ncol(pred))
} else {
l.vec = label$label
}
# average data for plotting one mean prediction curve
summ.stat = 'mean'
rocsumm = list(roc(response=label$label, predictor=apply(pred, 1, summ.stat),
ci=TRUE, of="se", sp=seq(0, 1, 0.05)))
auroc = list(rocsumm[[1]]$auc)
# precision recall curve
pr = list(NULL)
ev = list(NULL)
if (ncol(pred) > 1) {
aucspr = vector('numeric', dim(pred)[2])
for (c in 1:ncol(pred)) {
ev[c] = list(eval.classifier(pred[,c], label$label, label))
pr[c] = list(get.pr(ev[[c]]))
aucspr[c] = calc.aupr(ev[[c]])
}
ev = append(ev,list(eval.classifier(apply(pred, 1, summ.stat), label$label, label)))
} else {
ev[1] = list(eval.classifier(as.vector(pred), label$label, label))
pr[1] = list(get.pr(ev[[1]]))
}
if (ncol(pred) > 1) {
return(list("roc.all" = rocc,
"auc.all"=aucs,
"roc.average"=rocsumm,
"auc.average"=auroc,
"ev.list"=ev,
"pr.list"=pr,
"aucspr"=aucspr))
} else {
return(list("roc.average"=rocsumm,
"auc.average"=auroc,
"ev.list"=ev,
"pr.list"=pr))
}
}
File mode changed from 100755 to 100644
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Perform unsupervised feature filtering.
#' @description This function may convert absolute abundances into relative abundances and then performs unsupervised feature filtering. Features can be filtered based on abundance or prevalence. Additionally, unmapped reads may be removed.
#' @param feat feature object
#' @param filter.method method used for filtering the features, can be one of these: \code{c("abundance", "cum.abundance", "prevalence")}
#' @param cutoff float, abundace or prevalence cutoff
#' @param recomp.prop boolean, should absolute abundances be converted into relative abundances?
#' @param rm.unmapped boolean, should unmapped reads be discarded?
#' @details Currently, there are three filtering methods implemented:
#' \itemize{
#' \item \code{"abundance"} remove features for which the abundance is never above the threshold value (e.g. 0.5\%) in any of the samples
#' \item \code{"cum.abundance"} remove features with very low abundance in all samples, i.e. features that are never among the most abundant entities that collectively make up \code{(1-cutoff)} of the reads in any samples
#' \item \code{"prevalence"} remove features with low prevalence across samples, i.e. features that are undetected in more than \code{(1-cutoff)} proportion of samples
#' }
#' @keywords SIAMCAT filter.feat
#' @export
#' @return Returns the filtered feature matrix
filter.feat <- function(feat, filter.method=abundance, cutoff=0.001, recomp.prop=FALSE, rm.unmapped=TRUE){
### this statement does not have the purpose to calculate relative abundances on the fly and return them.
### Instead, it's purpose is to be able to calculate f.idx (specifying the indices of features which are to be kept)
### when feature list has already been transformed to relative abundances, but e.g. certain features have been removed manually.
## TODO check filter.method, add default value for cutoff, recomp.prop, and rm.unmapped?
if (recomp.prop) {
# recompute relative abundance values (proportions)
ra.feat <- prop.table(feat, 2)
} else {
ra.feat <- feat
}
### apply filters
if (filter.method == 'abundance') {
# remove features whose abundance is never above the threshold value (e.g. 0.5%) in any of the samples
f.max <- apply(ra.feat, 1, max)
f.idx <- which(f.max >= cutoff)
} else if (filter.method == 'cum.abundance') {
# remove features with very low abundance in all samples i.e. ones that are never among the most abundant
# entities that collectively make up (1-cutoff) of the reads in any sample
f.idx <- vector('numeric', 0)
# sort features per sample and apply cumsum to identify how many collectively have weight K
for (s in 1:ncol(ra.feat)) {
srt <- sort(ra.feat[,s], index.return=TRUE)
cs <- cumsum(srt$x)
m <- max(which(cs < cutoff))
f.idx <- union(f.idx, srt$ix[-(1:m)])
}
# an index of those features that collectively make up more than 1-K of the read mass in any sample
f.idx <- sort(f.idx)
} else if (filter.method == 'prevalence') {
# remove features with low prevalence across samples
# i.e. ones that are 0 (undetected) in more than (1-cutoff) proportion of samples
f.idx <- which(rowSums(ra.feat > 0) / ncol(ra.feat) > cutoff)
} else {
stop('unrecognized filter.method, exiting!\n')
}
cat('Removed ', nrow(feat)-length(f.idx), ' features whose values did not exceed ', cutoff,
' in any sample (retaining ', length(f.idx), ')\n', sep='')
feat <- feat[f.idx,]
### postprocessing and output generation
if (rm.unmapped) {
# remove 'unmapped' feature
unm.idx <- rownames(feat) == 'UNMAPPED' | rownames(feat) == 'unmapped' | rownames(feat) == '-1' | rownames(feat) == 'UNCLASSIFIED' | rownames(feat) == 'unclassified' | rownames(feat) == 'UNASSIGNED' | rownames(feat) == 'unassigned'
if (any(unm.idx)) {
feat <- feat[!unm.idx,]
cat('Removed ', sum(unm.idx), ' features corresponding to UNMAPPED reads',
' (retaining ', nrow(feat), ')\n', sep='')
} else {
cat('tried to remove unmapped reads, but could not find them. Continue anyway.')
}
}
return(feat)
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# R package flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 12.06.2017
# GNU GPL-3.0
###
#' @title Read feature file
#' @description This file reads in the tsv file with features and converts it into a matrix.
#'
#' The file should be oragnized as follows:
#' features (in rows) x samples (in columns).
#'
#' First row should contain sample labels (consistent with label data), while the first column should contain feature labels (e.g. taxonomic identifiers). The remaining entries are expected to be real values \code{>= 0} that quantify the abundance of each feature in each sample.
#' @param fn.in.feat name of the tsv file containing features
#' @export
#' @return matrix containing features from the file
read.features <- function(fn.in.feat){
if(is.null(fn.in.feat)) stop("Filename for features file not provided!\n")
if(!file.exists(fn.in.feat)) stop("Feature file ", fn.in.feat, " does not exist!\n")
feat <- read.table(file=fn.in.feat, sep='\t', header=TRUE, stringsAsFactors=FALSE, check.names=FALSE, quote='')
feat <- as.matrix(feat)
featNames <- make.names(rownames(feat)) ### making the names semantically correct
if(any(rownames(feat)==featNames)){
cat("The provided feature names were not semantically correct for use in R, they were updated.\n")
rownames(feat) <- featNames
}
invisible(feat)
}
#' @title Read labels file
#' @description This file reads in the tsv file with labels and converts it into a matrix.
#'
#' First row is expected to be \code{#BINARY:1=[label for cases];-1=[label for controls]}. Second row should contain the sample identifiers as tab-separated list (consistent with feature and metadata).
#'
#' Third row is expected to contain the actual class labels (tab-separated): \code{1} for each case and \code{-1} for each control.
#'
#' Note: Labels can take other numeric values (but not characters or strings); importantly, the label for cases has to be greater than the one for controls.
#' @param fn.in.label name of the tsv file containing labels
#' @export
#' @return list with nine values:\itemize{
#' \item \code{$label} named vector containing the numerical labels from the file;
#' \item \code{$header} first row of the label file;
#' \item \code{$info} information about the type of label (e.g. \code{BINARY});
#' \item \code{$positive.lab} numerical label for controls, e.g. \code{-1};
#' \item \code{$negative.lab} numerical label for cases, e.g. \code{1};
#' \item \code{$n.idx} logical vector of labels (\code{TRUE} for controls, \code{FALSE} otherwise);
#' \item \code{$n.lab} label for controls, e.g. \code{healthy};
#' \item \code{$p.idx} logical vector of labels (\code{TRUE} for cases, \code{FALSE} otherwise);
#' \item \code{$p.lab} label for cases, e.g. \code{cancer}
#'}
read.labels <- function(fn.in.label,feat=NULL){
# TODO move feature/label agreement check to validate data?
if (is.null(fn.in.label)) stop("Filename for labels file not provided!\n")
if(!file.exists(fn.in.label)) stop("Label file ", fn.in.label, " does not exist!\n")
label <- read.table(file=fn.in.label, sep='\t', header=TRUE, row.names=NULL, stringsAsFactors=FALSE,
check.names=FALSE, quote='', comment.char="#")
label <- as.matrix(label)
if (dim(label)[1] > dim(label)[2]){
temp <- names(label)
names(label) <- NULL
label <- rbind(temp,label)
rownames(label) <- label[,1]
label[,1] <- NULL
label <- t(label)
}
namesL <- colnames(label)
label <- as.numeric(label)
names(label) <- namesL
if(!all(is.null(feat))){
if(any(names(label) != colnames(feat))) stop("read.labels: Names in label object and feat object do not match!\n")
}
# Check general suitablity of supplied dataset
classes <- unique(label)
for (i in classes){
if(sum(label==i) <= 5) stop("Data set has only",sum(label==i), "training examples of class",i," This is not enough for SIAMCAT to proceed")
if (sum(label==i) < 10){
cat("Data set has only",sum(label==i), "training examples of class",i," . Note that a dataset this small/skewed is not necessarily suitable for analysis in this pipe line." )
}
}
#Check label header!
con <- file(fn.in.label, 'rt')
header <- readLines(con, 1)
if (substring(header,1,1) != "#"){
stop("Label header seems to be missing or broken.")
}
close(con)
label <- list("label" = label, "header" = header)
label$info <- parse.label.header(label$header)
stopifnot(label$info$type == 'BINARY')
label$positive.lab <- max(label$info$class.descr)
label$negative.lab <- min(label$info$class.descr)
label$n.idx <- label$label==label$negative.lab
label$n.lab <- gsub('[_.-]', ' ', names(label$info$class.descr)[label$info$class.descr==label$negative.lab])
label$p.idx <- label$label==label$positive.lab
label$p.lab <- gsub('[_.-]', ' ', names(label$info$class.descr)[label$info$class.descr==label$positive.lab])
invisible(label)
}
#' @title Read metadata file
#' @description This file reads in the tsv file with numerical metadata and converts it into a matrix.
#'
#' The file should be organized as follows:
#' samples (in rows) x metadata (in columns). Metadata needs to be converted to numerical values by the user.
#'
#' Metadata may be optional for the SIAMCAT workflow, but are necessary for heatmap displays, see \link{interpretor.model.plot}
#' @param fn.in.meta name of the tsv file containing metadata
#' @export
#' @return matrix containing metadata from the file
read.meta <- function(fn.in.meta){
if (is.null(fn.in.meta) || toupper(fn.in.meta)=='NULL' || toupper(fn.in.meta)=='NONE' || toupper(fn.in.meta)=='UNKNOWN') {
cat("Filename for metadata file not provided, continuing without it.\n")
}else{
if(!file.exists(fn.in.meta)) stop("Metadata file ", fn.in.meta, " does not exist!\n")
meta <- read.table(file=fn.in.meta, sep='\t', header=TRUE, row.names=1, check.names=FALSE, quote='')
meta <- as.matrix(meta)
}
invisible(meta)
}
##### auxiliary function to trim whitespace from string
# returns string without leading or trailing whitespace
trim <- function (x) {
gsub("^\\s+|\\s+$", "", x)
}
##### function to parse the header of a label file
### label.header - string in the format: #<TYPE>:<L1>=<class1>;<L2>=<class2>[;<L3>=<class3>]
### where <TYPE> is a string specifying the type of label variable such as
### BINARY (for binary classification), CATEGORICAL (for multi-class classification), or CONTINUOUS (for regression)
### <L1> is a short numeric label for the first class with description <class1> (similarly for the other classes)
#' @export
parse.label.header <- function(label.header) {
s <- strsplit(label.header, ':')[[1]]
type <- trim(s[1])
if (substr(type, 1, 1) == '#')
type <- trim(substr(type, 2, nchar(type)))
class.descr <- unlist(strsplit(strsplit(trim(s[2]), ';')[[1]], '='))
l <- class.descr[seq(2,length(class.descr),2)]
class.descr <- as.numeric(class.descr[seq(1,length(class.descr)-1,2)])
names(class.descr) <- l
label.info <- list()
label.info$type <- type
label.info$class.descr <- class.descr
return(label.info)
}
# ##### function to parse the header of a model file
#' @export
parse.model.header <- function(model.header) {
s <- strsplit(model.header, ':')[[1]]
type <- trim(s[1])
if (substr(type, 1, 1) == '#')
type <- trim(substr(type, 2, nchar(type)))
label.header <- trim(paste(s[2:length(s)], collapse=':'))
if (substr(label.header, 1, 1) == '[') {
stopifnot(substr(label.header, nchar(label.header), nchar(label.header)) == ']')
label.header <- substr(label.header, 2, nchar(label.header)-1)
}
p <- grep('\\(.*\\)', type)
properties <- NULL
if (length(p) > 0) {
stopifnot(length(p) == 1)
stopifnot(substr(type, nchar(type), nchar(type)) == ')')
properties <- substr(type, p+1, nchar(type)-1)
type <- trim(substr(type, 1, p-1))
}
model.info <- list()
model.info$type <- type
model.info$properties <- properties
model.info$label.header <- label.header
return(model.info)
}
This diff is collapsed. Click to expand it.
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# R package flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL-3.0
###
#' @title Model Evaluation Plot
#' @description Produces two plots for model evaluation. The first plot shows the Receiver Operating Receiver (ROC)-curves, the other the Precision-recall (PR)-curves for the different CV repetitions.
#' @param label a label object
#' @param pred matrix containing the model predictions for every CV repetition
#' @param eval.data list of evaluation results produced by \link{eval.results}
#' @param fn.plot string, filename for the pdf-plot
#' @keywords SIAMCAT evaluation.model.plot
#' @export
#' @return Does not return anything, but produces the model evaluation plot.
evaluation.model.plot <- function(label, pred, eval.data, fn.plot){
pdf(fn.plot, onefile=TRUE)
plot(NULL, xlim=c(0,1), ylim=c(0,1), xlab='False positive rate', ylab='True positive rate', type='n')
title(paste('ROC curve for the model', sep=' '))
abline(a=0, b=1, lty=3)
if (dim(pred)[2] > 1) {
aucs = vector('numeric', dim(pred)[2])
for (c in 1:dim(pred)[2]) {
roc.c = eval.data$roc.all[[c]]
lines(1-roc.c$specificities, roc.c$sensitivities, col=gray(runif(1,0.2,0.8)))
aucs[c] = eval.data$auc.all[c]
cat('AU-ROC (resampled run ', c, '): ', format(aucs[c], digits=3), '\n', sep='')
}
l.vec = rep(label$label, dim(pred)[2])
} else {
l.vec = label$label
}
roc.summ = eval.data$roc.average[[1]]
lines(1-roc.summ$specificities, roc.summ$sensitivities, col='black', lwd=2)
auroc = eval.data$auc.average[1]
# plot CI
x = as.numeric(rownames(roc.summ$ci))
yl = roc.summ$ci[,1]
yu = roc.summ$ci[,3]
polygon(1-c(x, rev(x)), c(yl, rev(yu)), col='#88888844' , border=NA)
if (dim(pred)[2] > 1) {
cat('Mean-pred. AU-ROC:', format(auroc, digits=3), '\n')
cat('Averaged AU-ROC: ', format(mean(aucs), digits=3), ' (sd=', format(sd(aucs), digits=4), ')\n', sep='')
text(0.7, 0.1, paste('Mean-prediction AUC:', format(auroc, digits=3)))
} else {
cat('AU-ROC:', format(auroc, digits=3), '\n')
text(0.7, 0.1, paste('AUC:', format(auroc, digits=3)))
}
# precision recall curve
plot(NULL, xlim=c(0,1), ylim=c(0,1), xlab='Recall', ylab='Precision', type='n')
title(paste('Precision-recall curve for the model', sep=' '))
abline(h=mean(label$label==label$positive.lab), lty=3)
if (dim(pred)[2] > 1) {
aucspr = vector('numeric', dim(pred)[2])
for (c in 1:dim(pred)[2]) {
ev = eval.data$ev.list[[c]]
pr = eval.data$pr.list[[c]]
lines(pr$x, pr$y, col=gray(runif(1,0.2,0.8)))
aucspr[c] = eval.data$aucspr[c]
cat('AU-PRC (resampled run ', c, '): ', format(aucspr[c], digits=3), '\n', sep='')
}
ev = eval.data$ev.list[[length(eval.data$ev.list)]]
} else {
ev = eval.data$ev.list[[1]]
}
pr = get.pr(ev)
lines(pr$x, pr$y, col='black', lwd=2)
aupr = calc.aupr(ev)
if (dim(pred)[2] > 1) {
cat('Mean-pred. AU-PRC:', format(aupr, digits=3), '\n')
cat('Averaged AU-PRC: ', format(mean(aucs), digits=3), ' (sd=', format(sd(aucs), digits=4), ')\n', sep='')
text(0.7, 0.1, paste('Mean-prediction AUC:', format(aupr, digits=3)))
} else {
cat('AU-PRC:', format(aupr, digits=3), '\n')
text(0.7, 0.1, paste('AUC:', format(aupr, digits=3)))
}
tmp <- dev.off()
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Prediction on the test set
#' @description This function takes the test set instances and the model trained by \link{plm.trainer} in order to predict the classes.
#' @param feat features object
#' @param label label object
#' @param data.split filename containing the test samples or list of test instances produced by \link{data.splitter()}, defaults to \code{NULL} leading to testing on the complete dataset
#' @param models.list list of models trained by \link{plm.trainer}
#' @export
#' @keywords SIAMCAT plm.predictor
#' @return list containing the precitions \itemize{
#' \item \code{$pred};
#' \item \code{$mat}
#'}
make.predictions <- function(feat, label, data.split=NULL, models.list){
feat <- t(feat)
### subselect training examples as specified in fn.train.sample (if given)
foldList <- get.foldList(data.split, label, mode="test", model=models.list)
fold.name <- foldList$fold.name
fold.exm.idx <- foldList$fold.exm.idx
num.runs <- foldList$num.runs
num.folds <- foldList$num.folds
### apply one LASSO model per test sample (i.e. CV fold)
# predictions are made on a concatenation of test examples from all test samples
pred = NULL
predList = list()
fold.pred.idx = list()
# Init hyperpar list
opt.hp <- list(lambda = NULL, C = NULL, alpha = NULL, ntree = NULL)
for (r in 1:num.runs) {
label.fac <- factor(label$label, levels=c(label$negative.lab, label$positive.lab))
test.label <- label.fac
test.label <- label.fac[fold.exm.idx[[r]]]
data <- as.data.frame(feat[fold.exm.idx[[r]],])
stopifnot(nrow(data) == length(test.label))
stopifnot(all(rownames(data) == names(test.label)))
data$label <- test.label
model <- models.list[[r]]
cat('Applying ', colnames(model$W)[r], ' on ', fold.name[r], ' (', r, ' of ', num.runs, ')...\n', sep='')
# subselect appropriate model
model$W = model$W[,r]
# subselect test examples
test.feat = feat[fold.exm.idx[[r]],,drop=FALSE]
task <- makeClassifTask(data = data, target = "label")
pdata <- predict(model, task = task)
# save(pdata,file="pdata.RData") # not very good, saves the data somewhere, depending on working directory
p <- label$negative.lab+abs(label$positive.lab-label$negative.lab)*pdata$data[,4]
names(p) <- rownames(pdata$data)
pred <- c(pred, p)
fold.pred.idx[[r]] = (length(pred)-length(p)+1):length(pred)
}
cat('\nTotal number of predictions made:', length(pred), '\n')
if (!is.null(data.split)) {
### if test labels are given do some evaluation as well
# get the appropriate labels for all test sets
test.label = NULL
aucs = vector('numeric', num.runs)
for (r in 1:num.runs) {
lab <- label$label[fold.exm.idx[[r]]]
test.label <- c(test.label, lab)
lab.p.idx <- (length(test.label)-length(lab)+1):length(test.label)
# accuracy of individual test sets
if (length(unique(test.label[lab.p.idx])) == 2) {
ev = eval.classifier(pred[lab.p.idx], as.vector(test.label[lab.p.idx]), label)
aucs[r] = calc.auroc(ev)
}
}
stopifnot(length(test.label) == length(pred))
stopifnot(names(test.label) == names(pred))
# in case of cross-validation there should be exactly one prediction per labeled example,
# so we reorder them according to the order of label
if (length(label$label) == length(pred) && all(names(label$label) %in% names(pred)) && all(names(pred) %in% names(label$label))) {
m = match(names(label$label), names(pred))
pred = pred[m]
test.label = test.label[m]
stopifnot(all(names(label$label) == names(pred)))
}
# test accuracy of combined test set
c.auc = NA
if (length(unique(test.label)) == 2) {
ev = eval.classifier(pred, as.vector(test.label), label)
c.auc = calc.auroc(ev)
}
cat('Combined test AUC = ', format(c.auc, digits=3),
' (m=', format(mean(aucs, na.rm=TRUE), digits=3),
', s.d.=', format(sd(aucs, na.rm=TRUE), digits=3), ')\n', sep='')
}
### reformat predictions in case models were trained in repeated cross-validation
if (length(unique(names(pred))) < length(pred)) {
ref.names = NULL
if (any(substr(fold.name,1,14) == 'whole data set')) {
r.idx = c(1:num.runs) #as.numeric(sapply(strsplit(fold.name, 'predicted by model '), '[[', 2))
runs = sort(unique(r.idx))
stopifnot(all(runs == 1:length(runs)))
if (!is.null(data.split)) {
ref.names = names(label$label)
} else {
ref.names = unique(names(pred))
}
} else {
r.idx = as.numeric(sapply(strsplit(fold.name, 'rep'), '[[', 2))
runs = sort(unique(r.idx))
stopifnot(all(runs == 1:length(runs)))
if (!is.null(data.split)) {
ref.names = names(label$label)
} else {
ref.names = names(pred)[unlist(fold.pred.idx[r.idx==1])]
}
}
# cat(ref.names, '\n\n')
# cat(names(label), '\n\n')
# cat(names(pred), '\n\n')
pred.mat = matrix(data=NA, nrow=length(ref.names), ncol=length(runs))
rownames(pred.mat) = ref.names
if (any(substr(fold.name,1,14) == 'whole data set')) {
colnames(pred.mat) = paste('Model', runs, sep='')
} else {
colnames(pred.mat) = paste('CV_rep', runs, sep='')
}
for (r in runs) {
idx = which(r.idx == r)
p = unlist(fold.pred.idx[idx])
m = match(names(pred)[p], ref.names)
# cat(sort(m), '\n\n')
# cat(length(m), '\n\n')
# cat(length(label), '\n\n')
#if (!is.null(data.split)) {
# stopifnot(all(sort(m) == 1:length(label$label)))
#}
pred.mat[m,r] = pred[p]
stopifnot(all(names(pred)[p] == rownames(pred.mat)[m]))
}
correlation <- cor(pred.mat, method='spearman')
cat('\nCorrelation between predictions from repeated CV:\n')
cat('Min: ', min(correlation), ', Median: ', median(correlation), ', Mean: ', mean(correlation), '\n', sep='')
}else{
pred.mat = as.matrix(pred,byrow=TRUE)
}
#print(pred.mat[1:3,1:3])
invisible(list(pred = pred, mat = pred.mat))
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Select samples based on metadata
#' @description This functions filters features, labels, and metadata based on a specific column in the metadata.
#' Provided with a column in the metadata and a range or a set of allowed values, the function will return the labels, metadata, and features for the samples matching the allowed range or set.
#' @param meta metadata object
#' @param feat features object
#' @param label labels object
#' @param filter name of the column from metadata on which the selection should be done
#' @param allowed.set a vector of allowed values
#' @param allowed.range a range of allowed values
#' @keywords SIAMCAT select.samples
#' @export
#' @return list containing values after selection: \itemize{
#' \item \code{$feat} = features;
#' \item \code{$label} = labels;
#' \item \code{$meta} = metadata
#' }
select.samples <- function(meta, feat, label, filter, allowed.set = NULL, allowed.range = NULL){
# TODO at the moment, this functions takes label$label, not an label-object. Should be corrected
# parse interval
if(!is.null(allowed.range )) {
allowed.range <- gsub('\\[|\\]','', allowed.range)
allowed.range <- as.numeric(unlist(strsplit(allowed.range,',')))
stopifnot(length(allowed.range) == 2)
stopifnot(allowed.range [1] <= allowed.range[2])
}
#allowed.set <- opt$allowed.set # TODO should be removed, i guess?
if (!is.null(allowed.set)) {
# parse set
allowed.set <- gsub('\\{|\\}','', allowed.set)
allowed.set <- as.numeric(unlist(strsplit(allowed.set,',')))
allowed.set <- sort(unique(allowed.set))
}
if (!xor(is.null(allowed.range ), is.null(allowed.set))) {
stop('Neither allowed.range nor allowed.set (or both at the same time) have been provided, exiting!\n')
} else {
if (!is.null(allowed.range )) {
cat('allowed.range = [', paste(allowed.range , collapse=','), ']\n', sep='')
} else {
cat('allowed.set = {', paste(allowed.set, collapse=','), '}\n', sep='')
}
}
cat('\n')
# TODO throws an error when called in Rstudio, should be removed i guess
# if (substr(source.dir, nchar(source.dir), nchar(source.dir)) != '/') {
# source.dir <- paste(source.dir, '/', sep='')
# }
if(!filter %in% colnames(meta)) stop("The filter name is not present in colnames of the metadata. Stopping.\n")
filter.var <- meta[,filter]
if (!is.null(allowed.range )) {
s.idx <- !is.na(filter.var) & filter.var >= allowed.range [1] & filter.var <= allowed.range [2]
cat('Removed ', sum(!s.idx), ' samples with ', filter,
' not in [', paste(allowed.range , collapse=', '), '] (retaining ', sum(s.idx), ')\n', sep='')
} else {
s.idx <- !is.na(filter.var) & filter.var %in% allowed.set
cat('Removed ', sum(!s.idx), ' samples with ', filter,
' not in {', paste(allowed.set, collapse=', '), '} (retaining ', sum(s.idx), ')\n', sep='')
}
results <- list(NULL) # why?
results$feat <- feat[,s.idx]
results$label <- label[s.idx]
results$meta <- meta[s.idx,]
# why invisible return?
invisible(results)
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Split a dataset into training and a test sets.
#' @description This function prepares the cross-validation by splitting the data into \code{num.folds} training and test folds for \code{num.resample} times.
#' @param label label object
#' @param num.folds number of cross-validation folds (needs to be \code{>=2}), defaults to \code{2}
#' @param num.resample resampling rounds (values \code{<= 1} deactivate resampling), defaults to \code{1}
#' @param stratify boolean, should the splits be stratified s. t. an equal proportion of classes are present in each fold?, defaults to \code{TRUE}
#' @param inseparable column index or column name of metadata variable, defaults to \code{NULL}
#' @param meta metadata object, only needed when \code{inseparable} is given, defaults to \code{NULL}
#' @keywords SIAMCAT data.splitter
#' @export
#' @return list containing the indices of the training and test folds and the parameters of the splits: \itemize{
#' \item \code{$training.folds} nested list, containing for \code{length(num.folds)} the sample names of the \code{length(num.resample)} training folds;
#' \item \code{$test.folds} nested list, containing for \code{length(num.folds)} the sample names of the \code{length(num.resample)} test folds;
#' \item \code{$num.resample} = number of repeated samplings;
#' \item \code{$num.folds} = number of folds
#'}
# TODO add detail section for this function
data.splitter <- function(label, num.folds=2, num.resample=1, stratify=TRUE, inseparable=NULL, meta=NULL){
### read label and meta-data
# (assuming the label file has 1 column)
if (is.null(inseparable) || inseparable=='' || toupper(inseparable)=='NULL' || toupper(inseparable)=='NONE' || toupper(inseparable)=='UNKNOWN') {
inseparable <- NULL
# cat('+++ Inseparable parameter not specified\n')
}
labelNum <- as.numeric(label$label)
names(labelNum) <- names(label$label)
exm.ids <- names(labelNum)
# parse label description
classes <- sort(label$info$class.descr)
### check arguments
if (num.resample < 1) {
cat('+++ Resetting num.resample = 1 (', num.resample, ' is an invalid number of resampling rounds)\n', sep='')
num.resample <- 1
}
if (num.folds < 2) {
cat('+++ Resetting num.folds = 2 (', num.folds, ' is an invalid number of folds)\n', sep='')
num.folds <- 2
}
if (!is.null(inseparable) && stratify) {
cat('+++ Resetting stratify to FALSE (Stratification is not supported when inseparable is given)\n')
stratify <- FALSE
}
if (num.folds >= length(labelNum)) {
cat('+++ Performing un-stratified leave-one-out (LOO) cross-validation\n')
stratify <- FALSE
num.folds <- length(labelNum)-1
}
if (!is.null(inseparable) && is.null(meta)){
stop('Meta-data must be provided if the inseparable parameter is not NULL')
}
if (!is.null(inseparable)){
if (is.numeric(inseparable) && length(inseparable) == 1){
stopifnot(inseparable <= ncol(meta))
} else if (class(inseparable) == 'character' && length(inseparable == 1)){
stopifnot(inseparable %in% colnames(meta))
} else {
stop('Inseparable parameter must be either a single column index or a single column name of metadata matrix')
}
}
train.list <- list(NULL)
test.list <- list(NULL)
for (r in 1:num.resample) {
labelNum <- sample(labelNum)
foldid <- assign.fold(label = labelNum, num.folds=num.folds, stratified = stratify, inseparable = inseparable, meta=meta)
names(foldid) <- names(labelNum)
stopifnot(length(labelNum) == length(foldid))
stopifnot(length(unique(foldid)) == num.folds)
train.temp <- list(NULL)
test.temp <- list(NULL)
cat('\n+++ Splitting the dataset:\n')
for (f in 1:num.folds) {
# make sure each fold contains examples from all classes
# for stratify==TRUE should be tested before assignment of test/training set
if (stratify){
stopifnot(all(sort(unique(labelNum[foldid==f])) == classes))
}
# select test examples
test.idx <- which(foldid == f)
train.idx <- which(foldid != f)
train.temp[f] <- list(names(foldid)[train.idx])
test.temp[f] <- list(names(foldid)[test.idx])
# for startify==FALSE, all classes must only be present in the training set
# e.g. in leave-one-out CV, the test fold cannot contain all classes
if (!stratify){
stopifnot(all(sort(unique(labelNum[foldid != f])) == classes))
}
stopifnot(length(intersect(train.idx, test.idx)) == 0)
cat(' + Fold ', f, ' contains ', sum(foldid==f), ' examples\n', sep='')
}
train.list[[r]] <- train.temp
test.list[[r]] <- test.temp
}
return(list("training.folds" = train.list, "test.folds" = test.list, "num.resample" = num.resample, "num.folds" = num.folds))
}
##### function to partition training set into cross-validation folds for model selection
### Works analogous to the function used in data_splitter.r
#' @export
assign.fold <- function(label, num.folds, stratified, inseparable = NULL, meta=NULL) {
foldid <- rep(0, length(label))
classes <- sort(unique(label))
# Transform number of classes into vector of 1 to x for looping over.
# stratify positive examples
if (stratified) {
print(num.folds)
# If stratify is TRUE, make sure that num.folds does not exceed the maximum number of examples for the class with the fewest training examples.
if (any(as.data.frame(table(label))[,2] < num.folds)) {
stop("+++ Number of CV folds is too large for this data set to maintain stratification. Reduce num.folds or turn stratification off. Exiting.\n")
# q(status = 1)
}
for (c in 1:length(classes)) {
idx <- which(label==classes[c])
foldid[idx] <- sample(rep(1:num.folds, length.out=length(idx)))
}
} else {
# If stratify is not TRUE, make sure that num.sample is not bigger than number.folds
if (length(label) <= num.folds){
print("+++ num.samples is exceeding number of folds, setting CV to (k-1) unstratified CV\n")
num.folds <- length(label)-1
}
if (!is.null(inseparable)) {
strata <- unique(meta[,inseparable])
sid <- sample(rep(1:num.folds, length.out=length(strata)))
for (s in 1:length(strata)) {
idx <- which(meta[,inseparable] == strata[s])
foldid[idx] <-sid[s]
}
stopifnot(all(!is.na(foldid)))
} else {
foldid <- sample(rep(1:num.folds, length.out=length(label)))
}
}
# make sure that for each test fold the training fold
# (i.e. all other folds together) contain examples from all classes
# except for stratified CV
if (!stratified){
for (f in 1:num.folds) {
stopifnot(all(sort(unique(label[foldid!=f])) == classes))
}
} else {
for (f in 1:num.folds) {
stopifnot(all(sort(unique(label[foldid==f])) == classes))
}
}
stopifnot(length(label) == length(foldid))
return(foldid)
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Model training
#' @description This function trains the a machine learning model on the training data
#' @param feat feature object
#' @param label label object
#' @param method string, specifies the type of model to be trained, may be one of these: \code{c("lasso", "enet", "ridge", "lasso_ll", "ridge_ll", "randomForest")}
#' @param data.split filename containing the training samples or list of training instances produced by \link{data.splitter()}, defaults to \code{NULL} leading to training on the complete dataset
#' @param stratify boolean, should the folds in the internal cross-validation be stratified?
#' @param modsel.crit list, specifies the model selection criterion during internal cross-validation, may contain these: \code{c("auc", "f1", "acc", "pr")}
#' @param min.nonzero.coeff integer number of minimum nonzero coefficients that should be present in the model (only for \code{"lasso"}, \code{"ridge"}, and \code{"enet"}
#' @param param.set a list of extra parameters for mlr run, may contain: \code{cost} - for lasso_ll and ridge_ll; \code{alpha} for enet and \code{ntree, mtry} for RandomForrest
#' @export
#' @keywords SIAMCAT plm.trainer
#' @return list containing \itemize{
#' \item \code{models.list} the list of trained models for each cross-validation fold
#' \item \code{out.matrix} ?
#' \item \code{W.mat} ?
#'}
# TODO add details section for this function
train.model <- function(feat, label, method = c("lasso", "enet", "ridge", "lasso_ll", "ridge_ll", "randomForest"),
data.split=NULL, stratify = TRUE,
modsel.crit=list("auc"), min.nonzero.coeff = 1, param.set=NULL){
# TODO 1: modsel.criterion should be implemented
# check modsel.crit
if (!all(modsel.crit %in% c("auc", "f1", "acc", "pr", "auprc"))){
cat("Unkown model selection criterion... Defaulting to AU-ROC!\n")
measure <- list(mlr::auc)
} else {
measure <- list()
}
for (m in modsel.crit){
if (m == 'auc'){
measure[[length(measure)+1]] <- mlr::auc
} else if (m == 'acc'){
measure[[length(measure)+1]] <- mlr::acc
} else if (m == 'f1'){
measure[[length(measure)+1]] <- mlr::f1
} else if (m == 'pr' || m == 'auprc'){
auprc <- makeMeasure(id = "auprc", minimize = FALSE, best = 1, worst = 0,
properties = c("classif", "req.pred", "req.truth", "req.prob"),
name = "Area under the Precision Recall Curve",
fun = function(task, model, pred, feats, extra.args) {
#if (anyMissing(pred$data$response) || length(unique(pred$data$truth)) == 1L)
# return(NA_real_)
measureAUPRC(getPredictionProbabilities(pred), pred$data$truth, pred$task.desc$negative, pred$task.desc$positive)
})
measure[[length(measure)+1]] <- auprc