Commit 40b68d17 authored by Konrad Zych's avatar Konrad Zych

adding clean code

parent cb22da5e
Package: SIAMCAT
Type: Package
Title: Statistical Inference of Associations between Microbial Communities And host phenoTypes
Version: 0.2.0
Author: Georg Zeller, Nicolai Karcher, Konrad Zych
Maintainer: Konrad Zych <konrad.zych@embl.de>
Description: SIAMCAT is a pipeline for Statistical Inference of Associations between Microbial Communities And host phenoTypes. A primary goal of analyzing microbiome data is to determine changes in community composition that are associated with environmental factors. In particular, linking human microbiome composition to host phenotypes such as diseases has become an area of intense research. For this, robust statistical modeling and biomarker extraction toolkits are crucially needed. SIAMCAT provides a full pipeline supporting data preprocessing, statistical association testing, statistical modeling (LASSO logistic regression) including tools for evaluation and interpretation of these models (such as cross validation, parameter selection, ROC analysis and diagnostic model plots).
Depends:
R (>= 3.1.0),
RColorBrewer,
beanplot,
glmnet,
LiblineaR,
pROC
Imports:
optparse,
colorRamps,
gelnet
License: GNU GPL-3.0
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.0.1
This diff is collapsed.
# Generated by roxygen2: do not edit by hand
S3method(plot,data.range)
S3method(predict,plm)
export(add.meta.pred)
export(appendDirName)
export(assign.fold)
export(check.associations)
export(confounder.check)
export(create.tints.rgb)
export(data.splitter)
export(draw.error.bar)
export(eval.result)
export(evaluation.model.plot)
export(filter.feat)
export(interpretor.model.plot)
export(label.plot.horizontal)
export(make_evaler_options)
export(make_filter_options)
export(make_frozen_normalizer_options)
export(make_meta_adder_options)
export(make_model_interpretor_options)
export(make_normalizer_options)
export(normalize.feat)
export(parse.label.header)
export(parse.model.header)
export(plm.predictor)
export(plm.trainer)
export(read.features)
export(read.labels)
export(read.meta)
export(sample.strat)
export(select.model)
export(select.samples)
export(train.plm)
export(validate.data)
This diff is collapsed.
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 01.06.2017
# GNU GPL 3.0
###
# evaluates the predictive performance of a classifier on a labeled data sets
# returns a list with vectors containing TP, FP, TN, FN for each threshold value on the predictions
# (where TP = true positives, FP = false positives, TN = true negatives, FN = false negatives)
eval.classifier <- function(predictions, test.label, label) {
stopifnot(dim(test.label) == NULL)
stopifnot(length(unique(test.label)) == 2)
stopifnot(all(is.finite(predictions)))
# calculate thresholds, one between each subsequent pair of sorted prediction values
# this is ignorant to whether predictions is in matrix or vector format (see below)
thr <- predictions
dim(thr) <- NULL
thr <- sort(unique(thr))
thr <- rev(c(min(thr)-1, (thr[-1]+thr[-length(thr)])/2, max(thr)+1))
if (is.null(dim(predictions))) {
# assuming that a single model was applied to predict the data set
stopifnot(length(test.label) == length(predictions))
# actual evaluations per threshold value
tp = vector('numeric', length(thr))
fp = vector('numeric', length(thr))
tn = vector('numeric', length(thr))
fn = vector('numeric', length(thr))
for (i in 1:length(thr)) {
tp[i] = sum(test.label==label$positive.lab & predictions>thr[i])
fp[i] = sum(test.label==label$negative.lab & predictions>thr[i])
tn[i] = sum(test.label==label$negative.lab & predictions<thr[i])
fn[i] = sum(test.label==label$positive.lab & predictions<thr[i])
}
} else {
# assuming that several models were applied to predict the same data and predictions of each model occupy one column
stopifnot(length(test.label) == nrow(predictions))
tp = matrix(0, nrow=length(thr), ncol=ncol(predictions))
fp = matrix(0, nrow=length(thr), ncol=ncol(predictions))
tn = matrix(0, nrow=length(thr), ncol=ncol(predictions))
fn = matrix(0, nrow=length(thr), ncol=ncol(predictions))
for (c in 1:ncol(predictions)) {
for (r in 1:length(t)) {
tp[r,c] = sum(test.label==label$positive.lab & predictions[,c] > thr[r])
fp[r,c] = sum(test.label==label$negative.lab & predictions[,c] > thr[r])
tn[r,c] = sum(test.label==label$negative.lab & predictions[,c] < thr[r])
fn[r,c] = sum(test.label==label$positive.lab & predictions[,c] < thr[r])
}
}
}
return(list(tp=tp, tn=tn, fp=fp, fn=fn, thresholds=thr))
}
# returns a matrix of x and y values for plotting a receiver operating characteristic curve
# eval is a list produced by eval.classifier
get.roc = function(eval) {
if (is.null(dim(eval$tp))) {
stopifnot(!is.null(eval$fp))
stopifnot(!is.null(eval$tn))
stopifnot(!is.null(eval$fn))
fpr = eval$fp / (eval$tn + eval$fp)
tpr = eval$tp / (eval$tp + eval$fn)
} else {
stopifnot(ncol(eval$tp) == ncol(eval$fp))
stopifnot(ncol(eval$tp) == ncol(eval$tn))
stopifnot(ncol(eval$tp) == ncol(eval$fn))
fpr = matrix(NA, nrow=nrow(eval$tp), ncol=ncol(eval$tp))
tpr = matrix(NA, nrow=nrow(eval$tp), ncol=ncol(eval$tp))
for (c in 1:ncol(eval$tp)) {
fpr[,c] = eval$fp[,c] / (eval$tn[,c] + eval$fp[,c])
tpr[,c] = eval$tp[,c] / (eval$tp[,c] + eval$fn[,c])
}
}
return(list(x=fpr, y=tpr))
}
# plots a receiver operating characteristic curve
plot.roc.curve = function(eval) {
roc = get.roc(eval)
plot(roc$x, roc$y,
xlim=c(0,1), ylim=c(0,1), type='l', # type='o' pch=16, cex=0.4,
xlab='False positive rate', ylab='True positive rate')
}
# returns a vector of x and y values for plotting a precision-recall curve
get.pr = function(eval) {
tpr = eval$tp / (eval$tp + eval$fn)
ppv = eval$tp / (eval$tp + eval$fp)
# at thresholds where the classifier makes no positive predictions at all,
# we (somewhat arbitrarily) set its precision to 1
ppv[is.na(ppv)] = 1.0
return(list(x=tpr, y=ppv))
}
# plots a precision-recall curve
plot.pr.curve = function(eval) {
pr = get.pr(eval)
plot(pr$x, pr$y,
xlim=c(0,1), ylim=c(0,1), type='l', # type='o' pch=16, cex=0.4,
xlab='Recall (TPR)', ylab='Precision (PPV)')
}
# calculates the area under a curve using a trapezoid approximation
area.trapez = function(x, y) {
if (x[1] > x[length(x)]) {
x = rev(x)
y = rev(y)
}
xd = x[-1] - x[-length(x)]
ym = 0.5 * (y[-1] + y[-length(y)])
return(xd %*% ym)
}
# calculates the area under the ROC curve (over the interval [0, max.fpr], if specified)
# get.roc is a function which takes the list returned by eval.classifier and returns a list (x,y) containing TP and FP values for each
# threshhold set in eval.classifier.
calc.auroc = function(eval, max.fpr=1.0) {
roc = get.roc(eval)
idx = roc$x <= max.fpr
return(area.trapez(roc$x[idx], roc$y[idx]))
}
# calculates the area under the precision-recall curve (over the interval [0, max.tpr], if specified)
calc.aupr = function(eval, max.tpr=1.0) {
pr = get.pr(eval)
idx = pr$x <= max.tpr
return(area.trapez(pr$x[idx], pr$y[idx]))
}
# returns the positions of local minima in a given (ordered) vector
# local minima are defined as having ext adjacent smaller values in both directions
local.min = function(y, ext) {
m = list()
for (i in 1:length(y)) {
env = c((i-ext):(i-1), (i+1):(i+ext))
env = env[env > 0 & env <= length(y)]
if (y[i] > max(y[env])) {
m = c(m,i)
}
}
return(unlist(m))
}
plot.model.eval <- function(fn.plot, pred, eval.data, model.type){
pdf(fn.plot)
plot(NULL, xlim=c(0,1), ylim=c(0,1), xlab='False positive rate', ylab='True positive rate', type='n')
title(paste('ROC curve for', model.type, 'model', sep=' '))
abline(a=0, b=1, lty=3)
if (dim(pred)[2] > 1) {
aucs = vector('numeric', dim(pred)[2])
for (c in 1:dim(pred)[2]) {
roc.c = eval.data$roc.all[[c]]
lines(1-roc.c$specificities, roc.c$sensitivities, col=gray(runif(1,0.2,0.8)))
aucs[c] = eval.data$auc.all[c]
cat('AU-ROC (resampled run ', c, '): ', format(aucs[c], digits=3), '\n', sep='')
}
l.vec = rep(label, dim(pred)[2])
} else {
l.vec = label
}
roc.summ = eval.data$roc.average[[1]]
lines(1-roc.summ$specificities, roc.summ$sensitivities, col='black', lwd=2)
auroc = eval.data$auc.average[1]
# plot CI
x = as.numeric(rownames(roc.summ$ci))
yl = roc.summ$ci[,1]
yu = roc.summ$ci[,3]
polygon(1-c(x, rev(x)), c(yl, rev(yu)), col='#88888844', border=NA)
if (dim(pred)[2] > 1) {
cat('Mean-pred. AU-ROC:', format(auroc, digits=3), '\n')
cat('Averaged AU-ROC: ', format(mean(aucs), digits=3), ' (sd=', format(sd(aucs), digits=4), ')\n', sep='')
text(0.7, 0.1, paste('Mean-prediction AUC:', format(auroc, digits=3)))
} else {
cat('AU-ROC:', format(auroc, digits=3), '\n')
text(0.7, 0.1, paste('AUC:', format(auroc, digits=3)))
}
# precision recall curve
plot(NULL, xlim=c(0,1), ylim=c(0,1), xlab='Recall', ylab='Precision', type='n')
title(paste('Precision-recall curve for', model.type, 'model', sep=' '))
abline(h=mean(label==PL), lty=3)
if (dim(pred)[2] > 1) {
aucspr = vector('numeric', dim(pred)[2])
for (c in 1:dim(pred)[2]) {
ev = eval.data$ev.list[[c]]
pr = eval.data$pr.list[[c]]
lines(pr$x, pr$y, col=gray(runif(1,0.2,0.8)))
aucspr[c] = eval.data$aucspr[c]
cat('AU-PRC (resampled run ', c, '): ', format(aucspr[c], digits=3), '\n', sep='')
}
ev = eval.data$ev.list[[length(eval.data$ev.list)]]
} else {
ev = eval.data$ev.list[[1]]
}
pr = get.pr(ev)
lines(pr$x, pr$y, col='black', lwd=2)
aupr = calc.aupr(ev)
if (dim(pred)[2] > 1) {
cat('Mean-pred. AU-PRC:', format(aupr, digits=3), '\n')
cat('Averaged AU-PRC: ', format(mean(aucs), digits=3), ' (sd=', format(sd(aucs), digits=4), ')\n', sep='')
text(0.7, 0.1, paste('Mean-prediction AUC:', format(aupr, digits=3)))
} else {
cat('AU-PRC:', format(aupr, digits=3), '\n')
text(0.7, 0.1, paste('AUC:', format(aupr, digits=3)))
}
tmp = dev.off()
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 25.06.2017
# GNU GPL 3.0
###
#liblinear.type = 6
#class.weights = c(1, 1) # TODO reset!!!
#model.tag = 'LASSO'
#eps = 1e-8
##### function to draw a stratified sample from the label vector
#' @export
sample.strat = function(label, frac) {
classes = unique(label)
s = NULL
for (c in classes) {
idx = which(label==c)
s = c(s, sample(which(label==c), round(frac * sum(label==c))))
}
return(s)
}
##### function to train a LASSO model for a single given C
#' @export
train.plm <- function(feat, label, method, hyper.par) {
method <- tolower(method)
model <- list(original.model=NULL, feat.weights=NULL)
# note that some of the logit models are set up inversely to each other,
# requiring to swap coefficient signs
if (method == 'lasso') {
# here we will ignore any hyper.par$alpha to ensure a LASSO model (and not an Elastic Net) is trained
lambda <- hyper.par$lambda
label.fac <- factor(label$label, levels=c(label$negative.lab, label$positive.lab))
# Note that ordering of label vector is important!
#print(label.fac)
#saveList <- list(feat=feat, label.fac=label.fac)
#save(saveList, file="featwhat.RData")
model$original.model <- glmnet(x=feat, y=label.fac, family='binomial', standardize=FALSE, alpha=1, lambda=lambda)
#print("do")
coef <- coefficients(model$original.model)
bias.idx <- which(rownames(coef) == '(Intercept)')
coef <- coef[-bias.idx,]
# Remove bias/intercept term when returning model feature weights
model$feat.weights <- (-1) * as.numeric(coef) ### check!!!
} else if (method == 'lasso_ll') {
liblin.type <- 6
model$original.model <- LiblineaR(feat, label$label,
type=liblin.type, bias=TRUE, epsilon=1e-6,
cost=hyper.par$C)
stopifnot((is.numeric(label$positive.lab) && is.numeric(label$negative.lab)) || label$positive.lab < label$negative.lab)
# Apparently, LiblineaR orders class labels differently, depending on "type". I need to check this more thoroughly. Works for now.
if(model$original.model$ClassNames[2] != label$positive.lab){
# Since prediction function is mirrored on the y-axis, we also need to generate "swapped" models.
# This means model$ClassNames has to have structure c(NL, PL) and the model features have to be mirrored as well (neg. numbers become pos. and vice verca)
# If the upper is true, ensure that swapping takes place.
temp = model$original.model$ClassNames[1]
model$original.model$ClassNames[1] = model$original.model$ClassNames[2]
model$original.model$ClassNames[2] = temp
model$original.model$W = model$original.model$W * -1
}
bias.idx <- which(colnames(model$original.model$W) == 'Bias')
model$feat.weights <- as.numeric(model$original.model$W[,-bias.idx])
} else if (method == 'ridge_ll') {
liblin.type <- 0
model$original.model <- LiblineaR(feat, label$label,
type=liblin.type, bias=TRUE, epsilon=1e-6,
cost=hyper.par$C)
stopifnot((is.numeric(label$postive.lab) && is.numeric(label$negative.lab)) || label$postive.lab < label$negative.lab)
# Apparently, LiblineaR orders class labels differently, depending on "type". I need to check this more thoroughly. Works for now.
if(model$original.model$ClassNames[2] != label$postive.lab){
# Since prediction function is mirrored on the y-axis, we also need to generate "swapped" models.
# This means model$ClassNames has to have structure c(NL, PL) and the model features have to be mirrored as well (neg. numbers become pos. and vice verca)
# If the upper is true, ensure that swapping takes place.
temp = model$original.model$ClassNames[1]
model$original.model$ClassNames[1] = model$original.model$ClassNames[2]
model$original.model$ClassNames[2] = temp
model$original.model$W = model$original.model$W * -1
}
bias.idx <- which(colnames(model$original.model$W) == 'Bias')
model$feat.weights <- as.numeric(model$original.model$W[,-bias.idx])
} else if (method == 'enet') {
stopifnot(hyper.par$alpha < 1) # otherwise we're going to train a LASSO model
model$original.model <- glmnet(feat, factor(label$label, levels=c(label$negative.lab, label$postive.lab)),
family='binomial', standardize=FALSE,
alpha=hyper.par$alpha, lambda=hyper.par$lambda)
c <- coefficients(model$original.model)
c <- c[,ncol(c)]
bias.idx <- which(names(c) == '(Intercept)')
model$feat.weights <- -1 * as.numeric(c[-bias.idx])
} else if (method == 'gelnet') {
stopifnot(hyper.par$alpha < 1) # otherwise we're going to train a LASSO model
model$original.model <- gelnet(feat, factor(label$label, levels=c(label$negative.lab, label$postive.lab)),
l2=hyper.par$alpha, l1=hyper.par$lambda,
silent=TRUE)
# for consistency with the other models the coefficient sign needs to be flipped
model$feat.weights <- model$original.model$w
} else {
stop('unknown method')
}
return(model)
}
#' @export
predict.plm <- function(feat, model, method, opt.hyper.par) {
method <- tolower(method)
# note that some of the logit models are set up inversely to each other,
# requiring to select coefficient/prediction columns accordingly using col.idx
if (method == 'lasso') {
col.idx <- 1
# glmnet's predict function needs to be given a lambda value
pred <- predict(model$original.model, feat,
alpha=1, s=opt.hyper.par$lambda,
type="response")[,col.idx]
} else if (method == 'lasso_ll' || method == 'ridge_ll') {
col.idx <- 2 # this is a bit counter-intuitive given the column names of the predict data frame
pred <- predict(model$original.model, feat,
proba=TRUE)$probabilities[,col.idx]
# Adding rownames here is important.
names(pred) <- rownames(feat)
} else if (method == 'enet') {
col.idx <- 1
# glmnet's predict function needs to be given a lambda value
pred <- predict(model$original.model, feat,
alpha=opt.hyper.par$alpha, s=opt.hyper.par$lambda,
type="response")[,col.idx]
} else if (method == 'gelnet') {
# gelnet's predictions need to be calculated "by hand"
m = model$original.model
pred <- 1.0 / (1.0 + exp(feat %*% m$w + m$b))
names(pred) <- rownames(feat)
} else {
stop('unknown method')
}
return(pred)
}
#' @export
select.model <- function(feat, label, method, hyper.par, min.nonzero=1,
num.folds=5, stratified=FALSE, foldid=foldid) {
method <- tolower(method)
opt.hyper.par <- NULL
nonzero.coeff <- matrix(0, num.folds, length(hyper.par$lambda))
# here we use the area under the ROC curve as a model selection criterion,
# but there are alternatives (e.g. accuracy, partial AU-ROC or the area under the precision-recall curve)
fold.id <- foldid
p <- rep(NA, length(label$label))
if (method == 'lasso') {
aucs <- rep(0, length(hyper.par$lambda))
for (i in 1:length(hyper.par$lambda)) {
hp <- NULL
hp$lambda <- hyper.par$lambda[i]
for (f in 1:num.folds) {
test.idx <- which(fold.id == f)
# Replace this with train.idx <- which(fold.id != f)? For better readability.
train.idx <- setdiff(1:length(label$label), test.idx)
label.train <- label
label.train$label <- label.train$label[train.idx]
#print(feat[train.idx,])
model <- train.plm(feat[train.idx,], label.train, method, hp)
p[test.idx] <- predict.plm(feat[test.idx,], model, method, hp)
#print(model)
nonzero.coeff[f,i] = sum(model$feat.weights[1:(ncol(feat)-1)] != 0)
}
aucs[i] <- roc(response=label$label, predictor=p)$auc
cat(' ', method, ' model selection: (lambda=', hyper.par$lambda[i],
') AU-ROC=', format(aucs[i], digits=3), '\n', sep='')
}
suff.nonzero <- apply(nonzero.coeff, 2, min) > min.nonzero
nonzero.coeff <- nonzero.coeff[,suff.nonzero]
aucs <- aucs[suff.nonzero]
opt.idx <- which.max(aucs)
opt.hyper.par$lambda <- hyper.par$lambda[opt.idx]
cat(' optimal lambda =', hyper.par$lambda[opt.idx], '\n')
} else if (method == 'lasso_ll' || method == 'ridge_ll' || method == 'l1_svm' || method == 'l2_svm') {
aucs <- rep(0, length(hyper.par$C))
for (i in 1:length(hyper.par$C)) {
hp <- NULL
hp$C <- hyper.par$C[i]
for (f in 1:num.folds) {
test.idx <- which(fold.id == f)
train.idx <- setdiff(1:length(label$label), test.idx)
label.train <- label
label.train$label <- label.train$label[train.idx]
model <- train.plm(feat[train.idx,], label.train, method, hp)
p[test.idx] <- predict.plm(feat[test.idx,], model, method, hp)
}
aucs[i] <- roc(response=label, predictor=p)$auc
cat(' ', method, ' model selection: (C=', hyper.par$C[i],
') AU-ROC=', format(aucs[i], digits=3), '\n', sep='')
}
opt.idx <- which.max(aucs)
opt.hyper.par$C <- hyper.par$C[opt.idx]
cat(' optimal C =', hyper.par$C[opt.idx], '\n')
} else if (method == 'enet' || method == 'gelnet') {
aucs <- matrix(0, nrow=length(hyper.par$alpha),
ncol=length(hyper.par$lambda))
for (i in 1:length(hyper.par$alpha)) {
for (j in 1:length(hyper.par$lambda)) {
hp <- NULL
hp$alpha <- hyper.par$alpha[i]
hp$lambda <- hyper.par$lambda[j]
for (f in 1:num.folds) {
test.idx <- which(fold.id == f)
train.idx <- setdiff(1:length(label$label), test.idx)
label.train <- label
label.train$label <- label.train$label[train.idx]
model <- train.plm(feat[train.idx,], label.train, method, hp)
p[test.idx] <- predict.plm(feat[test.idx,], model, method, hp)
}
aucs[i,j] <- roc(response=label, predictor=p)$auc
cat(' ', method, ' model selection: (alpha=', hyper.par$alpha[i],
', lambda=', hyper.par$lambda[j], ') AU-ROC=',
format(aucs[i,j], digits=3), '\n', sep='')
}
}
opt.idx <- arrayInd(which.max(aucs), dim(aucs))
opt.hyper.par$alpha <- hyper.par$lambda[opt.idx[1]]
opt.hyper.par$lambda <- hyper.par$lambda[opt.idx[2]]
cat(' optimal alpha =' , hyper.par$alpha[opt.idx[1]],
', lambda =', hyper.par$lambda[opt.idx[2]], '\n')
} else {
stop('unknown method')
}
return(opt.hyper.par)
}
##### function to partition training set into cross-validation folds for model selection
### Works analogous to the function used in data_splitter.r
#' @export
assign.fold <- function(label, num.folds, stratified, inseparable = NULL, foldid) {
classes <- sort(unique(label))
# Transform number of classes into vector of 1 to x for looping over.
# stratify positive examples
if (stratified) {
# If stratify is TRUE, make sure that num.folds does not exceed the maximum number of examples for the class with the fewest training examples.
if (any(as.data.frame(table(label))[,2] < num.folds)) {
print("+++ Number of CV folds is too large for this data set to maintain stratification. Reduce num.folds or turn stratification off. Exiting.\n")
q(status = 1)
}
for (c in 1:length(classes)) {
idx <- which(label==classes[c])
foldid[idx] <- sample(rep(1:num.folds, length.out=length(idx)))
}
} else {
# If stratify is not TRUE, make sure that num.sample is not bigger than number.folds
if (length(label) <= num.folds){
print("+++ num.samples is exceeding number of folds, setting CV to (k-1) unstratified CV\n")
num.folds <- length(label)-1
}
if (!is.null(inseparable)) {
strata <- unique(meta.data[,inseparable])
sid <- sample(rep(1:num.folds, length.out=length(strata)))
foldid <- rep(NA, length(label))
for (s in 1:length(strata)) {
idx <- which(meta.data[,inseparable] == strata[s])
foldid[idx] <-sid</