Commit 297bae13 authored by Jakob Wirbel's avatar Jakob Wirbel
Browse files

switch to mlr3 from mlr.

parent 70495558
#' @title LiblineaR Classification Learner
#' @name LearnerClassifLiblineaR
#'
#' @details Type of SVC depends on \code{type} argument:
#' \itemize{
#' \item \code{0} L2-regularized logistic regression (primal)
#' \item \code{1} L2-regularized L2-loss support vector classification (dual)
#' \item \code{3} L2-regularized L1-loss support vector classification (dual)
#' \item \code{2} L2-regularized L2-loss support vector classification (primal)
#' \item \code{4} Support vector classification by Crammer and Singer
#' \item \code{5} L1-regularized L2-loss support vector classification
#' \item \code{6} L1-regularized logistic regression
#' \item \code{7} L2-regularized logistic regression (dual)
#' }
#' If number of records > number of features, \code{type = 2} is faster
#' than \code{type = 1}
#' (Hsu et al. 2003).
#'
#' Note that probabilistic predictions are only available for
#' types \code{0}, \code{6}, and \code{7}.
#' The default \code{epsilon} value depends on the \code{type} parameter,
#' see [LiblineaR::LiblineaR].
#'
#' @encoding UTF-8
# taken from https://github.com/mlr-org/mlr3extralearners
LearnerClassifLiblineaR <- R6::R6Class("LearnerClassifLiblineaR",
inherit = LearnerClassif, public = list(
#' @description
#' #' Creates a new instance of this [R6][R6::R6Class] class.
initialize = function() {
ps = ps(
type = p_int(default = 0, lower = 0, upper = 7, tags = "train"),
cost = p_dbl(default = 1, lower = 0, tags = "train"),
epsilon = p_dbl(lower = 0, tags = "train"),
bias = p_dbl(default = 1, tags = "train"),
cross = p_int(default = 0L, lower = 0L, tags = "train"),
verbose = p_lgl(default = FALSE, tags = "train"),
wi = p_uty(default = NULL, tags = "train"),
findC = p_lgl(default = FALSE, tags = "train"),
useInitC = p_lgl(default = TRUE, tags = "train")
)
# 50 is an arbitrary choice here
ps$add_dep("findC", "cross", CondAnyOf$new(seq(2:50)))
ps$add_dep("useInitC", "findC", CondEqual$new(TRUE))
super$initialize(
id = "classif.liblinear",
packages = "LiblineaR",
feature_types = "numeric",
predict_types = c("response", "prob"),
param_set = ps,
properties = c("twoclass", "multiclass"),
)
}
),
private = list(
.train = function(task) {
pars = self$param_set$get_values(tags = "train")
data = task$data()
train = task$data(cols = task$feature_names)
target = task$truth()
type = ifelse(is.null(pars$type), 0, pars$type)
pars = pars[names(pars) != "type"]
mlr3misc::invoke(LiblineaR::LiblineaR, data = train,
target = target, type = type, .args = pars)
},
.predict = function(task) {
newdata = task$data(cols = task$feature_names)
type = ifelse(is.null(self$param_set$values$type), 0,
self$param_set$values$type)
if (!type %in% c(0, 6, 7) && self$predict_type == "prob") {
stop("'prob' predict_type only possible if",
" `type` is `0`, `6`, or `7`.")
}
if (self$predict_type == "prob") {
return(list(
prob = mlr3misc::invoke(predict, self$model,
newx = newdata, proba = TRUE)$probabilities))
} else {
return(list(
response = mlr3misc::invoke(predict, self$model,
newx = newdata)$predictions))
}
}
)
)
mlr_learners$add("classif.liblinear", LearnerClassifLiblineaR)
\ No newline at end of file
...@@ -3,256 +3,275 @@ ...@@ -3,256 +3,275 @@
### Microbial Communities And host phenoTypes R flavor EMBL ### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0 ### Heidelberg 2012-2018 GNU GPL 3.0
##### Internal function to train a model for a single CV fold
#' @keywords internal
train.plm <-
function(data,
method = c("lasso",
"enet",
"ridge",
"lasso_ll",
"ridge_ll",
"randomForest"),
measure = list("acc"),
min.nonzero.coeff = 5,
param.set = NULL,
neg.lab,
verbose = 1) {
## 1) Define the task Specify the type of analysis (e.g. classification)
## and provide data and response variable assert that the label for the
## first patient is always the same in order for lasso_ll to work
## correctly
if (data$label[1] != neg.lab) {
data <- data[c(which(data$label == neg.lab)[1],
c(seq_len(nrow(data)))[-which(data$label == neg.lab)[1]]), ]
}
task <- makeClassifTask(data = data, target = "label")
## 2) Define the learner Choose a specific algorithm (e.g. linear ##### Internal function to create the mlr3 learner with all kinds of parameters
# discriminant analysis) #' @keywords internal
cost <- 10 ^ seq(-2, 3, length = 6 + 5 + 10) create.mlr.learner <- function(method, nrow.data, param.set=NULL,
type='BINARY'){
if (!method %in% c("lasso", "enet", "ridge", "lasso_ll",
"ridge_ll", "randomForest")){
stop("Unsupported method!")
}
standard.param.set <- list(
"cost" = c(-2, 3),
"epsilon" = 1e-08,
"ntree" = c(100, 1000),
"mtry" = c(round(sqrt(nrow.data) / 2),
round(sqrt(nrow.data)),
round(sqrt(nrow.data) * 2)),
"alpha" = c(0, 1),
"class.weights" = c("-1"=5, "1"=1))
### the most common learner defined here to remove redundancy if (is.null(param.set)){
cl <- "classif.cvglmnet" use.param.set <- standard.param.set
parameters <- } else {
get.parameters.from.param.set(param.set = param.set, use.param.set <- param.set
method = method, sqrt(nrow(data))) for (i in names(standard.param.set)){
if (!i %in% names(param.set)){
use.param.set[[i]] <- standard.param.set[[i]]
}
}
}
if (method == "lasso") { # lasso/enet/ridge
lrn <- if (method %in% c('lasso', 'enet', 'ridge')){
makeLearner( if (type == 'BINARY'){
cl, learner <- lrn("classif.cv_glmnet")
predict.type = "prob", learner$predict_type <- 'prob'
nlambda = 100, } else if (type == 'CONTINUOUS'){
alpha = 1 learner <- lrn("regr.cv_glmnet")
) }
} else if (method == "ridge") { if (method == 'lasso'){
lrn <- learner$param_set$values$alpha <- 1
makeLearner( if ('alpha' %in% names(param.set)){
cl, warning("Parameter 'alpha' will be ignored and set to 1!")
predict.type = "prob", }
nlambda = 100, } else if (method == 'ridge'){
alpha = 0 learner$param_set$values$alpha <- 0
) if ('alpha' %in% names(param.set)){
} else if (method == "enet") { warning("Parameter 'alpha' will be ignored and set to 0!")
if ('alpha' %in% names(parameters)){ }
lrn <- makeLearner(cl, predict.type = 'prob', } else if (method == 'enet'){
nlambda=10, alpha=parameters$alpha) if (length(use.param.set$alpha)==1){
parameters <- NULL learner$param_set$values$alpha <- use.param.set$alpha
} else if ('pars' %in% names(parameters)){
lrn <- makeLearner(cl, predict.type = "prob",
nlambda = 10, alpha=parameters$pars$alpha)
parameters <- NULL
} else { } else {
lrn <- makeLearner(cl, predict.type = "prob", nlambda = 10) learner$param_set$values$alpha <- to_tune(
lower=use.param.set$alpha[1],
upper=use.param.set$alpha[2])
} }
} else if (method == "lasso_ll") { }
cl <- "classif.LiblineaRL1LogReg" use.param.set$alpha <- NULL
lrn <- } else if (method=='randomForest'){
makeLearner(cl, if (type == 'BINARY'){
predict.type = "prob", learner <- lrn("classif.ranger", importance='permutation')
epsilon = 1e-08, learner$predict_type <- 'prob'
wi = parameters$class.weights) } else if (type == 'CONTINUOUS'){
parameters <- parameters$cost learner <- lrn("regr.ranger", importance='impurity')
} else if (method == "ridge_ll") { }
cl <- "classif.LiblineaRL2LogReg" # number of trees
lrn <- if (length(use.param.set$ntree) == 1){
makeLearner( learner$param_set$values$num.trees <- use.param.set$ntree
cl, } else if (length(use.param.set$ntree) == 2) {
predict.type = "prob", learner$param_set$values$num.trees <- to_tune(
epsilon = 1e-08, lower=use.param.set$ntree[1], upper=use.param.set$ntree[2])
type = 0,
wi = parameters$class.weights)
parameters <- parameters$cost
} else if (method == "randomForest") {
cl <- "classif.randomForest"
lrn <- makeLearner(cl,
predict.type = "prob",
fix.factors.prediction = TRUE)
} else { } else {
stop( learner$param_set$values$num.trees <- to_tune(use.param.set$ntree)
method,
" is not a valid method, currently supported: lasso,
enet, ridge, lasso_ll, ridge_ll, randomForest.\n"
)
} }
show.info <- FALSE # mtry
if (verbose > 2) if (length(use.param.set$mtry) == 1){
show.info <- TRUE learner$param_set$values$mtry <- use.param.set$mtry
} else if (length(use.param.set$mtry) == 2){
## 3) Fit the model Train the learner on the task using a random subset learner$param_set$values$mtry <- to_tune(
##of the data as training set lower=use.param.set$mtry[1], upper=use.param.set$mtry[2])
if (!all(is.null(parameters))) { } else {
hyperPars <- tuneParams( learner$param_set$values$mtry <- to_tune(use.param.set$mtry)
learner = lrn, }
task = task, use.param.set$ntree <- NULL
resampling = makeResampleDesc("CV", iters = 5L, use.param.set$mtry <- NULL
stratify = TRUE), if (!'alpha' %in% names(param.set)){
par.set = parameters, use.param.set$alpha <- NULL
control = makeTuneControlGrid(resolution = 10L), }
measures = measure, if (!'class.weights' %in% names(param.set)){
show.info = show.info use.param.set$class.weights <- NULL
)
lrn <- setHyperPars(lrn, par.vals = hyperPars$x)
} }
model <- train(lrn, task)
if (cl == "classif.cvglmnet") { } else if (method %in% c('lasso_ll', 'ridge_ll')){
opt.lambda <- get.optimal.lambda.for.glmnet(model, task, measure, if (type == 'BINARY'){
min.nonzero.coeff) if (method == 'lasso_ll'){
# transform model type <- 6
if (is.null(model$learner$par.vals$s)) {
model$learner.model$lambda.1se <- opt.lambda
} else { } else {
model$learner.model[[model$learner$par.vals$s]] <- opt.lambda type <- 0
} }
coef <- coefficients(model$learner.model) learner <- lrn('classif.liblinear', type=type,
bias.idx <- which(rownames(coef) == "(Intercept)") wi=use.param.set$class.weights,
coef <- coef[-bias.idx, ] epsilon=use.param.set$epsilon)
model$feat.weights <- learner$predict_type <- 'prob'
(-1) * as.numeric(coef) ### check!!! } else if (type == 'CONTINUOUS'){
model$learner.model$call <- NULL stop("Methods not usable for regression tasks!")
} else if (cl == "classif.LiblineaRL1LogReg") {
model$feat.weights <-
model$learner.model$W[
-which(colnames(model$learner.model$W) == "Bias")]
} else if (cl == "classif.randomForest") {
model$feat.weights <- model$learner.model$importance
} }
model$task <- task if (length(use.param.set$cost) == 1){
learner$param_set$values$cost <- use.param.set$cost
return(model) } else if (length(use.param.set$cost) == 2){
learner$param_set$values$cost <- to_tune(p_dbl(
lower=use.param.set$cost[1],
upper=use.param.set$cost[2], trafo=.f_exp))
} else {
learner$param_set$values$cost <- to_tune(use.param.set$cost)
}
use.param.set$class.weights <- NULL
use.param.set$epsilon <- NULL
use.param.set$cost <- NULL
} }
#' @keywords internal # try to set additional parameters, i hope that mlr catches errors here
get.optimal.lambda.for.glmnet <- param.settable <- learner$param_set$ids()
function(trained.model, for (x in intersect(names(use.param.set), param.settable)){
training.task, if (length(use.param.set[[x]])==1){
perf.measure, learner$param_set$values[[x]] <- use.param.set[[x]]
min.nonzero.coeff) {
# get lambdas that fullfill the minimum nonzero coefficients criterion
lambdas <-
trained.model$learner.model$glmnet.fit$lambda[
which(trained.model$learner.model$nzero >= min.nonzero.coeff)]
# get performance on training set for all those lambdas in trace
performances <-
vapply(
lambdas,
FUN = function(lambda, model, task,
measure) {
model.transformed <- model
if (is.null(model.transformed$learner$par.vals$s)) {
model.transformed$learner.model$lambda.1se <- lambda
} else {
model.transformed$learner.model[[
model.transformed$learner$par.vals$s]] <-
lambda
}
pred.temp <- predict(model.transformed, task)
performance(pred.temp, measures = measure)
},
model = trained.model,
task = training.task,
measure = perf.measure,
USE.NAMES = FALSE,
FUN.VALUE = double(1)
)
# get optimal lambda in depence of the performance measure
if (length(perf.measure) == 1) {
if (perf.measure[[1]]$minimize == TRUE) {
opt.lambda <- lambdas[which(performances ==
min(performances))[1]]
} else {
opt.lambda <- lambdas[which(performances ==
max(performances))[1]]
}
} else { } else {
opt.idx <- c() learner$param_set$values[[x]] <- to_tune(use.param.set[[x]])
for (m in seq_along(perf.measure)) {
if (perf.measure[[m]]$minimize == TRUE) {
opt.idx <- c(opt.idx, which(performances[m, ] ==
min(performances[m, ]))[1])
} else {
opt.idx <- c(opt.idx, which(performances[m, ] ==
max(performances[m, ]))[1])
}
}
opt.lambda <- lambdas[floor(mean(opt.idx))]
} }
return(opt.lambda)
} }
return(learner)
}
# function to perform feature selection
#' @keywords internal #' @keywords internal
get.parameters.from.param.set <- perform.feature.selection <- function(data, train.label, param.fs, verbose){
function(param.set, method, sqrt.mdim) {
cost <- 10 ^ seq(-2, 3, length = 6 + 5 + 10) stopifnot(all(c('method', 'no_features', 'direction') %in%names(param.fs)))
ntree <- c(100, 1000)
mtry <- # test method.fs
c(round(sqrt.mdim / 2), round(sqrt.mdim), round(sqrt.mdim * 2)) if (is.factor(train.label)){
alpha <- c(0, 1) allowed.methods <- c('Wilcoxon', 'AUC', 'gFC')
class.weights <- c(5, 1) if (!param.fs$method %in% allowed.methods) {
names(class.weights) <- c(-1, 1) stop('Unrecognised feature selection method. ',
parameters <- NULL 'Must be one of those: {"',
if (method == "lasso_ll") { paste(allowed.methods, collapse = '", "'), '"}')
if (!all(is.null(param.set))) { }
if ("cost" %in% names(param.set)) } else if (is.numeric(train.label)){
cost <- param.set$cost allowed.methods <- c('spearman', 'pearson', 'MI')
if ("class.weights" %in% names(param.set)){ if (!param.fs$method %in% allowed.methods) {
class.weights <- param.set$class.weights stop('Unrecognised feature selection method. ',
names(class.weights) <- c(-1, 1) 'Must be one of those: {"',
} paste(allowed.methods, collapse = '", "'), '"}')
} }
parameters <- list("class.weights"=class.weights, }
'cost'=makeParamSet(makeDiscreteParam("cost", values = cost)))
} else if (method == "randomForest") { # assert the threshold
if (!all(is.null(param.set))) { stopifnot(param.fs$no_features > 10)
if ("ntree" %in% names(param.set)) stopifnot(param.fs$no_features < ncol(data))
ntree <- param.set$ntree
if ("mtry" %in% names(param.set)) if (param.fs$method == 'Wilcoxon') {
mtry <- param.set$mtry assoc <- vapply(data,
} FUN=function(x, label){
parameters <- d <- data.frame(x=x, y=label);
makeParamSet( t <- wilcox.test(x~y, data=d)
makeNumericParam("ntree", lower = ntree[1], return(t$p.val)
upper = ntree[2]), }, FUN.VALUE=double(1),
makeDiscreteParam("mtry", label=train.label)
values = mtry) assoc <- sort(assoc)
) } else if (param.fs$method == 'AUC') {
} else if (method == "enet") { assoc <- vapply(data,
if (!all(is.null(param.set))) { FUN=get.single.feat.AUC,
if ("alpha" %in% names(param.set)) FUN.VALUE = double(1),
alpha <- param.set$alpha label=train.label,
} pos=max(levels(train.label)),
if (length(alpha)==1){ neg=min(levels(train.label)))
parameters <- list(alpha=alpha) if (param.fs$direction == 'absolute'){
} else if (length(alpha) == 2){ assoc[assoc < 0.5] <- 1 - assoc[assoc < 0.5]
parameters <- } else if (param.fs$direction == 'negative'){
makeParamSet(makeNumericParam("alpha", lower = alpha[1], assoc <- 1 - assoc
upper = alpha[2]))
} else {
stop("'alpha' parameter can not have more than two entries!")
}
} }
return(parameters) assoc <- assoc[assoc > 0.5]
assoc <- sort(-assoc)
} else if (param.fs$method == 'gFC') {
assoc <- vapply(data,
FUN=get.quantile.FC,
FUN.VALUE = double(1),
label=train.label,
pos=max(levels(train.label)),
neg=min(levels(train.label)))
if (param.fs$direction == 'absolute'){
assoc <- abs(assoc)
} else if (param.fs$direction == 'negative'){
assoc <- -assoc
}
assoc <- assoc[assoc > 0]
assoc <- sort(-assoc)
} else if (param.fs$method %in% c('spearman', 'pearson')){
assoc <- vapply(data, FUN=cor, FUN.VALUE = double(1), y=train.label,
method=param.fs$method)
if (param.fs$direction == 'absolute'){
assoc <- abs(assoc)
} else if (param.fs$direction == 'negative'){
assoc <- -assoc
}
assoc <- assoc[assoc > 0]
assoc <- sort(-assoc)
} else if (param.fs$method == 'MI'){
assoc <- vapply(data, FUN=function(x){
mutinformation(discretize(x, disc='equalwidth'),
discretize(train.label, disc='equalwidth'))
}, FUN.VALUE = double(1))
assoc <- sort(-assoc)
}
data <- data[,names(assoc)[seq_len(param.fs$no_features)]]
stopifnot(ncol(data) > 0)
if (verbose > 2) {
message(paste0('++ retaining ', ncol(data),
' features after selection based on ',
param.fs$method, '; target number of features ',
param.fs$no_features))
}
return(data)
}
#' @keywords internal
measureAUPRC <- function(probs, truth, negative, positive) {
pr <- pr.curve(scores.class0 = probs[which(truth == positive)],
scores.class1 = probs[which(truth == negative)])
return(pr$auc.integral)
}
#' @keywords internal
get.single.feat.AUC <- function(x, label, pos, neg) {
x.p <- x[label == pos]
x.n <- x[label == neg]
temp.auc <- roc(cases=x.p, controls=x.n, direction='<')$auc
return(temp.auc)
}
#' @keywords internal
get.quantile.FC <- function(x, label, pos, neg){
x.p <- x[label == pos]
x.n <- x[label == neg]
q.p <- quantile(x.p, probs=seq(.1, .9, length.out=9))
q.n <- quantile(x.n, probs=seq(.1, .9, length.out=9))
return(sum(q.p - q.n)/length(q.p))
}
#' @keywords internal
.f_exp <- function(x){10^x}
#' @keywords internal
get.best.glmnet.lambda <- function(model, measure, min.nonzero, task){
idx <- which(model$model$nzero >= min.nonzero)
new.model <- model$clone(deep = TRUE)
perf <- vapply(idx, FUN=function(x){
new.model$param_set$values$s <- new.model$model$lambda[x]
pred <- new.model$predict(task)
pred$score(msr(measure))
}, FUN.VALUE = double(1))
if (msr(measure)$minimize){
f.idx <- which(perf==min(perf))[1]