Commit 297bae13 authored by Jakob Wirbel's avatar Jakob Wirbel
Browse files

switch to mlr3 from mlr.

parent 70495558
#' @title LiblineaR Classification Learner
#' @name LearnerClassifLiblineaR
#'
#' @details Type of SVC depends on \code{type} argument:
#' \itemize{
#' \item \code{0} L2-regularized logistic regression (primal)
#' \item \code{1} L2-regularized L2-loss support vector classification (dual)
#' \item \code{3} L2-regularized L1-loss support vector classification (dual)
#' \item \code{2} L2-regularized L2-loss support vector classification (primal)
#' \item \code{4} Support vector classification by Crammer and Singer
#' \item \code{5} L1-regularized L2-loss support vector classification
#' \item \code{6} L1-regularized logistic regression
#' \item \code{7} L2-regularized logistic regression (dual)
#' }
#' If number of records > number of features, \code{type = 2} is faster
#' than \code{type = 1}
#' (Hsu et al. 2003).
#'
#' Note that probabilistic predictions are only available for
#' types \code{0}, \code{6}, and \code{7}.
#' The default \code{epsilon} value depends on the \code{type} parameter,
#' see [LiblineaR::LiblineaR].
#'
#' @encoding UTF-8
# taken from https://github.com/mlr-org/mlr3extralearners
LearnerClassifLiblineaR <- R6::R6Class("LearnerClassifLiblineaR",
inherit = LearnerClassif, public = list(
#' @description
#' #' Creates a new instance of this [R6][R6::R6Class] class.
initialize = function() {
ps = ps(
type = p_int(default = 0, lower = 0, upper = 7, tags = "train"),
cost = p_dbl(default = 1, lower = 0, tags = "train"),
epsilon = p_dbl(lower = 0, tags = "train"),
bias = p_dbl(default = 1, tags = "train"),
cross = p_int(default = 0L, lower = 0L, tags = "train"),
verbose = p_lgl(default = FALSE, tags = "train"),
wi = p_uty(default = NULL, tags = "train"),
findC = p_lgl(default = FALSE, tags = "train"),
useInitC = p_lgl(default = TRUE, tags = "train")
)
# 50 is an arbitrary choice here
ps$add_dep("findC", "cross", CondAnyOf$new(seq(2:50)))
ps$add_dep("useInitC", "findC", CondEqual$new(TRUE))
super$initialize(
id = "classif.liblinear",
packages = "LiblineaR",
feature_types = "numeric",
predict_types = c("response", "prob"),
param_set = ps,
properties = c("twoclass", "multiclass"),
)
}
),
private = list(
.train = function(task) {
pars = self$param_set$get_values(tags = "train")
data = task$data()
train = task$data(cols = task$feature_names)
target = task$truth()
type = ifelse(is.null(pars$type), 0, pars$type)
pars = pars[names(pars) != "type"]
mlr3misc::invoke(LiblineaR::LiblineaR, data = train,
target = target, type = type, .args = pars)
},
.predict = function(task) {
newdata = task$data(cols = task$feature_names)
type = ifelse(is.null(self$param_set$values$type), 0,
self$param_set$values$type)
if (!type %in% c(0, 6, 7) && self$predict_type == "prob") {
stop("'prob' predict_type only possible if",
" `type` is `0`, `6`, or `7`.")
}
if (self$predict_type == "prob") {
return(list(
prob = mlr3misc::invoke(predict, self$model,
newx = newdata, proba = TRUE)$probabilities))
} else {
return(list(
response = mlr3misc::invoke(predict, self$model,
newx = newdata)$predictions))
}
}
)
)
mlr_learners$add("classif.liblinear", LearnerClassifLiblineaR)
\ No newline at end of file
......@@ -3,256 +3,275 @@
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
##### Internal function to train a model for a single CV fold
#' @keywords internal
train.plm <-
function(data,
method = c("lasso",
"enet",
"ridge",
"lasso_ll",
"ridge_ll",
"randomForest"),
measure = list("acc"),
min.nonzero.coeff = 5,
param.set = NULL,
neg.lab,
verbose = 1) {
## 1) Define the task Specify the type of analysis (e.g. classification)
## and provide data and response variable assert that the label for the
## first patient is always the same in order for lasso_ll to work
## correctly
if (data$label[1] != neg.lab) {
data <- data[c(which(data$label == neg.lab)[1],
c(seq_len(nrow(data)))[-which(data$label == neg.lab)[1]]), ]
}
task <- makeClassifTask(data = data, target = "label")
## 2) Define the learner Choose a specific algorithm (e.g. linear
# discriminant analysis)
cost <- 10 ^ seq(-2, 3, length = 6 + 5 + 10)
##### Internal function to create the mlr3 learner with all kinds of parameters
#' @keywords internal
create.mlr.learner <- function(method, nrow.data, param.set=NULL,
type='BINARY'){
if (!method %in% c("lasso", "enet", "ridge", "lasso_ll",
"ridge_ll", "randomForest")){
stop("Unsupported method!")
}
standard.param.set <- list(
"cost" = c(-2, 3),
"epsilon" = 1e-08,
"ntree" = c(100, 1000),
"mtry" = c(round(sqrt(nrow.data) / 2),
round(sqrt(nrow.data)),
round(sqrt(nrow.data) * 2)),
"alpha" = c(0, 1),
"class.weights" = c("-1"=5, "1"=1))
### the most common learner defined here to remove redundancy
cl <- "classif.cvglmnet"
parameters <-
get.parameters.from.param.set(param.set = param.set,
method = method, sqrt(nrow(data)))
if (is.null(param.set)){
use.param.set <- standard.param.set
} else {
use.param.set <- param.set
for (i in names(standard.param.set)){
if (!i %in% names(param.set)){
use.param.set[[i]] <- standard.param.set[[i]]
}
}
}
if (method == "lasso") {
lrn <-
makeLearner(
cl,
predict.type = "prob",
nlambda = 100,
alpha = 1
)
} else if (method == "ridge") {
lrn <-
makeLearner(
cl,
predict.type = "prob",
nlambda = 100,
alpha = 0
)
} else if (method == "enet") {
if ('alpha' %in% names(parameters)){
lrn <- makeLearner(cl, predict.type = 'prob',
nlambda=10, alpha=parameters$alpha)
parameters <- NULL
} else if ('pars' %in% names(parameters)){
lrn <- makeLearner(cl, predict.type = "prob",
nlambda = 10, alpha=parameters$pars$alpha)
parameters <- NULL
# lasso/enet/ridge
if (method %in% c('lasso', 'enet', 'ridge')){
if (type == 'BINARY'){
learner <- lrn("classif.cv_glmnet")
learner$predict_type <- 'prob'
} else if (type == 'CONTINUOUS'){
learner <- lrn("regr.cv_glmnet")
}
if (method == 'lasso'){
learner$param_set$values$alpha <- 1
if ('alpha' %in% names(param.set)){
warning("Parameter 'alpha' will be ignored and set to 1!")
}
} else if (method == 'ridge'){
learner$param_set$values$alpha <- 0
if ('alpha' %in% names(param.set)){
warning("Parameter 'alpha' will be ignored and set to 0!")
}
} else if (method == 'enet'){
if (length(use.param.set$alpha)==1){
learner$param_set$values$alpha <- use.param.set$alpha
} else {
lrn <- makeLearner(cl, predict.type = "prob", nlambda = 10)
learner$param_set$values$alpha <- to_tune(
lower=use.param.set$alpha[1],
upper=use.param.set$alpha[2])
}
} else if (method == "lasso_ll") {
cl <- "classif.LiblineaRL1LogReg"
lrn <-
makeLearner(cl,
predict.type = "prob",
epsilon = 1e-08,
wi = parameters$class.weights)
parameters <- parameters$cost
} else if (method == "ridge_ll") {
cl <- "classif.LiblineaRL2LogReg"
lrn <-
makeLearner(
cl,
predict.type = "prob",
epsilon = 1e-08,
type = 0,
wi = parameters$class.weights)
parameters <- parameters$cost
} else if (method == "randomForest") {
cl <- "classif.randomForest"
lrn <- makeLearner(cl,
predict.type = "prob",
fix.factors.prediction = TRUE)
}
use.param.set$alpha <- NULL
} else if (method=='randomForest'){
if (type == 'BINARY'){
learner <- lrn("classif.ranger", importance='permutation')
learner$predict_type <- 'prob'
} else if (type == 'CONTINUOUS'){
learner <- lrn("regr.ranger", importance='impurity')
}
# number of trees
if (length(use.param.set$ntree) == 1){
learner$param_set$values$num.trees <- use.param.set$ntree
} else if (length(use.param.set$ntree) == 2) {
learner$param_set$values$num.trees <- to_tune(
lower=use.param.set$ntree[1], upper=use.param.set$ntree[2])
} else {
stop(
method,
" is not a valid method, currently supported: lasso,
enet, ridge, lasso_ll, ridge_ll, randomForest.\n"
)
learner$param_set$values$num.trees <- to_tune(use.param.set$ntree)
}
show.info <- FALSE
if (verbose > 2)
show.info <- TRUE
## 3) Fit the model Train the learner on the task using a random subset
##of the data as training set
if (!all(is.null(parameters))) {
hyperPars <- tuneParams(
learner = lrn,
task = task,
resampling = makeResampleDesc("CV", iters = 5L,
stratify = TRUE),
par.set = parameters,
control = makeTuneControlGrid(resolution = 10L),
measures = measure,
show.info = show.info
)
lrn <- setHyperPars(lrn, par.vals = hyperPars$x)
# mtry
if (length(use.param.set$mtry) == 1){
learner$param_set$values$mtry <- use.param.set$mtry
} else if (length(use.param.set$mtry) == 2){
learner$param_set$values$mtry <- to_tune(
lower=use.param.set$mtry[1], upper=use.param.set$mtry[2])
} else {
learner$param_set$values$mtry <- to_tune(use.param.set$mtry)
}
use.param.set$ntree <- NULL
use.param.set$mtry <- NULL
if (!'alpha' %in% names(param.set)){
use.param.set$alpha <- NULL
}
if (!'class.weights' %in% names(param.set)){
use.param.set$class.weights <- NULL
}
model <- train(lrn, task)
if (cl == "classif.cvglmnet") {
opt.lambda <- get.optimal.lambda.for.glmnet(model, task, measure,
min.nonzero.coeff)
# transform model
if (is.null(model$learner$par.vals$s)) {
model$learner.model$lambda.1se <- opt.lambda
} else if (method %in% c('lasso_ll', 'ridge_ll')){
if (type == 'BINARY'){
if (method == 'lasso_ll'){
type <- 6
} else {
model$learner.model[[model$learner$par.vals$s]] <- opt.lambda
type <- 0
}
coef <- coefficients(model$learner.model)
bias.idx <- which(rownames(coef) == "(Intercept)")
coef <- coef[-bias.idx, ]
model$feat.weights <-
(-1) * as.numeric(coef) ### check!!!
model$learner.model$call <- NULL
} else if (cl == "classif.LiblineaRL1LogReg") {
model$feat.weights <-
model$learner.model$W[
-which(colnames(model$learner.model$W) == "Bias")]
} else if (cl == "classif.randomForest") {
model$feat.weights <- model$learner.model$importance
learner <- lrn('classif.liblinear', type=type,
wi=use.param.set$class.weights,
epsilon=use.param.set$epsilon)
learner$predict_type <- 'prob'
} else if (type == 'CONTINUOUS'){
stop("Methods not usable for regression tasks!")
}
model$task <- task
return(model)
if (length(use.param.set$cost) == 1){
learner$param_set$values$cost <- use.param.set$cost
} else if (length(use.param.set$cost) == 2){
learner$param_set$values$cost <- to_tune(p_dbl(
lower=use.param.set$cost[1],
upper=use.param.set$cost[2], trafo=.f_exp))
} else {
learner$param_set$values$cost <- to_tune(use.param.set$cost)
}
use.param.set$class.weights <- NULL
use.param.set$epsilon <- NULL
use.param.set$cost <- NULL
}
#' @keywords internal
get.optimal.lambda.for.glmnet <-
function(trained.model,
training.task,
perf.measure,
min.nonzero.coeff) {
# get lambdas that fullfill the minimum nonzero coefficients criterion
lambdas <-
trained.model$learner.model$glmnet.fit$lambda[
which(trained.model$learner.model$nzero >= min.nonzero.coeff)]
# get performance on training set for all those lambdas in trace
performances <-
vapply(
lambdas,
FUN = function(lambda, model, task,
measure) {
model.transformed <- model
if (is.null(model.transformed$learner$par.vals$s)) {
model.transformed$learner.model$lambda.1se <- lambda
} else {
model.transformed$learner.model[[
model.transformed$learner$par.vals$s]] <-
lambda
}
pred.temp <- predict(model.transformed, task)
performance(pred.temp, measures = measure)
},
model = trained.model,
task = training.task,
measure = perf.measure,
USE.NAMES = FALSE,
FUN.VALUE = double(1)
)
# get optimal lambda in depence of the performance measure
if (length(perf.measure) == 1) {
if (perf.measure[[1]]$minimize == TRUE) {
opt.lambda <- lambdas[which(performances ==
min(performances))[1]]
} else {
opt.lambda <- lambdas[which(performances ==
max(performances))[1]]
}
# try to set additional parameters, i hope that mlr catches errors here
param.settable <- learner$param_set$ids()
for (x in intersect(names(use.param.set), param.settable)){
if (length(use.param.set[[x]])==1){
learner$param_set$values[[x]] <- use.param.set[[x]]
} else {
opt.idx <- c()
for (m in seq_along(perf.measure)) {
if (perf.measure[[m]]$minimize == TRUE) {
opt.idx <- c(opt.idx, which(performances[m, ] ==
min(performances[m, ]))[1])
} else {
opt.idx <- c(opt.idx, which(performances[m, ] ==
max(performances[m, ]))[1])
}
}
opt.lambda <- lambdas[floor(mean(opt.idx))]
learner$param_set$values[[x]] <- to_tune(use.param.set[[x]])
}
return(opt.lambda)
}
return(learner)
}
# function to perform feature selection
#' @keywords internal
get.parameters.from.param.set <-
function(param.set, method, sqrt.mdim) {
cost <- 10 ^ seq(-2, 3, length = 6 + 5 + 10)
ntree <- c(100, 1000)
mtry <-
c(round(sqrt.mdim / 2), round(sqrt.mdim), round(sqrt.mdim * 2))
alpha <- c(0, 1)
class.weights <- c(5, 1)
names(class.weights) <- c(-1, 1)
parameters <- NULL
if (method == "lasso_ll") {
if (!all(is.null(param.set))) {
if ("cost" %in% names(param.set))
cost <- param.set$cost
if ("class.weights" %in% names(param.set)){
class.weights <- param.set$class.weights
names(class.weights) <- c(-1, 1)
}
}
parameters <- list("class.weights"=class.weights,
'cost'=makeParamSet(makeDiscreteParam("cost", values = cost)))
} else if (method == "randomForest") {
if (!all(is.null(param.set))) {
if ("ntree" %in% names(param.set))
ntree <- param.set$ntree
if ("mtry" %in% names(param.set))
mtry <- param.set$mtry
}
parameters <-
makeParamSet(
makeNumericParam("ntree", lower = ntree[1],
upper = ntree[2]),
makeDiscreteParam("mtry",
values = mtry)
)
} else if (method == "enet") {
if (!all(is.null(param.set))) {
if ("alpha" %in% names(param.set))
alpha <- param.set$alpha
}
if (length(alpha)==1){
parameters <- list(alpha=alpha)
} else if (length(alpha) == 2){
parameters <-
makeParamSet(makeNumericParam("alpha", lower = alpha[1],
upper = alpha[2]))
} else {
stop("'alpha' parameter can not have more than two entries!")
}
perform.feature.selection <- function(data, train.label, param.fs, verbose){
stopifnot(all(c('method', 'no_features', 'direction') %in%names(param.fs)))
# test method.fs
if (is.factor(train.label)){
allowed.methods <- c('Wilcoxon', 'AUC', 'gFC')
if (!param.fs$method %in% allowed.methods) {
stop('Unrecognised feature selection method. ',
'Must be one of those: {"',
paste(allowed.methods, collapse = '", "'), '"}')
}
} else if (is.numeric(train.label)){
allowed.methods <- c('spearman', 'pearson', 'MI')
if (!param.fs$method %in% allowed.methods) {
stop('Unrecognised feature selection method. ',
'Must be one of those: {"',
paste(allowed.methods, collapse = '", "'), '"}')
}
}
# assert the threshold
stopifnot(param.fs$no_features > 10)
stopifnot(param.fs$no_features < ncol(data))
if (param.fs$method == 'Wilcoxon') {
assoc <- vapply(data,
FUN=function(x, label){
d <- data.frame(x=x, y=label);
t <- wilcox.test(x~y, data=d)
return(t$p.val)
}, FUN.VALUE=double(1),
label=train.label)
assoc <- sort(assoc)
} else if (param.fs$method == 'AUC') {
assoc <- vapply(data,
FUN=get.single.feat.AUC,
FUN.VALUE = double(1),
label=train.label,
pos=max(levels(train.label)),
neg=min(levels(train.label)))
if (param.fs$direction == 'absolute'){
assoc[assoc < 0.5] <- 1 - assoc[assoc < 0.5]
} else if (param.fs$direction == 'negative'){
assoc <- 1 - assoc
}
return(parameters)
assoc <- assoc[assoc > 0.5]
assoc <- sort(-assoc)
} else if (param.fs$method == 'gFC') {
assoc <- vapply(data,
FUN=get.quantile.FC,
FUN.VALUE = double(1),
label=train.label,
pos=max(levels(train.label)),
neg=min(levels(train.label)))
if (param.fs$direction == 'absolute'){
assoc <- abs(assoc)
} else if (param.fs$direction == 'negative'){
assoc <- -assoc
}
assoc <- assoc[assoc > 0]
assoc <- sort(-assoc)
} else if (param.fs$method %in% c('spearman', 'pearson')){
assoc <- vapply(data, FUN=cor, FUN.VALUE = double(1), y=train.label,
method=param.fs$method)
if (param.fs$direction == 'absolute'){
assoc <- abs(assoc)
} else if (param.fs$direction == 'negative'){
assoc <- -assoc
}
assoc <- assoc[assoc > 0]
assoc <- sort(-assoc)
} else if (param.fs$method == 'MI'){
assoc <- vapply(data, FUN=function(x){
mutinformation(discretize(x, disc='equalwidth'),
discretize(train.label, disc='equalwidth'))
}, FUN.VALUE = double(1))
assoc <- sort(-assoc)
}
data <- data[,names(assoc)[seq_len(param.fs$no_features)]]
stopifnot(ncol(data) > 0)
if (verbose > 2) {
message(paste0('++ retaining ', ncol(data),
' features after selection based on ',
param.fs$method, '; target number of features ',
param.fs$no_features))
}
return(data)
}
#' @keywords internal
measureAUPRC <- function(probs, truth, negative, positive) {
pr <- pr.curve(scores.class0 = probs[which(truth == positive)],
scores.class1 = probs[which(truth == negative)])
return(pr$auc.integral)
}
#' @keywords internal
get.single.feat.AUC <- function(x, label, pos, neg) {
x.p <- x[label == pos]
x.n <- x[label == neg]
temp.auc <- roc(cases=x.p, controls=x.n, direction='<')$auc
return(temp.auc)
}
#' @keywords internal
get.quantile.FC <- function(x, label, pos, neg){
x.p <- x[label == pos]
x.n <- x[label == neg]
q.p <- quantile(x.p, probs=seq(.1, .9, length.out=9))
q.n <- quantile(x.n, probs=seq(.1, .9, length.out=9))
return(sum(q.p - q.n)/length(q.p))
}
#' @keywords internal
.f_exp <- function(x){10^x}