Commit 2f8ddb68 authored by Jakob Wirbel's avatar Jakob Wirbel
Browse files

mostly documentation changes.

parent d08ee5f9
...@@ -3,38 +3,37 @@ ...@@ -3,38 +3,37 @@
### Microbial Communities And host phenoTypes R flavor EMBL ### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0 ### Heidelberg 2012-2018 GNU GPL 3.0
#' @title Add metadata as predictors #' @title Add metadata as predictors
#' #'
#' @description This function adds metadata to the feature matrix to be later #' @description This function adds metadata to the feature matrix to be later
#' used as predictors #' used as predictors
#' #'
#' @usage add.meta.pred(siamcat, pred.names, #' @usage add.meta.pred(siamcat, pred.names, std.meta = TRUE,
#' std.meta = TRUE, #' feature.type='normalized', verbose = 1)
#' feature.type='normalized',
#' verbose = 1)
#' #'
#' @param siamcat object of class \link{siamcat-class} #' @param siamcat object of class \link{siamcat-class}
#' #'
#' @param pred.names vector of names of the variables within the metadata to be #' @param pred.names vector of names of the variables within the metadata to
#' added to the feature matrix as predictors #' be added to the feature matrix as predictors
#' #'
#' @param std.meta boolean, should added metadata features be standardized?, #' @param std.meta boolean, should added metadata features be standardized?,
#' defaults to \code{TRUE} #' defaults to \code{TRUE}
#' #'
#' @param feature.type string, on which type of features should the function #' @param feature.type string, on which type of features should the function
#' work? Can be either \code{"original"}, \code{"filtered"}, or #' work? Can be either \code{"original"}, \code{"filtered"}, or
#' \code{"normalized"}. Please only change this paramter if you know what #' \code{"normalized"}. Please only change this paramter if you know what
#' you are doing! #' you are doing!
#' #'
#' @param verbose integer, control output: \code{0} for no output at all, #' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for #' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information, #' normal level of information and \code{3} for full debug information,
#' defaults to \code{1} #' defaults to \code{1}
#' #'
#' @keywords SIAMCAT add.meta.pred #' @keywords SIAMCAT add.meta.pred
#' #'
#' @export #' @export
#'
#' @encoding UTF-8
#' #'
#' @details This functions adds one or several metadata variables to the set #' @details This functions adds one or several metadata variables to the set
#' of features, so that they can be included for model training. #' of features, so that they can be included for model training.
...@@ -50,7 +49,7 @@ ...@@ -50,7 +49,7 @@
#' numerically before you start the SIAMCAT workflow. #' numerically before you start the SIAMCAT workflow.
#' #'
#' @return an object of class \link{siamcat-class} with metadata added to the #' @return an object of class \link{siamcat-class} with metadata added to the
#' features #' features
#' #'
#' @examples #' @examples
#' data(siamcat_example) #' data(siamcat_example)
...@@ -60,9 +59,8 @@ ...@@ -60,9 +59,8 @@
#' #'
#' # Add Age and BMI as potential predictors #' # Add Age and BMI as potential predictors
#' # Additionally, prevent standardization of the added features #' # Additionally, prevent standardization of the added features
#' siamcat_meta_added <- add.meta.pred(siamcat_example, #' siamcat_meta_added <- add.meta.pred(siamcat_example,
#' pred.names=c('Age', 'BMI'), #' pred.names=c('Age', 'BMI'), std.meta=FALSE)
#' std.meta=FALSE)
add.meta.pred <- function(siamcat, pred.names, std.meta = TRUE, add.meta.pred <- function(siamcat, pred.names, std.meta = TRUE,
feature.type = 'normalized', verbose = 1) { feature.type = 'normalized', verbose = 1) {
......
...@@ -4,41 +4,57 @@ ...@@ -4,41 +4,57 @@
### Heidelberg 2012-2018 GNU GPL 3.0 ### Heidelberg 2012-2018 GNU GPL 3.0
#' @title Check for potential confounders in the metadata #' @title Check for potential confounders in the metadata
#' @description Checks potential confounders in the metadata and produces #'
#' some visualizations #' @description Checks potential confounders in the metadata and visualize the
#' results
#'
#' @usage check.confounders(siamcat, fn.plot, meta.in = NULL, #' @usage check.confounders(siamcat, fn.plot, meta.in = NULL,
#' feature.type='filtered', verbose = 1) #' feature.type='filtered', verbose = 1)
#'
#' @param siamcat an object of class \link{siamcat-class} #' @param siamcat an object of class \link{siamcat-class}
#'
#' @param fn.plot string, filename for the pdf-plot #' @param fn.plot string, filename for the pdf-plot
#' @param meta.in vector, specific metadata variable names to analyze, #'
#' defaults to NULL (all metadata variables will be analyzed) #' @param meta.in vector, specific metadata variable names to analyze,
#'@param feature.type string, on which type of features should the function #' defaults to NULL (all metadata variables will be analyzed)
#'
#' @param feature.type string, on which type of features should the function
#' work? Can be either \code{c()"original", "filtered", or "normalized")}. #' work? Can be either \code{c()"original", "filtered", or "normalized")}.
#' Please only change this paramter if you know what you are doing! #' Please only change this paramter if you know what you are doing!
#' @param verbose integer, control output: \code{0} for no output at all, #'
#' \code{1} for only information about progress and success, \code{2} for #' @param verbose integer, control output: \code{0} for no output at all,
#' normal level of information and \code{3} for full debug information, #' \code{1} for only information about progress and success, \code{2} for
#' defaults to \code{1} #' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords SIAMCAT check.confounders #' @keywords SIAMCAT check.confounders
#' @details This function checks for associations between class labels and #'
#' potential confounders (e.g. Age, Sex, or BMI) that are present in the #' @details This function checks for associations between class labels and
#' metadata. Statistical testing is performed with Fisher's exact test or #' potential confounders (e.g. Age, Sex, or BMI) that are present in the
#' Wilcoxon test, while associations are visualized either as barplot or #' metadata. Statistical testing is performed with Fisher's exact test or
#' Q-Q plot, depending on the type of metadata. #' Wilcoxon test, while associations are visualized either as barplot or
#' #' Q-Q plot, depending on the type of metadata.
#' Additionally, it evaluates associations among metadata variables using #'
#' conditional entropy and associations with the label using generalized #' Additionally, it evaluates associations among metadata variables using
#' linear models, producing a correlation heatmap and appropriate #' conditional entropy and associations with the label using generalized
#' quantitative barplots, respectively. #' linear models, producing a correlation heatmap and appropriate
#' quantitative barplots, respectively.
#'
#' Please note that the confounder check is currently only available for binary
#' classification problems!
#'
#' @export #' @export
#'
#' @encoding UTF-8
#'
#' @return Does not return anything, but outputs plots to specified pdf file #' @return Does not return anything, but outputs plots to specified pdf file
#'
#' @examples #' @examples
#' # Example data #' # Example data
#' data(siamcat_example) #' data(siamcat_example)
#' #'
#' # Simple working example #' # Simple working example
#' check.confounders(siamcat_example, './conf_plot.pdf') #' check.confounders(siamcat_example, './conf_plot.pdf')
check.confounders <- function(siamcat, fn.plot, meta.in = NULL, check.confounders <- function(siamcat, fn.plot, meta.in = NULL,
feature.type='filtered', verbose = 1) { feature.type='filtered', verbose = 1) {
...@@ -46,6 +62,10 @@ check.confounders <- function(siamcat, fn.plot, meta.in = NULL, ...@@ -46,6 +62,10 @@ check.confounders <- function(siamcat, fn.plot, meta.in = NULL,
if (verbose > 1) message("+ starting check.confounders") if (verbose > 1) message("+ starting check.confounders")
s.time <- proc.time()[3] s.time <- proc.time()[3]
label <- label(siamcat) label <- label(siamcat)
if (label$type!='BINARY'){
stop("Confounder check is currently only possible for",
" classification tasks")
}
meta <- meta(siamcat) meta <- meta(siamcat)
# get features # get features
if (feature.type == 'original'){ if (feature.type == 'original'){
......
...@@ -7,71 +7,72 @@ ...@@ -7,71 +7,72 @@
#' #'
#' @name create.data.split #' @name create.data.split
#' #'
#' @description This function prepares the cross-validation by splitting the #' @description This function prepares the cross-validation by splitting the
#' data into \code{num.folds} training and test folds for #' data into \code{num.folds} training and test folds for
#' \code{num.resample} times. #' \code{num.resample} times.
#' #'
#' @usage create.data.split(siamcat, num.folds = 2, num.resample = 1, #' @usage create.data.split(siamcat, num.folds = 2, num.resample = 1,
#' stratify = TRUE, inseparable = NULL, verbose = 1) #' stratify = TRUE, inseparable = NULL, verbose = 1)
#' #'
#' @param siamcat object of class \link{siamcat-class} #' @param siamcat object of class \link{siamcat-class}
#' #'
#' @param num.folds integer number of cross-validation folds (needs to be #' @param num.folds integer number of cross-validation folds (needs to be
#' \code{>=2}), defaults to \code{2} #' \code{>=2}), defaults to \code{2}
#' #'
#' @param num.resample integer, resampling rounds (values \code{<= 1} #' @param num.resample integer, resampling rounds (values \code{<= 1}
#' deactivate resampling), defaults to \code{1} #' deactivate resampling), defaults to \code{1}
#' #'
#' @param stratify boolean, should the splits be stratified so that an equal #' @param stratify boolean, should the splits be stratified so that an equal
#' proportion of classes are present in each fold?, defaults to \code{TRUE} #' proportion of classes are present in each fold?, will be ignored for
#' regression tasks, defaults to \code{TRUE}
#' #'
#' @param inseparable string, name of metadata variable to be inseparable, #' @param inseparable string, name of metadata variable to be inseparable,
#' defaults to \code{NULL}, see Details below #' defaults to \code{NULL}, see Details below
#' #'
#' @param verbose integer, control output: \code{0} for no output at all, #' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for #' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information, #' normal level of information and \code{3} for full debug information,
#' defaults to \code{1} #' defaults to \code{1}
#' #'
#' @keywords SIAMCAT create.data.split #' @keywords SIAMCAT create.data.split
#' #'
#' @return object of class \link{siamcat-class} with the \code{data_split}-slot #' @return object of class \link{siamcat-class} with the \code{data_split}-slot
#' filled #' filled
#' #'
#' @details This function splits the labels within a \link{siamcat-class} object #' @details This function splits the labels within a \link{siamcat-class}
#' and prepares the internal cross-validation for the model training (see #' object and prepares the internal cross-validation for the model training
#' \link{train.model}). #' (see \link{train.model}).
#'
#' The function saves the training and test instances for the different
#' cross-validation folds within a list in the \code{data_split}-slot of the
#' \link{siamcat-class} object, which is a list with four entries: \itemize{
#' \item \code{num.folds} - the number of cross-validation folds
#' \item \code{num.resample} - the number of repetitions for the
#' cross-validation
#' \item \code{training.folds} - a list containing the indices for the
#' training instances
#' \item \code{test.folds} - a list containing the indices for the
#' test instances }
#' #'
#' The function saves the training and test instances for the different #' If provided, the data split will take into account a metadata variable
#' cross-validation folds within a list in the \code{data_split}-slot of the #' for the data split (by providing the \code{inseparable} argument). For
#' \link{siamcat-class} object, which is a list with four entries: #' example, if the data contains several samples for the same individual,
#' \itemize{ #' it makes sense to keep data from the same individual within the
#' \item \code{num.folds} - the number of cross-validation folds #' same fold.
#' \item \code{num.resample} - the number of repetitions for the #'
#' cross-validation #' If \code{inseparable} is given, the \code{stratify} argument will be
#' \item \code{training.folds} - a list containing the indices for the #' ignored.
#' training instances #'
#' \item \code{test.folds} - a list containing the indices for the
#' test instances }
#'
#' If provided, the data split will take into account a metadata variable
#' for the data split (by providing the \code{inseparable} argument). For
#' example, if the data contains several samples for the same individual,
#' it would make sense to keep data from the same individual within the
#' same fold.
#' If \code{inseparable} is given, the \code{stratify} argument will be
#' ignored.
#' @export #' @export
#'
#' @encoding UTF-8
#' #'
#' @examples #' @examples
#' data(siamcat_example) #' data(siamcat_example)
#' #'
#' # simple working example #' # simple working example
#' siamcat_split <- create.data.split(siamcat_example, #' siamcat_split <- create.data.split(siamcat_example, num.folds=10,
#' num.folds=10, #' num.resample=5, stratify=TRUE)
#' num.resample=5,
#' stratify=TRUE)
create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
stratify = TRUE, inseparable = NULL, verbose = 1) { stratify = TRUE, inseparable = NULL, verbose = 1) {
...@@ -80,9 +81,9 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, ...@@ -80,9 +81,9 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
s.time <- proc.time()[3] s.time <- proc.time()[3]
label <- label(siamcat) label <- label(siamcat)
if (label$type != 'BINARY'){ if (label$type == 'CONTINUOUS'){
stop("SIAMCAT only works with binary labels at the moment!") stratify <- FALSE
} else { } else if (label$type=='BINARY') {
group.numbers <- vapply(label$info, group.numbers <- vapply(label$info,
FUN = function(x){ FUN = function(x){
sum(label$label == x)}, sum(label$label == x)},
...@@ -95,6 +96,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, ...@@ -95,6 +96,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
"\nThis is not enough for SIAMCAT to proceed!" "\nThis is not enough for SIAMCAT to proceed!"
) )
} }
} else if (label$type == 'TEST'){
stop("Cannot create data split for TEST object!")
} }
...@@ -116,32 +119,23 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, ...@@ -116,32 +119,23 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
### check arguments ### check arguments
if (num.resample < 1) { if (num.resample < 1) {
if (verbose > 1) if (verbose > 1)
message( message(paste0("+++ Resetting num.resample = 1 (",
paste0(
"+++ Resetting num.resample = 1 (",
num.resample, num.resample,
" is an invalid number of resampling rounds)" " is an invalid number of resampling rounds)"))
)
)
num.resample <- 1 num.resample <- 1
} }
if (num.folds < 2) { if (num.folds < 2) {
if (verbose > 1) if (verbose > 1)
message( message(paste0("+++ Resetting num.folds = 2 (",
paste0(
"+++ Resetting num.folds = 2 (",
num.folds, num.folds,
" is an invalid number of folds)" " is an invalid number of folds)"))
)
)
num.folds <- 2 num.folds <- 2
} }
if (!is.null(inseparable) && stratify) { if (!is.null(inseparable) && stratify) {
if (verbose > 1) if (verbose > 1)
message( message(paste0("+++ Resetting stratify to FALSE ",
"+++ Resetting stratify to FALSE (Stratification is not "(Stratification is not supported when ",
supported when inseparable is given" "inseparable is given"))
)
stratify <- FALSE stratify <- FALSE
} }
if (num.folds >= length(labelNum)) { if (num.folds >= length(labelNum)) {
...@@ -175,15 +169,24 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, ...@@ -175,15 +169,24 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
for (r in seq_len(num.resample)) { for (r in seq_len(num.resample)) {
labelNum <- sample(labelNum) labelNum <- sample(labelNum)
foldid <- if (label$type == 'BINARY'){
assign.fold( foldid <-
label = labelNum, assign.fold.binary(
num.folds = num.folds, label = labelNum,
stratified = stratify, num.folds = num.folds,
inseparable = inseparable, stratified = stratify,
meta = meta(siamcat)[names(labelNum),], inseparable = inseparable,
verbose = verbose meta = meta(siamcat)[names(labelNum),],
) verbose = verbose)
} else if (label$type == 'CONTINUOUS'){
foldid <-
assign.fold.regr(
label = labelNum,
num.folds = num.folds,
inseparable = inseparable,
meta = meta(siamcat)[names(labelNum),],
verbose = verbose)
}
names(foldid) <- names(labelNum) names(foldid) <- names(labelNum)
stopifnot(length(labelNum) == length(foldid)) stopifnot(length(labelNum) == length(foldid))
stopifnot(length(unique(foldid)) == num.folds) stopifnot(length(unique(foldid)) == num.folds)
...@@ -198,9 +201,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, ...@@ -198,9 +201,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
# stratify==TRUE should be tested before assignment of # stratify==TRUE should be tested before assignment of
# test/training set # test/training set
if (stratify) { if (stratify) {
stopifnot(all(sort(unique( stopifnot(all(sort(unique(labelNum[foldid == f])) ==
labelNum[foldid == f] classes))
)) == classes))
} }
# select test examples # select test examples
test.idx <- which(foldid == f) test.idx <- which(foldid == f)
...@@ -210,20 +212,14 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, ...@@ -210,20 +212,14 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
# for startify==FALSE, all classes must only be present in the # for startify==FALSE, all classes must only be present in the
# training set e.g. in leave-one-out CV, the test fold # training set e.g. in leave-one-out CV, the test fold
# cannot contain all classes # cannot contain all classes
if (!stratify) { if (!stratify && label$type == 'BINARY') {
stopifnot(all(sort(unique( stopifnot(all(sort(unique(labelNum[foldid != f]))
labelNum[foldid != f] == classes))
)) == classes))
} }
stopifnot(length(intersect(train.idx, test.idx)) == 0) stopifnot(length(intersect(train.idx, test.idx)) == 0)
if (verbose > 2) if (verbose > 2)
message(paste( message(paste("+++ fold ", f, " contains ",
"+++ fold ", sum(foldid == f), " samples"))
f,
" contains ",
sum(foldid == f),
" samples"
))
} }
train.list[[r]] <- train.temp train.list[[r]] <- train.temp
test.list[[r]] <- test.temp test.list[[r]] <- test.temp
...@@ -237,11 +233,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, ...@@ -237,11 +233,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
) )
e.time <- proc.time()[3] e.time <- proc.time()[3]
if (verbose > 1) if (verbose > 1)
message(paste( message(paste("+ finished create.data.split in",
"+ finished create.data.split in", formatC(e.time - s.time, digits = 3),"s"))
formatC(e.time - s.time, digits = 3),
"s"
))
if (verbose == 1) if (verbose == 1)
message("Features splitted for cross-validation successfully.") message("Features splitted for cross-validation successfully.")
return(siamcat) return(siamcat)
...@@ -249,15 +242,10 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1, ...@@ -249,15 +242,10 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
#' @keywords internal #' @keywords internal
assign.fold <- assign.fold.binary <- function(label, num.folds, stratified,
function(label, inseparable = NULL, meta = NULL, verbose = 1) {
num.folds,
stratified,
inseparable = NULL,
meta = NULL,
verbose = 1) {
if (verbose > 2) if (verbose > 2)
message("+++ starting assign.fold") message("+++ starting assign.fold.binary")
foldid <- rep(0, length(label)) foldid <- rep(0, length(label))
classes <- sort(unique(label)) classes <- sort(unique(label))
# Transform number of classes into vector of 1 to x for looping over. # Transform number of classes into vector of 1 to x for looping over.
...@@ -318,6 +306,35 @@ assign.fold <- ...@@ -318,6 +306,35 @@ assign.fold <-
stopifnot(length(label) == length(foldid)) stopifnot(length(label) == length(foldid))
if (verbose > 2) if (verbose > 2)
message("+++ finished assign.fold") message("+++ finished assign.fold.binary")
return(foldid) return(foldid)
} }
#' @keywords internal
assign.fold.regr <- function(label, num.folds, inseparable = NULL,
meta = NULL, verbose = 1) {
if (verbose > 2)
message("+++ starting assign.fold.regr")
foldid <- rep(0, length(label))
# If stratify is not TRUE, make sure that num.sample is not
# bigger than number.folds
if (!is.null(inseparable)) {
strata <- unique(meta[[inseparable]])
sid <- sample(rep(seq_len(num.folds), length.out = length(strata)))
for (s in seq_along(strata)) {
idx <- which(meta[[inseparable]] == strata[s])
foldid[idx] <- sid[s]
}
stopifnot(all(!is.na(foldid)))
} else {
foldid <- sample(rep(seq_len(num.folds), length.out = length(label)))
}
stopifnot(length(label) == length(foldid))
if (verbose > 2)
message("+++ finished assign.fold.regr")
return(foldid)
}
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
#' #'
#' Mainly used for running the examples in the function documentation. #' Mainly used for running the examples in the function documentation.
#' #'
#' @encoding UTF-8
#'
#' @name siamcat_example #' @name siamcat_example
#' #'
#' @source \url{http://msb.embopress.org/content/10/11/766} #' @source \url{http://msb.embopress.org/content/10/11/766}
...@@ -27,6 +29,8 @@ NULL ...@@ -27,6 +29,8 @@ NULL
#' et al. MSB 2014 (see \url{http://msb.embopress.org/content/10/11/766}), #' et al. MSB 2014 (see \url{http://msb.embopress.org/content/10/11/766}),
#' containing 141 samples and 1754 bacterial species (features). #' containing 141 samples and 1754 bacterial species (features).
#' #'
#' @encoding UTF-8
#'
#' @name feat.crc.zeller #' @name feat.crc.zeller
#' #'
#' @source \url{http://msb.embopress.org/content/10/11/766} #' @source \url{http://msb.embopress.org/content/10/11/766}
...@@ -42,6 +46,8 @@ NULL ...@@ -42,6 +46,8 @@ NULL
#' al. MSB 2014 (see \url{http://msb.embopress.org/content/10/11/766}), #' al. MSB 2014 (see \url{http://msb.embopress.org/content/10/11/766}),
#' containing 6 metadata variables variables (e.g. Age or BMI) for 141 samples. #' containing 6 metadata variables variables (e.g. Age or BMI) for 141 samples.
#' #'
#' @encoding UTF-8
#'
#' @name meta.crc.zeller #' @name meta.crc.zeller
#' #'