Commit 2f8ddb68 authored by Jakob Wirbel's avatar Jakob Wirbel
Browse files

mostly documentation changes.

parent d08ee5f9
......@@ -3,38 +3,37 @@
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title Add metadata as predictors
#'
#' @description This function adds metadata to the feature matrix to be later
#' used as predictors
#' used as predictors
#'
#' @usage add.meta.pred(siamcat, pred.names,
#' std.meta = TRUE,
#' feature.type='normalized',
#' verbose = 1)
#' @usage add.meta.pred(siamcat, pred.names, std.meta = TRUE,
#' feature.type='normalized', verbose = 1)
#'
#' @param siamcat object of class \link{siamcat-class}
#'
#' @param pred.names vector of names of the variables within the metadata to be
#' added to the feature matrix as predictors
#' @param pred.names vector of names of the variables within the metadata to
#' be added to the feature matrix as predictors
#'
#' @param std.meta boolean, should added metadata features be standardized?,
#' defaults to \code{TRUE}
#' defaults to \code{TRUE}
#'
#' @param feature.type string, on which type of features should the function
#' work? Can be either \code{"original"}, \code{"filtered"}, or
#' \code{"normalized"}. Please only change this paramter if you know what
#' you are doing!
#' @param feature.type string, on which type of features should the function
#' work? Can be either \code{"original"}, \code{"filtered"}, or
#' \code{"normalized"}. Please only change this paramter if you know what
#' you are doing!
#'
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords SIAMCAT add.meta.pred
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @details This functions adds one or several metadata variables to the set
#' of features, so that they can be included for model training.
......@@ -50,7 +49,7 @@
#' numerically before you start the SIAMCAT workflow.
#'
#' @return an object of class \link{siamcat-class} with metadata added to the
#' features
#' features
#'
#' @examples
#' data(siamcat_example)
......@@ -60,9 +59,8 @@
#'
#' # Add Age and BMI as potential predictors
#' # Additionally, prevent standardization of the added features
#' siamcat_meta_added <- add.meta.pred(siamcat_example,
#' pred.names=c('Age', 'BMI'),
#' std.meta=FALSE)
#' siamcat_meta_added <- add.meta.pred(siamcat_example,
#' pred.names=c('Age', 'BMI'), std.meta=FALSE)
add.meta.pred <- function(siamcat, pred.names, std.meta = TRUE,
feature.type = 'normalized', verbose = 1) {
......
......@@ -4,41 +4,57 @@
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title Check for potential confounders in the metadata
#' @description Checks potential confounders in the metadata and produces
#' some visualizations
#'
#' @description Checks potential confounders in the metadata and visualize the
#' results
#'
#' @usage check.confounders(siamcat, fn.plot, meta.in = NULL,
#' feature.type='filtered', verbose = 1)
#'
#' @param siamcat an object of class \link{siamcat-class}
#'
#' @param fn.plot string, filename for the pdf-plot
#' @param meta.in vector, specific metadata variable names to analyze,
#' defaults to NULL (all metadata variables will be analyzed)
#'@param feature.type string, on which type of features should the function
#'
#' @param meta.in vector, specific metadata variable names to analyze,
#' defaults to NULL (all metadata variables will be analyzed)
#'
#' @param feature.type string, on which type of features should the function
#' work? Can be either \code{c()"original", "filtered", or "normalized")}.
#' Please only change this paramter if you know what you are doing!
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords SIAMCAT check.confounders
#' @details This function checks for associations between class labels and
#' potential confounders (e.g. Age, Sex, or BMI) that are present in the
#' metadata. Statistical testing is performed with Fisher's exact test or
#' Wilcoxon test, while associations are visualized either as barplot or
#' Q-Q plot, depending on the type of metadata.
#'
#' Additionally, it evaluates associations among metadata variables using
#' conditional entropy and associations with the label using generalized
#' linear models, producing a correlation heatmap and appropriate
#' quantitative barplots, respectively.
#'
#' @details This function checks for associations between class labels and
#' potential confounders (e.g. Age, Sex, or BMI) that are present in the
#' metadata. Statistical testing is performed with Fisher's exact test or
#' Wilcoxon test, while associations are visualized either as barplot or
#' Q-Q plot, depending on the type of metadata.
#'
#' Additionally, it evaluates associations among metadata variables using
#' conditional entropy and associations with the label using generalized
#' linear models, producing a correlation heatmap and appropriate
#' quantitative barplots, respectively.
#'
#' Please note that the confounder check is currently only available for binary
#' classification problems!
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @return Does not return anything, but outputs plots to specified pdf file
#'
#' @examples
#' # Example data
#' data(siamcat_example)
#'
#' # Simple working example
#' check.confounders(siamcat_example, './conf_plot.pdf')
check.confounders <- function(siamcat, fn.plot, meta.in = NULL,
feature.type='filtered', verbose = 1) {
......@@ -46,6 +62,10 @@ check.confounders <- function(siamcat, fn.plot, meta.in = NULL,
if (verbose > 1) message("+ starting check.confounders")
s.time <- proc.time()[3]
label <- label(siamcat)
if (label$type!='BINARY'){
stop("Confounder check is currently only possible for",
" classification tasks")
}
meta <- meta(siamcat)
# get features
if (feature.type == 'original'){
......
......@@ -7,71 +7,72 @@
#'
#' @name create.data.split
#'
#' @description This function prepares the cross-validation by splitting the
#' data into \code{num.folds} training and test folds for
#' \code{num.resample} times.
#' @description This function prepares the cross-validation by splitting the
#' data into \code{num.folds} training and test folds for
#' \code{num.resample} times.
#'
#' @usage create.data.split(siamcat, num.folds = 2, num.resample = 1,
#' stratify = TRUE, inseparable = NULL, verbose = 1)
#' @usage create.data.split(siamcat, num.folds = 2, num.resample = 1,
#' stratify = TRUE, inseparable = NULL, verbose = 1)
#'
#' @param siamcat object of class \link{siamcat-class}
#'
#' @param num.folds integer number of cross-validation folds (needs to be
#' \code{>=2}), defaults to \code{2}
#' @param num.folds integer number of cross-validation folds (needs to be
#' \code{>=2}), defaults to \code{2}
#'
#' @param num.resample integer, resampling rounds (values \code{<= 1}
#' deactivate resampling), defaults to \code{1}
#' @param num.resample integer, resampling rounds (values \code{<= 1}
#' deactivate resampling), defaults to \code{1}
#'
#' @param stratify boolean, should the splits be stratified so that an equal
#' proportion of classes are present in each fold?, defaults to \code{TRUE}
#' @param stratify boolean, should the splits be stratified so that an equal
#' proportion of classes are present in each fold?, will be ignored for
#' regression tasks, defaults to \code{TRUE}
#'
#' @param inseparable string, name of metadata variable to be inseparable,
#' defaults to \code{NULL}, see Details below
#' defaults to \code{NULL}, see Details below
#'
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords SIAMCAT create.data.split
#'
#' @return object of class \link{siamcat-class} with the \code{data_split}-slot
#' filled
#' filled
#'
#' @details This function splits the labels within a \link{siamcat-class} object
#' and prepares the internal cross-validation for the model training (see
#' \link{train.model}).
#' @details This function splits the labels within a \link{siamcat-class}
#' object and prepares the internal cross-validation for the model training
#' (see \link{train.model}).
#'
#' The function saves the training and test instances for the different
#' cross-validation folds within a list in the \code{data_split}-slot of the
#' \link{siamcat-class} object, which is a list with four entries: \itemize{
#' \item \code{num.folds} - the number of cross-validation folds
#' \item \code{num.resample} - the number of repetitions for the
#' cross-validation
#' \item \code{training.folds} - a list containing the indices for the
#' training instances
#' \item \code{test.folds} - a list containing the indices for the
#' test instances }
#'
#' The function saves the training and test instances for the different
#' cross-validation folds within a list in the \code{data_split}-slot of the
#' \link{siamcat-class} object, which is a list with four entries:
#' \itemize{
#' \item \code{num.folds} - the number of cross-validation folds
#' \item \code{num.resample} - the number of repetitions for the
#' cross-validation
#' \item \code{training.folds} - a list containing the indices for the
#' training instances
#' \item \code{test.folds} - a list containing the indices for the
#' test instances }
#'
#' If provided, the data split will take into account a metadata variable
#' for the data split (by providing the \code{inseparable} argument). For
#' example, if the data contains several samples for the same individual,
#' it would make sense to keep data from the same individual within the
#' same fold.
#' If \code{inseparable} is given, the \code{stratify} argument will be
#' ignored.
#' If provided, the data split will take into account a metadata variable
#' for the data split (by providing the \code{inseparable} argument). For
#' example, if the data contains several samples for the same individual,
#' it makes sense to keep data from the same individual within the
#' same fold.
#'
#' If \code{inseparable} is given, the \code{stratify} argument will be
#' ignored.
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @examples
#' data(siamcat_example)
#'
#' # simple working example
#' siamcat_split <- create.data.split(siamcat_example,
#' num.folds=10,
#' num.resample=5,
#' stratify=TRUE)
#' siamcat_split <- create.data.split(siamcat_example, num.folds=10,
#' num.resample=5, stratify=TRUE)
create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
stratify = TRUE, inseparable = NULL, verbose = 1) {
......@@ -80,9 +81,9 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
s.time <- proc.time()[3]
label <- label(siamcat)
if (label$type != 'BINARY'){
stop("SIAMCAT only works with binary labels at the moment!")
} else {
if (label$type == 'CONTINUOUS'){
stratify <- FALSE
} else if (label$type=='BINARY') {
group.numbers <- vapply(label$info,
FUN = function(x){
sum(label$label == x)},
......@@ -95,6 +96,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
"\nThis is not enough for SIAMCAT to proceed!"
)
}
} else if (label$type == 'TEST'){
stop("Cannot create data split for TEST object!")
}
......@@ -116,32 +119,23 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
### check arguments
if (num.resample < 1) {
if (verbose > 1)
message(
paste0(
"+++ Resetting num.resample = 1 (",
message(paste0("+++ Resetting num.resample = 1 (",
num.resample,
" is an invalid number of resampling rounds)"
)
)
" is an invalid number of resampling rounds)"))
num.resample <- 1
}
if (num.folds < 2) {
if (verbose > 1)
message(
paste0(
"+++ Resetting num.folds = 2 (",
message(paste0("+++ Resetting num.folds = 2 (",
num.folds,
" is an invalid number of folds)"
)
)
" is an invalid number of folds)"))
num.folds <- 2
}
if (!is.null(inseparable) && stratify) {
if (verbose > 1)
message(
"+++ Resetting stratify to FALSE (Stratification is not
supported when inseparable is given"
)
message(paste0("+++ Resetting stratify to FALSE ",
"(Stratification is not supported when ",
"inseparable is given"))
stratify <- FALSE
}
if (num.folds >= length(labelNum)) {
......@@ -175,15 +169,24 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
for (r in seq_len(num.resample)) {
labelNum <- sample(labelNum)
foldid <-
assign.fold(
label = labelNum,
num.folds = num.folds,
stratified = stratify,
inseparable = inseparable,
meta = meta(siamcat)[names(labelNum),],
verbose = verbose
)
if (label$type == 'BINARY'){
foldid <-
assign.fold.binary(
label = labelNum,
num.folds = num.folds,
stratified = stratify,
inseparable = inseparable,
meta = meta(siamcat)[names(labelNum),],
verbose = verbose)
} else if (label$type == 'CONTINUOUS'){
foldid <-
assign.fold.regr(
label = labelNum,
num.folds = num.folds,
inseparable = inseparable,
meta = meta(siamcat)[names(labelNum),],
verbose = verbose)
}
names(foldid) <- names(labelNum)
stopifnot(length(labelNum) == length(foldid))
stopifnot(length(unique(foldid)) == num.folds)
......@@ -198,9 +201,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
# stratify==TRUE should be tested before assignment of
# test/training set
if (stratify) {
stopifnot(all(sort(unique(
labelNum[foldid == f]
)) == classes))
stopifnot(all(sort(unique(labelNum[foldid == f])) ==
classes))
}
# select test examples
test.idx <- which(foldid == f)
......@@ -210,20 +212,14 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
# for startify==FALSE, all classes must only be present in the
# training set e.g. in leave-one-out CV, the test fold
# cannot contain all classes
if (!stratify) {
stopifnot(all(sort(unique(
labelNum[foldid != f]
)) == classes))
if (!stratify && label$type == 'BINARY') {
stopifnot(all(sort(unique(labelNum[foldid != f]))
== classes))
}
stopifnot(length(intersect(train.idx, test.idx)) == 0)
if (verbose > 2)
message(paste(
"+++ fold ",
f,
" contains ",
sum(foldid == f),
" samples"
))
message(paste("+++ fold ", f, " contains ",
sum(foldid == f), " samples"))
}
train.list[[r]] <- train.temp
test.list[[r]] <- test.temp
......@@ -237,11 +233,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
)
e.time <- proc.time()[3]
if (verbose > 1)
message(paste(
"+ finished create.data.split in",
formatC(e.time - s.time, digits = 3),
"s"
))
message(paste("+ finished create.data.split in",
formatC(e.time - s.time, digits = 3),"s"))
if (verbose == 1)
message("Features splitted for cross-validation successfully.")
return(siamcat)
......@@ -249,15 +242,10 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
#' @keywords internal
assign.fold <-
function(label,
num.folds,
stratified,
inseparable = NULL,
meta = NULL,
verbose = 1) {
assign.fold.binary <- function(label, num.folds, stratified,
inseparable = NULL, meta = NULL, verbose = 1) {
if (verbose > 2)
message("+++ starting assign.fold")
message("+++ starting assign.fold.binary")
foldid <- rep(0, length(label))
classes <- sort(unique(label))
# Transform number of classes into vector of 1 to x for looping over.
......@@ -318,6 +306,35 @@ assign.fold <-
stopifnot(length(label) == length(foldid))
if (verbose > 2)
message("+++ finished assign.fold")
message("+++ finished assign.fold.binary")
return(foldid)
}
#' @keywords internal
assign.fold.regr <- function(label, num.folds, inseparable = NULL,
meta = NULL, verbose = 1) {
if (verbose > 2)
message("+++ starting assign.fold.regr")
foldid <- rep(0, length(label))
# If stratify is not TRUE, make sure that num.sample is not
# bigger than number.folds
if (!is.null(inseparable)) {
strata <- unique(meta[[inseparable]])
sid <- sample(rep(seq_len(num.folds), length.out = length(strata)))
for (s in seq_along(strata)) {
idx <- which(meta[[inseparable]] == strata[s])
foldid[idx] <- sid[s]
}
stopifnot(all(!is.na(foldid)))
} else {
foldid <- sample(rep(seq_len(num.folds), length.out = length(label)))
}
stopifnot(length(label) == length(foldid))
if (verbose > 2)
message("+++ finished assign.fold.regr")
return(foldid)
}
......@@ -12,6 +12,8 @@
#'
#' Mainly used for running the examples in the function documentation.
#'
#' @encoding UTF-8
#'
#' @name siamcat_example
#'
#' @source \url{http://msb.embopress.org/content/10/11/766}
......@@ -27,6 +29,8 @@ NULL
#' et al. MSB 2014 (see \url{http://msb.embopress.org/content/10/11/766}),
#' containing 141 samples and 1754 bacterial species (features).
#'
#' @encoding UTF-8
#'
#' @name feat.crc.zeller
#'
#' @source \url{http://msb.embopress.org/content/10/11/766}
......@@ -42,6 +46,8 @@ NULL
#' al. MSB 2014 (see \url{http://msb.embopress.org/content/10/11/766}),
#' containing 6 metadata variables variables (e.g. Age or BMI) for 141 samples.
#'
#' @encoding UTF-8
#'
#' @name meta.crc.zeller
#'
#' @source \url{http://msb.embopress.org/content/10/11/766}
......
......@@ -5,64 +5,64 @@
#' @title Perform unsupervised feature filtering.
#'
#' @description This function performs unsupervised feature filtering. Features
#' can be filtered based on abundance, prevalence, or on variance.
#' Additionally, unmapped reads may be removed.
#' @description This function performs unsupervised feature filtering.
#'
#' @usage filter.features(siamcat, filter.method = "abundance",
#' cutoff = 0.001, rm.unmapped = TRUE,
#' feature.type='original', verbose = 1)
#' @usage filter.features(siamcat, filter.method = "abundance",
#' cutoff = 0.001, rm.unmapped = TRUE, feature.type='original', verbose = 1)
#'
#' @param siamcat an object of class \link{siamcat-class}
#'
#' @param filter.method string, method used for filtering the features, can be
#' one of these: \code{c('abundance', 'cum.abundance', 'prevalence',
#' 'variance', 'pass')}, defaults to \code{'abundance'}
#' one of these: \code{c('abundance', 'cum.abundance', 'prevalence',
#' 'variance', 'pass')}, defaults to \code{'abundance'}
#'
#' @param cutoff float, abundace, prevalence, or variance cutoff, defaults
#' to \code{0.001} (see Details below)
#' to \code{0.001} (see Details below)
#'
#' @param rm.unmapped boolean, should unmapped reads be discarded?, defaults to
#' \code{TRUE}
#' \code{TRUE}
#'
#' @param feature.type string, on which type of features should the function
#' work? Can be either \code{"original"}, \code{"filtered"}, or
#' \code{"normalized"}. Please only change this paramter if you know what
#' you are doing!
#' work? Can be either \code{"original"}, \code{"filtered"}, or
#' \code{"normalized"}. Please only change this paramter if you know what
#' you are doing!
#'
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords SIAMCAT filter.features
#'
#' @details This function filters the features in a \link{siamcat-class}
#' object in a unsupervised manner.
#'
#' The different filter methods work in the following way: \itemize{
#' \item \code{'abundace'} - remove features whose maximum abundance is
#' never above the threshold value in any of the samples
#' \item \code{'cum.abundance'} - remove features with very low abundance
#' in all samples, i.e. those that are never among the most abundant
#' entities that collectively make up (1-cutoff) of the reads in
#' any sample
#' \item \code{'prevalence'} - remove features with low prevalence across
#' samples, i.e. those that are undetected (relative abundance of 0)
#' in more than \code{1 - cutoff} percent of samples.
#' \item \code{'variance'} - remove features with low variance across
#' samples, i.e. those that have a variance lower than \code{cutoff}
#' \item \code{'pass'} - pass-through filtering will not change the
#' features
#' }
#'
#' Features can also be filtered repeatedly with different methods, e.g.
#' first using the maximum abundance filtering and then using prevalence
#' filtering.
#' However, if a filtering method has already been applied to the dataset,
#' SIAMCAT will default back on the original features for filtering.
#' object in a unsupervised manner.
#'
#' The different filter methods work in the following way: \itemize{
#' \item \code{'abundace'} - remove features whose maximum abundance is
#' never above the threshold value in any of the samples
#' \item \code{'cum.abundance'} - remove features with very low abundance
#' in all samples, i.e. those that are never among the most abundant
#' entities that collectively make up (1-cutoff) of the reads in
#' any sample
#' \item \code{'prevalence'} - remove features with low prevalence across
#' samples, i.e. those that are undetected (relative abundance of 0)
#' in more than \code{1 - cutoff} percent of samples.
#' \item \code{'variance'} - remove features with low variance across
#' samples, i.e. those that have a variance lower than \code{cutoff}
#' \item \code{'pass'} - pass-through filtering will not change the
#' features
#' }
#'
#' Features can also be filtered repeatedly with different methods, e.g.
#' first using the maximum abundance filtering and then using prevalence
#' filtering.
#' However, if a filtering method has already been applied to the dataset,
#' SIAMCAT will default back on the original features for filtering.
#'
#' @export filter.features
#'
#' @encoding UTF-8
#'
#' @return siamcat an object of class \link{siamcat-class}
#'
#' @examples
......@@ -78,7 +78,12 @@
#' siamcat_filtered <- filter.features(siamcat_example,
#' filter.method='prevalence',
#' cutoff=0.05)
#'
#' # filter first for abundance and then for prevalence
#' siamcat_filt <- filter.features(siamcat_example,
#' filter.method='abundance', cutoff=1e-03)
#' siamcat_filt <- filter.features(siamcat_filt, filter.method='prevalence',
#' cutoff=0.05, feature.type='filtered')
filter.features <- function(siamcat,
filter.method = "abundance",
cutoff = 0.001,
......
......@@ -3,9 +3,9 @@
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title create a lefse input file from siamcat object
#' @title create a LEfSe input file from SIAMCAT object