Commit 2f8ddb68 authored by Jakob Wirbel's avatar Jakob Wirbel
Browse files

mostly documentation changes.

parent d08ee5f9
......@@ -3,21 +3,18 @@
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title Add metadata as predictors
#'
#' @description This function adds metadata to the feature matrix to be later
#' used as predictors
#'
#' @usage add.meta.pred(siamcat, pred.names,
#' std.meta = TRUE,
#' feature.type='normalized',
#' verbose = 1)
#' @usage add.meta.pred(siamcat, pred.names, std.meta = TRUE,
#' feature.type='normalized', verbose = 1)
#'
#' @param siamcat object of class \link{siamcat-class}
#'
#' @param pred.names vector of names of the variables within the metadata to be
#' added to the feature matrix as predictors
#' @param pred.names vector of names of the variables within the metadata to
#' be added to the feature matrix as predictors
#'
#' @param std.meta boolean, should added metadata features be standardized?,
#' defaults to \code{TRUE}
......@@ -36,6 +33,8 @@
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @details This functions adds one or several metadata variables to the set
#' of features, so that they can be included for model training.
#'
......@@ -61,8 +60,7 @@
#' # Add Age and BMI as potential predictors
#' # Additionally, prevent standardization of the added features
#' siamcat_meta_added <- add.meta.pred(siamcat_example,
#' pred.names=c('Age', 'BMI'),
#' std.meta=FALSE)
#' pred.names=c('Age', 'BMI'), std.meta=FALSE)
add.meta.pred <- function(siamcat, pred.names, std.meta = TRUE,
feature.type = 'normalized', verbose = 1) {
......
......@@ -4,22 +4,31 @@
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title Check for potential confounders in the metadata
#' @description Checks potential confounders in the metadata and produces
#' some visualizations
#'
#' @description Checks potential confounders in the metadata and visualize the
#' results
#'
#' @usage check.confounders(siamcat, fn.plot, meta.in = NULL,
#' feature.type='filtered', verbose = 1)
#'
#' @param siamcat an object of class \link{siamcat-class}
#'
#' @param fn.plot string, filename for the pdf-plot
#'
#' @param meta.in vector, specific metadata variable names to analyze,
#' defaults to NULL (all metadata variables will be analyzed)
#'@param feature.type string, on which type of features should the function
#'
#' @param feature.type string, on which type of features should the function
#' work? Can be either \code{c()"original", "filtered", or "normalized")}.
#' Please only change this paramter if you know what you are doing!
#'
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords SIAMCAT check.confounders
#'
#' @details This function checks for associations between class labels and
#' potential confounders (e.g. Age, Sex, or BMI) that are present in the
#' metadata. Statistical testing is performed with Fisher's exact test or
......@@ -30,15 +39,22 @@
#' conditional entropy and associations with the label using generalized
#' linear models, producing a correlation heatmap and appropriate
#' quantitative barplots, respectively.
#'
#' Please note that the confounder check is currently only available for binary
#' classification problems!
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @return Does not return anything, but outputs plots to specified pdf file
#'
#' @examples
#' # Example data
#' data(siamcat_example)
#'
#' # Simple working example
#' check.confounders(siamcat_example, './conf_plot.pdf')
check.confounders <- function(siamcat, fn.plot, meta.in = NULL,
feature.type='filtered', verbose = 1) {
......@@ -46,6 +62,10 @@ check.confounders <- function(siamcat, fn.plot, meta.in = NULL,
if (verbose > 1) message("+ starting check.confounders")
s.time <- proc.time()[3]
label <- label(siamcat)
if (label$type!='BINARY'){
stop("Confounder check is currently only possible for",
" classification tasks")
}
meta <- meta(siamcat)
# get features
if (feature.type == 'original'){
......
......@@ -23,7 +23,8 @@
#' deactivate resampling), defaults to \code{1}
#'
#' @param stratify boolean, should the splits be stratified so that an equal
#' proportion of classes are present in each fold?, defaults to \code{TRUE}
#' proportion of classes are present in each fold?, will be ignored for
#' regression tasks, defaults to \code{TRUE}
#'
#' @param inseparable string, name of metadata variable to be inseparable,
#' defaults to \code{NULL}, see Details below
......@@ -38,14 +39,13 @@
#' @return object of class \link{siamcat-class} with the \code{data_split}-slot
#' filled
#'
#' @details This function splits the labels within a \link{siamcat-class} object
#' and prepares the internal cross-validation for the model training (see
#' \link{train.model}).
#' @details This function splits the labels within a \link{siamcat-class}
#' object and prepares the internal cross-validation for the model training
#' (see \link{train.model}).
#'
#' The function saves the training and test instances for the different
#' cross-validation folds within a list in the \code{data_split}-slot of the
#' \link{siamcat-class} object, which is a list with four entries:
#' \itemize{
#' \link{siamcat-class} object, which is a list with four entries: \itemize{
#' \item \code{num.folds} - the number of cross-validation folds
#' \item \code{num.resample} - the number of repetitions for the
#' cross-validation
......@@ -57,21 +57,22 @@
#' If provided, the data split will take into account a metadata variable
#' for the data split (by providing the \code{inseparable} argument). For
#' example, if the data contains several samples for the same individual,
#' it would make sense to keep data from the same individual within the
#' it makes sense to keep data from the same individual within the
#' same fold.
#'
#' If \code{inseparable} is given, the \code{stratify} argument will be
#' ignored.
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @examples
#' data(siamcat_example)
#'
#' # simple working example
#' siamcat_split <- create.data.split(siamcat_example,
#' num.folds=10,
#' num.resample=5,
#' stratify=TRUE)
#' siamcat_split <- create.data.split(siamcat_example, num.folds=10,
#' num.resample=5, stratify=TRUE)
create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
stratify = TRUE, inseparable = NULL, verbose = 1) {
......@@ -80,9 +81,9 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
s.time <- proc.time()[3]
label <- label(siamcat)
if (label$type != 'BINARY'){
stop("SIAMCAT only works with binary labels at the moment!")
} else {
if (label$type == 'CONTINUOUS'){
stratify <- FALSE
} else if (label$type=='BINARY') {
group.numbers <- vapply(label$info,
FUN = function(x){
sum(label$label == x)},
......@@ -95,6 +96,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
"\nThis is not enough for SIAMCAT to proceed!"
)
}
} else if (label$type == 'TEST'){
stop("Cannot create data split for TEST object!")
}
......@@ -116,32 +119,23 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
### check arguments
if (num.resample < 1) {
if (verbose > 1)
message(
paste0(
"+++ Resetting num.resample = 1 (",
message(paste0("+++ Resetting num.resample = 1 (",
num.resample,
" is an invalid number of resampling rounds)"
)
)
" is an invalid number of resampling rounds)"))
num.resample <- 1
}
if (num.folds < 2) {
if (verbose > 1)
message(
paste0(
"+++ Resetting num.folds = 2 (",
message(paste0("+++ Resetting num.folds = 2 (",
num.folds,
" is an invalid number of folds)"
)
)
" is an invalid number of folds)"))
num.folds <- 2
}
if (!is.null(inseparable) && stratify) {
if (verbose > 1)
message(
"+++ Resetting stratify to FALSE (Stratification is not
supported when inseparable is given"
)
message(paste0("+++ Resetting stratify to FALSE ",
"(Stratification is not supported when ",
"inseparable is given"))
stratify <- FALSE
}
if (num.folds >= length(labelNum)) {
......@@ -175,15 +169,24 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
for (r in seq_len(num.resample)) {
labelNum <- sample(labelNum)
if (label$type == 'BINARY'){
foldid <-
assign.fold(
assign.fold.binary(
label = labelNum,
num.folds = num.folds,
stratified = stratify,
inseparable = inseparable,
meta = meta(siamcat)[names(labelNum),],
verbose = verbose
)
verbose = verbose)
} else if (label$type == 'CONTINUOUS'){
foldid <-
assign.fold.regr(
label = labelNum,
num.folds = num.folds,
inseparable = inseparable,
meta = meta(siamcat)[names(labelNum),],
verbose = verbose)
}
names(foldid) <- names(labelNum)
stopifnot(length(labelNum) == length(foldid))
stopifnot(length(unique(foldid)) == num.folds)
......@@ -198,9 +201,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
# stratify==TRUE should be tested before assignment of
# test/training set
if (stratify) {
stopifnot(all(sort(unique(
labelNum[foldid == f]
)) == classes))
stopifnot(all(sort(unique(labelNum[foldid == f])) ==
classes))
}
# select test examples
test.idx <- which(foldid == f)
......@@ -210,20 +212,14 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
# for startify==FALSE, all classes must only be present in the
# training set e.g. in leave-one-out CV, the test fold
# cannot contain all classes
if (!stratify) {
stopifnot(all(sort(unique(
labelNum[foldid != f]
)) == classes))
if (!stratify && label$type == 'BINARY') {
stopifnot(all(sort(unique(labelNum[foldid != f]))
== classes))
}
stopifnot(length(intersect(train.idx, test.idx)) == 0)
if (verbose > 2)
message(paste(
"+++ fold ",
f,
" contains ",
sum(foldid == f),
" samples"
))
message(paste("+++ fold ", f, " contains ",
sum(foldid == f), " samples"))
}
train.list[[r]] <- train.temp
test.list[[r]] <- test.temp
......@@ -237,11 +233,8 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
)
e.time <- proc.time()[3]
if (verbose > 1)
message(paste(
"+ finished create.data.split in",
formatC(e.time - s.time, digits = 3),
"s"
))
message(paste("+ finished create.data.split in",
formatC(e.time - s.time, digits = 3),"s"))
if (verbose == 1)
message("Features splitted for cross-validation successfully.")
return(siamcat)
......@@ -249,15 +242,10 @@ create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
#' @keywords internal
assign.fold <-
function(label,
num.folds,
stratified,
inseparable = NULL,
meta = NULL,
verbose = 1) {
assign.fold.binary <- function(label, num.folds, stratified,
inseparable = NULL, meta = NULL, verbose = 1) {
if (verbose > 2)
message("+++ starting assign.fold")
message("+++ starting assign.fold.binary")
foldid <- rep(0, length(label))
classes <- sort(unique(label))
# Transform number of classes into vector of 1 to x for looping over.
......@@ -318,6 +306,35 @@ assign.fold <-
stopifnot(length(label) == length(foldid))
if (verbose > 2)
message("+++ finished assign.fold")
message("+++ finished assign.fold.binary")
return(foldid)
}
#' @keywords internal
assign.fold.regr <- function(label, num.folds, inseparable = NULL,
meta = NULL, verbose = 1) {
if (verbose > 2)
message("+++ starting assign.fold.regr")
foldid <- rep(0, length(label))
# If stratify is not TRUE, make sure that num.sample is not
# bigger than number.folds
if (!is.null(inseparable)) {
strata <- unique(meta[[inseparable]])
sid <- sample(rep(seq_len(num.folds), length.out = length(strata)))
for (s in seq_along(strata)) {
idx <- which(meta[[inseparable]] == strata[s])
foldid[idx] <- sid[s]
}
stopifnot(all(!is.na(foldid)))
} else {
foldid <- sample(rep(seq_len(num.folds), length.out = length(label)))
}
stopifnot(length(label) == length(foldid))
if (verbose > 2)
message("+++ finished assign.fold.regr")
return(foldid)
}
......@@ -12,6 +12,8 @@
#'
#' Mainly used for running the examples in the function documentation.
#'
#' @encoding UTF-8
#'
#' @name siamcat_example
#'
#' @source \url{http://msb.embopress.org/content/10/11/766}
......@@ -27,6 +29,8 @@ NULL
#' et al. MSB 2014 (see \url{http://msb.embopress.org/content/10/11/766}),
#' containing 141 samples and 1754 bacterial species (features).
#'
#' @encoding UTF-8
#'
#' @name feat.crc.zeller
#'
#' @source \url{http://msb.embopress.org/content/10/11/766}
......@@ -42,6 +46,8 @@ NULL
#' al. MSB 2014 (see \url{http://msb.embopress.org/content/10/11/766}),
#' containing 6 metadata variables variables (e.g. Age or BMI) for 141 samples.
#'
#' @encoding UTF-8
#'
#' @name meta.crc.zeller
#'
#' @source \url{http://msb.embopress.org/content/10/11/766}
......
......@@ -5,13 +5,10 @@
#' @title Perform unsupervised feature filtering.
#'
#' @description This function performs unsupervised feature filtering. Features
#' can be filtered based on abundance, prevalence, or on variance.
#' Additionally, unmapped reads may be removed.
#' @description This function performs unsupervised feature filtering.
#'
#' @usage filter.features(siamcat, filter.method = "abundance",
#' cutoff = 0.001, rm.unmapped = TRUE,
#' feature.type='original', verbose = 1)
#' cutoff = 0.001, rm.unmapped = TRUE, feature.type='original', verbose = 1)
#'
#' @param siamcat an object of class \link{siamcat-class}
#'
......@@ -61,8 +58,11 @@
#' filtering.
#' However, if a filtering method has already been applied to the dataset,
#' SIAMCAT will default back on the original features for filtering.
#'
#' @export filter.features
#'
#' @encoding UTF-8
#'
#' @return siamcat an object of class \link{siamcat-class}
#'
#' @examples
......@@ -78,7 +78,12 @@
#' siamcat_filtered <- filter.features(siamcat_example,
#' filter.method='prevalence',
#' cutoff=0.05)
#'
#' # filter first for abundance and then for prevalence
#' siamcat_filt <- filter.features(siamcat_example,
#' filter.method='abundance', cutoff=1e-03)
#' siamcat_filt <- filter.features(siamcat_filt, filter.method='prevalence',
#' cutoff=0.05, feature.type='filtered')
filter.features <- function(siamcat,
filter.method = "abundance",
cutoff = 0.001,
......
......@@ -3,9 +3,9 @@
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title create a lefse input file from siamcat object
#' @title create a LEfSe input file from SIAMCAT object
#'
#' @description This function creates a lefse input file from siamcat object
#' @description This function creates a LEfSe input file from SIAMCAT object
#'
#' @param siamcat object of class \link{siamcat-class}
#'
......@@ -15,13 +15,14 @@
#'
#' @return nothing but data is written to a file
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @examples
#'
#' data(siamcat_example)
#' siamcat.to.lefse(siamcat_example)
#'
#' @export
#'
siamcat.to.lefse <- function(siamcat, filename="siamcat_output.txt") {
feat <- get.orig_feat.matrix(siamcat)
label <- label(siamcat)
......
......@@ -3,10 +3,10 @@
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title create a MaAsLin input file from siamcat object
#' @title create a MaAsLin input file from SIAMCAT object
#'
#' @description This function creates a MaAsLin merged PCL single input file
#' from siamcat object
#' from SIAMCAT object
#'
#' @param siamcat object of class \link{siamcat-class}
#'
......@@ -16,13 +16,14 @@
#'
#' @return nothing but data is written to a file
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @examples
#'
#' data(siamcat_example)
#' siamcat.to.maaslin(siamcat_example)
#'
#' @export
#'
siamcat.to.maaslin <- function(siamcat, filename="siamcat_output.pcl") {
feat <- get.orig_feat.matrix(siamcat)
label <- label(siamcat)
......
......@@ -5,25 +5,22 @@
#' @title Perform feature normalization
#'
#' @description This function performs feature normalization according to user-
#' specified parameters.
#' @description This function performs feature normalization according to
#' user-specified parameters.
#'
#' @usage normalize.features(siamcat,
#' norm.method = c("rank.unit", "rank.std",
#' @usage normalize.features(siamcat, norm.method = c("rank.unit", "rank.std",
#' "log.std", "log.unit", "log.clr", "std", "pass"),
#' norm.param = list(log.n0 = 1e-06, sd.min.q = 0.1,
#' n.p = 2, norm.margin = 1),
#' feature.type='filtered',
#' verbose = 1)
#' norm.param = list(log.n0 = 1e-06, sd.min.q = 0.1, n.p = 2, norm.margin = 1),
#' feature.type='filtered', verbose = 1)
#'
#' @param siamcat an object of class \link{siamcat-class}
#'
#' @param norm.method string, normalization method, can be one of these:
#' '\code{c('rank.unit', 'rank.std', 'log.std', 'log.unit', 'log.clr',
#' 'std', 'pass')}
#' \code{c('rank.unit', 'rank.std', 'log.std', 'log.unit', 'log.clr','std',
#' 'pass')}
#'
#' @param norm.param list, specifying the parameters of the different
#' normalization methods, see details for more information
#' normalization methods, see Details for more information
#'
#' @param feature.type string, on which type of features should the function
#' work? Can be either \code{"original"}, \code{"filtered"}, or
......@@ -35,49 +32,44 @@
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @details There are seven different normalization methods available:
#' @section Implemented methods:
#' There are seven different normalization methods available, which
#' might need additional parameters, which are passed via the \code{norm.param}
#' list:
#' \itemize{
#' \item \code{'rank.unit'} - converts features to ranks and normalizes
#' each column (=sample) by the square root of the sum of ranks
#' This method does not require additional parameters.
#' \item \code{'rank.std'} - converts features to ranks and applies z-score
#' standardization
#' \item \code{'log.clr'} - centered log-ratio transformation (with the
#' addition of pseudocounts)
#' \item \code{'log.std'} - log-transforms features (after addition of
#' pseudocounts) and applies z-score standardization
#' \item \code{'log.unit'} - log-transforms features (after addition of
#' pseudocounts) and normalizes by features or samples with different norms
#' standardization.
#' This method requires \code{sd.min.q} (minimum quantile of the standard
#' deviation to be added to all features in order to avoid underestimation of
#' standard deviation) as additional parameter.
#' \item \code{'log.clr'} - centered log-ratio transformation.
#' This methods requires a pseudocount (\code{log.n0}) before
#' log-transformation.
#' \item \code{'log.std'} - log-transforms features and applies z-score
#' standardization.
#' This method requires both a pseudocount (\code{log.n0}) and \code{sd.min.q}
#' \item \code{'log.unit'} - log-transforms features and normalizes by
#' features or samples with different norms.
#' This method requires a pseudocount (\code{log.n0}) and then additionally the
#' parameters \code{norm.maring} (margin over which to normalize, similarly to
#' the \code{apply}-syntax: Allowed values are \code{1} for normalization
#' over features, \code{2} over samples, and \code{3} for normalization
#' by the global maximum) and the parameter \code{n.p} (vector norm to be
#' used, can be either \code{1} for \code{x/sum(x)} or \code{2} for
#' \code{x/sqrt(sum(x^2))}).
#' \item \code{'std'} - z-score standardization without any other
#' transformation
#' \item \code{'pass'} - pass-through normalization will not change the
#' features}
#'
#' The list entries in \code{'norm.param'} specify the normalzation parameters,
#' which are dependant on the normalization method of choice:
#' \itemize{
#' \item \code{'rank.unit' or 'pass'} does not require any other parameters
#' \item \code{'rank.std' and 'std'} requires \code{sd.min.q}, quantile
#' of the distribution of standard deviations of all features that will
#' be added to the denominator during standardization in order to avoid
#' underestimation of the standard deviation, defaults to 0.1
#' \item \code{'log.clr'} requires \code{log.n0}, which is the pseudocount
#' to be added before log-transformation, defaults to \code{NULL} leading
#' to the estimation of \code{log.n0} from the data
#' \item \code{'log.std'} requires both \code{log.n0} and \code{sd.min.q},
#' using the same default values
#' \item \code{'log.unit'} requires next to \code{log.n0} also the
#' parameters \code{n.p} and \code{norm.margin}. \code{n.p} specifies the
#' vector norm to be used, can be either \code{1} for \code{x/sum(x)} or
#' \code{2} for \code{x/sqrt(sum(x^2))}. The parameter \code{norm.margin}
#' specifies the margin over which to normalize, similarly to the
#' \code{apply}-syntax: Allowed values are \code{1} for normalization
#' over features, \code{2} over samples, and \code{3} for normalization
#' by the global maximum.
#'}
#' This method only requires the \code{sd.min.q} parameter
#' \item \code{'pass'} - pass-through normalization will not change
#' the features}
#'
#' @section Frozen normalization:
#' The function additionally allows to perform a frozen normalization on a
#' different dataset. After normalizing the first dataset, the \code{norm_feat}
#' slot in the siamcat object contains all parameters of the normalization,
#' slot in the SIAMCAT object contains all parameters of the normalization,
#' which you can access via the \link{norm_params} accessor.
#'
#' In order to perform a frozen normalization of a new dataset, you can run the
......@@ -89,6 +81,8 @@
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @return an object of class \link{siamcat-class} with normalized features
#'
#' @examples
......@@ -112,7 +106,6 @@
#' # Frozen normalization
#' \donttest{siamcat_norm <- normalize.features(siamcat,
#' norm.param=norm_params(siamcat_reference))}
normalize.features <- function(siamcat,
norm.method = c("rank.unit", "rank.std",
"log.std", "log.unit",
......
......@@ -9,8 +9,6 @@
#'
#' @param fn.in.label name of the tsv file containing labels
#'
#' @export
#'
#' @return label object containing several entries:\itemize{
#' \item \code{$label} named vector containing the numerical labels from the
#' file;
......@@ -34,6 +32,12 @@
#'
#' Note: Labels can take other numeric values (but not characters or strings);
#' importantly, the label for cases has to be greater than the one for controls