...
 
Commits (123)
......@@ -29,7 +29,7 @@ preprocess_data:
- Rscript 01_validate_data.r --metadata_in num_metadata_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --metadata_out valMetaData.tsv --label_in label_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --label_out valLabel.tsv --feat_in feat_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --feat_out valFeat.tsv
- Rscript 02_select_samples.r --metadata_in valMetaData.tsv --metadata_out valMetaData_sel.tsv --label_in valLabel.tsv --label_out valLabel_sel.tsv --feat_in valFeat.tsv --feat_out valFeat_sel.tsv --filter_var="age" --allowed_range="[0,120]"
- Rscript 03_check_for_confounders.r --metadata_in valMetaData_sel.tsv --feat_in valFeat.tsv --plot metaCheck.pdf --label_in valLabel_sel.tsv
- Rscript 04_filter_features.r --feat_in valFeat_sel.tsv --feat_out valFeat_sel_filtered.tsv --method="abundance" --cutoff="0.001" --recomp_prop="FALSE" --rm_unmapped="TRUE"
- Rscript 04_filter_features.r --feat_in valFeat_sel.tsv --feat_out valFeat_sel_filtered.tsv --method="abundance" --cutoff="0.001" --rm_unmapped="TRUE"
- Rscript 05_check_associations.r --feat_in valFeat_sel_filtered.tsv --label_in valLabel_sel.tsv --plot assoc.pdf --col_scheme="RdYlBu" --alpha="0.7" --min_fc="0" --mult_test="fdr" --max_show="50" --detect_limit="1e-06" --plot_type="bean"
- Rscript 06_normalize_features.r --feat_in valFeat_sel_filtered.tsv --feat_out valFeat_sel_norm.tsv --param_out param_out.tsv --method="log.unit" --log_n0="1e-08" --sd_min_quantile="0.2" --vector_norm="2" --norm_margin="1"
- Rscript 07_add_metadata_as_predictor.r --feat_in valFeat_sel_norm.tsv --feat_out valFeat_sel_norm.tsv --metadata_in valMetaData_sel.tsv --pred_names="fobt" --std_meta="TRUE"
......@@ -41,7 +41,7 @@ preprocess_data:
only:
- master
- development
- dev
train_model:
stage: train_model
......@@ -58,7 +58,7 @@ train_model:
only:
- master
- development
- dev
make_predictions:
stage: make_predictions
......@@ -75,7 +75,7 @@ make_predictions:
only:
- master
- development
- dev
evaluate_interpret:
stage: evaluate_interpret
......@@ -83,7 +83,7 @@ evaluate_interpret:
- cd Rscript_flavor
- R CMD INSTALL SIAMCAT_1.0.1.tar.gz
script:
- Rscript 11_evaluate_predictions.r --feat_in valFeat_sel_norm.tsv --label_in valLabel_sel.tsv --plot evalPlot.pdf --pred pred.tsv
- Rscript 11_evaluate_predictions.r --feat_in valFeat_sel_norm.tsv --label_in valLabel_sel.tsv --data_split dataSplit.Rdata --plot evalPlot.pdf --pred pred.tsv
- Rscript 12_interpret_model.r --label_in valLabel_sel.tsv --feat_in valFeat_sel_norm.tsv --origin_feat valFeat_sel.tsv --metadata_in valMetaData_sel.tsv --mlr_models_list models.RData --plot interpretation.pdf --pred pred.tsv --col_scheme="BrBG" --heatmap_type="zscore" --consens_thres="0.5"
artifacts:
......@@ -93,4 +93,4 @@ evaluate_interpret:
only:
- master
- development
- dev
......@@ -49,10 +49,11 @@ Imports:
RColorBrewer,
scales,
stats,
stringr,
utils
License: GPL-3
LazyData: true
RoxygenNote: 6.0.1.9000
RoxygenNote: 6.1.0
biocViews: Metagenomics, Classification, Microbiome, Sequencing, Preprocessing,
Clustering, FeatureExtraction, GeneticVariability, MultipleComparison,
Regression
......
# Generated by roxygen2: do not edit by hand
export("associations<-")
export("data_split<-")
export("eval_data<-")
export("features<-")
export("filt_feat<-")
export("label<-")
export("meta<-")
export("model_list<-")
export("norm_param<-")
export("norm_feat<-")
export("orig_feat<-")
export("physeq<-")
export("pred_matrix<-")
export(accessSlot)
export(add.meta.pred)
export(assoc_param)
export(associations)
export(check.associations)
export(check.confounders)
export(create.data.split)
export(create.label.from.metadata)
export(create.label)
export(data_split)
export(eval_data)
export(evaluate.predictions)
export(features)
export(feature_type)
export(feature_weights)
export(filt_feat)
export(filt_params)
export(filter.features)
export(filter.label)
export(get.features.matrix)
export(get.filt_feat.matrix)
export(get.norm_feat.matrix)
export(get.orig_feat.matrix)
export(label)
export(make.predictions)
......@@ -32,16 +39,14 @@ export(model.interpretation.plot)
export(model_list)
export(model_type)
export(models)
export(norm_param)
export(norm_feat)
export(norm_params)
export(normalize.features)
export(orig_feat)
export(physeq)
export(pred_matrix)
export(read.features)
export(read.labels)
export(read.label)
export(read.lefse)
export(read.meta)
export(reset.features)
export(select.samples)
export(siamcat)
export(siamcat.to.lefse)
......@@ -49,12 +54,14 @@ export(siamcat.to.maaslin)
export(summarize.features)
export(train.model)
export(validate.data)
export(weight_matrix)
exportClasses(associations)
exportClasses(data_split)
exportClasses(eval_data)
exportClasses(filt_feat)
exportClasses(label)
exportClasses(model_list)
exportClasses(norm_param)
exportClasses(orig_feat)
exportClasses(norm_feat)
exportClasses(pred_matrix)
exportClasses(siamcat)
import(LiblineaR)
......@@ -127,6 +134,8 @@ importFrom(stats,rnorm)
importFrom(stats,runif)
importFrom(stats,sd)
importFrom(stats,wilcox.test)
importFrom(stringr,str_extract_all)
importFrom(utils,askYesNo)
importFrom(utils,head)
importFrom(utils,read.csv)
importFrom(utils,read.table)
......
......@@ -7,13 +7,18 @@
#' @title Add metadata as predictors
#' @description This function adds metadata to the feature matrix to be later
#' used as predictors
#' @usage add.meta.pred(siamcat, pred.names = NULL, std.meta =
#' TRUE, verbose = 1)
#' @usage add.meta.pred(siamcat, pred.names,
#' std.meta = TRUE,
#' feature.type='normalized',
#' verbose = 1)
#' @param siamcat object of class \link{siamcat-class}
#' @param pred.names vector of names of the variables within the metadata to be
#' added to the feature matrix as predictors
#' @param std.meta boolean, should added metadata features be standardized?,
#' defaults to \code{TRUE}
#' @param feature.type On which type of features should the function work? Can
#' be either "original", "filtered", or "normalized". Please only change this
#' paramter if you know what you are doing!
#' @param verbose control output: \code{0} for no output at all, \code{1} for
#' only information about progress and success, \code{2} for normal level of
#' information and \code{3} for full debug information, defaults to \code{1}
......@@ -24,17 +29,46 @@
#' @examples
#' data(siamcat_example)
#' # Add the Age of the patients as potential predictor
#' siamcat_age_added <- add.meta.pred(siamcat_example, pred.names=c('age'))
#' siamcat_age_added <- add.meta.pred(siamcat_example, pred.names=c('Age'))
#'
#' # Add Age, BMI, and Gender as potential predictors
#' # Additionally, prevent standardization of the added features
#' siamcat_meta_added <- add.meta.pred(siamcat_example, pred.names=c('age',
#' 'bmi', 'gender'), std.meta=FALSE)
add.meta.pred <- function(siamcat, pred.names = NULL, std.meta = TRUE,
#' siamcat_meta_added <- add.meta.pred(siamcat_example, pred.names=c('Age',
#' 'BMI', 'Gender'), std.meta=FALSE)
add.meta.pred <- function(siamcat, pred.names,
std.meta = TRUE,
feature.type = 'normalized',
verbose = 1) {
if (verbose > 1)
message("+ starting add.meta.pred")
s.time <- proc.time()[3]
if (is.null(meta(siamcat))) {
stop('SIAMCAT object has no metadata. Exiting...')
}
if (!feature.type %in% c('original', 'filtered', 'normalized')){
stop("Unrecognised feature type, exiting...\n")
}
if (feature.type != 'normalized'){
warning('It is recommended to add a meta-predictor only',
' after feature normalization.')
}
# get the right features
if (feature.type == 'original'){
feat <- get.orig_feat.matrix(siamcat)
} else if (feature.type == 'filtered'){
if (is.null(filt_feat(siamcat, verbose=0))){
stop('Features have not yet been filtered, exiting...\n')
}
feat <- get.filt_feat.matrix(siamcat)
} else if (feature.type == 'normalized'){
if (is.null(norm_feat(siamcat, verbose=0))){
stop('Features have not yet been normalized, exiting...\n')
}
feat <- get.norm_feat.matrix(siamcat)
}
### add metadata as predictors to the feature matrix
cnt <- 0
......@@ -47,6 +81,9 @@ add.meta.pred <- function(siamcat, pred.names = NULL, std.meta = TRUE,
if (!p %in% colnames(meta(siamcat))) {
stop("There is no metadata variable called ", p)
}
if (paste0('META_', toupper(p)) %in% rownames(feat)){
stop("This meta-variable has already been added. Exiting...")
}
idx <- which(colnames(meta(siamcat)) == p)
if (length(idx) != 1)
stop(p, "matches multiple columns in the metada")
......@@ -55,7 +92,7 @@ add.meta.pred <- function(siamcat, pred.names = NULL, std.meta = TRUE,
# check if the meta-variable is a factor or numeric
if (class(m) == 'factor'){
message(paste0('WARNING: the meta-variable is a factor and',
warning(paste0('WARNING: meta-variable ', p,' is a factor and',
' not numeric...\n The values will be converted to numerical',
' values with "as.numeric()"\n'))
m <- as.numeric(m)
......@@ -79,12 +116,21 @@ add.meta.pred <- function(siamcat, pred.names = NULL, std.meta = TRUE,
stopifnot(!m.sd == 0)
m <- (m - m.mean)/m.sd
}
features.with.meta <- otu_table(rbind(features(siamcat),m),
features.with.meta <- otu_table(rbind(feat,m),
taxa_are_rows = TRUE)
rownames(features.with.meta)[nrow(features.with.meta)] <- paste(
"META_", toupper(p), sep = "")
features(siamcat) <- features.with.meta
if (feature.type == 'original'){
orig_feat(siamcat) <- features.with.meta
} else if (feature.type == 'filtered'){
filt_feat(siamcat) <- new('filt_feat',
filt.feat=features.with.meta,
filt.param=filt_params(siamcat))
} else if (feature.type == 'normalized'){
norm_feat(siamcat) <- new("norm_feat",
norm.feat=features.with.meta,
norm.param=norm_params(siamcat))
}
cnt <- cnt + 1
}
if (verbose > 1)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -12,7 +12,7 @@
#' \code{num.resample} times.
#'
#' @usage create.data.split(siamcat, num.folds = 2, num.resample = 1,
#' stratify = TRUE,inseparable = NULL, verbose = 1)
#' stratify = TRUE, inseparable = NULL, verbose = 1)
#'
#' @param siamcat object of class \link{siamcat-class}
#'
......@@ -64,13 +64,9 @@
#' ## # example with a variable which is to be inseparable
#' ## siamcat_split <- create.data.split(siamcat_example, num.folds=10,
#' ## num.resample=5, stratify=FALSE, inseparable='Gender')
create.data.split <-
function(siamcat,
num.folds = 2,
num.resample = 1,
stratify = TRUE,
inseparable = NULL,
verbose = 1) {
create.data.split <- function(siamcat, num.folds = 2, num.resample = 1,
stratify = TRUE, inseparable = NULL, verbose = 1) {
if (verbose > 1)
message("+ starting create.data.split")
s.time <- proc.time()[3]
......@@ -88,7 +84,7 @@ create.data.split <-
}
# parse label description
classes <- sort(c(label$negative.lab, label$positive.lab))
classes <- sort(label$info)
### check arguments
if (num.resample < 1) {
......
#!/usr/bin/Rscript
### SIAMCAT - Statistical Inference of Associations between
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title create a label object from metadata or an atomic vector
#'
#' @description This function creates a label object from metadata
#' or an atomic vector
#'
#' @usage create.label(label, case,
#' meta=NULL, control=NULL,
#' p.lab = NULL, n.lab = NULL,
#' remove.meta.column=FALSE,
#' verbose=1)
#'
#' @param label named vector to create the label or the name of the metadata
#' column that will be used to create the label
#'
#' @param case name of the group that will be used as a positive label. If the
#' variable is binary, the other label will be used as a negative one. If the
#' variable has multiple values, all the other values will be used a negative
#' label (testing one vs rest).
#'
#' @param meta metadata dataframe object or an object of class
#' \link[phyloseq]{sample_data-class}
#'
#' @param control name of a label or vector with names that will be used as a
#' negative label. All values that are nor equal to case and control will be
#' dropped. Default to NULL in which case: If the variable is binary, the value
#' not equal to case will be used as negative. If the variable has multiple
#' values, all the values not equal to cases will be used a negative label
#' (testing one vs rest).
#'
#' @param p.lab name of the positive group (useful mostly for visualizations).
#' Default to NULL in which case the value of the positive group will be used.
#'
#' @param n.lab name of the negative group (useful mostly for visualizations).
#' Default to NULL in which case the value of the negative group will be used
#' for binary variables and "rest" will be used for variables with multiple
#' values.
#'
#' @param remove.meta.column boolean indicating if the label column in the
#' metadata should be retained. Please note that if this is set to
#' \code{TRUE}, the function will return a list as result. Defaults to
#' \code{FALSE}
#'
#' @param verbose control output: \code{0} for no output at all, \code{1}
#' for only information about progress and success, \code{2} for normal
#' level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords create.label
#'
#' @return an object of class \link{label-class} OR a list with entries
#' \code{meta} and \code{label}, if \code{remove.meta.column} is set to
#' \code{TRUE}
#'
#' @examples
#' data('meta_crc_zeller')
#' label <- create.label(label='Group', case='CRC', meta=meta.crc.zeller)
#'
#' @export
create.label <- function(label, case, meta=NULL, control = NULL,
p.lab=NULL, n.lab=NULL, remove.meta.column=FALSE, verbose=1) {
if (verbose > 1)
message("+ starting create.label")
s.time <- proc.time()[3]
#if metadata has been supplied and the label is of length 1
if (!is.null(meta) & length(label) == 1){
if (!label %in% colnames(meta))
stop("ERROR: Column ", label, " not found in the metadata\n")
if (class(meta) == 'sample_data'){
label.vec <- vapply(meta[, label], as.character,
FUN.VALUE = character(nrow(meta)))
} else if (class(meta) == 'data.frame'){
label.vec <- vapply(meta[, label], as.character,
FUN.VALUE = character(1))
} else {
stop(paste0('Please provide the metadata either as a data.frame',
' or a sample_data object!'))
}
names(label.vec) <- rownames(meta)
#if the label is a vector
} else if (is.atomic(label) & length(label) > 1){
label.vec <- label
if (is.null(names(label.vec))){
stop('The label vectors needs to be named!')
}
if (is.factor(label.vec)){
names.old <- names(label.vec)
label.vec <- as.character(label.vec)
names(label.vec) <- names.old
}
} else {
stop(paste0('Could not interpret your input.\n Please provide ',
'either a column name and metadata or a label vector.\nExiting...!'))
}
# remove NAs in the label
if (any(is.na(label.vec))){
if (verbose > 1) message(paste0('+ removing ', sum(is.na(label.vec)),
' instances of NA in the label'))
label.vec <- label.vec[!is.na(label.vec)]
}
# get different groups
groups <- unique(label.vec)
### checking case
if(!all(case %in% groups)){
stop("The chosen label does not contain values: ",
paste(case,collapse=","),"\nInstead, contains: ",
paste(groups, collapse=','))
}
### checking control
if (is.null(control)) {
if((length(groups)-length(case))>1){
control <- "rest"
}else{
control <- setdiff(groups, case)
}
}else{
if(!control%in%groups){
stop("The chose label does not contain value:",control,
"\nInstead, contains: ", paste(groups, collapse=','))
}
### dropping unused values
if(any(!groups%in%c(case, control))){
label.vec <- label.vec[which(label.vec%in%c(case, control))]
warning("Dropping values: ",
groups[which(!groups%in%c(case, control))])
}
}
# message status
if (verbose > 0)
message("Label used as case:\n ",paste(case,collapse=","),
"\nLabel used as control:\n ",paste(control,collapse=","))
# create new label.object
label.new <- list(label = rep(-1, length(label.vec)))
n.lab <- ifelse(is.null(n.lab), gsub("[_.-]", " ", control), n.lab)
p.lab <- ifelse(is.null(p.lab),
ifelse(length(case) > 1, 'Case', gsub("[_.-]", " ", case)), p.lab)
info <- c(-1, 1)
names(info) <- c(n.lab, p.lab)
names(label.new$label) <- names(label.vec)
if (length(case) > 1){
label.new$label[which(label.vec %in% case)] <- 1
} else {
label.new$label[which(label.vec == case)] <- 1
}
label.new$info <- info
label.new$type <- "BINARY"
label.new <- label(label.new)
e.time <- proc.time()[3]
if (verbose > 0)
message(paste(
"+ finished create.label.from.metadata in",
formatC(e.time - s.time, digits = 3),
"s"
))
if (!remove.meta.column){
return(label.new)
} else {
meta.new <- meta[,-which(colnames(meta) == label)]
return(list(label=label.new,
meta=meta.new))
}
}
#!/usr/bin/Rscript
### SIAMCAT - Statistical Inference of Associations between
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title create a label object from metadata
#'
#' @description This function creates a label object from metadata
#'
#' @usage create.label.from.metadata(meta, column, case,
#' control=NULL, p.lab = NULL, n.lab = NULL, verbose=1)
#'
#' @param meta metadata as read by \link{read.meta}
#' of \link[phyloseq]{sample_data-class}
#'
#' @param column name of column that will be used
#' to create the label
#'
#' @param case name of a label that will be used as a positive label. If the
#' variable is binary, the other label will be used as a negative one. If the
#' variable has multiple values, all the other values will be used a negative
#' label (testing one vs rest).
#'
#' @param control name of a label or vector with names that will be used as a
#' negative label. All values that are nor equal to case and control will be
#' dropped. Default to NULL in which case: If the variable is binary, the value
#' not equal to case will be used as negative. If the variable has multiple
#' values, all the values not equal to cases will be used a negative label
#' (testing one vs rest).
#'
#' @param p.lab name of the positive label (useful mostly for visualizations).
#' Default to NULL in which case the value of the positive label will be used.
#'
#' @param n.lab name of the negative label (useful mostly for visualizations).
#' Default to NULL in which case the value of the negative label will be used
#' for binary variables and "rest" will be used for variables with multiple
#' values.
#'
#' @param verbose control output: \code{0} for no output at all, \code{1}
#' for only information about progress and success, \code{2} for normal
#' level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords create.label.from.metadata
#'
#' @return an object of class \link{label-class}
#'
#' @examples
#' data(siamcat_example)
#' label <- create.label.from.metadata(meta(siamcat_example),"fobt",
#' case = 1, control = 0)
#'
#' @export
create.label.from.metadata <- function(meta, column, case, control = NULL,
p.lab = NULL, n.lab = NULL, verbose=1) {
if (verbose > 1)
message("+ starting create.label.from.metadata")
s.time <- proc.time()[3]
if (!column %in% colnames(meta))
stop("ERROR: Column ", column, " not found in the metadata\n")
metaColumn <- vapply(meta[, column], as.character,
FUN.VALUE = character(nrow(meta)))
names(metaColumn) <- rownames(meta)
labels <- unique(metaColumn)
if (length(labels) == 2){
if (verbose > 0) message("Column ", column, " contains binary label\n")
if(!case%in%labels){
stop("Column ", column, " does not contain value:",case,"\n")
}
if (is.null(control)) {
control <- setdiff(unique(labels), case)
} else {
if(!control%in%labels){
stop("Column ", column, " does not contain value:",control,"\n")
}
}
}else if(length(labels) > 2){
if(!case%in%labels){
stop("Column ", column, " does not contain value:",case,"\n")
}
if (is.null(control)) {
control <- "rest"
} else {
if(!control%in%labels){
stop("Column ", column, " does not contain value:",control,"\n")
}
if(any(!labels%in%c(case, control))){
metaColumn <- metaColumn[which(metaColumn%in%c(case, control))]
warning("Dropping values: ",
labels[which(!labels%in%c(case, control))])
}
}
}
if (verbose > 0)
message("Label used as case:\n ",case,
"\nLabel used as control:\n ",
paste(labels[which(labels!=case)], collapse = ","))
label <-
list(
label = rep(-1, length(metaColumn)),
positive.lab = 1,
negative.lab = (-1)
)
label$n.lab <- gsub("[_.-]", " ", control)
label$p.lab <- gsub("[_.-]", " ", case)
class.descr <- c(-1, 1)
names(class.descr) <- c(label$n.lab, label$p.lab)
names(label$label) <- names(metaColumn)
label$header <-
paste0("#BINARY:1=", label$p.lab, ";-1=", label$n.lab)
label$label[which(metaColumn == case)] <- 1
label$n.idx <- label$label == label$negative.lab
label$p.idx <- label$label == label$positive.lab
label$info <- list(type = "BINARY", class.descr = class.descr)
labelRes <-
label(
list(
label = label$label,
header = label$header,
info = label$info,
positive.lab = label$positive.lab,
negative.lab = label$negative.lab,
n.idx = label$n.idx,
p.idx = label$p.idx,
n.lab = label$n.lab,
p.lab = label$p.lab
)
)
e.time <- proc.time()[3]
if (verbose > 0)
message(paste(
"+ finished create.label.from.metadata in",
formatC(e.time - s.time, digits = 3),
"s"
))
return(labelRes)
}
......@@ -2,15 +2,39 @@
#'
#' Reduced version of the CRC dataset in inst/extdata, containing 100 features
#' (15 associated features at 5\% FDR in the original dataset and 85 random
#' other features) and 141 samples, saved after the complete SIAMCAT
#' other features) and 141 samples, saved after the complete SIAMCAT
#' pipelinehas been run.
#' Therefore, contains entries in every siamcat-object slot,
#' e.g, \code{eval_data} or \code{data_split}. Mainly used for running
#' the examples in the function documentation
#'
#'
#' @name siamcat_example
#'
#'
#' @docType data
#'
#' @keywords data
NULL
#' Documentation for the example feature object in the data folder
#'
#' Feature matrix (as data.frame) for the CRC dataset, containing 141 samples
#' and 1754 bacterial species (features).
#'
#' @name feat.crc.zeller
#'
#' @docType data
#'
#'
#' @keywords data
NULL
#' Documentation for the example metadata object in the data folder
#'
#' Metadata (as data.frame) for the CRC dataset, containing 6 variables (e.g.
#' Age or BMI) for 141 samples.
#'
#' @name meta.crc.zeller
#'
#' @docType data
#'
#' @keywords data
NULL
This diff is collapsed.
This diff is collapsed.
......@@ -19,6 +19,7 @@
#' @importFrom gridExtra ttheme_minimal grid.table
#' @importFrom matrixStats rowQuantiles rowMaxs rowSds colRanks rowMedians
#' @importFrom scales alpha
#' @importFrom utils write.table read.csv
#' @importFrom utils write.table read.csv askYesNo
#' @importFrom stringr str_extract_all
NULL
......@@ -4,26 +4,26 @@
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title create a lefse input file from siamcat object
#'
#'
#' @description This function creates a lefse input file from siamcat object
#'
#'
#' @param siamcat object of class \link{siamcat-class}
#'
#'
#' @param filename name of the input file to which data will be save
#'
#'
#' @keywords siamcat.to.lefse
#'
#'
#' @return nothing but data is written to a file
#'
#'
#' @examples
#'
#'
#' data(siamcat_example)
#' siamcat.to.lefse(siamcat_example)
#'
#'
#' @export
#'
#'
siamcat.to.lefse <- function(siamcat, filename="siamcat_output.txt") {
feat <- get.features.matrix(siamcat)
feat <- get.orig_feat.matrix(siamcat)
label <- label(siamcat)
labelD <- label$label
labelD[label$p.idx] <- label$p.lab
......@@ -38,62 +38,60 @@ siamcat.to.lefse <- function(siamcat, filename="siamcat_output.txt") {
rownames(feat))
write.table(results,
file = filename,
quote = FALSE,
sep = '\t',
file = filename,
quote = FALSE,
sep = '\t',
row.names = TRUE,
col.names = FALSE)
}
#' @title read an input file in a LEfSe input format
#'
#'
#' @description This reads an input file in a LEfSe input format
#'
#'
#' @param filename name of the input file in a LEfSe input format
#'
#'
#' @param rows.meta specifies in which rows medata variables are stored
#'
#' @param row.samples specifies in which row sample names are stored
#'
#' @keywords read.lefse
#'
#'
#' @return a list with two elements: \itemize{
#' \item \code{feat} a features matrix (just as returned by
#' \link{read.features} function
#' \item \code{meta} a metadate matrix (just as returned by
#' \link{read.meta} function}
#' \item \code{feat} a features matrix
#' \item \code{meta} a metadate matrix}
#' @examples
#'
#'
#' fn.in.lefse<- system.file("extdata",
#' "LEfSe_crc_zeller_msb_mocat_specI.tsv",package = "SIAMCAT")
#' meta.and.features <- read.lefse(fn.in.lefse, rows.meta = 1:6,
#' meta.and.features <- read.lefse(fn.in.lefse, rows.meta = 1:6,
#' row.samples = 7)
#' meta <- meta.and.features$meta
#' feat <- meta.and.features$feat
#' label <- create.label.from.metadata(meta, "label", case = "cancer")
#' siamcat <- siamcat(feat, label, meta)
#'
#' label <- create.label(meta=meta, label="label", case = "cancer")
#' siamcat <- siamcat(feat=feat, label=label, meta=meta)
#'
#' @export
#'
#'
read.lefse <- function(filename="data.txt", rows.meta = 1, row.samples = 2) {
lefse <- read.csv(filename, sep = "\t", header = FALSE,
lefse <- read.csv(filename, sep = "\t", header = FALSE,
stringsAsFactors = FALSE)
meta <- lefse[rows.meta,]
samples.names <- lefse[row.samples,]
feat <- lefse[(max(c(rows.meta,row.samples))+1):nrow(lefse),]
rownames(meta) <- meta[,1]
meta <- meta[,-1]
colnames(meta) <- samples.names[,-1]
meta <- sample_data(as.data.frame(t(meta)))
rownames(feat) <- feat[,1]
feat <- feat[,-1]
colnames(feat) <- samples.names[,-1]
feat <- apply(feat,c(1,2),as.numeric)
feat <- otu_table(feat,taxa_are_rows = TRUE)
return(list(feat = feat, meta = meta))
}
\ No newline at end of file
}
......@@ -4,27 +4,27 @@
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title create a MaAsLin input file from siamcat object
#'
#' @description This function creates a MaAsLin merged PCL single input file
#'
#' @description This function creates a MaAsLin merged PCL single input file
#' from siamcat object
#'
#'
#' @param siamcat object of class \link{siamcat-class}
#'
#'
#' @param filename name of the input file to which data will be save
#'
#'
#' @keywords internal
#'
#'
#' @return nothing but data is written to a file
#'
#'
#' @examples
#'
#'
#' data(siamcat_example)
#' siamcat.to.maaslin(siamcat_example)
#'
#'
#' @export
#'
#'
siamcat.to.maaslin <- function(siamcat, filename="siamcat_output.pcl") {
feat <- get.features.matrix(siamcat)
feat <- get.orig_feat.matrix(siamcat)
label <- label(siamcat)
meta <- NULL
if(!is.null(meta(siamcat))) meta <- t(meta(siamcat))
......@@ -43,9 +43,9 @@ siamcat.to.maaslin <- function(siamcat, filename="siamcat_output.pcl") {
rownames(feat))
write.table(results,
file = filename,
quote = FALSE,
sep = '\t',
file = filename,
quote = FALSE,
sep = '\t',
row.names = TRUE,
col.names = FALSE)
}
\ No newline at end of file
}
This diff is collapsed.
......@@ -8,6 +8,8 @@
#' the Receiver Operating Characteristic (ROC)-curves, the other the
#' Precision-recall (PR)-curves for the different cross-validation
#' repetitions.
#' @usage model.evaluation.plot(..., fn.plot = NULL,
#' colours=NULL, verbose = 1)
#' @param ... one or more object of class \link{siamcat-class}, can be named
#' @param fn.plot string, filename for the pdf-plot
#' @param colours colour specification for the different \link{siamcat-class}-
......@@ -26,15 +28,18 @@
#' # simple working example
#' model.evaluation.plot(siamcat_example, fn.plot='./eval,pdf')
#'
model.evaluation.plot <- function(..., fn.plot, colours = NULL, verbose = 1) {
model.evaluation.plot <- function(..., fn.plot=NULL, colours = NULL,
verbose = 1) {
if (verbose > 1)
message("+ starting model.evaluation.plot")
s.time <- proc.time()[3]
pdf(fn.plot, onefile = TRUE)
if(!is.null(fn.plot)) pdf(fn.plot, onefile = TRUE)
if (verbose > 2)
message("+ plotting ROC")
if (is.null(fn.plot)) par(ask=TRUE)
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(
NULL,
xlim = c(0, 1),
......@@ -50,10 +55,15 @@ model.evaluation.plot <- function(..., fn.plot, colours = NULL, verbose = 1) {
if (length(args) > 1) {
# checks
stopifnot(all(vapply(args, class,
FUN.VALUE = character(1)) == 'siamcat'))
stopifnot(all(vapply(args, FUN=function(x){length(eval_data(x)) != 0},
FUN.VALUE = logical(1))))
if (!all(vapply(args, class,
FUN.VALUE = character(1)) == 'siamcat')){
stop("Please supply only SIAMCAT objects. Exiting...")
}
if (any(vapply(args,
FUN=function(x){is.null(eval_data(x, verbose=0))},
FUN.VALUE = logical(1)))){
stop("Not all SIAMCAT objects have evaluation data. Exiting...")
}
n <- length(args)
if (is.null(colours)) {
......@@ -68,6 +78,7 @@ model.evaluation.plot <- function(..., fn.plot, colours = NULL, verbose = 1) {
}
}
stopifnot(length(colours) == n)
# ROC
legend.val <- c()
# plot each roc curve for each eval data object
......@@ -89,6 +100,7 @@ model.evaluation.plot <- function(..., fn.plot, colours = NULL, verbose = 1) {
format(legend.val, digits=3)),
col=colours, lty=1, lwd=2, cex=0.8, y.intersp=1.5)
}
# PR
# precision recall curve
if (verbose > 2)
......@@ -125,18 +137,22 @@ model.evaluation.plot <- function(..., fn.plot, colours = NULL, verbose = 1) {
} else if (length(args) == 1) {
# checks
stopifnot(all(class(args[[1]]) == 'siamcat'))
stopifnot(length(eval_data(args[[1]])) != 0)
if (!all(class(args[[1]]) == 'siamcat'))
stop('Please supply a SIAMCAT object. Exiting...')
if(is.null(eval_data(args[[1]]))){
stop('SIAMCAT object has no evaluation data. Exiting...')
}
# ROC
if (is.null(colours)) colours <- 'black'
auroc <- single.roc.plot(args[[1]], colours, verbose=verbose)
if (is.null(eval_data(args[[1]])$roc.all)) {
if (data_split(args[[1]])$num.resample == 1) {
text(0.7, 0.1, paste("AUC:", format(auroc, digits = 3)))
} else {
text(0.7, 0.1, paste("Mean-prediction AUC:",
format(auroc, digits = 3)))
}
# PR
if (verbose > 2)
message("+ plotting PRC")
......@@ -150,19 +166,20 @@ model.evaluation.plot <- function(..., fn.plot, colours = NULL, verbose = 1) {
)
title(paste("Precision-recall curve for the model", sep = " "))
label <- label(args[[1]])
abline(h = mean(label$label == label$positive.lab),
abline(h = mean(label$label == max(label$info)),
lty = 3)
aupr <- single.pr.plot(args[[1]], colours, verbose=verbose)
if (is.null(eval_data(args[[1]])$roc.all)) {
text(0.7, 0.1, paste("AUC:", format(aupr, digits = 3)))
auprc <- single.pr.plot(args[[1]], colours, verbose=verbose)
if (data_split(args[[1]])$num.resample == 1) {
text(0.7, 0.1, paste("AUC:", format(auprc, digits = 3)))
} else {
text(0.7, 0.1, paste("Mean AUC:", format(aupr, digits = 3)))
text(0.7, 0.1, paste("Mean AUC:", format(auprc, digits = 3)))
}
} else {
stop('No SIAMCAT object supplied. Exiting...')
}
tmp <- dev.off()
if(!is.null(fn.plot)) tmp <- dev.off()
if (is.null(fn.plot)) par(ask=FALSE)
e.time <- proc.time()[3]
if (verbose > 1)
message(paste(
......@@ -170,7 +187,7 @@ model.evaluation.plot <- function(..., fn.plot, colours = NULL, verbose = 1) {
formatC(e.time - s.time, digits = 3),
"s"
))
if (verbose == 1)
if (verbose == 1 & !is.null(fn.plot))
message(paste(
"Plotted evaluation of predictions successfully to:",
fn.plot
......@@ -178,35 +195,29 @@ model.evaluation.plot <- function(..., fn.plot, colours = NULL, verbose = 1) {
}
single.pr.plot <- function(siamcat, colour, verbose) {
eval.data <- eval_data(siamcat)
# pr curves for resampling
if (!is.null(eval.data$roc.all)) {
aucspr = vector("numeric", ncol(pred_matrix(siamcat)))
for (c in seq_len(ncol(pred_matrix(siamcat)))) {
ev = eval.data$ev.list[[c]]
pr = eval.data$pr.list[[c]]
lines(pr$x, pr$y, col = alpha(colour, alpha=0.5))
aucspr[c] = eval.data$aucspr[c]
if (!is.null(eval.data$prc.all)) {
aucspr.all = eval.data$auprc.all
for (c in seq_len(length(eval.data$prc.all))) {
pr = eval.data$prc.all[[c]]
lines(pr$recall, pr$precision, col = alpha(colour, alpha=0.5))
if (verbose > 2)
message(paste(
"+++ AU-PRC (resampled run ",
c,
"): ",
format(aucspr[c], digits = 3)
format(aucspr.all[c], digits = 3)
))
}
ev = eval_data(siamcat)$ev.list[[length(eval_data(siamcat)$ev.list)]]
} else {
ev = eval_data(siamcat)$ev.list[[1]]
}
pr = evaluate.get.pr(ev, verbose = verbose)
lines(pr$x, pr$y, col = colour, lwd = 2)
aupr = evaluate.calc.aupr(ev, verbose = verbose)
pr = eval.data$prc
lines(pr$recall, pr$precision, col = colour, lwd = 2)
auprc = eval.data$auprc
if (!is.null(eval.data$roc.all)) {
......@@ -214,20 +225,20 @@ single.pr.plot <- function(siamcat, colour, verbose) {
message(
paste(
"+ AU-PRC:\n+++ mean-prediction:",
format(aupr, digits = 3),
format(auprc, digits = 3),
"\n+++ averaged :",
format(mean(aucspr), digits = 3),
format(mean(aucspr.all), digits = 3),
"\n+++ sd :",
format(sd(aucspr), digits = 4)
format(sd(aucspr.all), digits = 4)
)
)
} else {
if (verbose > 1)
message("+ AU-PRC:", format(aupr, digits = 3), "\n")
message("+ AU-PRC:", format(auprc, digits = 3), "\n")
}
return(aupr)
return(auprc)
}
single.roc.plot <- function(siamcat, colour, verbose) {
......@@ -235,12 +246,11 @@ single.roc.plot <- function(siamcat, colour, verbose) {
eval.data <- eval_data(siamcat)
if (!is.null(eval.data$roc.all)){
aucs = vector("numeric", length(eval.data$roc.all))
aucs = eval.data$auroc.all
for (c in seq_along(eval.data$roc.all)) {
roc.c = eval.data$roc.all[[c]]
lines(1 - roc.c$specificities, roc.c$sensitivities,
col = alpha(colour, alpha=0.5))
aucs[c] = eval.data$auc.all[c]
if (verbose > 2) {
message(paste('+++ AU-ROC (resampled run ',
c, "): ", format(aucs[c], digits=3)))
......@@ -248,10 +258,10 @@ single.roc.plot <- function(siamcat, colour, verbose) {
}
}
roc.summ = eval.data$roc.average[[1]]
roc.summ = eval.data$roc
lines(1 - roc.summ$specificities, roc.summ$sensitivities,
col = colour, lwd = 2)
auroc = eval.data$auc.average[1]
auroc = eval.data$auroc
# plot CI
x = as.numeric(rownames(roc.summ$ci))
......@@ -278,5 +288,5 @@ single.roc.plot <- function(siamcat, colour, verbose) {
message(paste("+ AU-ROC:", format(auroc, digits = 3)))
}
return(as.numeric(auroc[[1]]))
return(as.numeric(auroc))
}
This diff is collapsed.
......@@ -9,9 +9,12 @@
#' specified parameters.
#'
#' @usage normalize.features(siamcat,
#' norm.method = c("rank.unit", "rank.std", "log.std", "log.unit", "log.clr"),
#' norm.param = list(log.n0 = 1e-06, sd.min.q = 0.1, n.p = 2, norm.margin = 1),
#' verbose = 1)
#' norm.method = c("rank.unit", "rank.std",
#' "log.std", "log.unit", "log.clr"),
#' norm.param = list(log.n0 = 1e-06, sd.min.q = 0.1,
#' n.p = 2, norm.margin = 1),
#' feature.type='filtered',
#' verbose = 1)
#'
#' @param siamcat an object of class \link{siamcat-class}
#'
......@@ -21,6 +24,10 @@
#' @param norm.param list, specifying the parameters of the different
#' normalization methods, see details for more information
#'
#' @param feature.type On which type of features should the function work? Can
#' be either "original", "filtered", or "normalized". Please only change this
#' paramter if you know what you are doing!
#'
#' @param verbose control output: \code{0} for no output at all, \code{1}
#' for only information about progress and success, \code{2} for normal
#' level of information and \code{3} for full debug information,
......@@ -48,9 +55,9 @@
#' distribution of standard deviations of all features that will be added
#' to the denominator during standardization in order to avoid
#' underestimation of the standard deviation, defaults to 0.1
#' \item \code{'clr'} requires \code{log.n0}, which is the pseudocount to be
#' added before log-transformation, defaults to \code{NULL} leading to
#' the estimation of \code{log.n0} from the data
#' \item \code{'log.clr'} requires \code{log.n0}, which is the pseudocount
#' to be added before log-transformation, defaults to \code{NULL} leading
#' to the estimation of \code{log.n0} from the data
#' \item \code{'log.std'} requires both \code{log.n0} and \code{sd.min.q},
#' using the same default values
#' \item \code{'log.unit'} requires next to \code{log.n0} also the
......@@ -79,9 +86,6 @@
#' @examples
#' # Example data
#' data(siamcat_example)
#' # since the whole pipeline has been run in the example data, exchange the
#' # normalized features with the original features
#' siamcat_example <- reset.features(siamcat_example)
#'
#' # Simple example
#' siamcat_norm <- normalize.features(siamcat_example,
......@@ -107,34 +111,48 @@ normalize.features <- function(siamcat,
n.p = 2,
norm.margin = 1
),
feature.type='filtered',
verbose = 1) {
if (verbose > 1)
message("+ starting normalize.features")
s.time <- proc.time()[3]
feat <- get.features.matrix(siamcat)
if (!feature.type %in% c('original', 'filtered', 'normalized')){
stop("Unrecognised feature type, exiting...\n")
}
# get the right features
if (feature.type == 'original'){
feat <- get.orig_feat.matrix(siamcat)
if (verbose > 1) message('+ normalizing original features')
} else if (feature.type == 'filtered'){
if (is.null(filt_feat(siamcat, verbose=0))){
stop('Features have not yet been filtered, exiting...\n')
}
feat <- get.filt_feat.matrix(siamcat)
} else if (feature.type == 'normalized'){
if (is.null(norm_feat(siamcat, verbose=0))){
stop('Features have not yet been normalized, exiting...\n')
}
warning(
paste('Normalizing features that have already been normalized!',
'\nPlease note that some functionalities may not work properly'))
feat <- get.norm_feat.matrix(siamcat)
}
if (is.null(norm.param$norm.method)) {
# de novo normalization
if (verbose > 1)
message(paste(
"+++ performing de novo normalization using the ",
norm.method,
" method"
))
message(paste( "+++ performing de novo normalization using the ",
norm.method, " method"))
### remove features with missing values
keep.idx <- rowSums(is.na(feat)) == 0
if (any(!keep.idx)) {
feat.red.na <- feat[keep.idx,]
if (verbose > 1) {
message(
paste0(
"+++ removed ",
nrow(feat.red.na) - nrow(feat),
message(paste0("+++ removed ", nrow(feat.red.na) - nrow(feat),
" features with missing values (retaining ",
nrow(feat.red.na),
")"
)
)
nrow(feat.red.na), ")"))
}
} else {
feat.red.na <- feat
......@@ -144,27 +162,19 @@ normalize.features <- function(siamcat,
if (any(keep.idx.sd)) {
feat.red <- feat.red.na[!keep.idx.sd,]
if (verbose > 1) {
message(
paste0(
"+++ removed ",
nrow(feat.red.na) -
nrow(feat.red),
message(paste0("+++ removed ", nrow(feat.red.na)-nrow(feat.red),
" features with no variation across samples
(retaining ",
nrow(feat.red),
")"
)
)
(retaining ", nrow(feat.red), ")"))
}
} else {
feat.red <- feat.red.na
}
## check if one of the allowed norm.methods have been supplied
if (!norm.method %in% c("rank.unit",
"rank.std",
"log.std",
"log.unit",
"log.clr")) {
if (length(norm.method) != 1){
stop('Please supply only a single normalization method! Exiting...')
}
if (!norm.method %in%
c("rank.unit", "rank.std", "log.std", "log.unit", "log.clr")) {
stop("Unknown normalization method! Exiting...")
}
......@@ -184,14 +194,19 @@ normalize.features <- function(siamcat,
not supplied. Exiting ..."
)
}
if (norm.method != "rank.std" &&
is.null(norm.param$log.n0)) {
warning(
"Pseudo-count before log-transformation not supplied!
Estimating it as 5% percentile...\n"
)
norm.param$log.n0 <-
quantile(feat.red[feat.red != 0], 0.05)
if (startsWith(norm.method, "log")){
if (any(feat.red < 0)){
stop('Can not perform log-transform on ',
'negative data. Exiting...')
}
if (is.null(norm.param$log.n0)){
warning(
"Pseudo-count before log-transformation not supplied!
Estimating it as 5% percentile...\n"
)
norm.param$log.n0 <-
quantile(feat.red[feat.red != 0], 0.05)
}
}
if (norm.method == "log.std" &&
(is.null(norm.param$sd.min.q))) {
......@@ -296,7 +311,6 @@ normalize.features <- function(siamcat,
"%"
))
stopifnot(!any(is.na(feat.norm)))
norm_param(siamcat) <- norm_param(par)
} else {
# frozen normalization
if (verbose > 1)
......@@ -413,9 +427,13 @@ normalize.features <- function(siamcat,
"%"
))
stopifnot(!any(is.na(feat.norm)))
par <- norm.param
}
features(siamcat) <- otu_table(feat.norm, taxa_are_rows = TRUE)
norm_feat(siamcat) <- new('norm_feat',
norm.feat=otu_table(feat.norm, taxa_are_rows=TRUE),
norm.param=par)
e.time <- proc.time()[3]
if (verbose > 1)
message(paste(
......
......@@ -3,77 +3,7 @@
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title Read feature file
#'
#' @description This file reads in the tsv file with features and
#' converts it into a matrix.
#'
#' The file should be oragnized as follows:
#' features (in rows) x samples (in columns).
#'
#' First row should contain sample labels (consistent with label data), while
#' the first column should contain feature labels (e.g. taxonomic identifiers).
#' The remaining entries are expected to be real values \code{>= 0} that
#' quantify the abundance of each feature in each sample.
#'
#' @param fn.in.feat name of the tsv file containing features
#'