Commit c317f8d5 authored by Konrad Zych's avatar Konrad Zych
Browse files

merging in the development branch

parents e62e4000 660be04a
Pipeline #3072 passed with stage
in 5 minutes and 44 seconds
<<<<<<< HEAD
.Rproj.user
.Rhistory
.RData
......@@ -8,3 +9,16 @@
*.Rproj
*.tar.gz
*.Rcheck*/
=======
.Rbuildignore
SIAMCAT.RProj
.Rproj.user
.Rhistory
.RData
*.DS_Store*
.history
.Rapp.history
*.tar.gz
*.Rcheck*/
inst/doc
>>>>>>> development
image: r-base
before_script:
- cd Rscript_flavor
- Rscript 00_setup.r
- cd ../
- R CMD build ./
- mv SIAMCAT*.tar.gz Rscript_flavor/.
- cd Rscript_flavor
- wget http://congo.embl.de/downloads/siamcat_test_data.tar.gz
- tar -zxf siamcat_test_data.tar.gz && rm siamcat_test_data.tar.gz
test:
stage: test
script:
- Rscript 00_setup.r
- Rscript 01_validate_data.r --metadata_in num_metadata_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --metadata_out valMetaData.tsv --label_in label_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --label_out valLabel.tsv --feat_in feat_cancer-vs-healthy_study-pop-I_N141_tax_profile_mocat_bn_genus.tsv --feat_out valFeat.tsv
- Rscript 02_select_samples.r --metadata_in valMetaData.tsv --metadata_out valMetaData_sel.tsv --label_in valLabel.tsv --label_out valLabel_sel.tsv --feat_in valFeat.tsv --feat_out valFeat_sel.tsv --filter_var="age" --allowed_range="[0,120]"
- Rscript 03_check_for_confounders.r --metadata_in valMetaData_sel.tsv --plot metaCheck.pdf --label_in valLabel_sel.tsv
- Rscript 04_filter_features.r --feat_in valFeat_sel.tsv --feat_out valFeat_sel_filtered.tsv --method="abundance" --cutoff="0.001" --recomp_prop="FALSE" --rm_unmapped="TRUE"
- Rscript 05_check_associations.r --feat_in valFeat_sel_filtered.tsv --label_in valLabel_sel.tsv --plot assoc.pdf --col_scheme="RdYlBu" --alpha="0.7" --min_fc="0" --mult_test="fdr" --max_show="50" --detect_limit="1e-06" --plot_type="bean"
- Rscript 06_normalize_features.r --feat_in valFeat_sel_filtered.tsv --feat_out valFeat_sel_norm.tsv --param_out param_out.tsv --method="log.unit" --log_n0="1e-08" --sd_min_quantile="0.2" --vector_norm="2" --norm_margin="1"
- Rscript 07_add_metadata_as_predictor.r --feat_in valFeat_sel_norm.tsv --feat_out valFeat_sel_norm.tsv --metadata_in valMetaData_sel.tsv --pred_names="fobt" --std_meta="TRUE"
- Rscript 08_split_data.r --label_in valLabel_sel.tsv --train_sets trainSets.tsv --test_sets testSets.tsv --num_folds="10" --resample="5" --stratify="TRUE" --inseparable="NULL" --metadata_in valMetaData_sel.tsv
- Rscript 09_train_models.r --feat_in valFeat_sel_norm.tsv --label_in valLabel_sel.tsv --method="lasso" --train_sets trainSets.tsv --mlr_models_list models.RData --stratify="TRUE" --sel_criterion="auprc" --min_nonzero_coeff="2"
- Rscript 10_make_predictions.r --label_in valLabel_sel.tsv --feat_in valFeat_sel_norm.tsv --test_sets testSets.tsv --pred pred.tsv --mlr_models_list models.RData
- Rscript 11_evaluate_predictions.r --label_in valLabel_sel.tsv --plot evalPlot.pdf --pred pred.tsv
- Rscript 12_interpret_model.r --label_in valLabel_sel.tsv --feat_in valFeat_sel_norm.tsv --origin_feat valFeat_sel.tsv --metadata_in valMetaData_sel.tsv --mlr_models_list models.RData --plot interpretation.pdf --pred pred.tsv --col_scheme="BrBG" --heatmap_type="zscore" --consens_thres="0.5"
only:
- master
- development
after_script:
- rm -r *.tsv
- echo "cleaned up"
Package: SIAMCAT
Type: Package
Title: Statistical Inference of Associations between Microbial Communities And host phenoTypes
Version: 0.2.0
Author: Georg Zeller, Nicolai Karcher, Konrad Zych
Version: 0.3.0
Author: Georg Zeller [aut,cre], Nicolai Karcher [aut], Jakob Wirbel[aut], Konrad Zych [aut]
Authors@R: c(person("Georg", "Zeller", role = c("aut", "cre"), email = "zeller@embl.de"),
person("Konrad", "Zych", role = "aut", email = "konrad.zych@embl.de", comment = c(ORCID = "0000-0001-7426-0516")),
person("Nicolai", "Karcher", role = "aut"),
person("Jakob", "Wirbel", role = "aut", email = "jakob.wirbel@embl.de", comment = c(ORCID = "0000-0002-4073-3562"))
)
Maintainer: Konrad Zych <konrad.zych@embl.de>
Description: SIAMCAT is a pipeline for Statistical Inference of Associations between Microbial Communities And host phenoTypes. A primary goal of analyzing microbiome data is to determine changes in community composition that are associated with environmental factors. In particular, linking human microbiome composition to host phenotypes such as diseases has become an area of intense research. For this, robust statistical modeling and biomarker extraction toolkits are crucially needed. SIAMCAT provides a full pipeline supporting data preprocessing, statistical association testing, statistical modeling (LASSO logistic regression) including tools for evaluation and interpretation of these models (such as cross validation, parameter selection, ROC analysis and diagnostic model plots).
Depends:
Depends:
R (>= 3.1.0),
RColorBrewer,
beanplot,
glmnet,
LiblineaR,
pROC
Imports:
pROC,
mlr,
PRROC
Imports:
optparse,
colorRamps,
gelnet
......@@ -20,3 +27,7 @@ License: GNU GPL-3.0
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.0.1
Suggests: testthat,
knitr,
rmarkdown
VignetteBuilder: knitr
# Generated by roxygen2: do not edit by hand
S3method(plot,data.range)
S3method(predict,plm)
export(add.meta.pred)
export(appendDirName)
export(assign.fold)
export(check.associations)
export(confounder.check)
export(create.tints.rgb)
export(data.splitter)
export(draw.error.bar)
export(eval.result)
export(eval.predictions)
export(evaluation.model.plot)
export(filter.feat)
export(get.foldList)
export(interpretor.model.plot)
export(label.plot.horizontal)
export(make_evaler_options)
export(make_filter_options)
export(make_frozen_normalizer_options)
export(make_meta_adder_options)
export(make_model_interpretor_options)
export(make_normalizer_options)
export(make.predictions)
export(normalize.feat)
export(parse.label.header)
export(parse.model.header)
export(plm.predictor)
export(plm.trainer)
export(read.features)
export(read.labels)
export(read.meta)
export(sample.strat)
export(select.model)
export(select.samples)
export(train.model)
export(train.plm)
export(validate.data)
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Add metadata as predictors
#' @description This function adds metadata to the feature matrix to be later used as predictors
#' @param feat features object
#' @param meta metadata object
#' @param pred.names vector of names of the metavariables to be added to the feature matrix as predictors
#' @param std.meta boolean, should added metadata features be standardized?
#' @keywords SIAMCAT add.meta.pred
#' @export
#' @return features object with added metadata
add.meta.pred <- function(feat, meta, pred.names=NULL, std.meta){
### add metadata as predictors to the feature matrix
cnt <- 0
if (pred.names != '' && !is.null(pred.names)) {
for (p in pred.names) {
if(!p%in%colnames(meta)) stop("There is no meta variable called ",p,"\n")
idx <- which(colnames(meta) == p)
if(length(idx) != 1) stop(p, "matches multiple columns in the metada\n")
cat('adding ', p, '\n', sep='')
m <- meta[,idx]
if (!all(is.finite(m))) {
na.cnt <- sum(!is.finite(m))
cat('filling in', na.cnt, 'missing values by mean imputation\n')
mn <- mean(m, na.rm=TRUE)
m[!is.finite(m)] <- mn
}
if (std.meta) {
cat('standardize metadata feature', p, '\n')
m.mean <- mean(m, na.rm = TRUE)
m.sd <- sd(m, na.rm = TRUE)
stopifnot(!m.sd == 0)
m <- (m - m.mean)/m.sd
}
feat <- rbind(feat, m)
rownames(feat)[nrow(feat)] <- paste('META_', toupper(p), sep='')
cnt <- cnt + 1
}
cat('added', cnt, 'meta-variables as predictors to the feature matrix\n')
} else {
cat('Not adding any of the meta-variables as predictor to the feature matrix\n')
}
stopifnot(all(!is.na(feat)))
invisible(feat)
}
This diff is collapsed.
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Check for potential confounders in the metadata
#' @description This function checks for associations between class labels and potential confounders (e.g. age, sex, or BMI) that are present in the metadata.
#' Statistical testing is performed with Fisher's exact test or Wilcoxon test, while associations are visualized either as barplot or Q-Q plot, depending on the type of metadata.
#' @param meta metadata object
#' @param label labels object
#' @param fn.plot string, filename for the pdf-plot
#' @keywords SIAMCAT confounder.check
#' @export
#' @return Does not return anything, but produces a single plot for each metadata category
confounder.check <- function(meta, label, fn.plot){
# TODO: implement color.scheme selection as function parameter
pdf(fn.plot, onefile=TRUE)
par(xpd=FALSE)
for (m in 1:ncol(meta)) {
mname <- gsub('[_.-]', ' ', colnames(meta)[m])
mname <- paste(toupper(substring(mname, 1, 1)), substring(mname, 2), sep="")
cat('checking', mname, 'as a potential confounder...\n')
mvar <- as.numeric(meta[,m])
u.val <- unique(mvar)
u.val <- u.val[!is.na(u.val)]
colors <- brewer.pal(5,"Dark2")
if (length(u.val) == 1) {
cat(' skipped because all subjects have the same value\n')
} else if (length(u.val) <= 5) {
cat(' using a bar plot\n')
par(mar=c(6.1,4.1,4.1,4.1))
ct <- matrix(NA, nrow=2, ncol=length(u.val))
for (i in 1:length(u.val)) {
ct[1,i] = sum(mvar[label$n.idx] == u.val[i], na.rm=TRUE)
ct[2,i] = sum(mvar[label$p.idx] == u.val[i], na.rm=TRUE)
}
freq <- t(ct)
for (i in 1:dim(freq)[2]) {
freq[,i] <- freq[,i] / sum(freq[,i])
}
barplot(freq, ylim=c(0,1), main=mname, names.arg=c(label$n.lab, label$p.lab), col=colors)
#legend("right", legend=u.val, col=colors)
p.val <- fisher.test(ct)$p.value
mtext(paste('Fisher test p-value:', format(p.val, digits=4)), side=1, line=3, at=1, adj=0)
} else {
cat(' using a Q-Q plot\n')
par(mar=c(5.1,4.1,4.1,4.1))
ax.int <- c(min(mvar, na.rm=TRUE), max(mvar, na.rm=TRUE))
qqplot(mvar[label$n.idx], mvar[label$p.idx], xlim=ax.int, ylim=ax.int, pch=16, cex=0.6,
xlab=label$n.lab, ylab=label$p.lab, main=paste('Q-Q plot for', mname))
abline(0, 1, lty=3)
p.val <- wilcox.test(mvar[label$n.idx], mvar[label$p.idx], exact=FALSE)$p.value
text(ax.int[1]+0.9*(ax.int[2]-ax.int[1]), ax.int[1]+0.1*(ax.int[2]-ax.int[1]),
paste('MWW test p-value:', format(p.val, digits=4)), pos=2)
}
}
tmp <- dev.off()
}
This diff is collapsed.
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Evaluate prediction results
#' @description This function takes the correct labels and predictions for all samples and evaluates the results using the \itemize{
#' \item Area Under the Receiver Operating Characteristic (ROC) Curve (AU-ROC)
#' \item and the Precision-recall Curve (PR)
#' }
#' as metric. Predictions can be supplied either for a single case or as matrix after resampling of the dataset.
#'
#' Prediction results are usually produced with the function \link{plm.predictor}.
#' @param label label object
#' @param pred prediction for each sample by the model, should be a matrix with dimensions \code{length(label) x 1} or \code{length(label) x num.resample}
#' @keywords SIAMCAT eval.result
#' @export
#' @return list containing \itemize{
#' \item \code{$roc.average} average ROC-curve across repeats or a single ROC-curve on complete dataset;
#' \item \code{$auc.average} AUC value for the average ROC-curve;
#' \item \code{$ev.list} list of \code{length(num.folds)}, containing for different decision thresholds the number of false positives, false negatives, true negatives, and true positives;
#' \item \code{$pr.list} list of \code{length(num.folds)}, containing the positive predictive value (precision) and true positive rate (recall) values used to plot the PR curves;
#' }. If \code{prediction} had more than one column, i.e. if the models has been trained with several repeats, the function will additonally return \itemize{
#' \item \code{$roc.all} list of roc objects (see \link[pROC]{roc}) for every repeat;
#' \item \code{$aucspr} vector of AUC values for the PR curves for every repeat;
#' \item \code{$auc.all} vector of AUC values for the ROC curves for every repeat
#'}
eval.predictions <- function(label, pred){
# TODO compare header to label
### make sure that label and prediction are in the same order
#stopifnot(all(names(label) %in% rownames(pred)) && all(rownames(pred) %in% names(label)))
m <- match(names(label$label), rownames(pred))
#cat(m, '\n')
pred <- pred[m,,drop=FALSE]
stopifnot(all(names(label$label) == rownames(pred)))
# ROC curve
auroc = 0
if (ncol(pred) > 1) {
rocc = list(NULL)
aucs = vector('numeric', ncol(pred))
for (c in 1:ncol(pred)) {
rocc[c] = list(roc(response=label$label, predictor=pred[,c], ci=FALSE))
aucs[c] = rocc[[c]]$auc
}
l.vec = rep(label$label, ncol(pred))
} else {
l.vec = label$label
}
# average data for plotting one mean prediction curve
summ.stat = 'mean'
rocsumm = list(roc(response=label$label, predictor=apply(pred, 1, summ.stat),
ci=TRUE, of="se", sp=seq(0, 1, 0.05)))
auroc = list(rocsumm[[1]]$auc)
# precision recall curve
pr = list(NULL)
ev = list(NULL)
if (ncol(pred) > 1) {
aucspr = vector('numeric', dim(pred)[2])
for (c in 1:ncol(pred)) {
ev[c] = list(eval.classifier(pred[,c], label$label, label))
pr[c] = list(get.pr(ev[[c]]))
aucspr[c] = calc.aupr(ev[[c]])
}
ev = append(ev,list(eval.classifier(apply(pred, 1, summ.stat), label$label, label)))
} else {
ev[1] = list(eval.classifier(as.vector(pred), label$label, label))
pr[1] = list(get.pr(ev[[1]]))
}
if (ncol(pred) > 1) {
return(list("roc.all" = rocc,
"auc.all"=aucs,
"roc.average"=rocsumm,
"auc.average"=auroc,
"ev.list"=ev,
"pr.list"=pr,
"aucspr"=aucspr))
} else {
return(list("roc.average"=rocsumm,
"auc.average"=auroc,
"ev.list"=ev,
"pr.list"=pr))
}
}
File mode changed from 100755 to 100644
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# RScript flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 26.06.2017
# GNU GPL 3.0
###
#' @title Perform unsupervised feature filtering.
#' @description This function may convert absolute abundances into relative abundances and then performs unsupervised feature filtering. Features can be filtered based on abundance or prevalence. Additionally, unmapped reads may be removed.
#' @param feat feature object
#' @param filter.method method used for filtering the features, can be one of these: \code{c("abundance", "cum.abundance", "prevalence")}
#' @param cutoff float, abundace or prevalence cutoff
#' @param recomp.prop boolean, should absolute abundances be converted into relative abundances?
#' @param rm.unmapped boolean, should unmapped reads be discarded?
#' @details Currently, there are three filtering methods implemented:
#' \itemize{
#' \item \code{"abundance"} remove features for which the abundance is never above the threshold value (e.g. 0.5\%) in any of the samples
#' \item \code{"cum.abundance"} remove features with very low abundance in all samples, i.e. features that are never among the most abundant entities that collectively make up \code{(1-cutoff)} of the reads in any samples
#' \item \code{"prevalence"} remove features with low prevalence across samples, i.e. features that are undetected in more than \code{(1-cutoff)} proportion of samples
#' }
#' @keywords SIAMCAT filter.feat
#' @export
#' @return Returns the filtered feature matrix
filter.feat <- function(feat, filter.method=abundance, cutoff=0.001, recomp.prop=FALSE, rm.unmapped=TRUE){
### this statement does not have the purpose to calculate relative abundances on the fly and return them.
### Instead, it's purpose is to be able to calculate f.idx (specifying the indices of features which are to be kept)
### when feature list has already been transformed to relative abundances, but e.g. certain features have been removed manually.
## TODO check filter.method, add default value for cutoff, recomp.prop, and rm.unmapped?
if (recomp.prop) {
# recompute relative abundance values (proportions)
ra.feat <- prop.table(feat, 2)
} else {
ra.feat <- feat
}
### apply filters
if (filter.method == 'abundance') {
# remove features whose abundance is never above the threshold value (e.g. 0.5%) in any of the samples
f.max <- apply(ra.feat, 1, max)
f.idx <- which(f.max >= cutoff)
} else if (filter.method == 'cum.abundance') {
# remove features with very low abundance in all samples i.e. ones that are never among the most abundant
# entities that collectively make up (1-cutoff) of the reads in any sample
f.idx <- vector('numeric', 0)
# sort features per sample and apply cumsum to identify how many collectively have weight K
for (s in 1:ncol(ra.feat)) {
srt <- sort(ra.feat[,s], index.return=TRUE)
cs <- cumsum(srt$x)
m <- max(which(cs < cutoff))
f.idx <- union(f.idx, srt$ix[-(1:m)])
}
# an index of those features that collectively make up more than 1-K of the read mass in any sample
f.idx <- sort(f.idx)
} else if (filter.method == 'prevalence') {
# remove features with low prevalence across samples
# i.e. ones that are 0 (undetected) in more than (1-cutoff) proportion of samples
f.idx <- which(rowSums(ra.feat > 0) / ncol(ra.feat) > cutoff)
} else {
stop('unrecognized filter.method, exiting!\n')
}
cat('Removed ', nrow(feat)-length(f.idx), ' features whose values did not exceed ', cutoff,
' in any sample (retaining ', length(f.idx), ')\n', sep='')
feat <- feat[f.idx,]
### postprocessing and output generation
if (rm.unmapped) {
# remove 'unmapped' feature
unm.idx <- rownames(feat) == 'UNMAPPED' | rownames(feat) == 'unmapped' | rownames(feat) == '-1' | rownames(feat) == 'UNCLASSIFIED' | rownames(feat) == 'unclassified' | rownames(feat) == 'UNASSIGNED' | rownames(feat) == 'unassigned'
if (any(unm.idx)) {
feat <- feat[!unm.idx,]
cat('Removed ', sum(unm.idx), ' features corresponding to UNMAPPED reads',
' (retaining ', nrow(feat), ')\n', sep='')
} else {
cat('tried to remove unmapped reads, but could not find them. Continue anyway.')
}
}
return(feat)
}
###
# SIAMCAT - Statistical Inference of Associations between Microbial Communities And host phenoTypes
# R package flavor
#
# written by Georg Zeller
# with additions by Nicolai Karcher and Konrad Zych
# EMBL Heidelberg 2012-2017
#
# version 0.2.0
# file last updated: 12.06.2017
# GNU GPL-3.0
###
#' @title Read feature file
#' @description This file reads in the tsv file with features and converts it into a matrix.
#'
#' The file should be oragnized as follows:
#' features (in rows) x samples (in columns).
#'
#' First row should contain sample labels (consistent with label data), while the first column should contain feature labels (e.g. taxonomic identifiers). The remaining entries are expected to be real values \code{>= 0} that quantify the abundance of each feature in each sample.
#' @param fn.in.feat name of the tsv file containing features
#' @export
#' @return matrix containing features from the file
read.features <- function(fn.in.feat){
if(is.null(fn.in.feat)) stop("Filename for features file not provided!\n")
if(!file.exists(fn.in.feat)) stop("Feature file ", fn.in.feat, " does not exist!\n")
feat <- read.table(file=fn.in.feat, sep='\t', header=TRUE, stringsAsFactors=FALSE, check.names=FALSE, quote='')
feat <- as.matrix(feat)
featNames <- make.names(rownames(feat)) ### making the names semantically correct
if(any(rownames(feat)==featNames)){
cat("The provided feature names were not semantically correct for use in R, they were updated.\n")
rownames(feat) <- featNames
}
invisible(feat)
}
#' @title Read labels file
#' @description This file reads in the tsv file with labels and converts it into a matrix.
#'
#' First row is expected to be \code{#BINARY:1=[label for cases];-1=[label for controls]}. Second row should contain the sample identifiers as tab-separated list (consistent with feature and metadata).
#'
#' Third row is expected to contain the actual class labels (tab-separated): \code{1} for each case and \code{-1} for each control.
#'
#' Note: Labels can take other numeric values (but not characters or strings); importantly, the label for cases has to be greater than the one for controls.
#' @param fn.in.label name of the tsv file containing labels
#' @export
#' @return list with nine values:\itemize{
#' \item \code{$label} named vector containing the numerical labels from the file;
#' \item \code{$header} first row of the label file;
#' \item \code{$info} information about the type of label (e.g. \code{BINARY});
#' \item \code{$positive.lab} numerical label for controls, e.g. \code{-1};
#' \item \code{$negative.lab} numerical label for cases, e.g. \code{1};
#' \item \code{$n.idx} logical vector of labels (\code{TRUE} for controls, \code{FALSE} otherwise);
#' \item \code{$n.lab} label for controls, e.g. \code{healthy};
#' \item \code{$p.idx} logical vector of labels (\code{TRUE} for cases, \code{FALSE} otherwise);
#' \item \code{$p.lab} label for cases, e.g. \code{cancer}
#'}
read.labels <- function(fn.in.label,feat=NULL){
# TODO move feature/label agreement check to validate data?
if (is.null(fn.in.label)) stop("Filename for labels file not provided!\n")
if(!file.exists(fn.in.label)) stop("Label file ", fn.in.label, " does not exist!\n")
label <- read.table(file=fn.in.label, sep='\t', header=TRUE, row.names=NULL, stringsAsFactors=FALSE,
check.names=FALSE, quote='', comment.char="#")
label <- as.matrix(label)
if (dim(label)[1] > dim(label)[2]){
temp <- names(label)
names(label) <- NULL
label <- rbind(temp,label)
rownames(label) <- label[,1]
label[,1] <- NULL
label <- t(label)
}
namesL <- colnames(label)
label <- as.numeric(label)
names(label) <- namesL
if(!all(is.null(feat))){
if(any(names(label) != colnames(feat))) stop("read.labels: Names in label object and feat object do not match!\n")
}
# Check general suitablity of supplied dataset
classes <- unique(label)
for (i in classes){
if(sum(label==i) <= 5) stop("Data set has only",sum(label==i), "training examples of class",i," This is not enough for SIAMCAT to proceed")
if (sum(label==i) < 10){
cat("Data set has only",sum(label==i), "training examples of class",i," . Note that a dataset this small/skewed is not necessarily suitable for analysis in this pipe line." )
}
}
#Check label header!
con <- file(fn.in.label, 'rt')
header <- readLines(con, 1)
if (substring(header,1,1) != "#"){
stop("Label header seems to be missing or broken.")
}
close(con)
label <- list("label" = label, "header" = header)
label$info <- parse.label.header(label$header)
stopifnot(label$info$type == 'BINARY')
label$positive.lab <- max(label$info$class.descr)
label$negative.lab <- min(label$info$class.descr)
label$n.idx <- label$label==label$negative.lab
label$n.lab <- gsub('[_.-]', ' ', names(label$info$class.descr)[label$info$class.descr==label$negative.lab])
label$p.idx <- label$label==label$positive.lab
label$p.lab <- gsub('[_.-]', ' ', names(label$info$class.descr)[label$info$class.descr==label$positive.lab])
invisible(label)
}
#' @title Read metadata file
#' @description This file reads in the tsv file with numerical metadata and converts it into a matrix.
#'
#' The file should be organized as follows:
#' samples (in rows) x metadata (in columns). Metadata needs to be converted to numerical values by the user.
#'
#' Metadata may be optional for the SIAMCAT workflow, but are necessary for heatmap displays, see \link{interpretor.model.plot}
#' @param fn.in.meta name of the tsv file containing metadata
#' @export
#' @return matrix containing metadata from the file
read.meta <- function(fn.in.meta){
if (is.null(fn.in.meta) || toupper(fn.in.meta)=='NULL' || toupper(fn.in.meta)=='NONE' || toupper(fn.in.meta)=='UNKNOWN') {
cat("Filename for metadata file not provided, continuing without it.\n")
}else{
if(!file.exists(fn.in.meta)) stop("Metadata file ", fn.in.meta, " does not exist!\n")
meta <- read.table(file=fn.in.meta, sep='\t', header=TRUE, row.names=1, check.names=FALSE, quote='')
meta <- as.matrix(meta)
}
invisible(meta)
}
##### auxiliary function to trim whitespace from string
# returns string without leading or trailing whitespace
trim <- function (x) {
gsub("^\\s+|\\s+$", "", x)
}
##### function to parse the header of a label file
### label.header - string in the format: #<TYPE>:<L1>=<class1>;<L2>=<class2>[;<L3>=<class3>]