Commit f5c2c01c authored by Jean-Karim Heriche's avatar Jean-Karim Heriche

Fix problem with imbalanced partitioning of data and issue when making predictions.

parent 7326e390
......@@ -12,7 +12,7 @@ RUN R -e "install.packages('BiocManager', repos=c('https://cloud.r-project.org/
RUN R -e "BiocManager::install('aoles/RBioFormats')"
RUN R -e "BiocManager::install('EBImage')"
RUN R -e "install.packages(c('DT', 'shiny', 'shinyFiles', 'shinycssloaders', 'shinydashboard', 'shinyjs', 'shinyWidgets', 'shinybusy', 'assertthat'), repos=c('https://cloud.r-project.org/', 'https://ftp.gwdg.de/pub/misc/cran/'))"
RUN R -e "install.packages(c('ggplot2', 'plotly', 'RANN', 'MASS', 'uwot', 'xgboost', 'Ckmeans.1d.dp', 'caret'), repos=c('https://cloud.r-project.org/', 'https://ftp.gwdg.de/pub/misc/cran/'))"
RUN R -e "install.packages(c('ggplot2', 'plotly', 'RANN', 'MASS', 'uwot', 'xgboost', 'Ckmeans.1d.dp', 'e1071', 'caret'), repos=c('https://cloud.r-project.org/', 'https://ftp.gwdg.de/pub/misc/cran/'))"
# Copy the app to the image
RUN mkdir -p /usr/local/app/image-data-explorer
......
......@@ -34,7 +34,6 @@ explore_image_server <-function(input, output, session, rv){
if(!is.null(rv$imgPath1) && length(unique(rv$imgPath1)) == 1 &&
length(unique(rv$selectedFrame)) <= 1 &&
!is.null(rv$fileCol1) && rv$fileCol1 != ""){
rv$imgPath1 <- gsub("[[:space:]]", "", rv$imgPath1)
filePath1 <- file.path(rootDir, strsplit(rv$imgPath1[1], '\\\\'))
validate(need(filePath1 != "" && file.exists(filePath1), "File not found. Check that you selected the correct image root directory."))
slice.def <- list()
......
......@@ -29,7 +29,6 @@ explore_image_server2 <-function(input, output, session, rv){
if(!is.null(rv$imgPath2) && length(unique(rv$imgPath2)) == 1 &&
length(unique(rv$selectedFrame)) <= 1 &&
!is.null(rv$fileCol2) && rv$fileCol2 != ""){
rv$imgPath2 <- gsub("[[:space:]]", "", rv$imgPath2)
filePath2 <- file.path(rootDir, strsplit(rv$imgPath2[1], '\\\\'))
validate(need(filePath2 != "" && file.exists(filePath2), "File not found. Check that you selected the correct image root directory."))
slice.def <- list()
......
......@@ -46,8 +46,12 @@ plot_server <- function(input, output, session, rv) {
geom_point(colour = "grey") +
coord_cartesian(expand = TRUE)
if(!is.null(rv$clusters)) {
idx.to.na <- which(clusters == "")
if(length(idx.to.na)>0) {
clusters[idx.to.na] <- NA
}
p <- p + geom_point(aes(colour = factor(clusters[1:nrow(D)])), alpha = 0.5) +
scale_colour_brewer(palette = 'Set1') +
scale_colour_brewer(palette = 'Set1') +
theme(plot.title = element_blank(), legend.title = element_blank())
}
p <- p + geom_point(data = D[rv$selectedRows,],
......
......@@ -101,27 +101,25 @@ feature_selection_server <- function(input, output, session, rv, session_parent
req(rv$data)
tmp <- rv$data[,c(input$targetCol, input$featuresToProcess)]
# Remove rows with NAs and infinite values
tmp <- tmp[is.finite(rowSums(tmp[,input$featuresToProcess])),]
idx.valid.data <- which(is.finite(rowSums(tmp[,input$featuresToProcess])))
tmp <- tmp[idx.valid.data,]
# Target vector
target <- tmp[, input$targetCol]
tmp <- tmp[, input$featuresToProcess]
target <- as.character(tmp[, input$targetCol])
# Extract data with annotations
idx.to.keep <- which(!is.na(target) & tolower(target) != 'none' & target != "")
target <- target[idx.to.keep]
tmp <- tmp[idx.to.keep, input$featuresToProcess]
classes <- levels(as.factor(target))
classifier.data$classes <- classes
# Extract data with annotations
idx.to.keep <- which(!is.na(target) & tolower(target) != 'none' & target != "")
# Split into training and test set indices
# Note: if unlucky we may get class imbalance
# If this becomes a problem, we can use functions from the caret package
train.idx <- sample(idx.to.keep, floor(0.67*length(idx.to.keep)))
test.idx <- idx.to.keep[-train.idx]
# Split into training and test set preserving class distribution
train.idx <- createDataPartition(as.factor(target), p = 0.67, list = FALSE, times = 1)
# Form training and test sets
train.data <- as.matrix(tmp[train.idx, input$featuresToProcess])
train.labels <- as.factor(target[train.idx])
test.data <- as.matrix(tmp[test.idx, input$featuresToProcess])
test.labels <- as.factor(target[test.idx])
train.labels <- factor(target[train.idx])
test.data <- as.matrix(tmp[-train.idx, input$featuresToProcess])
test.labels <- factor(target[-train.idx])
# Tune xgboost hyperparameters using caret
nrounds <- seq(from = 100, to = 500, by = 50)
eta <- c(0.025, 0.05, 0.1, 0.3, 0.4)
......@@ -167,7 +165,7 @@ feature_selection_server <- function(input, output, session, rv, session_parent
# Evaluate on held out data
xgbpred <- predict(xgbModel, newdata = test.data)
confusion.matrix <- confusionMatrix(xgbpred, test.labels, mode = "everything")
# Get feature importance using the xgboost library
# This also clusters the features. The number of clusters is automatically determined (using BIC)
feature.importance <- xgb.importance(model = xgbModel$finalModel, feature_names = xgbModel$finalModel$feature_names)
......@@ -218,10 +216,9 @@ feature_selection_server <- function(input, output, session, rv, session_parent
rv$data$xgboost.predictions <- NA
}
tmp <- rv$data[,input$featuresToProcess]
# Remove rows with NAs and infinite values
tmp <- as.matrix(tmp[is.finite(rowSums(tmp)),])
preds <- predict(classifier.data$model, newdata = tmp)
rv$data[rownames(tmp),]$xgboost.predictions <- classifier.data$classes[preds]
idx.valid.data <- which(is.finite(rowSums(tmp)))
preds <- predict(classifier.data$model, newdata = tmp[idx.valid.data,])
rv$data[idx.valid.data,]$xgboost.predictions <- classifier.data$classes[preds]
showNotification("Model predictions have been added to the data.", type = "warning")
}
})
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment