4144 lines
152 KiB
R
4144 lines
152 KiB
R
|
|
||
|
pkgs <- c("arrow", "class", "DT", "future", "ggplot2", "glmnet", "htmlTable",
|
||
|
"LiblineaR", "Matrix","Metrics", "promises", "quanteda",
|
||
|
"quanteda.textmodels", "ranger", "rlang", "Rtsne",
|
||
|
"RJSONIO", "shiny", "SparseM", "stringi", "uwot")
|
||
|
pkgs <- pkgs[! pkgs %in% rownames(installed.packages())]
|
||
|
if (length(pkgs)) {
|
||
|
warning(paste0("Packages ", paste0(pkgs, collapse = " , "), " not available. ",
|
||
|
"App might crash."))
|
||
|
}
|
||
|
# for (i in seq_along(pkgs)) install.packages(pkgs[i])
|
||
|
|
||
|
verbose <- TRUE
|
||
|
# sink("ActiveTigger.log")
|
||
|
|
||
|
options(shiny.maxRequestSize = 30*1024^3) # File upload size
|
||
|
|
||
|
modelnames <- read.csv("modelnames.csv")
|
||
|
modelnames_labels <- modelnames$short
|
||
|
names(modelnames_labels) <- modelnames$short_lang
|
||
|
|
||
|
library(quanteda)
|
||
|
library(ggplot2)
|
||
|
library(promises)
|
||
|
library(future)
|
||
|
plan(multisession)
|
||
|
|
||
|
|
||
|
################################################################################
|
||
|
## Global functions
|
||
|
################################################################################
|
||
|
|
||
|
label2hash <- function(label) rlang::hash(label)
|
||
|
|
||
|
cleanFileName <- function(name)
|
||
|
gsub('\\s|[/]|[\\]|[:]|[*]|[?]|["]|[<]|[>]|[+]|[$]|[=]', "_",
|
||
|
gsub("^\\s+|\\s+$", "", name))
|
||
|
|
||
|
wtdF1 <- function(truth, pred) {
|
||
|
pred <- pred[!is.na(truth)]
|
||
|
truth <- truth[!is.na(truth)]
|
||
|
sum(sapply(unique(truth), function(ilab)
|
||
|
Metrics::fbeta_score(truth == ilab, pred == ilab) *
|
||
|
sum(truth == ilab) / length(truth)), na.rm = TRUE)
|
||
|
}
|
||
|
|
||
|
metricsTable <- function(truth, pred) {
|
||
|
do.call(rbind, lapply(sort(unique(truth)), function(ilab) {
|
||
|
tmptrue <- truth == ilab
|
||
|
tmppred <- pred == ilab
|
||
|
tmp <- data.frame(
|
||
|
"Tag" = ilab,
|
||
|
"N.cases" = sum(tmptrue),
|
||
|
"N.predict" = sum(tmppred),
|
||
|
"Accuracy" = sprintf("%f", round(Metrics::accuracy(tmptrue, tmppred), 6)),
|
||
|
"Precision" = sprintf("%f", round(Metrics::precision(tmptrue, tmppred), 6)),
|
||
|
"Recall" = sprintf("%f", round(Metrics::recall(tmptrue, tmppred), 6)),
|
||
|
"F1" = sprintf("%f", round(Metrics::fbeta_score(tmptrue, tmppred), 6)))
|
||
|
tmp$F1[tmp$F1 == "NaN"] <- sprintf("%f", 0)
|
||
|
tmp
|
||
|
}))
|
||
|
}
|
||
|
|
||
|
bertPlotFromPath <- function(path) {
|
||
|
if (!file.exists(paste0(path, "/trainer_state.json"))) return(NULL)
|
||
|
state <- RJSONIO::fromJSON(paste0(path, "/trainer_state.json"))
|
||
|
logs <- data.table::rbindlist(lapply(state$log_history, as.list), fill = TRUE)
|
||
|
epochmin <- logs$epoch[which.min(logs$eval_loss)]
|
||
|
|
||
|
gdat <- data.table::rbindlist(
|
||
|
use.names = FALSE,
|
||
|
list(
|
||
|
cbind("train", logs[!is.na(logs$loss), c("epoch", "loss")]),
|
||
|
cbind("valid", logs[!is.na(logs$eval_loss), c("epoch", "eval_loss")])
|
||
|
))
|
||
|
colnames(gdat) <- c("data", "epoch", "loss")
|
||
|
|
||
|
ggplot(gdat, aes_string("epoch", "loss", color = "data", shape = "data")) +
|
||
|
geom_point(size = 2) + geom_line() +
|
||
|
geom_vline(xintercept = epochmin, linetype = 2) +
|
||
|
theme_bw() + expand_limits(y=0)
|
||
|
}
|
||
|
|
||
|
bertHyperToMsg <- function(pars) {
|
||
|
if (is.null(pars)) return(NULL)
|
||
|
paste0("Scheme : ", pars$bertScheme, "; ",
|
||
|
"Model : ", pars$bertModel, "\n* ",
|
||
|
"Epochs : ", pars$bertEpochs, " ; ",
|
||
|
"Lrate : ", pars$bertLrate, " ; ",
|
||
|
"Wdecay : ", pars$bertWdecay, " ; ",
|
||
|
"Batch size : ", pars$bertBatchsize, " ; ",
|
||
|
"Grad. accum. : ", pars$bertGradacc, "\n* ",
|
||
|
"Keep best : ", pars$bertBest, " ; ",
|
||
|
"ValidFrac : ", pars$bertValidFrac, " ; ",
|
||
|
"ValidSeed : ", pars$bertValidSeed, " \n* ",
|
||
|
pars$bertNtrain, " training samples : ",
|
||
|
paste0(names(pars$bertTrainTable), " (",
|
||
|
as.numeric(pars$bertTrainTable), ")",
|
||
|
collapse = " ; "))
|
||
|
}
|
||
|
|
||
|
get_status <- function(dafile)
|
||
|
scan(dafile, what = "character", sep="\n", quiet = TRUE)
|
||
|
set_status <- function(dafile, msg) write(msg, dafile)
|
||
|
|
||
|
|
||
|
|
||
|
################################################################################
|
||
|
## Main server function
|
||
|
################################################################################
|
||
|
|
||
|
# options(shiny.maxRequestSize=2^34) # Max filesize
|
||
|
|
||
|
shinyServer(function(input, output, session) {
|
||
|
|
||
|
values <- reactiveValues()
|
||
|
values$visuGo <- FALSE
|
||
|
values$dfmGo <- FALSE
|
||
|
# queryTrigger <- reactiveVal(0)
|
||
|
# predlabTrigger <- reactiveVal(0)
|
||
|
initDataTrigger <- reactiveVal(0)
|
||
|
queryNext <- reactiveVal(0)
|
||
|
trainTrigger <- reactiveVal(0)
|
||
|
predTrigger <- reactiveVal(0)
|
||
|
diagTrigger <- reactiveVal(0)
|
||
|
ready2tag <- reactiveVal(0)
|
||
|
trainCountdown <- reactiveVal(0)
|
||
|
|
||
|
ok.data <- reactiveVal()
|
||
|
ok.data.async <- reactiveVal()
|
||
|
ok.data.running <- reactiveVal(FALSE)
|
||
|
|
||
|
bpall_nclicks <- reactiveVal(0)
|
||
|
bertpred_nclicks <- reactiveVal(0)
|
||
|
|
||
|
## NEW use_regressors list as values$new_use_regressors
|
||
|
observeEvent(input$use_regressors, {
|
||
|
values$new_use_regressors <- input$use_regressors
|
||
|
cat("DEBUG new_use_regressors: ", paste(input$use_regressors, collapse = " "), "\n")
|
||
|
})
|
||
|
|
||
|
## NEW trainCountdown with input$trainCountdown
|
||
|
## Policy: 0, don't train ; 1, trigger train on tag ; >1, decrement on tag
|
||
|
observeEvent(input$trainCountdown, trainCountdown(input$trainCountdown))
|
||
|
|
||
|
####################
|
||
|
## Projects and data
|
||
|
|
||
|
## Read config files, update UI forms accordingly
|
||
|
## This should happen only once
|
||
|
observe({
|
||
|
if (verbose)
|
||
|
cat("DEBUG read project configs\n")
|
||
|
# values$conf <- RJSONIO::fromJSON("conf.json")
|
||
|
projfiles <- dir(pattern = "^tigger_.*[.]json$")
|
||
|
isolate({
|
||
|
if (!length(projfiles)) {
|
||
|
values$conf <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
values$confnames <- gsub("^tigger_(.*)[.]json$", "\\1", projfiles)
|
||
|
|
||
|
values$conf <- RJSONIO::fromJSON(projfiles[1])
|
||
|
})
|
||
|
})
|
||
|
|
||
|
## When projects added, update selector
|
||
|
observeEvent(values$confnames, {
|
||
|
if (verbose)
|
||
|
cat("DEBUG update selectProject from confnames\n")
|
||
|
updateSelectInput(session, "selectProject", choices = values$confnames)
|
||
|
})
|
||
|
|
||
|
## Event from project change
|
||
|
observeEvent(input$selectProject, {
|
||
|
if (is.null(input$selectProject)) return(NULL)
|
||
|
if (!nchar(input$selectProject)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG project change step 1\n")
|
||
|
# values$project <- input$selectProject
|
||
|
# values$conf <- values$confs[[values$project]]
|
||
|
values$conf <- RJSONIO::fromJSON(
|
||
|
paste0("tigger_", input$selectProject, ".json"))
|
||
|
ok.data(NULL)
|
||
|
ok.visu(NULL)
|
||
|
initDataTrigger(initDataTrigger() + 1)
|
||
|
|
||
|
if (verbose)
|
||
|
cat("DEBUG project change step 2\n")
|
||
|
if (!is.null(values$conf$dataNrows))
|
||
|
updateNumericInput(session, "dataNrows", value = as.numeric(values$conf$dataNrows))
|
||
|
if (!is.null(values$conf$dataSkipRows))
|
||
|
updateNumericInput(session, "dataSkipRows", value = as.numeric(values$conf$dataSkipRows))
|
||
|
|
||
|
#### Python ####
|
||
|
## Default use_python if not found in conf file: FALSE
|
||
|
if(is.null(values$conf$use_python))
|
||
|
values$conf$use_python <- FALSE
|
||
|
updateCheckboxInput(session, "sys_use_python", value = values$conf$use_python)
|
||
|
|
||
|
## Set default python if not found in conf file
|
||
|
if(is.null(values$conf$python))
|
||
|
values$conf$python <- "python3"
|
||
|
updateTextInput(session, "sys_which_python", value = values$conf$python)
|
||
|
|
||
|
values$python_ok <- FALSE
|
||
|
if (values$conf$use_python) {
|
||
|
## Check whether python is working
|
||
|
pytest <- try(system(paste(values$conf$python, "--version"),
|
||
|
intern = TRUE))
|
||
|
if (inherits(pytest, "try-error")) {
|
||
|
showNotification(paste(
|
||
|
"Python path `", values$conf$python,
|
||
|
"` not valid, try changing it in Project/System tab"),
|
||
|
duration = 10, type = "error")
|
||
|
} else
|
||
|
values$python_ok <- TRUE
|
||
|
}
|
||
|
if (verbose)
|
||
|
cat("Python ok:", values$python_ok, "\n")
|
||
|
|
||
|
|
||
|
#### GPU ####
|
||
|
if (!is.null(values$conf$use_gpu))
|
||
|
updateCheckboxInput(session, "sys_use_gpu", value = values$conf$use_gpu)
|
||
|
|
||
|
#### Spacy ####
|
||
|
if (!is.null(values$conf$use_spacy))
|
||
|
updateCheckboxInput(session, "sys_use_spacy", value = values$conf$use_spacy)
|
||
|
updateTextInput(session, "sys_use_spacy_model", value = values$conf$use_spacy_model)
|
||
|
|
||
|
#### fastText ####
|
||
|
if (!is.null(values$conf$use_ft))
|
||
|
updateCheckboxInput(session, "sys_use_ft", value = values$conf$use_ft)
|
||
|
updateTextInput(session, "sys_use_ft_model", value = values$conf$use_ft_model)
|
||
|
|
||
|
#### SBERT ####
|
||
|
if (!is.null(values$conf$use_sb))
|
||
|
updateCheckboxInput(session, "sys_use_sb", value = values$conf$use_sb)
|
||
|
updateTextInput(session, "sys_use_sb_model", value = values$conf$use_sb_model)
|
||
|
|
||
|
})
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## Reactives : project settings UI
|
||
|
##############################################################################
|
||
|
|
||
|
output$sys_datadir <- renderText(values$conf$datadir)
|
||
|
output$sys_datafile <- renderText(paste0(values$conf$dataname, ".feather"))
|
||
|
output$sys_var_id <- renderUI(renderText(values$conf$idcol))
|
||
|
output$sys_var_text <- renderUI(renderText(values$conf$textcols))
|
||
|
output$sys_var_tag <- renderUI(renderText(values$conf$tagcols))
|
||
|
output$sys_var_context_ui <- renderUI(selectInput(
|
||
|
"sys_var_context", NULL, values$data_orig_colnames, values$conf$contextcols,
|
||
|
multiple = TRUE))
|
||
|
output$sys_var_comm_ui <- renderUI(selectInput(
|
||
|
"sys_var_comm", NULL, c("(none)", values$data_orig_colnames),
|
||
|
values$conf$commcol))
|
||
|
|
||
|
output$sys_ex_lang_ui <- renderUI(selectInput(
|
||
|
"sys_ex_lang", NULL, modelnames_labels, "en"
|
||
|
))
|
||
|
output$sys_ex_spacy <- renderText(
|
||
|
modelnames$spacy_name[modelnames$short == input$sys_ex_lang])
|
||
|
output$sys_ex_spacy_dl <- renderText(paste0(
|
||
|
"Download with python module: ", values$conf$python, " -m spacy download ",
|
||
|
modelnames$spacy_name[modelnames$short == input$sys_ex_lang]))
|
||
|
output$sys_ex_ft <- renderText(
|
||
|
modelnames$fasttext_name[modelnames$short == input$sys_ex_lang])
|
||
|
output$sys_ex_ft_dl <- renderText(paste0(
|
||
|
"Manual download link : ",
|
||
|
modelnames$fasttext_url[modelnames$short == input$sys_ex_lang]))
|
||
|
output$sys_ex_sb <- renderText(
|
||
|
ifelse(input$sys_ex_lang %in% c("ar", "zh", "nl", "en", "fr", "de",
|
||
|
"it", "ko", "pl", "pt", "ru", "es", "tr"),
|
||
|
"distiluse-base-multilingual-cased-v1",
|
||
|
"distiluse-base-multilingual-cased-v2"))
|
||
|
|
||
|
|
||
|
## Save system changes on button click, with modal
|
||
|
output$save_config_msg <- renderUI({
|
||
|
req(values$save_config_msg)
|
||
|
p(strong(values$save_config_msg))
|
||
|
})
|
||
|
output$save_embed_msg <- renderUI({
|
||
|
tmp <- ""
|
||
|
if (input$sys_use_spacy) if (!identical(input$sys_use_spacy_model, values$conf$use_spacy_model))
|
||
|
tmp <- paste0(tmp, "<li> Changing the spacy model will delete the current tokenized text and fasttext word embeddings, to prevent conflicts</li>")
|
||
|
if (input$sys_use_ft) if (!identical(input$sys_use_ft_model, values$conf$use_ft_model))
|
||
|
tmp <- paste0(tmp, "<li> Changing the fasttext model will delete the current fasttext word embeddings, to prevent conflicts.</li>")
|
||
|
if (input$sys_use_sb) if (!identical(input$sys_use_sb_model, values$conf$use_sb_model))
|
||
|
tmp <- paste0(tmp, "<li> Changing the SBERT model will delete the current SBERT sentence embeddings, to prevent conflicts.</li>")
|
||
|
|
||
|
if (nchar(tmp)) {
|
||
|
return(HTML(paste("<br><p> <strong>Warning:</strong> <ul>", tmp, "</ul></p>")))
|
||
|
} else
|
||
|
return(NULL)
|
||
|
})
|
||
|
observeEvent(input$saveSystem, {
|
||
|
values$save_config_msg <- NULL
|
||
|
showModal(modalDialog(
|
||
|
title = "Save project configuration",
|
||
|
paste0("Click 'Save' to check and confirm the new configuration"),
|
||
|
uiOutput("save_embed_msg"),
|
||
|
uiOutput("save_config_msg"),
|
||
|
footer = tagList(actionButton(paste0("save_conf_confirm"), "Save"),
|
||
|
modalButton("Cancel"))))
|
||
|
})
|
||
|
observeEvent(input$save_conf_confirm, {
|
||
|
newconf <- values$conf
|
||
|
newconf$contextcols <- input$sys_var_context
|
||
|
newconf$commcol <- input$sys_var_comm
|
||
|
|
||
|
if (input$sys_use_python) {
|
||
|
pytest <- try(system(paste(input$sys_which_python, "--version"),
|
||
|
intern = TRUE))
|
||
|
if (inherits(pytest, "try-error")) {
|
||
|
values$save_config_msg <- paste0(
|
||
|
"Error: python path `", input$sys_which_python, "` not valid")
|
||
|
return(NULL)
|
||
|
}
|
||
|
}
|
||
|
newconf$use_python <- input$sys_use_python
|
||
|
newconf$python <- input$sys_which_python
|
||
|
newconf$use_gpu <- input$sys_use_gpu
|
||
|
|
||
|
if (input$sys_use_spacy) {
|
||
|
sptest <- system(paste0(
|
||
|
newconf$python, " -m spacy info ", input$sys_use_spacy_model), intern = TRUE)
|
||
|
if (length(sptest) == 0) {
|
||
|
values$save_config_msg <- paste(
|
||
|
"Error loading spacy, check that it is installed in the specified python env")
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (length(attr(sptest, "status"))) {
|
||
|
values$save_config_msg <- paste(
|
||
|
"Error loading spacy model, check that it has been downloaded")
|
||
|
return(NULL)
|
||
|
}
|
||
|
newconf$use_spacy_model <- input$sys_use_spacy_model
|
||
|
}
|
||
|
newconf$use_spacy <- input$sys_use_spacy
|
||
|
|
||
|
if (input$sys_use_ft) {
|
||
|
if (!file.exists(input$sys_use_ft_model)) {
|
||
|
values$save_config_msg <- paste(
|
||
|
"Error loading fasttext model, check the specified path")
|
||
|
return(NULL)
|
||
|
}
|
||
|
newconf$use_ft_model <- input$sys_use_ft_model
|
||
|
}
|
||
|
newconf$use_ft <- input$sys_use_ft
|
||
|
|
||
|
if (input$sys_use_sb)
|
||
|
newconf$use_sb_model <- input$sys_use_sb_model
|
||
|
newconf$use_sb <- input$sys_use_sb
|
||
|
|
||
|
# if (newconf$use_spacy) if (newconf$use_spacy_model != values$conf$use_spacy_model) {
|
||
|
if (newconf$use_spacy) if (!identical(newconf$use_spacy_model, values$conf$use_spacy_model)) {
|
||
|
file.remove(paste0(values$conf$datadir, values$conf$dataname, "_spa.feather"))
|
||
|
file.remove(paste0(values$conf$datadir, values$conf$dataname, "_ft.feather"))
|
||
|
}
|
||
|
# if (newconf$use_ft) if (newconf$use_ft_model != values$conf$use_ft_model)
|
||
|
if (newconf$use_ft) if (!identical(newconf$use_ft_model, values$conf$use_ft_model))
|
||
|
file.remove(paste0(values$conf$datadir, values$conf$dataname, "_ft.feather"))
|
||
|
# if (newconf$use_sb) if (newconf$use_sb_model != values$conf$use_sb_model)
|
||
|
if (newconf$use_sb) if (!identical(newconf$use_sb_model, values$conf$use_sb_model))
|
||
|
file.remove(paste0(values$conf$datadir, values$conf$dataname, "_sb.feather"))
|
||
|
|
||
|
writeLines(RJSONIO::toJSON(newconf),
|
||
|
paste0("tigger_", newconf$projectid, ".json"))
|
||
|
values$conf <- newconf
|
||
|
initDataTrigger(initDataTrigger() + 1)
|
||
|
removeModal()
|
||
|
})
|
||
|
|
||
|
##############################################################################
|
||
|
## Reactives : data
|
||
|
##############################################################################
|
||
|
|
||
|
ok.commentCol <- reactive(values$conf$commcol)
|
||
|
|
||
|
## Current imported data (async)
|
||
|
observeEvent(input$dataImport, {
|
||
|
if (ok.data.running()) return(NULL)
|
||
|
## Save nrows to conf file on change
|
||
|
if (!is.numeric(input$dataNrows) | !is.numeric(input$dataSkipRows)) return(NULL)
|
||
|
values$conf$dataNrows <- input$dataNrows
|
||
|
values$conf$dataSkipRows <- input$dataSkipRows
|
||
|
if (!is.null(values$conf$projectid))
|
||
|
writeLines(RJSONIO::toJSON(values$conf), paste0(
|
||
|
"tigger_", values$conf$projectid, ".json"))
|
||
|
initDataTrigger(initDataTrigger() + 1)
|
||
|
})
|
||
|
|
||
|
|
||
|
## Read data event
|
||
|
observeEvent(initDataTrigger(), {
|
||
|
if (is.null(values$conf)) {
|
||
|
ok.data(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (initDataTrigger() == 0) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.data\n")
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.data trigger", initDataTrigger(), "\n")
|
||
|
ok.data.running(TRUE)
|
||
|
|
||
|
## Prepare variables for async launch
|
||
|
coco <- values$conf
|
||
|
coco$python_ok <- values$python_ok
|
||
|
|
||
|
da_token <- session$token
|
||
|
data_status <- paste0("tigger_", da_token, "_data")
|
||
|
|
||
|
file_orig_colnames <- paste0(data_status, "_origcols")
|
||
|
file_colstok <- paste0(data_status, "_tok")
|
||
|
file_colsft <- paste0(data_status, "_ft")
|
||
|
file_colssb <- paste0(data_status, "_sb")
|
||
|
file_colsbertpred <- paste0(data_status, "_bertpred")
|
||
|
da_missing <- paste0(data_status, "_bertpred_missing")
|
||
|
|
||
|
projpath <- paste0(coco$datadir, coco$projectid)
|
||
|
if (!file.exists(projpath))
|
||
|
dir.create(projpath)
|
||
|
|
||
|
## Read data: async launch
|
||
|
data_async <- future({
|
||
|
#################
|
||
|
## Main data file
|
||
|
|
||
|
set_status(data_status, "Importing main data file...")
|
||
|
res <- arrow::read_feather(
|
||
|
paste0(coco$datadir, coco$dataname, ".feather"))
|
||
|
if (is.null(coco$dataNrows)) return(NULL)
|
||
|
rowselect <-
|
||
|
intersect(1:nrow(res), (1:(coco$dataNrows)) + coco$dataSkipRows)
|
||
|
if (!length(rowselect)) return(NULL)
|
||
|
res <- res[rowselect, , drop = FALSE]
|
||
|
|
||
|
writeLines(colnames(res), file_orig_colnames)
|
||
|
|
||
|
|
||
|
#################
|
||
|
## Tokenized text
|
||
|
|
||
|
## Import data : feather of two cols, one the id, other the tokenized
|
||
|
set_status(data_status, "Importing tokenized text...")
|
||
|
file_tok <- paste0(coco$datadir, coco$dataname, "_spa.feather")
|
||
|
if (file.exists(file_tok)) {
|
||
|
tokdat <- arrow::read_feather(file_tok)
|
||
|
} else
|
||
|
tokdat <- NULL
|
||
|
tok_remaining <- res[[coco$idcol]]
|
||
|
tok_remaining <-
|
||
|
tok_remaining[! tok_remaining %in% tokdat[[coco$idcol]]]
|
||
|
|
||
|
## Tokenize remaining texts
|
||
|
if (length(tok_remaining)) {
|
||
|
if (!coco$python_ok | !coco$use_spacy) {
|
||
|
## If remaining texts to tokenize but no python, fall back on
|
||
|
## untokenized text
|
||
|
## TODO here use quanteda tokenizer with options?
|
||
|
tokdat <- data.frame(
|
||
|
res[[coco$idcol]],
|
||
|
do.call(paste, c(res[match(tok_remaining, res[[coco$idcol]]),
|
||
|
coco$textcols, drop = FALSE], sep = "\n\n")))
|
||
|
colnames(tokdat) <- c(coco$idcol, "text_spa")
|
||
|
set_status(data_status, "Importing tokenized text: no spacy ok")
|
||
|
} else {
|
||
|
## If python present, tokenize with spacy
|
||
|
set_status(data_status,
|
||
|
paste("Tokenizing", length(tok_remaining), "new texts"))
|
||
|
tmpfile <- paste0("tigger_tok_", da_token)
|
||
|
arrow::write_feather(
|
||
|
data.frame(
|
||
|
id = tok_remaining,
|
||
|
text = do.call(
|
||
|
paste, c(res[match(tok_remaining, res[[coco$idcol]]),
|
||
|
coco$textcols, drop = FALSE], sep = "\n\n"))),
|
||
|
paste0(tmpfile, ".feather")
|
||
|
)
|
||
|
system(paste0(
|
||
|
coco$python, " tokenize_spacy.py -d ", tmpfile, ".feather",
|
||
|
" -m ", coco$use_spacy_model))
|
||
|
tok_new <- arrow::read_feather(paste0(tmpfile, "_spa.feather"))
|
||
|
colnames(tok_new)[colnames(tok_new) == "id"] <- coco$idcol
|
||
|
tokdat <- rbind(tokdat, tok_new)
|
||
|
arrow::write_feather(tokdat, file_tok)
|
||
|
system(paste0("rm ", tmpfile, "*"))
|
||
|
set_status(data_status, "Importing tokenized text: spacy ok")
|
||
|
}
|
||
|
}
|
||
|
|
||
|
## Save colname for tokenized text
|
||
|
tmp_cols_tok <- colnames(tokdat)[colnames(tokdat) != coco$idcol]
|
||
|
writeLines(tmp_cols_tok, file_colstok)
|
||
|
|
||
|
set_status(data_status, "Importing tokenized text: wrote colnames")
|
||
|
|
||
|
## Merge with current data
|
||
|
res <- merge(res, tokdat, by = coco$idcol,
|
||
|
all.y = FALSE, sort = FALSE)
|
||
|
rm(tokdat)
|
||
|
|
||
|
set_status(data_status, "Importing tokenized text: merged")
|
||
|
|
||
|
#################
|
||
|
## FastText embeddings
|
||
|
|
||
|
set_status(data_status, paste("Loading fastText embeddings"))
|
||
|
|
||
|
## FastText embeddings : feather file with id and embeddings
|
||
|
file_ft <- paste0(coco$datadir, coco$dataname, "_ft.feather")
|
||
|
if (file.exists(file_ft)) {
|
||
|
ftdat <- arrow::read_feather(file_ft)
|
||
|
} else
|
||
|
ftdat <- NULL
|
||
|
ft_remaining <- res[[coco$idcol]]
|
||
|
ft_remaining <-
|
||
|
ft_remaining[! ft_remaining %in% ftdat[[coco$idcol]]]
|
||
|
|
||
|
## Embed remaining texts, if FT in use
|
||
|
if (length(ft_remaining)) {
|
||
|
if (!coco$python_ok | !coco$use_ft) {
|
||
|
## If no python or FT, abort if not all rows already embedded
|
||
|
ftdat <- NULL
|
||
|
} else {
|
||
|
set_status(data_status, paste(
|
||
|
"Embedding", length(ft_remaining), "new texts with fastText"))
|
||
|
tmpfile <- paste0("tigger_ft_", da_token)
|
||
|
arrow::write_feather(
|
||
|
data.frame(
|
||
|
id = ft_remaining,
|
||
|
text = do.call(
|
||
|
paste, res[match(ft_remaining, res[[coco$idcol]]),
|
||
|
tmp_cols_tok, drop = FALSE])),
|
||
|
paste0(tmpfile, ".feather")
|
||
|
)
|
||
|
system(paste0(
|
||
|
coco$python, " embed_fasttext.py -d ", tmpfile, ".feather",
|
||
|
" -m ", coco$use_ft_model))
|
||
|
ft_new <- arrow::read_feather(paste0(tmpfile, "_ft.feather"))
|
||
|
colnames(ft_new)[colnames(ft_new) == "id"] <- coco$idcol
|
||
|
ftdat <- rbind(ftdat, ft_new)
|
||
|
arrow::write_feather(ftdat, file_ft)
|
||
|
system(paste0("rm ", tmpfile, "*"))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!is.null(ftdat)) {
|
||
|
## Merge with current data
|
||
|
writeLines(colnames(ftdat)[colnames(ftdat) != coco$idcol], file_colsft)
|
||
|
res <- merge(res, ftdat, by = coco$idcol, all.y = FALSE, sort = FALSE)
|
||
|
rm(ftdat)
|
||
|
}
|
||
|
|
||
|
|
||
|
#################
|
||
|
## SBERT embeddings
|
||
|
|
||
|
set_status(data_status, "Loading SBERT embeddings")
|
||
|
|
||
|
## SBERT file : feather of id and embeddings
|
||
|
file_sb <- paste0(coco$datadir, coco$dataname, "_sb.feather")
|
||
|
if (file.exists(file_sb)) {
|
||
|
sbdat <- arrow::read_feather(file_sb)
|
||
|
}else
|
||
|
sbdat <- NULL
|
||
|
sb_remaining <- res[[coco$idcol]]
|
||
|
sb_remaining <-
|
||
|
sb_remaining[! sb_remaining %in% sbdat[[coco$idcol]]]
|
||
|
|
||
|
## Embed remaining texts
|
||
|
if (length(sb_remaining)) {
|
||
|
if (!coco$python_ok | !coco$use_sb) {
|
||
|
## If no python or SB, abort if not all rows already embedded
|
||
|
sbdat <- NULL
|
||
|
} else {
|
||
|
set_status(data_status, paste(
|
||
|
"Embedding", length(sb_remaining), "new texts with SBERT"))
|
||
|
tmpfile <- paste0("tigger_sb_", da_token)
|
||
|
arrow::write_feather(
|
||
|
data.frame(
|
||
|
id = sb_remaining,
|
||
|
text = do.call(
|
||
|
paste, res[match(sb_remaining, res[[coco$idcol]]),
|
||
|
coco$textcols, drop = FALSE])),
|
||
|
paste0(tmpfile, ".feather")
|
||
|
)
|
||
|
system(paste0(
|
||
|
coco$python, " embed_sbert.py -d ", tmpfile, ".feather",
|
||
|
" -m ", coco$use_sb_model))
|
||
|
sb_new <- arrow::read_feather(paste0(tmpfile, "_sb.feather"))
|
||
|
colnames(sb_new)[colnames(sb_new) == "id"] <- coco$idcol
|
||
|
sbdat <- rbind(sbdat, sb_new)
|
||
|
arrow::write_feather(sbdat, file_sb)
|
||
|
system(paste0("rm ", tmpfile, "*"))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!is.null(sbdat)) {
|
||
|
## Merge with current data
|
||
|
writeLines(colnames(sbdat)[colnames(sbdat) != coco$idcol], file_colssb)
|
||
|
res <- merge(res, sbdat, by = coco$idcol, all.y = FALSE, sort = FALSE)
|
||
|
rm(sbdat)
|
||
|
}
|
||
|
|
||
|
#################
|
||
|
## BERT predictions
|
||
|
|
||
|
set_status(data_status, "Loading BERT predictions")
|
||
|
|
||
|
## BERT predictions : feathers of id and probs
|
||
|
## TODO: take full prediction if existing
|
||
|
|
||
|
files_pred <- dir(
|
||
|
coco$datadir, paste0("^", coco$dataname, "_bertpred_.*[.]feather$"))
|
||
|
# files_predall <- dir(
|
||
|
# coco$datadir, paste0("^", coco$dataname, "_bertpredall_.*[.]feather$"))
|
||
|
# files_pred <- files_pred[
|
||
|
# ! gsub("_bertpred_", "_bertpredall_", files_pred) %in% files_predall]
|
||
|
# files_pred <- c(files_predall, files_pred)
|
||
|
|
||
|
tmp_bertpred_cols <- list()
|
||
|
if (length(files_pred)) {
|
||
|
for (ipred in files_pred) {
|
||
|
preddat <- arrow::read_feather(paste0(coco$datadir, ipred))
|
||
|
predname <-
|
||
|
gsub(paste0("^", coco$dataname, "_bertpred_(.*)[.]feather$"),
|
||
|
"\\1", ipred)
|
||
|
# predname <-
|
||
|
# gsub(paste0("^", coco$dataname, "_bertpred(all)?_(.*)[.]feather$"),
|
||
|
# "\\2", ipred, perl = TRUE)
|
||
|
n_missing <- sum(! res[[coco$idcol]] %in% preddat[[coco$idcol]])
|
||
|
if (n_missing) {
|
||
|
write(
|
||
|
paste0("BERT prediction ", predname, " : ", n_missing,
|
||
|
" texts missing in prediction."),
|
||
|
da_missing, append = TRUE)
|
||
|
} else {
|
||
|
colnames(preddat) <- gsub(
|
||
|
"^bertpred_", paste0("bertpred_", predname), colnames(preddat))
|
||
|
tmp_bertpred_cols[[predname]] <-
|
||
|
colnames(preddat)[colnames(preddat) != coco$idcol]
|
||
|
res <- merge(res, preddat, by = coco$idcol,
|
||
|
all.y = FALSE, sort = FALSE)
|
||
|
}
|
||
|
rm(preddat)
|
||
|
}
|
||
|
if (length(tmp_bertpred_cols)) {
|
||
|
writeLines(names(tmp_bertpred_cols), file_colsbertpred)
|
||
|
lapply(names(tmp_bertpred_cols), function(x) {
|
||
|
writeLines(tmp_bertpred_cols[[x]], paste0(file_colsbertpred, "_", x))
|
||
|
})
|
||
|
}
|
||
|
}
|
||
|
|
||
|
## Comment col
|
||
|
set_status(data_status, "Loading comments")
|
||
|
tmpfile <- paste0(projpath, "/", coco$commcol, ".csv")
|
||
|
if (file.exists(tmpfile)) {
|
||
|
tmpcol <- read.csv(tmpfile)
|
||
|
colnames(tmpcol)[2] <- coco$commcol
|
||
|
## Remove duplicates, keep newest
|
||
|
tmpcol <- tmpcol[nrow(tmpcol):1, , drop = FALSE]
|
||
|
tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), , drop = FALSE]
|
||
|
tmpcol <- tmpcol[nrow(tmpcol):1, , drop = FALSE]
|
||
|
res <- merge(res, tmpcol, by = coco$idcol,
|
||
|
all.x = TRUE, all.y = FALSE, sort = FALSE)
|
||
|
res[[coco$commcol]][is.na(res[[coco$commcol]])] <- ""
|
||
|
} else
|
||
|
res[[coco$commcol]] <- ""
|
||
|
|
||
|
# ## Tagging cols
|
||
|
# set_status(data_status, "Loading tags")
|
||
|
# for (itag in coco$tagcols) {
|
||
|
# tmpfile <- paste0(projpath, "/", itag, ".csv")
|
||
|
# if (file.exists(tmpfile)) {
|
||
|
# tmpcol <- read.csv(tmpfile)
|
||
|
# tmpcol[[paste0("hist_", itag)]] <- 1:nrow(tmpcol)
|
||
|
# ## Remove duplicates, keep newest
|
||
|
# tmpcol <- tmpcol[nrow(tmpcol):1, ]
|
||
|
# tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), ]
|
||
|
# tmpcol <- tmpcol[nrow(tmpcol):1, ]
|
||
|
# if (verbose)
|
||
|
# cat("DEBUG imported ", itag, nrow(tmpcol), "\n")
|
||
|
# res <- merge(res, tmpcol, by = coco$idcol,
|
||
|
# all.x = TRUE, all.y = FALSE, sort = FALSE)
|
||
|
# } else {
|
||
|
# res[[itag]] <- NA
|
||
|
# res[[paste0("hist_", itag)]] <- NA
|
||
|
# }
|
||
|
# }
|
||
|
## Tagging cols
|
||
|
set_status(data_status, "Loading tags")
|
||
|
for (itag in coco$tagcols) {
|
||
|
itaghist <- paste0("hist_", itag)
|
||
|
if (is.null(res[[itag]]) | all(is.na(res[[itag]]))) {
|
||
|
res[[itag]] <- NA
|
||
|
res[[itaghist]] <- NA
|
||
|
} else {
|
||
|
res[[itaghist]][!is.na(res[[itag]])] <- 1:sum(!is.na(res[[itag]]))
|
||
|
}
|
||
|
tmpfile <- paste0(projpath, "/", itag, ".csv")
|
||
|
if (file.exists(tmpfile)) {
|
||
|
tmpcol <- read.csv(tmpfile)
|
||
|
tmpcol[[itaghist]] <- 1:nrow(tmpcol) +
|
||
|
max(0, suppressWarnings(max(na.omit(res[[itaghist]]))))
|
||
|
## Remove duplicates, keep newest
|
||
|
tmpcol <- tmpcol[nrow(tmpcol):1, ]
|
||
|
tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), ]
|
||
|
tmpcol <- tmpcol[nrow(tmpcol):1, ]
|
||
|
tmpcol <- tmpcol[tmpcol[[coco$idcol]] %in% res[[coco$idcol]], ]
|
||
|
if (verbose)
|
||
|
cat("DEBUG imported", itag, nrow(tmpcol), "\n")
|
||
|
# res <- merge(res, tmpcol, by = coco$idcol,
|
||
|
# all.x = TRUE, all.y = FALSE, sort = FALSE)
|
||
|
if (nrow(tmpcol)) {
|
||
|
idmatches <- match(tmpcol[[coco$idcol]], res[[coco$idcol]])
|
||
|
# idmatches <- idmatches[!is.na(idmatches)]
|
||
|
res[[itag]][idmatches] <- tmpcol[[itag]]
|
||
|
res[[itaghist]][idmatches] <- tmpcol[[itaghist]]
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
set_status(data_status, "Finalizing data import")
|
||
|
res
|
||
|
}) %...>% ok.data.async()
|
||
|
data_async <- catch(data_async, function(error) {
|
||
|
values$modelDiagno <- paste0("Error in data import: ", error)
|
||
|
})
|
||
|
data_async <- finally(data_async, function() {
|
||
|
#################
|
||
|
## Finalize data importation
|
||
|
res <- ok.data.async()
|
||
|
if (is.null(res)) return(NULL)
|
||
|
|
||
|
set_status(data_status, "Finalizing data import: read columns")
|
||
|
## Read column (tok, ft, sb, ...) names from dedicated files
|
||
|
values$tagcols <- coco$tagcols
|
||
|
values$cols_tok <- readLines(file_colstok)
|
||
|
values$cols_ft <- NULL
|
||
|
if (file.exists(file_colsft)) {
|
||
|
values$cols_ft <- readLines(file_colsft)
|
||
|
}
|
||
|
values$cols_sb <- NULL
|
||
|
if (file.exists(file_colssb)) {
|
||
|
values$cols_sb <- readLines(file_colssb)
|
||
|
}
|
||
|
set_status(data_status, "Finalizing data import: bertpred")
|
||
|
if (file.exists(file_colsbertpred)) {
|
||
|
tmp_bertpred <- readLines(file_colsbertpred)
|
||
|
values$cols_bertpred <- lapply(tmp_bertpred, function(x)
|
||
|
readLines(paste0(file_colsbertpred, "_", x)))
|
||
|
names(values$cols_bertpred) <- tmp_bertpred
|
||
|
} else
|
||
|
values$cols_bertpred <- NULL
|
||
|
set_status(data_status, "Finalizing data import: bertpred missing")
|
||
|
if (file.exists(da_missing)) {
|
||
|
lapply(readLines(da_missing), function(x)
|
||
|
showNotification(paste(
|
||
|
x, "\n",
|
||
|
"You may predict on current text in BERT panel, ",
|
||
|
"and then reload the data.")))
|
||
|
}
|
||
|
|
||
|
## Populate regressors list in prediction panel
|
||
|
set_status(data_status, "Finalizing data import: populating regressors")
|
||
|
regr_choices <- c(
|
||
|
"Regex" = "regex",
|
||
|
"Word counts (DFM)" = "dfm")
|
||
|
if (!is.null(values$cols_ft))
|
||
|
regr_choices <- c(regr_choices, "Word embeddings (FastText)" = "ft")
|
||
|
if (!is.null(values$cols_sb))
|
||
|
regr_choices <- c(regr_choices, "Sentence embeddings (SBERT)" = "sb")
|
||
|
regr_choices <- c(regr_choices, "Extra predictors" = "extra")
|
||
|
|
||
|
if (length(values$cols_bertpred)) {
|
||
|
tmp_choices <- paste0("bertpred_", names(values$cols_bertpred))
|
||
|
names(tmp_choices) <- paste0("BERT_", names(values$cols_bertpred))
|
||
|
regr_choices <- c(regr_choices, tmp_choices)
|
||
|
}
|
||
|
|
||
|
# if (length(regr_choices) > 3) {
|
||
|
regr_choices_sel <-
|
||
|
regr_choices[regr_choices %in% c("regex", "ft", "sb")]
|
||
|
# } else
|
||
|
# regr_choices_sel <- c("regex", "dfm")
|
||
|
|
||
|
updateSelectizeInput(
|
||
|
session, "use_regressors", choices = regr_choices,
|
||
|
selected = regr_choices_sel)
|
||
|
values$new_use_regressors <- regr_choices_sel
|
||
|
|
||
|
othercols <- colnames(res)
|
||
|
othercols <- othercols[! othercols %in% c(
|
||
|
values$cols_tok, values$cols_ft, values$cols_sb, values$cols_bertpred,
|
||
|
values$tagcols, paste0("hist_", values$tagcols),
|
||
|
coco$commcol, coco$textcols)]
|
||
|
updateSelectInput(
|
||
|
session, "dlTagSelect",
|
||
|
choices = c("tags", "comments", "predictions", othercols),
|
||
|
selected = c("tags", "comments", "predictions"))
|
||
|
|
||
|
## Populate extra regressors lists in prediction panel
|
||
|
updateSelectizeInput(
|
||
|
session, "use_ootregnum", choices = othercols, selected = NULL)
|
||
|
updateSelectizeInput(
|
||
|
session, "use_ootregcat", choices = othercols, selected = NULL)
|
||
|
|
||
|
## Project path
|
||
|
values$projectdir <- paste0(projpath, "/")
|
||
|
|
||
|
## Create tagmat: data.frame of all current taggings
|
||
|
values$tagmat <- res[, coco$tagcols, drop = FALSE]
|
||
|
## Create tagmathist: data.frame of historical order of taggings
|
||
|
values$tagmathist <- res[, paste0("hist_", coco$tagcols), drop = FALSE]
|
||
|
colnames(values$tagmathist) <- coco$tagcols
|
||
|
|
||
|
## Save data original colnames
|
||
|
values$data_orig_colnames <- readLines(file_orig_colnames)
|
||
|
|
||
|
ok.data(ok.data.async())
|
||
|
ok.data.async(NULL)
|
||
|
ok.data.running(FALSE)
|
||
|
set_status(data_status, "Exit data async")
|
||
|
system(paste0("rm ", data_status, "*"))
|
||
|
values$modelDiagno <- "No model"
|
||
|
|
||
|
|
||
|
if(trainTrigger() == 0) trainTrigger(1) ## For first run. TODO set trainTrigger to 0 when changing data?
|
||
|
if (verbose) cat("DEBUG Exit data async finally\n")
|
||
|
})
|
||
|
NULL
|
||
|
})
|
||
|
|
||
|
## Read data import status from file
|
||
|
observe({
|
||
|
if (ok.data.running()) {
|
||
|
invalidateLater(100)
|
||
|
tmpfile <- paste0("tigger_", session$token, "_data")
|
||
|
if (file.exists(tmpfile)) {
|
||
|
values$modelDiagno <- paste(collapse = "\n", scan(
|
||
|
tmpfile, what = "character", sep="\n", quiet = TRUE))
|
||
|
} else
|
||
|
values$modelDiagno <- ""
|
||
|
}
|
||
|
})
|
||
|
|
||
|
|
||
|
###############
|
||
|
## ok.text() reactive
|
||
|
###############
|
||
|
ok.text <- eventReactive(ok.data(), {
|
||
|
if (is.null(ok.data())) return(NULL)
|
||
|
res <- ok.data()[, values$cols_tok[1]]
|
||
|
if (length(values$cols_tok) > 1) {
|
||
|
for (icol in values$cols_tok[-1]) {
|
||
|
res <- paste0(res, "\n\n", ok.data()[, icol])
|
||
|
}
|
||
|
}
|
||
|
# queryTrigger(queryTrigger() + 1)
|
||
|
if (verbose)
|
||
|
cat("DEBUG ok.text", length(res), "\n")
|
||
|
res
|
||
|
})
|
||
|
|
||
|
ok.labelcol <- reactive({
|
||
|
if (is.null(ok.data())) return(NULL)
|
||
|
if (verbose)
|
||
|
# cat("DEBUG enter ok.labelcol\n")
|
||
|
if (is.null(input$selectScheme)) return(values$conf$tagcols[1])
|
||
|
input$selectScheme
|
||
|
})
|
||
|
|
||
|
|
||
|
ok.nicetext <- reactive({
|
||
|
req(ok.data(), values$conf$textcols)
|
||
|
# if (is.null(ok.data())) return(NULL)
|
||
|
do.call(paste, c(ok.data()[, values$conf$textcols, drop = FALSE],
|
||
|
sep = "\n\n"))
|
||
|
})
|
||
|
|
||
|
ok.regressor.names <- reactive({
|
||
|
daregs <- values$new_use_regressors
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.regressor.names ", paste(daregs, collapse = " "), "\n")
|
||
|
res <- NULL
|
||
|
if ("ft" %in% daregs)
|
||
|
res <- c(res, values$cols_ft)
|
||
|
if ("sb" %in% daregs)
|
||
|
res <- c(res, values$cols_sb)
|
||
|
if (length(values$cols_bertpred)) {
|
||
|
for (ipred in 1:length(values$cols_bertpred)) {
|
||
|
if (paste0("bertpred_", names(values$cols_bertpred)[ipred]) %in% daregs)
|
||
|
res <- c(res, values$cols_bertpred[[ipred]])
|
||
|
}
|
||
|
}
|
||
|
if (verbose)
|
||
|
cat("DEBUG exit ok.regressor.names:", length(res), "\n")
|
||
|
res
|
||
|
})
|
||
|
|
||
|
ok.contextCol <- reactive({
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.contextCol\n")
|
||
|
# res <- isolate(values$conf$contextcols)
|
||
|
res <- values$conf$contextcols
|
||
|
names(res) <- res
|
||
|
res
|
||
|
})
|
||
|
|
||
|
## Initiate label and queries history, on ok.text() or labelcol change,
|
||
|
observeEvent(list(ok.text(), ok.labelcol()), {
|
||
|
if (verbose)
|
||
|
cat("DEBUG initiate label\n")
|
||
|
if (is.null(ok.labelcol())) {
|
||
|
values$label <- NULL
|
||
|
values$labelhist <- NULL
|
||
|
values$queries <- NULL
|
||
|
values$comment <- NULL
|
||
|
values$wasTagged <- NULL
|
||
|
values$wasRetagged <- NULL
|
||
|
} else {
|
||
|
values$label <- values$tagmat[, ok.labelcol()]
|
||
|
values$labelhist <- values$tagmathist[, ok.labelcol()]
|
||
|
values$queries <- which(!is.na(values$label))
|
||
|
values$comment <- ok.data()[[values$conf$commcol]]
|
||
|
values$wasTagged <- !is.na(values$label)
|
||
|
values$wasRetagged <- rep(FALSE, length(values$label))
|
||
|
}
|
||
|
})
|
||
|
|
||
|
## On values$label change, values$tagmat is updated for separate history
|
||
|
observeEvent(values$label, {
|
||
|
if (is.null(values$label)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG Update tagmat from label\n")
|
||
|
values$tagmat[, ok.labelcol()] <- values$label
|
||
|
values$tagmathist[, ok.labelcol()] <- values$labelhist
|
||
|
})
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## Reactives : training
|
||
|
##############################################################################
|
||
|
|
||
|
## Message of how many tagged and untagged, for model diagnostics box
|
||
|
ok.msgtagged <- reactive({
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.msgtagged\n")
|
||
|
# if (is.null(ok.data()) | ok.data.running()) {
|
||
|
if (is.null(ok.data())) {
|
||
|
return("No data yet ")
|
||
|
}
|
||
|
|
||
|
tmp <- ""
|
||
|
# if (sum(is.na(values$label)) == 0)
|
||
|
# tmp <- paste0(tmp, "Data correction mode (0 NA labels left).\n")
|
||
|
paste0(tmp, "Tagged: ", sum(!is.na(values$label)),
|
||
|
" ; Untagged: ", sum(is.na(values$label)))
|
||
|
})
|
||
|
|
||
|
## Training possible? At least 3 texts for 2 tags
|
||
|
ok.train.possible <- reactive({
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.train.possible\n")
|
||
|
labtab <- table(values$label)
|
||
|
if (length(labtab) < 2) return(FALSE)
|
||
|
if (sum(labtab > 2) < 2) return(FALSE)
|
||
|
TRUE
|
||
|
})
|
||
|
|
||
|
## Compute DFM (async)
|
||
|
## TODO make this less reactive
|
||
|
observe({
|
||
|
# if ("dfm" %in% input$use_regressors & !values$dfmGo)
|
||
|
if ("dfm" %in% values$new_use_regressors & !values$dfmGo)
|
||
|
values$dfmGo <- TRUE
|
||
|
})
|
||
|
ok.dfm <- reactiveVal()
|
||
|
observe({
|
||
|
if (is.null(ok.text())) return(NULL)
|
||
|
if (! values$dfmGo) return(NULL)
|
||
|
if (!is.numeric(input$dfmNgrams)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter dfm observer\n")
|
||
|
|
||
|
datext <- ok.text()
|
||
|
dfmNgrams <- input$dfmNgrams
|
||
|
dfmTfIdf <- input$dfmTfIdf
|
||
|
dfmMinDocfreq <- input$dfmMinDocfreq
|
||
|
dfmMinTermfreq <- input$dfmMinTermfreq
|
||
|
dfmTfScheme <- input$dfmTfScheme
|
||
|
dfmDfScheme <- input$dfmDfScheme
|
||
|
|
||
|
dfm_async <- future(seed = TRUE, {
|
||
|
dadfm <- datext %>%
|
||
|
tolower() %>% tokens(remove_punct = F) %>%
|
||
|
# tokens_select(pattern = stopwords(language= "fr"), selection = "remove") %>%
|
||
|
tokens_ngrams(n= 1:dfmNgrams) %>% dfm() %>%
|
||
|
dfm_trim(min_docfreq = dfmMinDocfreq, min_termfreq = dfmMinTermfreq,
|
||
|
verbose = TRUE)
|
||
|
if (dfmTfIdf) {
|
||
|
dadfm <- dadfm %>%
|
||
|
dfm_tfidf(scheme_tf = dfmTfScheme, scheme_df = dfmDfScheme)
|
||
|
}
|
||
|
suppressWarnings({ # TODO : check
|
||
|
dadfm <- quanteda::convert(dadfm, to= "tm")
|
||
|
})
|
||
|
dadfm <- Matrix::sparseMatrix(i= dadfm$i, j= dadfm$j, x= dadfm$v,
|
||
|
dimnames = dadfm$dimnames)
|
||
|
dadfm
|
||
|
}) %...>% ok.dfm()
|
||
|
dfm_async <- catch(dfm_async, function(error) {
|
||
|
values$modelDiagno <- paste0("Error in DFM: ", error)
|
||
|
})
|
||
|
})
|
||
|
|
||
|
## Predictor : df or matrix of dfm and extra
|
||
|
ok.regressor.matrix <- reactive({
|
||
|
if (is.null(ok.text())) return(NULL)
|
||
|
if (is.null(ok.regressor.names())) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.regressor.matrix\n")
|
||
|
model.matrix(~., data= ok.data()[, ok.regressor.names()])[, -1]
|
||
|
})
|
||
|
|
||
|
ok.extraregressor.matrix <- reactive({
|
||
|
req(ok.text())
|
||
|
if (! "extra" %in% values$new_use_regressors) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.extraregressor.matrix\n")
|
||
|
|
||
|
res <- NULL
|
||
|
for (inum in input$use_ootregnum) {
|
||
|
suppressWarnings({tmp <- as.numeric(ok.data()[[inum]])})
|
||
|
if (all(is.na(tmp))) {
|
||
|
showNotification(paste0(inum, " is all NAs or non-numeric ; predictor not used"),
|
||
|
duration = 5, type = "warning")
|
||
|
next
|
||
|
}
|
||
|
tmp[is.na(tmp)] <- median(tmp, na.rm = TRUE)
|
||
|
res <- cbind(res, tmp)
|
||
|
}
|
||
|
for (icat in input$use_ootregcat) {
|
||
|
tmp <- ok.data()[[icat]]
|
||
|
if (! length(unique(tmp)) %in% 2:100) {
|
||
|
showNotification(paste0(icat, " has ", length(unique(tmp)),
|
||
|
" unique values ; predictor not used"),
|
||
|
duration = 5, type = "warning")
|
||
|
next
|
||
|
}
|
||
|
tmp <- as.character(tmp)
|
||
|
tmp[is.na(tmp)] <- "<NA>"
|
||
|
tmp <- as.factor(tmp)
|
||
|
tmpmat <- model.matrix(~., data.frame(cat = tmp))[, -1, drop = FALSE]
|
||
|
res <- cbind(res, tmpmat)
|
||
|
}
|
||
|
if (verbose)
|
||
|
cat("DEBUG exit ok.extraregressor.matrix", ncol(res), "\n")
|
||
|
res
|
||
|
})
|
||
|
|
||
|
ok.extraRegex.matrix <- reactive({
|
||
|
if (is.null(ok.nicetext())) return(NULL)
|
||
|
if (is.null(values$extraRegex)) return(NULL)
|
||
|
if (is.null(values$regex_inuse)) return(NULL)
|
||
|
if (!any(values$regex_inuse)) return(NULL)
|
||
|
if (is.null(values$regex_case)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.extraRegex.matrix\n")
|
||
|
tmp <- as.matrix(sapply(which(values$regex_inuse), function(iregex) {
|
||
|
1 * stringi::stri_detect_regex(
|
||
|
ok.nicetext(), values$extraRegex[iregex],
|
||
|
opts_regex = list(case_insensitive = !values$regex_case[iregex]))
|
||
|
}))
|
||
|
colnames(tmp) <- values$extraRegex[values$regex_inuse]
|
||
|
tmp
|
||
|
})
|
||
|
|
||
|
ok.predictor <- reactive({
|
||
|
req(values$new_use_regressors)
|
||
|
if (ok.data.running()) return(NULL)
|
||
|
if (is.null(ok.text())) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.predictor\n")
|
||
|
dadfm <- NULL
|
||
|
if ("dfm" %in% values$new_use_regressors) {
|
||
|
dadfm <- ok.dfm()
|
||
|
} else {
|
||
|
if (input$predModel == "naive bayes") {
|
||
|
values$modelDiagno <-
|
||
|
"Error in training: Naive Bayes requires DFM word counts as predictors."
|
||
|
return(NULL)
|
||
|
}
|
||
|
}
|
||
|
if (length(ok.regressor.names()) > 0) {
|
||
|
if (input$predModel == "random forest") {
|
||
|
if (is.null(dadfm)) {
|
||
|
dadfm <- ok.data()[, ok.regressor.names()]
|
||
|
} else
|
||
|
dadfm <- cbind(ok.data()[, ok.regressor.names()], as.matrix(dadfm))
|
||
|
}
|
||
|
if (input$predModel %in% c("lasso", "linear", "knn")) {
|
||
|
dadfm <- cbind(ok.regressor.matrix(), dadfm)
|
||
|
}
|
||
|
}
|
||
|
# if ("regex" %in% input$use_regressors && length(values$regex_inuse) > 0) {
|
||
|
if ("regex" %in% values$new_use_regressors & !is.null(ok.extraRegex.matrix())) {
|
||
|
dadfm <- cbind(dadfm, ok.extraRegex.matrix())
|
||
|
}
|
||
|
if ("extra" %in% values$new_use_regressors) {
|
||
|
dadfm <- cbind(dadfm, ok.extraregressor.matrix())
|
||
|
}
|
||
|
|
||
|
if (is.null(dadfm)) {
|
||
|
values$modelDiagno <- "Error in training: no predictors selected"
|
||
|
if (verbose)
|
||
|
cat("DEBUG ok.predictor exit: no predictors\n")
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
if (input$predModel == "linear")
|
||
|
dadfm <- SparseM::as.matrix.csr(dadfm, nrow(dadfm), ncol(dadfm))
|
||
|
if (input$predModel == "naive bayes")
|
||
|
dadfm <- as.dfm(dadfm)
|
||
|
if (verbose)
|
||
|
cat("DEBUG exit ok.predictor: ", ncol(dadfm), "\n")
|
||
|
dadfm
|
||
|
})
|
||
|
|
||
|
|
||
|
## Train model async
|
||
|
ok.train <- reactiveVal()
|
||
|
ok.train.async <- reactiveVal()
|
||
|
observeEvent(
|
||
|
list(trainTrigger(), input$modelTrain),
|
||
|
{
|
||
|
if (is.null(ok.predictor())) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter ok.train observer\n")
|
||
|
if (!ok.train.possible()) {
|
||
|
values$modelDiagno <-
|
||
|
"Tag at least 3 texts for 2 tags to train prediction model."
|
||
|
ok.train(NULL)
|
||
|
# queryTrigger(queryTrigger() + 1)
|
||
|
return(NULL)
|
||
|
}
|
||
|
ready2tag(0)
|
||
|
values$modelDiagno <- paste0("Training ", input$predModel)
|
||
|
|
||
|
tmp_model <- input$predModel
|
||
|
if (tmp_model == "random forest") {
|
||
|
if (!is.numeric(input$rfNumTrees) | !is.numeric(input$rfMtry) |
|
||
|
!is.numeric(input$rfSampleFrac)) {
|
||
|
ok.train(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
train_fun <- ranger::ranger
|
||
|
train_args <- list(
|
||
|
x= ok.predictor()[!is.na(values$label), ],
|
||
|
y= as.factor(values$label[!is.na(values$label)]),
|
||
|
num.trees = input$rfNumTrees,
|
||
|
mtry = if (input$rfMtry > 0) input$rfMtry,
|
||
|
sample.fraction = input$rfSampleFrac, probability = TRUE)
|
||
|
} else {
|
||
|
if (tmp_model == "linear") {
|
||
|
tmpvalues <- values$label
|
||
|
if (!is.numeric(input$liblinCost)) {
|
||
|
ok.train(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
for (ival in na.omit(unique(tmpvalues)))
|
||
|
if (sum(tmpvalues == ival, na.rm = T) == 1)
|
||
|
tmpvalues[which(tmpvalues == ival)] <- NA
|
||
|
train_fun <- LiblineaR::LiblineaR
|
||
|
train_args <- list(
|
||
|
data= ok.predictor()[!is.na(tmpvalues), ],
|
||
|
target= as.factor(tmpvalues[!is.na(tmpvalues)]),
|
||
|
type= 0, cost = input$liblinCost)
|
||
|
} else if (tmp_model == "lasso") {
|
||
|
tmpvalues <- values$label
|
||
|
for (ival in na.omit(unique(tmpvalues)))
|
||
|
if (sum(tmpvalues == ival, na.rm = T) == 1)
|
||
|
tmpvalues[which(tmpvalues == ival)] <- NA
|
||
|
train_fun <- glmnet::glmnet
|
||
|
train_args <- list(
|
||
|
x= ok.predictor()[!is.na(tmpvalues), ],
|
||
|
y= as.factor(tmpvalues[!is.na(tmpvalues)]),
|
||
|
family= "multinomial")
|
||
|
} else if (tmp_model == "naive bayes") {
|
||
|
if (!is.numeric(input$naiveSmooth)) {
|
||
|
ok.train(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
tmpvalues <- values$label
|
||
|
for (ival in na.omit(unique(tmpvalues)))
|
||
|
if (sum(tmpvalues == ival, na.rm = T) == 1)
|
||
|
tmpvalues[which(tmpvalues == ival)] <- NA
|
||
|
train_fun <- quanteda.textmodels::textmodel_nb
|
||
|
train_args <- list(
|
||
|
x= ok.predictor()[!is.na(tmpvalues), ],
|
||
|
y= as.factor(tmpvalues[!is.na(tmpvalues)]),
|
||
|
smooth = input$naiveSmooth, prior = input$naivePrior,
|
||
|
distribution = input$naiveDistri)
|
||
|
} else if (tmp_model == "knn") {
|
||
|
train_fun <- function(x) return(TRUE)
|
||
|
train_args <- list(x = "knn")
|
||
|
}
|
||
|
}
|
||
|
|
||
|
train_async <- future(seed = TRUE, {
|
||
|
suppressWarnings(do.call(train_fun, train_args))
|
||
|
}) %...>% ok.train.async()
|
||
|
train_async <- catch(train_async, function(error) {
|
||
|
values$modelDiagno <- paste0("Model training error: ", error)
|
||
|
ok.train(NULL)
|
||
|
ok.train.async(NULL)
|
||
|
})
|
||
|
train_async <- finally(train_async, function() {
|
||
|
values$modelDiagno <- "Training done, predicting..."
|
||
|
ok.train(ok.train.async())
|
||
|
ok.train.async(NULL)
|
||
|
predTrigger(predTrigger() + 1)
|
||
|
})
|
||
|
NULL
|
||
|
}
|
||
|
)
|
||
|
|
||
|
## Model prediction on full dataset (async)
|
||
|
ok.pred <- reactiveVal()
|
||
|
ok.pred.async <- reactiveVal()
|
||
|
# observeEvent(list(ok.train(), input$glmLambda, input$knnK, values$retagged), {
|
||
|
observeEvent(predTrigger(), {
|
||
|
if (is.null(ok.train())) {
|
||
|
ok.pred(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (verbose)
|
||
|
cat("DEBUG ok.pred()\n")
|
||
|
|
||
|
damodel <- input$predModel
|
||
|
if (input$predModel == "random forest") {
|
||
|
pred_args <- list(ok.train(), data = ok.predictor())
|
||
|
dacolnames <- colnames(ok.train()$predictions)
|
||
|
} else if (input$predModel == "lasso") {
|
||
|
if (!is.numeric(input$glmLambda)) {
|
||
|
ok.pred(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
pred_args <- list(ok.train(), newx = ok.predictor(), s = input$glmLambda,
|
||
|
type = "response")
|
||
|
} else if (input$predModel == "linear") {
|
||
|
pred_args <- list(ok.train(), newx = ok.predictor(), proba = TRUE)
|
||
|
} else if (input$predModel == "naive bayes") {
|
||
|
pred_args <- list(ok.train(), newdata = ok.predictor(),
|
||
|
type = "probability")
|
||
|
} else if (input$predModel == "knn") {
|
||
|
if (!is.numeric(input$knnK)) {
|
||
|
ok.pred(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
pred_args <- list(train = ok.predictor()[!is.na(values$label), ],
|
||
|
test = ok.predictor(),
|
||
|
cl = values$label[!is.na(values$label)],
|
||
|
k = input$knnK, prob = TRUE)
|
||
|
}
|
||
|
|
||
|
pred_async <- future(seed = TRUE, {
|
||
|
if (damodel == "random forest") {
|
||
|
res <- do.call(predict, pred_args)$predictions
|
||
|
colnames(res) <- dacolnames
|
||
|
res
|
||
|
} else if (damodel == "lasso") {
|
||
|
do.call(predict, pred_args)[, , 1]
|
||
|
} else if (damodel == "linear") {
|
||
|
do.call(predict, pred_args)$probabilities
|
||
|
} else if (damodel == "naive bayes") {
|
||
|
do.call(predict, pred_args)
|
||
|
} else if (damodel == "knn") {
|
||
|
tmp <- do.call(class::knn, pred_args)
|
||
|
## A bit flaky here: probs of non-best classes uniformly share the rest,
|
||
|
## and distance is euc
|
||
|
tmpvalues <- sort(unique(pred_args$cl))
|
||
|
res <- matrix(0, nrow(pred_args$test), length(tmpvalues),
|
||
|
dimnames = list(NULL, tmpvalues))
|
||
|
res[cbind(1:nrow(res), tmp)] <- attr(tmp, "prob") -
|
||
|
((1 - attr(tmp, "prob")) / (ncol(res) - 1))
|
||
|
res <- res + (1 - attr(tmp, "prob")) / (ncol(res) - 1)
|
||
|
res
|
||
|
}
|
||
|
}) %...>% ok.pred.async()
|
||
|
pred_async <- catch(pred_async, function(error) {
|
||
|
values$modelDiagno <- paste0("Error in prediction: ", error)
|
||
|
})
|
||
|
pred_async <- finally(pred_async, function() {
|
||
|
if (verbose)
|
||
|
cat("DEBUG ok.pred() out of async\n")
|
||
|
ok.pred(ok.pred.async())
|
||
|
ok.pred.async(NULL)
|
||
|
# predlabTrigger(predlabTrigger() + 1)
|
||
|
diagTrigger(diagTrigger() + 1)
|
||
|
queryNext(queryNext() + 1)
|
||
|
})
|
||
|
})
|
||
|
|
||
|
## Best predicted label for full dataset
|
||
|
# ok.predlab1 <- eventReactive(predlabTrigger(), {
|
||
|
# if (is.null(ok.pred())) return(NULL)
|
||
|
# if (verbose)
|
||
|
# cat("DEBUG ok.predlab1\n")
|
||
|
# # queryTrigger(queryTrigger() + 1)
|
||
|
# colnames(ok.pred())[max.col(ok.pred())]
|
||
|
# })
|
||
|
ok.predlab1 <- reactive({
|
||
|
if (is.null(ok.pred())) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG ok.predlab1\n")
|
||
|
colnames(ok.pred())[max.col(ok.pred())]
|
||
|
})
|
||
|
|
||
|
## On scheme change, forget existing model, queue, query and hist
|
||
|
## load scheme description and predictor-regex
|
||
|
observeEvent(ok.labelcol(), {
|
||
|
if (verbose)
|
||
|
cat("DEBUG event selectScheme", ok.labelcol(), "\n")
|
||
|
ok.pred(NULL)
|
||
|
ok.train(NULL)
|
||
|
values$queryqueue <- NULL
|
||
|
values$modelDiagno <- "No model"
|
||
|
values$histSaveStack <- NULL
|
||
|
|
||
|
## Load description file
|
||
|
descr <- ""
|
||
|
if (file.exists(paste0(values$conf$datadir, values$conf$projectid,
|
||
|
"/", input$selectScheme, ".txt"))) {
|
||
|
descr <- paste(collapse = "\n", readLines(paste0(
|
||
|
values$conf$datadir, values$conf$projectid,
|
||
|
"/", input$selectScheme, ".txt")))
|
||
|
}
|
||
|
updateTextAreaInput(session, "schemeDescr", value = descr)
|
||
|
|
||
|
## Load regex from json
|
||
|
regexfile <- paste0(values$projectdir, input$selectScheme, "_regex.json")
|
||
|
if (file.exists(regexfile)) {
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter regex.json\n")
|
||
|
tmp <- RJSONIO::fromJSON(paste(collapse = "\n", readLines(regexfile)))
|
||
|
if (is.null(tmp$regex)) return(NULL)
|
||
|
values$extraRegex <- tmp$regex
|
||
|
names(values$extraRegex) <- sapply(tmp$regex, label2hash)
|
||
|
for (iregex in 1:length(tmp$regex)) {
|
||
|
values[[paste0("useregex_", names(values$extraRegex)[iregex])]] <-
|
||
|
tmp$use[iregex]
|
||
|
values[[paste0("caseregex_", names(values$extraRegex)[iregex])]] <-
|
||
|
tmp$case[iregex]
|
||
|
}
|
||
|
} else
|
||
|
values$extraRegex <- NULL
|
||
|
|
||
|
})
|
||
|
|
||
|
|
||
|
|
||
|
## queryNext event: update values$newQuery ; called by queryNext(), or by
|
||
|
## queryQueue or regex change
|
||
|
## Requires :
|
||
|
## - queryQueue (reactive: indices of current queue), and
|
||
|
## - wasTagged (boolean vector indicating whether each element has been tagged this session, used in "on untagged")
|
||
|
## - wasRetagged (boolean vector indicating whether each element has been tagged this session, used in "on tagged")
|
||
|
## TODO: make a scheme-list of wasTagged, now it will be wiped at each scheme change?
|
||
|
observeEvent(
|
||
|
list(queryNext(), values$queryQueue,
|
||
|
input$sampleChoice, input$taggedWhich, input$maxprobWhich,
|
||
|
input$regexFilter, input$regexCaseSens,
|
||
|
input$visuLock, values$visuZoom$xlim, values$visuZoom$ylim),
|
||
|
{
|
||
|
if (is.null(values$queryQueue)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG queryNext\n")
|
||
|
|
||
|
queue <- values$queryQueue
|
||
|
|
||
|
if (input$sampleChoice == "untagged") {
|
||
|
queue <- queue[!values$wasTagged[queue]]
|
||
|
} else if (input$sampleChoice == "tagged") {
|
||
|
queue <- queue[!is.na(values$label)[queue] & !values$wasRetagged[queue]]
|
||
|
if (input$taggedWhich != "all") {
|
||
|
queue <- queue[values$label[queue] == input$taggedWhich]
|
||
|
}
|
||
|
} else if (input$sampleChoice == "all") {
|
||
|
queue <- queue[!values$wasRetagged[queue]]
|
||
|
}
|
||
|
|
||
|
if (input$visuLock) if (!is.null(values$visuZoom$xlim)) {
|
||
|
if (!is.null(ok.visu()))
|
||
|
queue <- queue[ok.visu()[queue, 1] >= values$visuZoom$xlim[1] &
|
||
|
ok.visu()[queue, 1] <= values$visuZoom$xlim[2] &
|
||
|
ok.visu()[queue, 2] >= values$visuZoom$ylim[1] &
|
||
|
ok.visu()[queue, 2] <= values$visuZoom$ylim[2]]
|
||
|
}
|
||
|
|
||
|
if (length(queue)) if (nchar(input$regexFilter)) {
|
||
|
try_regex <- try(silent = TRUE, {
|
||
|
queue <- queue[
|
||
|
stringi::stri_detect_regex(
|
||
|
ok.nicetext()[queue], input$regexFilter,
|
||
|
opts_regex = list(case_insensitive = !input$regexCaseSens))]
|
||
|
})
|
||
|
if (inherits(try_regex, "try-error")) {
|
||
|
showNotification("Invalid regex", duration = 2, type = "error")
|
||
|
values$newQuery <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!length(queue)) {
|
||
|
if (nchar(input$regexFilter)) {
|
||
|
showNotification("No texts matching the regex",
|
||
|
duration = 2, type = "warning")
|
||
|
} else
|
||
|
showNotification("No more texts to be tagged in this set",
|
||
|
duration = 2, type = "warning")
|
||
|
values$newQuery <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
values$lastQuery <- queue[1] # ??
|
||
|
values$newQuery <- queue[1]
|
||
|
ready2tag(1)
|
||
|
}
|
||
|
)
|
||
|
|
||
|
## queryQueue event: fix values$queryQueue, (indices of current queue)
|
||
|
observeEvent(
|
||
|
list(ok.data(), ok.pred(), input$strategy, input$maxprobWhich,
|
||
|
input$selectProject, input$selectScheme), {
|
||
|
if (input$strategy %in% c("entropy", "maxprob") & is.null(ok.pred())) {
|
||
|
values$queryQueue <- NULL
|
||
|
values$lastQuery <- NULL
|
||
|
values$newQuery <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (is.null(ok.text())) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG queryQueue change\n")
|
||
|
|
||
|
if (input$strategy == "sequential") {
|
||
|
values$queryQueue <- 1:length(ok.text())
|
||
|
} else if (input$strategy == "random") {
|
||
|
values$queryQueue <- sample(length(ok.text()))
|
||
|
} else {
|
||
|
dapred <- ok.pred()
|
||
|
if (input$strategy == "entropy") {
|
||
|
tominimize <- rowSums(dapred * log(dapred))
|
||
|
if (any(dapred == 0)) {
|
||
|
tominimize[dapred == 0] <- 0
|
||
|
}
|
||
|
}
|
||
|
if (input$strategy == "maxprob") {
|
||
|
if (! input$maxprobWhich %in% colnames(dapred)) return(NULL)
|
||
|
tominimize <- -dapred[, input$maxprobWhich];
|
||
|
}
|
||
|
values$queryQueue <- order(tominimize)
|
||
|
}
|
||
|
}
|
||
|
)
|
||
|
|
||
|
|
||
|
## Update uniqueLabels and uniqueLabelsAll if necessary on values$label change
|
||
|
observeEvent(values$label, {
|
||
|
if (is.null(values$label)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG uniqueLabels\n")
|
||
|
dalabs <- sort(na.omit(unique(values$label)))
|
||
|
# if (!length(dalabs)) return(NULL)
|
||
|
if (!identical(dalabs, values$uniqueLabels))
|
||
|
values$uniqueLabels <- dalabs
|
||
|
if (any(! dalabs %in% values$uniqueLabelsAll))
|
||
|
values$uniqueLabelsAll <- c(values$uniqueLabelsAll,
|
||
|
dalabs[! dalabs %in% values$uniqueLabelsAll])
|
||
|
})
|
||
|
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## Project management
|
||
|
##############################################################################
|
||
|
|
||
|
output$cpDataMessage <- renderUI({
|
||
|
req(values$cpDataMessage)
|
||
|
p(values$cpDataMessage)
|
||
|
})
|
||
|
|
||
|
output$cpConfirmMessage <- renderUI({
|
||
|
req(values$cpConfirmMessage)
|
||
|
p(strong(paste0("Error: ", values$cpConfirmMessage)))
|
||
|
})
|
||
|
|
||
|
output$cpIdcolUI <- renderUI({
|
||
|
req(values$cpNewData)
|
||
|
if (!is.data.frame(values$cpNewData)) return(NULL)
|
||
|
selectInput("cpIdcol", NULL, colnames(values$cpNewData))
|
||
|
})
|
||
|
|
||
|
output$cpTextcolUI <- renderUI({
|
||
|
req(values$cpNewData)
|
||
|
if (!is.data.frame(values$cpNewData)) return(NULL)
|
||
|
selectInput("cpTextcol", NULL, colnames(values$cpNewData), multiple = TRUE)
|
||
|
})
|
||
|
|
||
|
output$cpContextcolUI <- renderUI({
|
||
|
req(values$cpNewData)
|
||
|
if (!is.data.frame(values$cpNewData)) return(NULL)
|
||
|
selectInput("cpContextcol", NULL, colnames(values$cpNewData), multiple = TRUE)
|
||
|
})
|
||
|
|
||
|
output$cpTagcolUI <- renderUI({
|
||
|
req(values$cpNewData)
|
||
|
if (!is.data.frame(values$cpNewData)) return(NULL)
|
||
|
selectInput("cpTagcol", NULL, colnames(values$cpNewData), multiple = TRUE)
|
||
|
})
|
||
|
|
||
|
output$cpCommcolUI <- renderUI({
|
||
|
req(values$cpNewData)
|
||
|
if (!is.data.frame(values$cpNewData)) return(NULL)
|
||
|
selectInput("cpCommcol", NULL, c("none", colnames(values$cpNewData)))
|
||
|
})
|
||
|
|
||
|
output$cp_spacyUI <- renderUI(
|
||
|
textInput("cp_use_spacy_model", NULL,
|
||
|
modelnames$spacy_name[modelnames$short == input$cp_lang],
|
||
|
placeholder = "(spacy model name)")
|
||
|
)
|
||
|
|
||
|
output$cp_spacyDlUI <- renderUI(HTML(paste(
|
||
|
"<em>Model must be downloaded once, from python:</em><br>",
|
||
|
input$cp_which_python, "-m spacy download",
|
||
|
modelnames$spacy_name[modelnames$short == input$cp_lang], "<br><br>"
|
||
|
)))
|
||
|
|
||
|
output$cp_ftUI <- renderUI(
|
||
|
textInput("cp_use_ft_model", NULL,
|
||
|
modelnames$fasttext_name[modelnames$short == input$cp_lang],
|
||
|
placeholder = "(fasttext model path)")
|
||
|
)
|
||
|
|
||
|
output$cp_ftDlUI <- renderUI(list(HTML(paste(
|
||
|
"<em>Model can be downloaded <a href='",
|
||
|
modelnames$fasttext_url[modelnames$short == input$cp_lang],
|
||
|
"'>here</a></em>")),
|
||
|
br(), br()))
|
||
|
|
||
|
output$cp_sbUI <- renderUI(
|
||
|
textInput("cp_use_sb_model", NULL, ifelse(
|
||
|
input$cp_lang %in% c("ar", "zh", "nl", "en", "fr", "de",
|
||
|
"it", "ko", "pl", "pt", "ru", "es", "tr"),
|
||
|
"distiluse-base-multilingual-cased-v1",
|
||
|
"distiluse-base-multilingual-cased-v2"),
|
||
|
placeholder = "(custom sentence_transformers model)")
|
||
|
)
|
||
|
|
||
|
|
||
|
|
||
|
## Create new project, with modal dialog
|
||
|
observeEvent(input$createProject, {
|
||
|
if (verbose)
|
||
|
cat("DEBUG create project\n")
|
||
|
showModal(modalDialog(
|
||
|
title = "New project",
|
||
|
|
||
|
p("Create a new text tagging project."),
|
||
|
p("This mainly involves choosing a dataset, and how to process it."),
|
||
|
p("Items marked by a star ", strong("*"),
|
||
|
" cannot be changed once the project is created."),
|
||
|
|
||
|
hr(),
|
||
|
h4("Files and folders"),
|
||
|
|
||
|
fluidRow(
|
||
|
column(3, p(strong("*Project name"))),
|
||
|
column(5, textInput("cpName", NULL, placeholder = "(Required)")),
|
||
|
column(4, p("Tags will be stored in a new directory by this name."))
|
||
|
),
|
||
|
fluidRow(
|
||
|
column(3, p(strong("*Data directory"))),
|
||
|
column(5, textInput("cpDatadir", NULL, placeholder = "(Required)")),
|
||
|
column(4, p("Place (on the server) where the data and project are stored"))
|
||
|
),
|
||
|
fluidRow(
|
||
|
column(3, p(strong("*Data filename"))),
|
||
|
column(5, checkboxInput("cpDoImport", "New data", TRUE)),
|
||
|
column(4, p("Main file, containing id and text columns"))
|
||
|
),
|
||
|
|
||
|
conditionalPanel("!input.cpDoImport", list(
|
||
|
## Existing data
|
||
|
fluidRow(
|
||
|
column(3, p(strong("Existing file"))),
|
||
|
column(5, textInput("cpDatafile", NULL, placeholder = "(Required)")),
|
||
|
column(4, actionButton("cpDatafileGo", "Import"))
|
||
|
)
|
||
|
)),
|
||
|
conditionalPanel("input.cpDoImport", {
|
||
|
## Import data
|
||
|
wellPanel(
|
||
|
selectInput('cpNDfiletype', NULL,
|
||
|
choices = c("Text file (.csv, .txt)" = "csv_txt",
|
||
|
"Feather" = "feather",
|
||
|
"Parquet" = "parquet",
|
||
|
"OpenDocument (.ods)" = "ods",
|
||
|
"Microsoft Excel (.xls, .xlsx)" = "excel",
|
||
|
"SPSS (.sav, .por)" = "spss",
|
||
|
"SAS (.sas7bdat)" = "sas_data",
|
||
|
"Stata v5-v12 (.dta)" = "stata"),
|
||
|
selected = "csv_txt"
|
||
|
),
|
||
|
fileInput('cpNDfile', 'Choose File'),
|
||
|
|
||
|
checkboxInput("cpNDparams", "File parameters"),
|
||
|
|
||
|
conditionalPanel(
|
||
|
"input.cpNDparams",
|
||
|
conditionalPanel(
|
||
|
"input.cpNDfiletype == 'csv_txt'",
|
||
|
|
||
|
fluidRow(column(4, p('Header')),
|
||
|
column(8, selectInput('cpCSVheader', NULL,
|
||
|
c("Header" = TRUE,
|
||
|
"No Header" = FALSE),
|
||
|
"Auto"))),
|
||
|
fluidRow(column(4, p('Separator')),
|
||
|
column(8, selectInput('cpCSVsep', NULL,
|
||
|
c("Comma ','" = ",",
|
||
|
"Semicolon ';'" = ";",
|
||
|
"Tab" = "\t",
|
||
|
"Space" = " "),
|
||
|
"Auto"))),
|
||
|
fluidRow(column(4, p('Quote')),
|
||
|
column(8, selectInput('cpCSVquote', NULL,
|
||
|
c("Double Quote \"" = "\"",
|
||
|
"Single Quote '" = "'",
|
||
|
"None" = ""),
|
||
|
"Double Quote \""))),
|
||
|
fluidRow(column(4, p('Decimal mark')),
|
||
|
column(8, selectInput('cpCSVdec', NULL,
|
||
|
c('Period "."' = ".",
|
||
|
'Comma ","' = ","),
|
||
|
'Period "."'))),
|
||
|
fluidRow(column(4, p('File Encoding')),
|
||
|
column(8, selectInput('cpCSVencoding', NULL,
|
||
|
c("unknown", "UTF-8", "Latin-1"),
|
||
|
"unknown")))),
|
||
|
|
||
|
## Options for ods files
|
||
|
conditionalPanel(
|
||
|
"input.cpNDfiletype == 'ods'",
|
||
|
fluidRow(column(4, p('Sheet')),
|
||
|
column(8, numericInput('cpODSsheet', NULL, 1, 1,
|
||
|
step = 1))),
|
||
|
checkboxInput("cpODScolnames", "Column names", TRUE),
|
||
|
fluidRow(column(4, p('NA symbol')),
|
||
|
column(8, textInput('cpODSna', NULL, value = ""))),
|
||
|
fluidRow(column(4, p('Skip rows')),
|
||
|
column(8, numericInput('cpODSskip', NULL, 0, 0,
|
||
|
step = 1))),
|
||
|
fluidRow(column(4, p('Specify range')),
|
||
|
column(8, textInput('cpODSrange', NULL, value = "")))
|
||
|
),
|
||
|
|
||
|
## Options for xls / xlsx files
|
||
|
conditionalPanel(
|
||
|
"input.cpNDfiletype == 'excel'",
|
||
|
fluidRow(column(4, p('Column names')),
|
||
|
column(8, selectInput('cpXLScolnames', NULL,
|
||
|
choices = c("Yes" = "TRUE",
|
||
|
"No" = "FALSE"),
|
||
|
selected = TRUE ))),
|
||
|
fluidRow(column(4, p('Skip rows')),
|
||
|
column(8, numericInput('cpXLSskip', NULL, 0, 0,
|
||
|
step = 1))),
|
||
|
fluidRow(column(4, p('Trim whitespaces')),
|
||
|
column(8, selectInput('cpXLStrim', NULL,
|
||
|
choices = c("Yes" = "TRUE",
|
||
|
"No" = "FALSE"),
|
||
|
selected = TRUE ))),
|
||
|
checkboxInput("cpXLSsheet",
|
||
|
"Specify Worksheet", FALSE),
|
||
|
conditionalPanel("input.cpXLSsheet",
|
||
|
textInput("cpXLSsheet_specs", NULL, "")),
|
||
|
checkboxInput("cpXLSrange", "Specify Range", FALSE),
|
||
|
conditionalPanel("input.cpXLSrange",
|
||
|
textInput("cpXLSrange_specs", NULL, "")
|
||
|
)),
|
||
|
|
||
|
## Options for SPSS files
|
||
|
conditionalPanel(
|
||
|
"input.cpNDfiletype == 'spss'",
|
||
|
fluidRow(
|
||
|
column(4, p('Skip rows')),
|
||
|
column(8, numericInput('cpSPSSskip', NULL, 0, 0, step = 1))),
|
||
|
checkboxInput("cpSPSSna", "User-defined NA", FALSE)),
|
||
|
|
||
|
## Options for SAS files
|
||
|
conditionalPanel(
|
||
|
"input.cpNDfiletype == 'sas_data'",
|
||
|
fluidRow(
|
||
|
column(4, p('Skip rows')),
|
||
|
column(8, numericInput("cpSASskip", NULL, 0, 0, step = 1))),
|
||
|
checkboxInput("cpSAScatalog", "Use catalog file", FALSE),
|
||
|
conditionalPanel(
|
||
|
"input.cpSAScatalog",
|
||
|
fileInput("cpSAScatalog_file", "Catalog file"))
|
||
|
),
|
||
|
|
||
|
## Options for STATA dta files
|
||
|
conditionalPanel(
|
||
|
"input.cpNDfiletype == 'stata'",
|
||
|
checkboxInput("cpSTATAfactors", "Convert factors", FALSE))
|
||
|
)
|
||
|
)
|
||
|
}),
|
||
|
|
||
|
uiOutput("cpDataMessage"),
|
||
|
hr(),
|
||
|
h4("Data"),
|
||
|
|
||
|
fluidRow(
|
||
|
column(6, numericInput(
|
||
|
"cpNrows", "N. rows for working sample", 500, 1, step = 1)),
|
||
|
column(6, numericInput(
|
||
|
"cpNskip", "Rows to skip before working sample", 0, 0, step = 1))
|
||
|
),
|
||
|
|
||
|
fluidRow(
|
||
|
column(3, p(strong("*ID"))),
|
||
|
column(5, uiOutput("cpIdcolUI")),
|
||
|
column(4, p("Name of the id variable, unique identifier of each text"))
|
||
|
),
|
||
|
fluidRow(
|
||
|
column(3, p(strong("*Text"))),
|
||
|
column(5, uiOutput("cpTextcolUI")),
|
||
|
column(4, p("Name of the text variables: if more than one, texts are concatenated in the specified order"))
|
||
|
),
|
||
|
fluidRow(
|
||
|
column(3, p(strong("*Tags"))),
|
||
|
column(5, uiOutput("cpTagcolUI")),
|
||
|
column(4, p("(optional) Names of variables that are already tagged: each will create a new scheme"))
|
||
|
),
|
||
|
fluidRow(
|
||
|
column(3, p(strong("Comments"))),
|
||
|
column(5, uiOutput("cpCommcolUI")),
|
||
|
column(4, p("(optional) Name of the comments variable"))
|
||
|
),
|
||
|
fluidRow(
|
||
|
column(3, p(strong("Context"))),
|
||
|
column(5, uiOutput("cpContextcolUI")),
|
||
|
column(4, p("(optional) Names of variables not used in the models, but may be displayed during tagging"))
|
||
|
),
|
||
|
|
||
|
|
||
|
hr(),
|
||
|
h4("System"),
|
||
|
|
||
|
fluidRow(
|
||
|
column(3, checkboxInput("cp_use_python", "Python backend", FALSE)),
|
||
|
column(5, conditionalPanel(
|
||
|
"input.cp_use_python",
|
||
|
textInput("cp_which_python", NULL, value = "python3",
|
||
|
placeholder = "(custom python path)"))),
|
||
|
column(4, conditionalPanel(
|
||
|
"input.cp_use_python",
|
||
|
p("This must be a working python3 environment, with the required modules installed (see documentation)")))
|
||
|
),
|
||
|
conditionalPanel("input.cp_use_python", list(
|
||
|
fluidRow(
|
||
|
column(3, p("Language")),
|
||
|
column(5, selectInput("cp_lang", NULL, modelnames_labels, "en")),
|
||
|
column(4, p("Used to preset tokenization and embedding models"))
|
||
|
),
|
||
|
fluidRow(
|
||
|
column(3, checkboxInput("cp_use_spacy", "SpaCy tokenization", FALSE)),
|
||
|
column(5, conditionalPanel("input.cp_use_spacy", uiOutput("cp_spacyUI"))),
|
||
|
column(4, p("Name of the spacy tokenizer model, used in DTM and word embeddings"))
|
||
|
),
|
||
|
conditionalPanel("input.cp_use_spacy", fluidRow(
|
||
|
column(3),
|
||
|
column(9, uiOutput("cp_spacyDlUI")))
|
||
|
),
|
||
|
fluidRow(
|
||
|
column(3, checkboxInput("cp_use_ft", "FastText word embeddings", FALSE)),
|
||
|
column(5, conditionalPanel("input.cp_use_ft", uiOutput("cp_ftUI"))),
|
||
|
column(4, p("Path to the local fasttext model binary"))
|
||
|
),
|
||
|
conditionalPanel("input.cp_use_ft", fluidRow(
|
||
|
column(3),
|
||
|
column(9, uiOutput("cp_ftDlUI")))),
|
||
|
fluidRow(
|
||
|
column(3, checkboxInput("cp_use_sb", "SBERT sentence embeddings", FALSE)),
|
||
|
column(5, conditionalPanel("input.cp_use_sb", uiOutput("cp_sbUI"))),
|
||
|
column(4, p("(GPU recommended) Name or path of the sentence-transformers model"))
|
||
|
),
|
||
|
|
||
|
conditionalPanel("input.cp_use_python", {
|
||
|
checkboxInput("cp_use_gpu", "GPU support (CUDA, for SBERT and BERT)", FALSE)
|
||
|
})
|
||
|
)),
|
||
|
|
||
|
uiOutput("cpConfirmMessage"),
|
||
|
footer = tagList(actionButton("cpConfirm", "Create"),
|
||
|
modalButton("Cancel"))))
|
||
|
})
|
||
|
|
||
|
observeEvent(input$cpDatafileGo, {
|
||
|
dataname <- input$cpDatafile
|
||
|
dataname <- gsub("[.]feather$", "", dataname)
|
||
|
tmpdat <- try(arrow::read_feather(
|
||
|
paste0(gsub("/+$", "/", paste0(input$cpDatadir, "/")),
|
||
|
dataname, ".feather")))
|
||
|
if (inherits(tmpdat, "try-error")) {
|
||
|
values$cpDataMessage <- paste0(
|
||
|
"Data import error: ", as.character(tmpdat))
|
||
|
values$cpNewData <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (!is.data.frame(tmpdat)) {
|
||
|
values$cpDataMessage <- "Data import error: object not a data.frame"
|
||
|
values$cpNewData <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (ncol(tmpdat) < 2) {
|
||
|
values$cpDataMessage <- "Data import error: only one column detected"
|
||
|
values$cpNewData <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
values$cpDataMessage <- paste0(
|
||
|
"Data import success: ", nrow(tmpdat), " rows, ", ncol(tmpdat), " cols.")
|
||
|
values$cpNewData <- tmpdat
|
||
|
values$cpNewDataname <- dataname
|
||
|
|
||
|
})
|
||
|
|
||
|
## Data import mechanism
|
||
|
observeEvent(input$cpNDfile, {
|
||
|
req(input$cpNDfile)
|
||
|
if(input$cpNDfiletype == "csv_txt") {
|
||
|
res <- try(read.csv(
|
||
|
input$cpNDfile$datapath,
|
||
|
header = as.logical(input$cpCSVheader), sep = input$cpCSVsep,
|
||
|
quote = input$cpCSVquote, dec = input$cpCSVdec, stringsAsFactors = FALSE,
|
||
|
encoding = input$cpCSVencoding, check.names = TRUE))
|
||
|
|
||
|
} else if(input$cpNDfiletype == "feather") {
|
||
|
res <- try(arrow::read_feather(input$cpNDfile$datapath))
|
||
|
|
||
|
} else if(input$cpNDfiletype == "parquet") {
|
||
|
res <- try(arrow::read_parquet(input$cpNDfile$datapath))
|
||
|
|
||
|
} else if (input$cpNDfiletype == "ods") {
|
||
|
range <- input$cpODSrange
|
||
|
if (range == "") range <- NULL
|
||
|
res <- try(readODS::read_ods(
|
||
|
path = input$cpNDfile$datapath, sheet = input$cpODSsheet,
|
||
|
col_names = input$cpODScolnames, na = input$cpODSna,
|
||
|
skip = input$cpODSskip, range = input$cpODSrange))
|
||
|
|
||
|
} else if(input$cpNDfiletype == "excel"){
|
||
|
column_names <- as.logical(input$cpXLScolnames)
|
||
|
trim_spaces <- as.logical(input$cpXLStrim)
|
||
|
the.range <- NULL
|
||
|
if(input$cpXLSrange == TRUE & input$cpXLSrange_specs != "")
|
||
|
the.range <- input$cpXLSrange_specs
|
||
|
the.sheet <- NULL
|
||
|
if(input$cpXLSsheet == TRUE & input$cpXLSsheet_specs != "")
|
||
|
the.sheet <- input$cpXLSsheet_specs
|
||
|
res <- try(data.frame(readxl::read_excel(
|
||
|
dataFile$datapath, col_names = column_names, range = the.range,
|
||
|
sheet= the.sheet, trim_ws = trim_spaces, skip = rows_to_skip)))
|
||
|
|
||
|
} else if(input$cpNDfiletype == "spss"){
|
||
|
res <- try(data.frame(haven::read_spss(
|
||
|
file = input$cpNDfile$datapath,
|
||
|
skip= input$cpSPSSskip, user_na = input$cpSPSSna)))
|
||
|
if (!inherits(res, "try-error")) {
|
||
|
res <- data.frame(lapply(res, function(x) {
|
||
|
attr(x, "format.spss") <- NULL
|
||
|
if ("haven_labelled" %in% class(x))
|
||
|
x <- haven::as_factor(x, levels= "labels")
|
||
|
x
|
||
|
}))
|
||
|
}
|
||
|
|
||
|
} else if(input$cpNDfiletype == "sas_data") {
|
||
|
res <- try(data.frame(haven::read_sas(
|
||
|
data_file = input$cpNDfile$datapath,
|
||
|
catalog_file = input$cpSAScatalog_file$datapath,
|
||
|
skip = input$cpSASskip)))
|
||
|
|
||
|
} else if(input$cpNDfiletype == "stata") {
|
||
|
res <- try(data.frame(foreign::read.dta(
|
||
|
file = input$cpNDfile$datapath, convert.factors = input$cpSTATAfactors)))
|
||
|
}
|
||
|
|
||
|
if(inherits(res, "try-error")) {
|
||
|
values$cpDataMessage <- paste0("Data import error: ", res)
|
||
|
values$cpNewData <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (ncol(res) < 2) {
|
||
|
values$cpDataMessage <- "Data import error: only one column detected"
|
||
|
values$cpNewData <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
values$cpDataMessage <- paste0(
|
||
|
"Data import success: ", nrow(res), " rows, ", ncol(res), " columns.")
|
||
|
values$cpNewData <- res
|
||
|
})
|
||
|
|
||
|
## New project confirm
|
||
|
observeEvent(input$cpConfirm, {
|
||
|
if (verbose)
|
||
|
cat("DEBUG cpConfirm\n")
|
||
|
newconf <- list()
|
||
|
|
||
|
tmpdir <- gsub("/+$", "/", paste0(input$cpDatadir, "/"))
|
||
|
if (!dir.exists(tmpdir))
|
||
|
dir.create(tmpdir)
|
||
|
newconf$datadir <- tmpdir
|
||
|
|
||
|
tmpname <- cleanFileName(input$cpName)
|
||
|
if (!nchar(tmpname)) {
|
||
|
values$cpConfirmMessage <- "Project name is required"
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (dir.exists(paste0(newconf$datadir, tmpname))) {
|
||
|
values$cpConfirmMessage <- paste0(
|
||
|
"A project with the same name (", tmpname,
|
||
|
") already exists in the data directory.")
|
||
|
return(NULL)
|
||
|
}
|
||
|
newconf$projectid <- tmpname
|
||
|
|
||
|
if (!is.data.frame(values$cpNewData)) {
|
||
|
values$cpConfirmMessage <- "Import data first"
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
if (input$cpDoImport) {
|
||
|
newconf$dataname <- newconf$projectid
|
||
|
} else
|
||
|
newconf$dataname <- values$cpNewDataname
|
||
|
|
||
|
if (any(is.na(values$cpNewData[[input$cpIdcol]]))) {
|
||
|
values$cpConfirmMessage <- "ID variable contains missing values"
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (any(duplicated(values$cpNewData[[input$cpIdcol]]))) {
|
||
|
values$cpConfirmMessage <- "ID variable contains duplicates"
|
||
|
return(NULL)
|
||
|
}
|
||
|
newconf$idcol <- input$cpIdcol
|
||
|
|
||
|
if (!length(input$cpTextcol)) {
|
||
|
values$cpConfirmMessage <- "Select at least one text variable"
|
||
|
return(NULL)
|
||
|
}
|
||
|
newconf$textcols <- input$cpTextcol
|
||
|
|
||
|
if (input$cpCommcol == "none") {
|
||
|
newconf$commcol <- paste0("comm_", newconf$projectid)
|
||
|
} else
|
||
|
newconf$commcol <- input$cpCommcol
|
||
|
|
||
|
if (!length(input$cpTagcol)) {
|
||
|
newconf$tagcols <- "scheme0"
|
||
|
} else
|
||
|
newconf$tagcols <- input$cpTagcol
|
||
|
|
||
|
newconf$contextcols <- input$cpContextcol
|
||
|
|
||
|
newconf$dataNrows <- input$cpNrows
|
||
|
newconf$dataSkipRows <- input$cpNskip
|
||
|
|
||
|
if (input$cp_use_python) {
|
||
|
pytest <- try(system(paste(input$cp_which_python, "--version"),
|
||
|
intern = TRUE))
|
||
|
if (inherits(pytest, "try-error")) {
|
||
|
values$cpConfirmMessage <- paste(
|
||
|
"Python path `", input$cp_which_python, "` not valid")
|
||
|
return(NULL)
|
||
|
}
|
||
|
}
|
||
|
newconf$use_python <- input$cp_use_python
|
||
|
newconf$python <- input$cp_which_python
|
||
|
newconf$use_gpu <- input$cp_use_gpu
|
||
|
|
||
|
if (input$cp_use_spacy) {
|
||
|
sptest <- system(paste0(
|
||
|
newconf$python, " -m spacy info ", input$cp_use_spacy_model), intern = TRUE)
|
||
|
if (length(sptest) == 0) {
|
||
|
values$cpConfirmMessage <- paste(
|
||
|
"Error loading spacy, check that it is installed in the specified python env")
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (length(attr(sptest, "status"))) {
|
||
|
values$cpConfirmMessage <- paste(
|
||
|
"Error loading spacy model, check that it has been downloaded")
|
||
|
return(NULL)
|
||
|
}
|
||
|
}
|
||
|
newconf$use_spacy <- input$cp_use_spacy
|
||
|
newconf$use_spacy_model <- input$cp_use_spacy_model
|
||
|
|
||
|
if (input$cp_use_ft) if (!file.exists(input$cp_use_ft_model)) {
|
||
|
values$cpConfirmMessage <- paste(
|
||
|
"Error loading fasttext model, check the specified path")
|
||
|
return(NULL)
|
||
|
}
|
||
|
newconf$use_ft <- input$cp_use_ft
|
||
|
newconf$use_ft_model <- input$cp_use_ft_model
|
||
|
|
||
|
newconf$use_sb <- input$cp_use_sb
|
||
|
newconf$use_sb_model <- input$cp_use_sb_model
|
||
|
|
||
|
writeLines(RJSONIO::toJSON(newconf),
|
||
|
paste0("tigger_", newconf$projectid, ".json"))
|
||
|
|
||
|
values$confnames <- c(newconf$projectid, values$confnames)
|
||
|
values$conf <- newconf
|
||
|
|
||
|
## On new data import, write a feather file with the name of the project
|
||
|
if (input$cpDoImport)
|
||
|
arrow::write_feather(values$cpNewData, paste0(
|
||
|
newconf$datadir, newconf$dataname, ".feather"
|
||
|
))
|
||
|
|
||
|
removeModal()
|
||
|
})
|
||
|
|
||
|
##############################################################################
|
||
|
## Scheme management
|
||
|
##############################################################################
|
||
|
|
||
|
## Print active scheme in scheme management panel
|
||
|
output$printScheme <- renderUI(h4(input$selectScheme))
|
||
|
|
||
|
## Create new scheme on request, with modal dialog
|
||
|
observeEvent(input$createScheme, {
|
||
|
if (verbose)
|
||
|
cat("DEBUG create scheme\n")
|
||
|
showModal(modalDialog(
|
||
|
title = "Create tagging scheme",
|
||
|
p(paste0("This will create a new scheme for this project")),
|
||
|
|
||
|
textInput("newScheme", label = NULL, placeholder = "Required: new scheme name"),
|
||
|
textAreaInput("newSchemeDescr", label = NULL,
|
||
|
placeholder = "Optional: scheme description"),
|
||
|
checkboxInput("schemeDuplicate", "Duplicate existing scheme:"),
|
||
|
conditionalPanel(
|
||
|
"input.schemeDuplicate",
|
||
|
selectInput("schemeDuplicateFrom", NULL, rev(values$conf$tagcols))),
|
||
|
|
||
|
footer = tagList(actionButton("createSchemeConfirm", "Create"),
|
||
|
modalButton("Cancel"))))
|
||
|
})
|
||
|
|
||
|
observeEvent(input$createSchemeConfirm, {
|
||
|
if (is.null(ok.data())) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG create scheme event\n")
|
||
|
|
||
|
## Make sure new scheme name is a correct file name
|
||
|
if (!grepl("\\S", input$newScheme)) {
|
||
|
showNotification(type = "error",
|
||
|
"Cannot create scheme: name empty.")
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
## Check that new scheme name doesn't exist
|
||
|
## Re-read tagcols from conf file, to prevent scheme disappearances
|
||
|
tmpname <- cleanFileName(input$newScheme)
|
||
|
tagpath <- paste0(values$projectdir, tmpname, ".csv")
|
||
|
confpath <- paste0("tigger_", values$conf$projectid, ".json")
|
||
|
if (verbose)
|
||
|
cat("DEBUG create scheme check json:", confpath,"\n")
|
||
|
tmp_tagcols <- RJSONIO::fromJSON(confpath)$tagcols
|
||
|
if (verbose)
|
||
|
cat("DEBUG create scheme json OK\n")
|
||
|
if (tmpname %in% tmp_tagcols | file.exists(tagpath)) {
|
||
|
showNotification(type = "error",
|
||
|
"Cannot create scheme: name already exists.")
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
## Create new scheme description
|
||
|
if (nchar(input$newSchemeDescr))
|
||
|
writeLines(input$newSchemeDescr,
|
||
|
paste0(values$projectdir, tmpname, ".txt"))
|
||
|
|
||
|
## Update conf file and reactives
|
||
|
values$conf$tagcols <- c(tmp_tagcols, tmpname)
|
||
|
# values$confs[[values$conf$projectid]]$tagcols <- c(tmp_tagcols, tmpname)
|
||
|
writeLines(RJSONIO::toJSON(values$conf), confpath)
|
||
|
|
||
|
if (input$schemeDuplicate) {
|
||
|
tmpmat <- matrix(values$tagmat[, input$schemeDuplicateFrom])
|
||
|
tmpmathist <- matrix(values$tagmathist[, input$schemeDuplicateFrom])
|
||
|
} else {
|
||
|
tmpmat <- matrix(rep(NA, nrow(ok.data())))
|
||
|
tmpmathist <- matrix(rep(NA, nrow(ok.data())))
|
||
|
}
|
||
|
colnames(tmpmat) <- tmpname
|
||
|
colnames(tmpmathist) <- tmpname
|
||
|
|
||
|
values$tagmat <- cbind(values$tagmat, tmpmat)
|
||
|
values$tagmathist <- cbind(values$tagmat, tmpmathist)
|
||
|
|
||
|
## Create new tagging file if duplicating
|
||
|
if (input$schemeDuplicate) {
|
||
|
export <- data.frame(ok.data()[, values$conf$idcol],
|
||
|
values$tagmat[, tmpname])
|
||
|
colnames(export) <- c(values$conf$idcol, tmpname)
|
||
|
export <- na.omit(export)
|
||
|
write.table(export, tagpath, sep = ",", qmethod = "double",
|
||
|
fileEncoding = "UTF-8", row.names = FALSE,
|
||
|
append = FALSE, col.names = TRUE)
|
||
|
}
|
||
|
|
||
|
cat("DEBUG create scheme exit\n")
|
||
|
removeModal()
|
||
|
})
|
||
|
|
||
|
## Button action: delete scheme
|
||
|
observeEvent(input$schemeDelete, {
|
||
|
if (verbose)
|
||
|
cat("DEBUG delete scheme\n")
|
||
|
if (length(values$conf$tagcols) < 2) {
|
||
|
showNotification(type = "error",
|
||
|
"Cannot delete: at least one scheme must remain.")
|
||
|
return(NULL)
|
||
|
}
|
||
|
showModal(modalDialog(
|
||
|
title = "Delete tagging scheme",
|
||
|
paste0("Confirm delete '", input$selectScheme,
|
||
|
"': ", sum(!is.na(values$label)), " tags will be deleted",
|
||
|
", along with scheme description."),
|
||
|
footer = tagList(actionButton("schemeDeleteConfirm", "Delete"),
|
||
|
modalButton("Cancel"))))
|
||
|
})
|
||
|
|
||
|
observeEvent(input$schemeDeleteConfirm, {
|
||
|
file.remove(paste0(values$projectdir, input$selectScheme, ".csv"))
|
||
|
file.remove(paste0(values$projectdir, input$selectScheme, ".txt"))
|
||
|
file.remove(paste0(values$projectdir, input$selectScheme, "_regex.json"))
|
||
|
values$conf$tagcols <-
|
||
|
values$conf$tagcols[values$conf$tagcols != input$selectScheme]
|
||
|
writeLines(RJSONIO::toJSON(values$conf),
|
||
|
paste0("tigger_", input$selectProject, ".json"))
|
||
|
removeModal()
|
||
|
})
|
||
|
|
||
|
|
||
|
observeEvent(input$schemeDescrSave, {
|
||
|
req(input$selectScheme)
|
||
|
writeLines(input$schemeDescr,
|
||
|
paste0(values$conf$datadir, values$conf$projectid,
|
||
|
"/", input$selectScheme, ".txt"))
|
||
|
})
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
##############################################################################
|
||
|
|
||
|
## Tag selection : populate
|
||
|
observe({
|
||
|
values$conf$tagcols
|
||
|
updateSelectInput(
|
||
|
# session, "selectScheme", choices = c(rev(values$conf$tagcols), "<NEW>"))
|
||
|
session, "selectScheme", choices = rev(values$conf$tagcols))
|
||
|
})
|
||
|
|
||
|
|
||
|
## Populate oracle buttons
|
||
|
output$oracleButtons <- renderUI({
|
||
|
if (is.null(ok.data())) return(NULL)
|
||
|
# if (is.null(values$label)) return(NULL)
|
||
|
if (is.null(values$uniqueLabels)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG populate oracle buttons\n")
|
||
|
|
||
|
do.call(fluidRow, lapply(values$uniqueLabels, function(ilab)
|
||
|
actionButton(paste0("oracle_", label2hash(ilab)),
|
||
|
strong(ilab))))
|
||
|
})
|
||
|
|
||
|
## Populate maxprob variable selector
|
||
|
observe({
|
||
|
updateSelectInput(session, "maxprobWhich", choices = values$uniqueLabels)
|
||
|
updateSelectInput(session, "taggedWhich",
|
||
|
choices = c("All" = "all", values$uniqueLabels))
|
||
|
})
|
||
|
|
||
|
## Data panel message
|
||
|
observeEvent(ok.data(), {
|
||
|
if (verbose)
|
||
|
cat("DEBUG data panel message\n")
|
||
|
if (is.null(ok.data())) {
|
||
|
output$dataMessage <- renderUI(p("Importing data..."))
|
||
|
}
|
||
|
output$dataMessage <- NULL
|
||
|
})
|
||
|
|
||
|
## On config change, update dataView and retag options
|
||
|
observeEvent(list(values$conf), {
|
||
|
|
||
|
if (is.null(values$conf)) return(NULL)
|
||
|
output$panelData <- renderUI(list(
|
||
|
DT::dataTableOutput("dataView")
|
||
|
))
|
||
|
|
||
|
output$panelRetag <- renderUI(c(
|
||
|
list(
|
||
|
br(),
|
||
|
h4("Rename tags"),
|
||
|
fluidRow(
|
||
|
column(4, strong("Current")),
|
||
|
column(2, strong("Count")),
|
||
|
column(6, strong("New"))
|
||
|
),
|
||
|
br()),
|
||
|
lapply(values$uniqueLabels, function(iLab) {
|
||
|
iHash <- label2hash(iLab)
|
||
|
fluidRow(
|
||
|
column(4, p(iLab)),
|
||
|
column(2, p(sum(values$label == iLab, na.rm = TRUE))),
|
||
|
column(4, textInput(
|
||
|
paste0("newtag_", iHash), NULL,
|
||
|
placeholder = "(new name, leave empty to untag)")),
|
||
|
column(2, actionButton(paste0("retag_", iHash), "Rename")))
|
||
|
})
|
||
|
))
|
||
|
})
|
||
|
|
||
|
output$panelExtraRegex <- renderUI(list(
|
||
|
p(strong("Regex predictors")),
|
||
|
fluidRow(
|
||
|
column(8, textInput("extraRegexText", NULL,
|
||
|
placeholder = "(new regex)", width = "100%")),
|
||
|
column(4, actionButton(
|
||
|
"extraRegexSearch", strong("+"), width = "100%"))
|
||
|
),
|
||
|
fluidRow(
|
||
|
style="overflow-y:scroll; max-height: 25vh",
|
||
|
fluidRow(
|
||
|
column(2, strong("")),
|
||
|
column(1, strong("Use")),
|
||
|
column(1, strong("Case")),
|
||
|
column(1, p(strong("N"))),
|
||
|
column(7, p(strong("Regex")))
|
||
|
),
|
||
|
|
||
|
lapply(values$extraRegex, function(iRegex) {
|
||
|
iHash <- label2hash(iRegex)
|
||
|
tmp_cased <- values[[paste0("caseregex_", iHash)]]
|
||
|
if (is.null(tmp_cased)) tmp_cased <- FALSE
|
||
|
fluidRow(
|
||
|
column(2, actionButton(paste0("delregex_", iHash), "🗑️", width="100%")),
|
||
|
column(1, checkboxInput(
|
||
|
paste0("useregex_", iHash), NULL,
|
||
|
value = isolate(!isFALSE(values[[paste0("useregex_", iHash)]])))),
|
||
|
column(1, checkboxInput(
|
||
|
paste0("caseregex_", iHash), NULL,
|
||
|
value = isolate(isTRUE(values[[paste0("caseregex_", iHash)]])))),
|
||
|
column(1, br(), p(sum(
|
||
|
stringi::stri_detect_regex(
|
||
|
ok.nicetext(), iRegex,
|
||
|
opts_regex = list(case_insensitive = !tmp_cased)),
|
||
|
na.rm = TRUE))),
|
||
|
column(7, br(), p(iRegex))
|
||
|
)
|
||
|
})
|
||
|
)
|
||
|
|
||
|
))
|
||
|
|
||
|
## Process new regex
|
||
|
observeEvent(input$extraRegexSearch, {
|
||
|
if (is.null(ok.nicetext())) return(NULL)
|
||
|
req(nchar(input$extraRegexText) > 0)
|
||
|
if (input$extraRegexText %in% values$extraRegex) return(NULL)
|
||
|
try_regex <- try(stringi::stri_detect_regex(
|
||
|
ok.nicetext(), input$extraRegexText,
|
||
|
opts_regex = list(case_insensitive = TRUE)))
|
||
|
if (inherits(try_regex, "try-error")) return(NULL)
|
||
|
new_regex <- input$extraRegexText
|
||
|
new_hash <- label2hash(new_regex)
|
||
|
values$extraRegex <- c(new_regex, values$extraRegex)
|
||
|
names(values$extraRegex)[1] <- new_hash
|
||
|
if (verbose)
|
||
|
cat("DEBUG regex search", input$extraRegexText, "\n")
|
||
|
})
|
||
|
|
||
|
## Regex predictors: populate delete and use events on first creation
|
||
|
useregex_trigger <- reactiveVal(0)
|
||
|
delregex_trigger <- reactiveVal(0)
|
||
|
caseregex_trigger <- reactiveVal(0)
|
||
|
observeEvent(list(values$extraRegex), {
|
||
|
if (!length(values$extraRegex)) return(NULL)
|
||
|
if (any(! values$extraRegex %in% values$extraRegexHist)) {
|
||
|
if (verbose)
|
||
|
cat("DEBUG regex predictor names updates\n")
|
||
|
tmpregex <- values$extraRegex[! values$extraRegex %in% values$extraRegexHist]
|
||
|
values$extraRegexHist <- c(tmpregex, values$extraRegexHist)
|
||
|
|
||
|
c(lapply(1:length(tmpregex), function(iregex) {
|
||
|
observeEvent(
|
||
|
input[[paste0("delregex_", names(tmpregex)[iregex])]], {
|
||
|
if (verbose)
|
||
|
cat("DEBUG regex delete", names(tmpregex)[iregex], "\n")
|
||
|
values$extraRegex <- values$extraRegex[values$extraRegex != tmpregex[iregex]]
|
||
|
delregex_trigger(delregex_trigger() + 1)
|
||
|
})
|
||
|
}),
|
||
|
lapply(1:length(tmpregex), function(iregex) {
|
||
|
tmpname <- paste0("useregex_", names(tmpregex)[iregex])
|
||
|
observeEvent(
|
||
|
input[[tmpname]], {
|
||
|
values[[tmpname]] <- input[[tmpname]]
|
||
|
useregex_trigger(useregex_trigger() + 1)
|
||
|
})
|
||
|
}),
|
||
|
lapply(1:length(tmpregex), function(iregex) {
|
||
|
tmpname <- paste0("caseregex_", names(tmpregex)[iregex])
|
||
|
observeEvent(
|
||
|
input[[tmpname]], {
|
||
|
values[[tmpname]] <- input[[tmpname]]
|
||
|
caseregex_trigger(caseregex_trigger() + 1)
|
||
|
})
|
||
|
})
|
||
|
)
|
||
|
}
|
||
|
})
|
||
|
|
||
|
observeEvent(list(delregex_trigger(), useregex_trigger(), caseregex_trigger()), {
|
||
|
req(ok.data(), input$selectScheme)
|
||
|
regex_all <- values$extraRegex
|
||
|
values$regex_inuse <- sapply(names(regex_all), function(iname)
|
||
|
values[[paste0("useregex_", iname)]])
|
||
|
values$regex_case <- sapply(names(regex_all), function(iname)
|
||
|
values[[paste0("caseregex_", iname)]])
|
||
|
if (verbose) if (any(values$regex_inuse))
|
||
|
cat("DEBUG regex in use", paste0(collapse = " ", regex_all[values$regex_inuse]), "\n")
|
||
|
writeLines(
|
||
|
RJSONIO::toJSON(lapply(FUN = unname, list(
|
||
|
regex = regex_all, use = values$regex_inuse, case = values$regex_case))),
|
||
|
paste0(values$projectdir, input$selectScheme, "_regex.json"))
|
||
|
})
|
||
|
|
||
|
output$dataView <- DT::renderDataTable({
|
||
|
# if (is.null(ok.text()) | ok.data.running()) return(NULL)
|
||
|
req(ok.text())
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter dataView\n")
|
||
|
if (verbose)
|
||
|
cat("DEBUG render dataView\n")
|
||
|
DT::datatable(
|
||
|
rownames = F,
|
||
|
data.frame(
|
||
|
row = 1:nrow(ok.data()),
|
||
|
id = ok.data()[, values$conf$idcol],
|
||
|
text = ok.text()),
|
||
|
options = list(
|
||
|
pageLength = 3,
|
||
|
columnDefs = list(list(
|
||
|
targets = 2,
|
||
|
render = DT::JS(
|
||
|
"function(data, type, row, meta) {",
|
||
|
"return type === 'display' && data.length > 200 ?",
|
||
|
"'<span title=\"' + data + '\">' + data.substr(0, 200) + '...</span>' : data;",
|
||
|
"}")
|
||
|
))
|
||
|
),
|
||
|
callback = DT::JS('table.page(0).draw(false);')
|
||
|
)
|
||
|
})
|
||
|
|
||
|
## Triggers for mass rename, created with each new label, with confirm
|
||
|
observeEvent(values$uniqueLabelsAll, {
|
||
|
if (is.null(values$uniqueLabelsAll)) return(NULL)
|
||
|
notthere <- unique(na.omit(
|
||
|
values$uniqueLabelsAll[! values$uniqueLabelsAll %in% values$retagLabels]))
|
||
|
lapply(
|
||
|
notthere,
|
||
|
function(iTag) {
|
||
|
iHash <- label2hash(iTag)
|
||
|
|
||
|
observeEvent(input[[paste0("retag_", iHash)]], {
|
||
|
tmp_oldtag <- iTag
|
||
|
tmp_retagged <- which(values$label == tmp_oldtag)
|
||
|
tmp_newtag <- gsub(
|
||
|
"^\\s+|\\s+$", "", input[[paste0("newtag_", iHash)]])
|
||
|
showModal(modalDialog(
|
||
|
title = "Mass retag",
|
||
|
paste0("Confirm retag: ", length(tmp_retagged), " ",
|
||
|
tmp_oldtag, " tags will be ",
|
||
|
ifelse(nchar(tmp_newtag),
|
||
|
paste("renamed to", tmp_newtag) , "deleted")),
|
||
|
footer = tagList(actionButton(paste0("retag_confirm_", iTag), "Retag"),
|
||
|
modalButton("Cancel"))))
|
||
|
})
|
||
|
|
||
|
observeEvent(input[[paste0("retag_confirm_", iTag)]], {
|
||
|
tmp_oldtag <- iTag
|
||
|
tmp_retagged <- which(values$label == tmp_oldtag)
|
||
|
tmp_newtag <- gsub(
|
||
|
"^\\s+|\\s+$", "", input[[paste0("newtag_", iHash)]])
|
||
|
if (!nchar(tmp_newtag)) tmp_newtag <- NA
|
||
|
values$tagTrigger <- list(
|
||
|
id = ok.data()[[values$conf$idcol]][tmp_retagged],
|
||
|
tag = rep(tmp_newtag, length(tmp_retagged)))
|
||
|
values$queries <- c(values$queries, tmp_retagged)
|
||
|
|
||
|
removeModal()
|
||
|
})
|
||
|
|
||
|
})
|
||
|
values$retagLabels <- c(values$retagLabels, notthere)
|
||
|
})
|
||
|
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## UI rendering of Panel Tagging / Visualization
|
||
|
##############################################################################
|
||
|
|
||
|
####################
|
||
|
## Tagging left panel
|
||
|
|
||
|
|
||
|
# ## Trigger from Oops
|
||
|
# observeEvent(input$oops, {
|
||
|
# if (is.null(ok.data()))
|
||
|
# return(NULL)
|
||
|
# if (verbose)
|
||
|
# cat("DEBUG enter Oops\n")
|
||
|
# if (!is.null(values$retagged)) if (values$queries[length(values$queries)] == values$retagged[length(values$retagged)])
|
||
|
# values$retagged <- values$retagged[-length(values$retagged)]
|
||
|
# values$label[values$queries[length(values$queries)]] <- NA
|
||
|
# if (ok.commentCol() != "(None)")
|
||
|
# values$comment[values$queries[length(values$queries)]] <- ok.data()[values$queries[length(values$queries)], ok.commentCol()]
|
||
|
# values$queries <- values$queries[-length(values$queries)]
|
||
|
# ## tagTrigger ?
|
||
|
#
|
||
|
# ready2tag(0)
|
||
|
# queryNext(queryNext() + 1)
|
||
|
# })
|
||
|
|
||
|
|
||
|
## Tagging triggers for Oracle buttons, created with each new label
|
||
|
observeEvent(values$uniqueLabelsAll, {
|
||
|
if (is.null(values$label)) return(NULL)
|
||
|
if (is.null(values$uniqueLabelsAll)) return(NULL)
|
||
|
if (all(is.na(values$label))) return(NULL)
|
||
|
notthere <- unique(na.omit(
|
||
|
values$uniqueLabelsAll[! values$uniqueLabelsAll %in% values$oracleLabels]
|
||
|
))
|
||
|
if (!length(notthere)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG triggers for oracle buttons\n")
|
||
|
lapply(
|
||
|
notthere,
|
||
|
function(iLab) {
|
||
|
observeEvent(
|
||
|
input[[paste0("oracle_", label2hash(iLab))]], {
|
||
|
if (is.null(ok.text())) return(NULL)
|
||
|
if (is.null(values$newQuery)) return(NULL)
|
||
|
if (!ready2tag()) {cat("DEBUG not ready to tag in Button\n"); return(NULL)}
|
||
|
# ready2tag(0)
|
||
|
# if (input$allowRetag | sum(is.na(values$label)) == 0)
|
||
|
if (!is.na(values$label[values$lastQuery]))
|
||
|
values$retagged <- c(values$retagged, values$lastQuery)
|
||
|
# values$label[values$lastQuery] <- iLab
|
||
|
values$comment[values$lastQuery] <- input$currentComment
|
||
|
# values$queries <- c(values$queries, values$lastQuery)
|
||
|
values$tagTrigger <- list(
|
||
|
id = ok.data()[values$lastQuery, values$conf$idcol],
|
||
|
tag = iLab,
|
||
|
comment = input$currentComment)
|
||
|
})
|
||
|
})
|
||
|
values$oracleLabels <- c(values$oracleLabels, notthere)
|
||
|
})
|
||
|
|
||
|
## Trigger from Oracle Create value button
|
||
|
observeEvent(input$currentAction, {
|
||
|
if (is.null(ok.data())) return(NULL)
|
||
|
if (is.null(values$newQuery)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter create value\n")
|
||
|
# if (input$allowRetag | sum(is.na(values$label)) == 0)
|
||
|
if (!is.na(values$label[values$lastQuery]))
|
||
|
values$retagged <- c(values$retagged, values$lastQuery)
|
||
|
if (input$newLab == "") {
|
||
|
return(NULL)
|
||
|
} else {
|
||
|
# values$label[values$lastQuery] <- input$newLab
|
||
|
updateTextInput(session, "newLab", value= "")
|
||
|
}
|
||
|
values$comment[values$lastQuery] <- input$currentComment
|
||
|
# values$queries <- c(values$queries, values$lastQuery)
|
||
|
values$tagTrigger <- list(
|
||
|
id = ok.data()[values$lastQuery, values$conf$idcol],
|
||
|
tag = input$newLab,
|
||
|
comment = input$currentComment)
|
||
|
})
|
||
|
|
||
|
## Trigger from Oracle Confirm button
|
||
|
observeEvent(input$oracleConfirm, {
|
||
|
req(ok.data(), ok.train(), values$newQuery)
|
||
|
# if (is.null(ok.data())) return(NULL)
|
||
|
# if (is.null(values$newQuery)) return(NULL)
|
||
|
if (!ready2tag()) {cat("DEBUG not ready to tag in OC\n"); return(NULL)}
|
||
|
# ready2tag(0)
|
||
|
if (!ok.train.possible()) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG oracle confirm\n")
|
||
|
# if (input$allowRetag | sum(is.na(values$label)) == 0)
|
||
|
if (!is.na(values$label[values$lastQuery]))
|
||
|
values$retagged <- c(values$retagged, values$lastQuery)
|
||
|
# values$label[values$lastQuery] <- ok.predlab1()[values$lastQuery]
|
||
|
# values$queries <- c(values$queries, values$lastQuery)
|
||
|
values$comment[values$lastQuery] <- input$currentComment
|
||
|
values$tagTrigger <- list(
|
||
|
id = ok.data()[values$lastQuery, values$conf$idcol],
|
||
|
tag = ok.predlab1()[values$lastQuery],
|
||
|
comment = input$currentComment)
|
||
|
})
|
||
|
|
||
|
## Trigger for CV glm (async)
|
||
|
glmCV_nclicks <- reactiveVal(0)
|
||
|
observeEvent(input$glmCV, {
|
||
|
if (is.null(ok.predictor())) return(NULL)
|
||
|
if (glmCV_nclicks() != 0) {
|
||
|
showNotification("LASSO CV already running")
|
||
|
return(NULL)
|
||
|
}
|
||
|
glmCV_nclicks(1)
|
||
|
glmCV_future <- reactiveVal()
|
||
|
|
||
|
tmpvalues <- values$label
|
||
|
tmppred <- ok.predictor()
|
||
|
for (ival in na.omit(unique(tmpvalues)))
|
||
|
if (sum(tmpvalues == ival, na.rm = T) <= 8)
|
||
|
tmpvalues[which(tmpvalues == ival)] <- NA
|
||
|
glmCV_async <- future(seed = TRUE, {
|
||
|
glmnet::cv.glmnet(
|
||
|
x= tmppred[!is.na(tmpvalues), ],
|
||
|
y= as.factor(tmpvalues[!is.na(tmpvalues)]),
|
||
|
family= "multinomial")
|
||
|
}) %...>% glmCV_future()
|
||
|
|
||
|
glmCV_async <- catch(glmCV_async, function(e) {
|
||
|
showNotification(type = "error", paste0("LASSO CV error: ", e))
|
||
|
})
|
||
|
glmCV_async <- finally(glmCV_async, function() {
|
||
|
glmCV_nclicks(0)
|
||
|
if (is.null(glmCV_future())) return(NULL)
|
||
|
updateNumericInput(session, "glmLambda",
|
||
|
value = glmCV_future()$lambda.min)
|
||
|
})
|
||
|
NULL
|
||
|
})
|
||
|
|
||
|
## Trigger for CV liblinear (async)
|
||
|
liblinCV_nclicks <- reactiveVal(0)
|
||
|
observeEvent(input$liblinCV, {
|
||
|
if (is.null(ok.predictor())) return(NULL)
|
||
|
if (liblinCV_nclicks() != 0) {
|
||
|
showNotification("LibLineaR CV already running")
|
||
|
return(NULL)
|
||
|
}
|
||
|
liblinCV_nclicks(1)
|
||
|
liblinCV_future <- reactiveVal()
|
||
|
|
||
|
tmpvalues <- values$label
|
||
|
tmppred <- ok.predictor()
|
||
|
for (ival in na.omit(unique(tmpvalues)))
|
||
|
if (sum(tmpvalues == ival, na.rm = T) <= 3)
|
||
|
tmpvalues[which(tmpvalues == ival)] <- NA
|
||
|
|
||
|
liblinCV_async <- future(seed = TRUE, {
|
||
|
LiblineaR::LiblineaR(
|
||
|
data= tmppred[!is.na(tmpvalues), ],
|
||
|
target= as.factor(tmpvalues[!is.na(tmpvalues)]),
|
||
|
type= 0, findC= T)
|
||
|
}) %...>% liblinCV_future()
|
||
|
liblinCV_async <- catch(liblinCV_async, function(e) {
|
||
|
showNotification(type = "error", paste0("LibLineaR CV error: ", e))
|
||
|
})
|
||
|
liblinCV_async <- finally(liblinCV_async, function() {
|
||
|
liblinCV_nclicks(0)
|
||
|
if (is.null(liblinCV_future())) return(NULL)
|
||
|
updateNumericInput(session, "liblinCost", value= liblinCV_future())
|
||
|
})
|
||
|
NULL
|
||
|
})
|
||
|
|
||
|
|
||
|
output$currentContext <- renderUI({
|
||
|
if (verbose)
|
||
|
cat("DEBUG 1rentContext\n")
|
||
|
if ((! is.null(ok.text())) & (length(ok.contextCol()) > 0))
|
||
|
HTML(paste(paste(ok.contextCol(),
|
||
|
as.character(ok.data()[values$newQuery, ok.contextCol()]),
|
||
|
sep = " : "),
|
||
|
collapse= "<br/><br/>"))
|
||
|
})
|
||
|
|
||
|
output$makeOracleConfirm <- renderUI({
|
||
|
if (verbose)
|
||
|
cat("DEBUG update printPredicted\n")
|
||
|
if (is.null(ok.pred())) {
|
||
|
tmp <- "No prediction"
|
||
|
} else if (is.null(values$newQuery)) {
|
||
|
tmp <- "No query"
|
||
|
} else {
|
||
|
tmp <- paste0(
|
||
|
ifelse(is.na(values$label[values$newQuery]), "", "Pred: "),
|
||
|
ok.predlab1()[values$newQuery],
|
||
|
" (", round(max(ok.pred()[values$newQuery, ]), 3), ")")
|
||
|
}
|
||
|
res <- list(actionButton("oracleConfirm", label= paste("🤖", tmp), width = "100%"))
|
||
|
if (!is.null(values$label) & !is.null(values$newQuery)) {
|
||
|
if (!is.na(values$label[values$newQuery]))
|
||
|
res <- list(res[[1]], HTML(paste0(
|
||
|
"<center><strong> Current: ", values$label[values$newQuery],
|
||
|
"</strong></center>")))
|
||
|
}
|
||
|
res
|
||
|
})
|
||
|
|
||
|
|
||
|
## Show training diagnostics on new model train
|
||
|
output$trainDiagno <-
|
||
|
renderText(paste0(ok.msgtagged(), "\n", values$modelDiagno))
|
||
|
|
||
|
## Diagnostics event
|
||
|
observeEvent(diagTrigger(), {
|
||
|
if (verbose)
|
||
|
cat("DEBUG update modelDiagno\n")
|
||
|
if (is.null(ok.data())) {
|
||
|
values$modelDiagno <- "No model"
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (!ok.train.possible()) {
|
||
|
values$modelDiagno <- "No model (not enough labelled data)."
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
if (input$predModel == "random forest") {
|
||
|
tmp <- paste0(
|
||
|
"Train Accuracy: ",
|
||
|
round(100 * mean(ok.predlab1()[!is.na(values$label)] ==
|
||
|
values$label[!is.na(values$label)]), 1),
|
||
|
"% ; Wtd. F1: ",
|
||
|
sprintf("%f", round(wtdF1(values$label[!is.na(values$label)],
|
||
|
ok.predlab1()[!is.na(values$label)]), 3)),
|
||
|
"%\nOOB Accuracy : ",
|
||
|
round(100 * mean(
|
||
|
colnames(ok.train()$predictions)[apply(ok.train()$predictions, 1, which.max)] ==
|
||
|
values$label[!is.na(values$label)]), 1),
|
||
|
"% ; Wtd. F1: ",
|
||
|
sprintf("%.3f", wtdF1(
|
||
|
values$label[!is.na(values$label)],
|
||
|
colnames(ok.train()$predictions)[apply(ok.train()$predictions, 1, which.max)]))
|
||
|
)
|
||
|
} else if (input$predModel %in% c("lasso", "linear", "naive bayes", "knn")) {
|
||
|
tmp <- paste0(
|
||
|
"Train Accuracy: ",
|
||
|
round(100 * mean(ok.predlab1()[!is.na(values$label)] ==
|
||
|
values$label[!is.na(values$label)]), 1),
|
||
|
"% ; Wtd. F1: ",
|
||
|
sprintf("%.3f", wtdF1(
|
||
|
values$label[!is.na(values$label)],
|
||
|
ok.predlab1()[!is.na(values$label)])))
|
||
|
}
|
||
|
values$modelDiagno <- tmp
|
||
|
})
|
||
|
|
||
|
## Update comment input field
|
||
|
observeEvent(values$newQuery, {
|
||
|
if (is.null(ok.text())) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG update comment field\n")
|
||
|
updateTextInput(session, "currentComment",
|
||
|
value = values$comment[values$newQuery])
|
||
|
})
|
||
|
|
||
|
|
||
|
## Save event: tag + comment in dedicated csv
|
||
|
observeEvent(values$tagTrigger, {
|
||
|
if (is.null(values$tagTrigger)) return(NULL)
|
||
|
if (verbose)
|
||
|
cat("DEBUG enter event tagTrigger", length(values$tagTrigger$id), "\n")
|
||
|
ready2tag(0) # Prevent new tagging until next query
|
||
|
dapath <- paste0(values$projectdir, ok.labelcol(), ".csv")
|
||
|
darows <- match(values$tagTrigger$id, ok.data()[, values$conf$idcol])
|
||
|
|
||
|
export <- data.frame(values$tagTrigger$id, values$tagTrigger$tag)
|
||
|
colnames(export) <- c(values$conf$idcol, ok.labelcol())
|
||
|
if (verbose)
|
||
|
cat("DEBUG tagTrigger tag",
|
||
|
paste(export[, 1], export[, 2], collapse = " ; "), "\n")
|
||
|
write.table(export, dapath, sep = ",", qmethod = "double",
|
||
|
fileEncoding = "UTF-8", row.names = F,
|
||
|
append = ifelse(file.exists(dapath), T, F),
|
||
|
col.names = ifelse(file.exists(dapath), F, T))
|
||
|
|
||
|
if (any(nchar(values$tagTrigger$comment))) {
|
||
|
tmp_filter <- which(nchar(values$tagTrigger$comment) > 0)
|
||
|
dapath <- paste0(values$projectdir, values$conf$commcol, ".csv")
|
||
|
export <- data.frame(values$tagTrigger$id[tmp_filter],
|
||
|
values$tagTrigger$comment[tmp_filter])
|
||
|
colnames(export) <- c(values$conf$idcol, values$conf$commcol)
|
||
|
if (verbose)
|
||
|
cat("DEBUG tagTrigger comment", export[1,1], export[1, 2])
|
||
|
write.table(export, dapath, sep = ",", qmethod = "double",
|
||
|
fileEncoding = "UTF-8", row.names = F,
|
||
|
append = ifelse(file.exists(dapath), T, F),
|
||
|
col.names = ifelse(file.exists(dapath), F, T))
|
||
|
}
|
||
|
|
||
|
## Update label hist
|
||
|
values$labelhist[match(values$tagTrigger$id, ok.data()[, values$conf$idcol])] <-
|
||
|
max(values$labelhist, na.rm = T) + 1
|
||
|
## Update wasTagged and wasRetagged
|
||
|
itag <- match(values$tagTrigger$id, ok.data()[, values$conf$idcol])
|
||
|
if (input$sampleChoice == "untagged") {
|
||
|
values$wasTagged[itag] <- TRUE
|
||
|
} else if (input$sampleChoice == "tagged") {
|
||
|
values$wasRetagged[itag] <- TRUE
|
||
|
} else if (input$sampleChoice == "all") {
|
||
|
if (is.na(values$label[itag])) {
|
||
|
values$wasTagged[itag] <- TRUE
|
||
|
} else
|
||
|
values$wasRetagged[itag] <- TRUE
|
||
|
}
|
||
|
|
||
|
## Call queryNext event, or launch training
|
||
|
if (trainCountdown() == 1) {
|
||
|
trainCountdown(input$trainCountdown)
|
||
|
trainTrigger(trainTrigger() + 1)
|
||
|
} else if (trainCountdown() > 1) {
|
||
|
trainCountdown(trainCountdown() - 1)
|
||
|
queryNext(queryNext() + 1)
|
||
|
} else {
|
||
|
queryNext(queryNext() + 1)
|
||
|
}
|
||
|
|
||
|
## Update values$label etc
|
||
|
values$label[darows] <- values$tagTrigger$tag
|
||
|
values$queries <- c(values$queries, darows)
|
||
|
|
||
|
if (verbose)
|
||
|
cat("DEBUG exit event tagTrigger", nrow(export), "rows\n")
|
||
|
})
|
||
|
|
||
|
|
||
|
####################
|
||
|
## Tagging main panel
|
||
|
|
||
|
output$textVisuCols <- renderUI({
|
||
|
if (verbose)
|
||
|
cat("DEBUG render text/visu panel\n")
|
||
|
leftsize <- 6
|
||
|
if (!input$panelText) leftsize <- 1
|
||
|
if (input$panelText & !input$panelVisu) leftsize <- 11
|
||
|
|
||
|
fluidRow(
|
||
|
column(
|
||
|
leftsize,
|
||
|
conditionalPanel(
|
||
|
"input.panelText",
|
||
|
wellPanel(htmlOutput("currentText"))
|
||
|
)
|
||
|
),
|
||
|
column(
|
||
|
12 - leftsize,
|
||
|
conditionalPanel(
|
||
|
"input.panelVisu",
|
||
|
fluidRow(
|
||
|
column(2, HTML(paste0('<div title="Compute new visualization">',
|
||
|
actionButton("visuCompute", "🎲"),
|
||
|
'</div>'))),
|
||
|
column(2, HTML(paste0('<div title="Zoom out/into selected area">',
|
||
|
actionButton("visuGoZoom", "🔍±"),
|
||
|
'</div>'))),
|
||
|
column(2, HTML(paste0('<div title="Query only from zoom area">',
|
||
|
checkboxInput("visuLock", "🔒", value = FALSE),
|
||
|
'</div>'))),
|
||
|
column(2, HTML(paste0('<div title="Visualization options">',
|
||
|
checkboxInput("visuOptions", "🔧"),
|
||
|
'</div>'))),
|
||
|
column(3, HTML(paste0('<div title="Plot size">',
|
||
|
numericInput("visuHeight", NULL, 400, 20, 2e12, 10),
|
||
|
'</div>')))
|
||
|
),
|
||
|
conditionalPanel(
|
||
|
'input.visuOptions',
|
||
|
flowLayout(
|
||
|
selectInput(
|
||
|
"visuAlgo", "Visu. method", c("umap", "t-sne"), "umap", width = "100%"),
|
||
|
selectizeInput(
|
||
|
"visuSource", "Visu. source",
|
||
|
c("Word counts (DFM)" = "dfm",
|
||
|
"Word embeddings (FastText)" = "ft",
|
||
|
"Sentence embeddings (SBERT)" = "sb",
|
||
|
"BERT predictions" = "bertpred",
|
||
|
"Regex" = "regex"),
|
||
|
c("sb", "ft"), multiple = TRUE)
|
||
|
)
|
||
|
),
|
||
|
conditionalPanel( # umap options
|
||
|
'input.visuAlgo == "umap" & input.visuOptions',
|
||
|
flowLayout(
|
||
|
numericInput(
|
||
|
"umapNeighb", "Nb. Neighbors", 15, 1, 500, 1),
|
||
|
numericInput(
|
||
|
"umapSpread", "Spread", 1, 0, 16, 1e-2),
|
||
|
numericInput(
|
||
|
"umapMindist", "Min dist.", .001, 0, 16, 1e-3),
|
||
|
selectInput(
|
||
|
"umapMetric", "Metric", selected = "cosine",
|
||
|
c("euclidean", "cosine", "manhattan", "hamming",
|
||
|
"correlation", "categorical")
|
||
|
)
|
||
|
)
|
||
|
),
|
||
|
conditionalPanel( # t-SNE options
|
||
|
'input.visuAlgo == "t-sne" & input.visuOptions',
|
||
|
flowLayout(
|
||
|
numericInput(
|
||
|
"tsnePerplex", "Perplexity", 15, 1, 500, 1),
|
||
|
numericInput(
|
||
|
"tsneTheta", "Theta", .5, 0, 1, 1e-2),
|
||
|
numericInput(
|
||
|
"tsnePcaDim", "PCA dims", 50, 1, 100, 1)
|
||
|
)
|
||
|
),
|
||
|
uiOutput("visuMsg"),
|
||
|
plotOutput(
|
||
|
"visuPlot", width = "100%",
|
||
|
dblclick = "visuDblclick",
|
||
|
brush = brushOpts("visuBrush", resetOnNew = T))
|
||
|
)
|
||
|
)
|
||
|
)
|
||
|
})
|
||
|
|
||
|
##################
|
||
|
## Subpanel "Text"
|
||
|
##################
|
||
|
|
||
|
output$currentText <- renderUI({
|
||
|
if (verbose)
|
||
|
cat("DEBUG render currentText\n")
|
||
|
if (is.null(ok.data()) | is.null(ok.nicetext()))
|
||
|
return(HTML("Import data to start training."))
|
||
|
|
||
|
if (input$strategy %in% c("entropy", "maxprob") & is.null(ok.train()))
|
||
|
return(HTML("<em>No model: train one, or try 'random' or 'sequential' strategy</em>"))
|
||
|
|
||
|
if (is.null(values$newQuery))
|
||
|
return(HTML("<em>No query</em>"))
|
||
|
|
||
|
bastext <- ok.nicetext()[values$newQuery]
|
||
|
if (nchar(input$regexFilter) > 0)
|
||
|
bastext <- gsub(paste0("(", isolate(input$regexFilter), ")"),
|
||
|
"<b>\\1</b>", bastext, ignore.case = isolate(!input$regexCaseSens))
|
||
|
baslen <- nchar(bastext)
|
||
|
if (!length(baslen)) {
|
||
|
if (nchar(input$regexFilter) > 0)
|
||
|
return(HTML("<b><em>Regex filter: no match</em></b>"))
|
||
|
return(HTML("<b><em>No text matches the filters</em></b>"))
|
||
|
}
|
||
|
if (is.na(baslen))
|
||
|
return(HTML("<em>No query</em>"))
|
||
|
if (baslen > 1200) {
|
||
|
bastext <- paste0(substring(bastext, 1, 1200), '<em title="> 512 tokens (not read by BERT)">',
|
||
|
substring(bastext, 1201), "</em>")
|
||
|
}
|
||
|
bastext <- gsub("\n", "<br/>", bastext)
|
||
|
HTML(bastext)
|
||
|
})
|
||
|
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## Panel "Visualize"
|
||
|
##############################################################################
|
||
|
|
||
|
output$visuMsg <- renderUI({
|
||
|
if (!is.null(values$visuMsg))
|
||
|
# strong(values$visuMsg)
|
||
|
h5(values$visuMsg, style = "text-align:center")
|
||
|
})
|
||
|
|
||
|
## Compute visualization points (async)
|
||
|
visu_nclicks <- reactiveVal(0)
|
||
|
ok.visu <- reactiveVal()
|
||
|
observe({
|
||
|
if (!input$panelVisu)
|
||
|
values$visuGo <- FALSE
|
||
|
if (input$panelVisu & !values$visuGo)
|
||
|
values$visuGo <- TRUE
|
||
|
})
|
||
|
observeEvent(list(values$visuGo, input$visuCompute), {
|
||
|
if (is.null(input$visuSource)) return(NULL)
|
||
|
if (!values$visuGo) return(NULL)
|
||
|
if (is.null(ok.data())) {
|
||
|
values$visuMsg <- "No data imported."
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
if (visu_nclicks() > 0) {
|
||
|
values$visuMsg <- "Already computing visualization"
|
||
|
return(NULL)
|
||
|
}
|
||
|
visu_nclicks(1)
|
||
|
|
||
|
values$visuMsg <- "Computing visualization..."
|
||
|
|
||
|
## Prepare options and objects for async computation
|
||
|
dasources <- input$visuSource
|
||
|
daalgo <- input$visuAlgo
|
||
|
|
||
|
if (verbose)
|
||
|
cat("DEBUG compute visu embedding: ", daalgo, paste(dasources, collapse = " "), "\n")
|
||
|
|
||
|
visudat <- NULL
|
||
|
if ("dfm" %in% dasources) dadfm <- ok.dfm()
|
||
|
if ("ft" %in% dasources) {
|
||
|
visudat <- ok.data()[, values$cols_ft]
|
||
|
}
|
||
|
if ("sb" %in% dasources) {
|
||
|
if (is.null(visudat)) {
|
||
|
visudat <- ok.data()[, values$cols_sb]
|
||
|
} else
|
||
|
visudat <- cbind(visudat, ok.data()[, values$cols_sb])
|
||
|
}
|
||
|
if ("bertpred" %in% dasources) {
|
||
|
if (is.null(visudat)) {
|
||
|
visudat <- ok.data()[, do.call(c, values$cols_bertpred)]
|
||
|
} else
|
||
|
visudat <- cbind(visudat, ok.data()[, do.call(c, values$cols_bertpred)])
|
||
|
}
|
||
|
if ("regex" %in% dasources) {
|
||
|
if (is.null(visudat)) {
|
||
|
visudat <- ok.extraRegex.matrix()
|
||
|
} else
|
||
|
visudat <- cbind(visudat, ok.extraRegex.matrix())
|
||
|
}
|
||
|
if (daalgo == "umap") {
|
||
|
visu_args <- list(
|
||
|
n_neighbors = input$umapNeighb, metric = input$umapMetric,
|
||
|
spread = input$umapSpread, min_dist = input$umapMindist)
|
||
|
} else if (daalgo == "t-sne") {
|
||
|
visu_args <- list(
|
||
|
perplexity = input$tsnePerplex, theta = input$tsneTheta,
|
||
|
initial_dims = input$tsnePcaDim, check_duplicates = FALSE)
|
||
|
}
|
||
|
|
||
|
visu_async <- future(seed = TRUE, {
|
||
|
if ("dfm" %in% dasources) {
|
||
|
## TODO: CA treatment buggy, switch to SVD?
|
||
|
# da_ca <- quanteda.textmodels::textmodel_ca(
|
||
|
# quanteda::as.dfm(dadfm), nd = min(100, nrow(dadfm)))
|
||
|
# da_ca <- (da_ca$rowcoord %*% diag(da_ca$sv))[, !is.na(da_ca$sv)]
|
||
|
da_ca <- as.matrix(dadfm)
|
||
|
if (is.null(visudat)) {
|
||
|
visudat <- da_ca
|
||
|
} else
|
||
|
visudat <- cbind(da_ca, visudat)
|
||
|
}
|
||
|
|
||
|
if (daalgo == "umap") {
|
||
|
do.call(uwot::umap, c(list(visudat), visu_args))
|
||
|
} else if (daalgo == "t-sne") {
|
||
|
visudat_dupli <- duplicated(visudat)
|
||
|
if (any(visudat_dupli))
|
||
|
visudat[visudat_dupli, ] <- jitter(visudat[visudat_dupli, ])
|
||
|
do.call(Rtsne::Rtsne, c(list(visudat), visu_args))$Y
|
||
|
}
|
||
|
}) %...>% ok.visu()
|
||
|
|
||
|
visu_async <- catch(visu_async, function(error) {
|
||
|
values$visuMsg <- paste0("Error: ", error)
|
||
|
})
|
||
|
visu_async <- finally(visu_async, function() {
|
||
|
visu_nclicks(0)
|
||
|
if (input$visuGoZoom == 0) {
|
||
|
values$visuMsg <- "Double-click to select, drag to select zooming zone."
|
||
|
} else
|
||
|
values$visuMsg <- NULL
|
||
|
|
||
|
nice_choices <- c("FastText" = "ft", "SBERT" = "sb", "visu" = daalgo)
|
||
|
names(nice_choices)[3] <- ifelse(daalgo == "t-sne", "t-SNE", "UMAP")
|
||
|
updateSelectInput(
|
||
|
session, "dlEmbedSelect", choices = nice_choices, selected = daalgo)
|
||
|
|
||
|
if (verbose)
|
||
|
cat("DEBUG Visu async out\n")
|
||
|
})
|
||
|
|
||
|
NULL
|
||
|
})
|
||
|
|
||
|
## Compute visualization plot (async)
|
||
|
ok.ggvisu <- reactiveVal()
|
||
|
observe({
|
||
|
# if (is.null(ok.visu())) return(NULL)
|
||
|
if (is.null(ok.visu())) {
|
||
|
ok.ggvisu(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
# if (!isTRUE(values$visugo)) return(NULL)
|
||
|
if (!input$panelVisu) return(NULL)
|
||
|
|
||
|
if (verbose)
|
||
|
cat("DEBUG compute visu plot\n")
|
||
|
ggdat <- data.frame(ok.visu(), tag = values$label)
|
||
|
if (!is.null(ok.predlab1()))
|
||
|
ggdat$tag[is.na(values$label)] <- ok.predlab1()[is.na(values$label)]
|
||
|
da_predlabs <- ok.predlab1()
|
||
|
da_tagged <- ! is.na(values$label)
|
||
|
da_query <- values$newQuery
|
||
|
|
||
|
ggvisu_async <- future({
|
||
|
ggres <- ggplot(ggdat, aes(X1, X2)) + theme_bw() +
|
||
|
xlab("") + ylab("") +
|
||
|
theme(legend.position = "bottom") +
|
||
|
scale_shape_manual(values = rep(1:8, 40)) # TODO : adapt to tags
|
||
|
|
||
|
## Tagged points
|
||
|
if (any(da_tagged)) {
|
||
|
ggres <- ggres +
|
||
|
geom_point(data = ggdat[da_tagged, ],
|
||
|
aes(X1, X2, color = tag, shape = tag),
|
||
|
size = 1.5, stroke = 1.5, alpha = .8)
|
||
|
}
|
||
|
|
||
|
## Not tagged points
|
||
|
if (!is.null(da_predlabs)) {
|
||
|
ggres <- ggres +
|
||
|
geom_point(data = ggdat[!da_tagged, ],
|
||
|
aes(color = tag, shape = tag), show.legend = FALSE)
|
||
|
} else {
|
||
|
ggres <- ggres +
|
||
|
geom_point(alpha = .8, data = ggdat[is.na(ggdat$tag), ])
|
||
|
}
|
||
|
|
||
|
## Current query
|
||
|
if (!is.null(da_query)) {
|
||
|
if (is.null(da_predlabs)) {
|
||
|
ggres <- ggres +
|
||
|
geom_point(data = ggdat[da_query, ], color = 2,
|
||
|
stroke = 3, shape = 3, show.legend = FALSE)
|
||
|
} else {
|
||
|
ggres <- ggres +
|
||
|
geom_point(data = ggdat[da_query, ],
|
||
|
stroke = 3, shape = 3, show.legend = FALSE)
|
||
|
}
|
||
|
}
|
||
|
ggres
|
||
|
}) %...>% ok.ggvisu()
|
||
|
|
||
|
ggvisu_async <- catch(ggvisu_async, function(error) {
|
||
|
values$visuMsg <- paste0("Error in ggplot: ", error)
|
||
|
})
|
||
|
|
||
|
})
|
||
|
|
||
|
observe({
|
||
|
if (verbose)
|
||
|
cat("DEBUG update visuPlot\n")
|
||
|
if (is.null(ok.ggvisu())) {
|
||
|
values$visuMsg <- "No visualization yet."
|
||
|
output$visuPlot <- NULL
|
||
|
values$visuZoom$xlim <- NULL
|
||
|
values$visuZoom$ylim <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
output$visuPlot <- renderPlot(height = input$visuHeight, {
|
||
|
# if (is.null(ok.ggvisu())) return(NULL)
|
||
|
ok.ggvisu() +
|
||
|
coord_fixed(xlim = values$visuZoom$xlim, ylim = values$visuZoom$ylim)
|
||
|
})
|
||
|
})
|
||
|
|
||
|
observeEvent(input$visuDblclick, {
|
||
|
if (!is.null(values$visuMsg))
|
||
|
values$visuMsg <- NULL
|
||
|
tmp <- input$visuDblclick
|
||
|
bmu <- t(ok.visu()) - c(tmp$x, tmp$y)
|
||
|
bmu <- which.min(colSums(bmu^2))
|
||
|
values$lastQuery <- bmu
|
||
|
values$newQuery <- bmu
|
||
|
ready2tag(1)
|
||
|
})
|
||
|
|
||
|
observeEvent(input$visuGoZoom, {
|
||
|
if (!is.null(values$visuMsg))
|
||
|
values$visuMsg <- NULL
|
||
|
values$visuZoom$xlim <- c(input$visuBrush$xmin, input$visuBrush$xmax)
|
||
|
values$visuZoom$ylim <- c(input$visuBrush$ymin, input$visuBrush$ymax)
|
||
|
})
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## Panel History
|
||
|
##############################################################################
|
||
|
|
||
|
## Create history link and retag events as they appear
|
||
|
observeEvent(input$histDTable_rows_current, {
|
||
|
histrows <- order(values$labelhist)
|
||
|
histrows <- rev(histrows[!is.na(values$labelhist[histrows])])
|
||
|
histrows <- histrows[!is.na(values$label[histrows])]
|
||
|
histrows <- histrows[input$histDTable_rows_current]
|
||
|
notthere <- histrows[! histrows %in% values$histQueries]
|
||
|
|
||
|
if (!length(notthere)) return(NULL)
|
||
|
lapply(notthere, function(daquery) {
|
||
|
if (verbose) cat("DEBUG create history events", daquery, "\n")
|
||
|
observeEvent(input[[paste0("hist", daquery)]], {
|
||
|
if (verbose) cat("DEBUG history link", daquery, "\n")
|
||
|
values$lastQuery <- daquery
|
||
|
values$newQuery <- daquery
|
||
|
updateTabsetPanel(session, "mainPanelset", selected = "Tagging")
|
||
|
ready2tag(1)
|
||
|
})
|
||
|
|
||
|
observeEvent(input[[paste0("histTag", daquery)]], {
|
||
|
if (verbose) cat("DEBUG history retag", daquery, "\n")
|
||
|
values$histSaveStack <- rbind(
|
||
|
values$histSaveStack,
|
||
|
data.frame(id = ok.data()[daquery, values$conf$idcol],
|
||
|
tag = input[[paste0("histTag", daquery)]],
|
||
|
comment = values$comment[daquery])
|
||
|
)
|
||
|
})
|
||
|
})
|
||
|
values$histQueries <- c(values$histQueries, notthere)
|
||
|
})
|
||
|
|
||
|
## History save action
|
||
|
observeEvent(input$histSave, {
|
||
|
req(is.data.frame(values$histSaveStack))
|
||
|
values$tagTrigger <- as.list(values$histSaveStack)
|
||
|
values$label[match(values$histSaveStack$id, ok.data()[[values$conf$idcol]])] <-
|
||
|
values$histSaveStack$tag
|
||
|
values$histSaveStack <- NULL
|
||
|
})
|
||
|
|
||
|
## Update save name with number of unsaved changes
|
||
|
observeEvent(length(values$histSaveStack$id), {
|
||
|
updateActionButton(
|
||
|
session, "histSave",
|
||
|
label = paste0("Save changes (", length(unique(values$histSaveStack$id)),")"))
|
||
|
})
|
||
|
|
||
|
## History table, with clickable links and retagging
|
||
|
output$histDTable <- DT::renderDataTable({
|
||
|
histrows <- order(values$labelhist)
|
||
|
histrows <- rev(histrows[!is.na(values$labelhist[histrows])])
|
||
|
histrows <- histrows[!is.na(values$label[histrows])]
|
||
|
|
||
|
# histrows <- rev(values$queries)
|
||
|
hist_df <- data.frame(
|
||
|
# history = length(histrows):1,
|
||
|
history = values$labelhist[histrows],
|
||
|
id = sapply(histrows, function(irow) {
|
||
|
as.character(actionLink(
|
||
|
paste0("hist", irow),
|
||
|
label = ok.data()[irow, values$conf$idcol],
|
||
|
onclick = paste0('Shiny.onInputChange(\"hist', irow, '\", this.id)')))
|
||
|
}),
|
||
|
Tag = values$label[histrows],
|
||
|
Retag = sapply(histrows, function(irow) {
|
||
|
paste0(
|
||
|
"<select id=\"histTag", irow, "\" onchange=\"Shiny.onInputChange('histTag", irow, "', this.value)\">",
|
||
|
paste0("<option value=\"", values$uniqueLabels, "\" ",
|
||
|
ifelse(values$uniqueLabels == values$label[irow], "selected", ""),
|
||
|
">", values$uniqueLabels, "</option>", collapse = " "),
|
||
|
"</select>"
|
||
|
)
|
||
|
}),
|
||
|
Comment = values$comment[histrows],
|
||
|
Text = ok.nicetext()[histrows])
|
||
|
if (!is.null(values$conf$contextcols))
|
||
|
hist_df <- cbind(hist_df,
|
||
|
ok.data()[histrows, values$conf$contextcols])
|
||
|
|
||
|
DT::datatable(hist_df, escape = FALSE, rownames = FALSE, filter = "top",
|
||
|
options = list(
|
||
|
columnDefs = list(list(
|
||
|
targets = 5,
|
||
|
render = DT::JS(
|
||
|
"function(data, type, row, meta) {",
|
||
|
"return type === 'display' && data.length > 200 ?",
|
||
|
"'<span title=\"' + data + '\">' + data.substr(0, 200) + '...</span>' : data;",
|
||
|
"}")))
|
||
|
))
|
||
|
})
|
||
|
|
||
|
##############################################################################
|
||
|
## Panel "Stats"
|
||
|
##############################################################################
|
||
|
|
||
|
output$statsTagTable <- renderTable(rownames = F, {
|
||
|
table(tag = values$label)
|
||
|
})
|
||
|
|
||
|
|
||
|
## Model diagnostics by CV
|
||
|
output$statsCVoutput <- reactive({
|
||
|
if (is.null(values$modelcv))
|
||
|
return("Model diagnostics will appear here")
|
||
|
values$modelcv
|
||
|
})
|
||
|
|
||
|
output$statsCVtable <- DT::renderDataTable({
|
||
|
if (is.null(values$cvtable))
|
||
|
return(NULL)
|
||
|
DT::datatable(values$cvtable, list(pageLength = 25), rownames = F)
|
||
|
})
|
||
|
|
||
|
## Button action: compute CV (async)
|
||
|
cv_nclicks <- reactiveVal(0)
|
||
|
observeEvent(input$statsCVgo, {
|
||
|
if (is.null(ok.train())) {
|
||
|
values$modelcv <- "Error in model training"
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (cv_nclicks() != 0) {
|
||
|
values$modelcv <- "CV already running"
|
||
|
return(NULL)
|
||
|
}
|
||
|
cv_nclicks(cv_nclicks() + 1)
|
||
|
|
||
|
predmat <- ok.predictor()[!is.na(values$label), ]
|
||
|
predlab <- values$label[!is.na(values$label)]
|
||
|
cvsamp <- sample(10, nrow(predmat), replace = TRUE)
|
||
|
cvres <- rep(NA, nrow(predmat))
|
||
|
cvres_future <- reactiveVal()
|
||
|
|
||
|
tmp_model <- input$predModel
|
||
|
if (tmp_model == "random forest") {
|
||
|
cv_args <- list(
|
||
|
num.trees = input$rfNumTrees, mtry = if (input$rfMtry > 0) input$rfMtry,
|
||
|
sample.fraction = input$rfSampleFrac, probability = FALSE)
|
||
|
} else if (tmp_model == "linear") {
|
||
|
cv_args <- list(type= 0, cost = input$liblinCost)
|
||
|
} else if (tmp_model == "lasso") {
|
||
|
cv_args <- input$glmLambda
|
||
|
} else if (tmp_model == "naive bayes") {
|
||
|
cv_args <- list(smooth = input$naiveSmooth, prior = input$naivePrior,
|
||
|
distribution = input$naiveDistri)
|
||
|
} else if (tmp_model == "knn") {
|
||
|
cv_args <- list(k = input$knnK)
|
||
|
}
|
||
|
|
||
|
cv_status_file <- paste0(session$token, "_cv")
|
||
|
|
||
|
cv_async <- future(seed = TRUE, {
|
||
|
for (icv in 1:10) {
|
||
|
set_status(cv_status_file, paste0("Computing CV fold ", icv, " / 10"))
|
||
|
if (tmp_model == "random forest") {
|
||
|
cvmodel <- do.call(ranger::ranger, c(cv_args, list(
|
||
|
x= predmat[cvsamp != icv, ],
|
||
|
y= as.factor(predlab[cvsamp != icv]))))
|
||
|
cvres[cvsamp == icv] <- as.character(
|
||
|
predict(cvmodel, data= predmat[cvsamp == icv, ])$predictions)
|
||
|
} else {
|
||
|
tmpvalues <- predlab[cvsamp != icv]
|
||
|
# for (ival in na.omit(unique(tmpvalues)))
|
||
|
# if (sum(tmpvalues == ival, na.rm = T) == 1)
|
||
|
# tmpvalues[which(tmpvalues == ival)] <- NA
|
||
|
|
||
|
if (tmp_model == "lasso") {
|
||
|
cvmodel <- suppressWarnings(
|
||
|
glmnet::glmnet(x= predmat[cvsamp != icv, ][!is.na(tmpvalues), ],
|
||
|
y= as.factor(tmpvalues[!is.na(tmpvalues)]),
|
||
|
family= "multinomial"))
|
||
|
cvres[cvsamp == icv] <- predict(
|
||
|
cvmodel, newx = predmat[cvsamp == icv, ],
|
||
|
type = "class", s= cv_args)
|
||
|
} else if (tmp_model == "linear") {
|
||
|
cvmodel <- do.call(LiblineaR::LiblineaR, c(
|
||
|
list(data= predmat[cvsamp != icv, ][!is.na(tmpvalues), ],
|
||
|
target= as.factor(tmpvalues[!is.na(tmpvalues)])),
|
||
|
cv_args))
|
||
|
cvres[cvsamp == icv] <- as.character(
|
||
|
predict(cvmodel, newx= predmat[cvsamp == icv, ])$predictions)
|
||
|
} else if (tmp_model == "naive bayes") {
|
||
|
cvmodel <- do.call(quanteda.textmodels::textmodel_nb, c(
|
||
|
list(x= predmat[cvsamp != icv, ][!is.na(tmpvalues), ],
|
||
|
y= as.factor(tmpvalues[!is.na(tmpvalues)])),
|
||
|
cv_args))
|
||
|
cvres[cvsamp == icv] <- as.character(
|
||
|
predict(
|
||
|
cvmodel, newdata= predmat[cvsamp == icv, ], type= "class"))
|
||
|
} else if (isolate(tmp_model) == "knn") {
|
||
|
cvres[cvsamp == icv] <-
|
||
|
as.character(do.call(class::knn, c(cv_args, list(
|
||
|
train = predmat[cvsamp != icv, ],
|
||
|
test = predmat[cvsamp == icv, ],
|
||
|
cl = predlab[cvsamp != icv]))))
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
cvres
|
||
|
}) %...>% cvres_future()
|
||
|
|
||
|
cv_async <- catch(cv_async, function(e) {
|
||
|
cvres_future(NULL)
|
||
|
values$modelcv <- e$message
|
||
|
})
|
||
|
|
||
|
cv_async <- finally(cv_async, function() {
|
||
|
cv_nclicks(0)
|
||
|
if (file.exists(cv_status_file)) unlink(cv_status_file)
|
||
|
if (is.null(cvres_future())) return(NULL)
|
||
|
cvres <- cvres_future()
|
||
|
predlab <- values$label[!is.na(values$label)]
|
||
|
dametrics <- metricsTable(predlab, cvres)
|
||
|
values$cvtable <- dametrics
|
||
|
res <- "10-CV OOB evaluation:\n"
|
||
|
# if (any(grepl("^bertpred", input$use_regressors)))
|
||
|
if (any(grepl("^bertpred", values$new_use_regressors)))
|
||
|
res <- paste0(
|
||
|
res,
|
||
|
"⚠️ Scores are biased upwards if BERT predictions were trained on current data ⚠️ \n")
|
||
|
res <- paste0(
|
||
|
res,
|
||
|
"* Accuracy : ", round(100 * mean(cvres == predlab), 2), "%\n",
|
||
|
"* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n",
|
||
|
"* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1),
|
||
|
dametrics$N.cases)), "\n"
|
||
|
)
|
||
|
values$modelcv <- res
|
||
|
})
|
||
|
|
||
|
NULL
|
||
|
})
|
||
|
|
||
|
## Automatic message update during CV computation
|
||
|
observe({
|
||
|
if (cv_nclicks() == 0) return(NULL)
|
||
|
invalidateLater(200)
|
||
|
if (!file.exists(paste0(session$token, "_cv"))) return(NULL)
|
||
|
values$modelcv <- get_status(paste0(session$token, "_cv"))
|
||
|
values$cvtable <- NULL
|
||
|
})
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## Panel BERT
|
||
|
##############################################################################
|
||
|
|
||
|
##########
|
||
|
## Bert UI left panel
|
||
|
|
||
|
## Update list of saved models and dlSelect on init and on saveTrigger
|
||
|
|
||
|
observeEvent(list(ok.data(), values$bertSaveTrigger), {
|
||
|
damodels <- dir(pattern = "^tiggerbert_saved_")
|
||
|
if (!length(damodels)) return(NULL)
|
||
|
damodels <- gsub("^tiggerbert_saved_", "", damodels)
|
||
|
values$bertSaved <- damodels
|
||
|
updateSelectInput(session, "dlBPSelect", choices = damodels)
|
||
|
})
|
||
|
|
||
|
## Populate saved models list, with buttons
|
||
|
|
||
|
output$bertSaved <- renderUI({
|
||
|
if (verbose)
|
||
|
cat("DEBUG bert update saved list\n")
|
||
|
HTML(paste0(
|
||
|
"<table><tbody>",
|
||
|
paste0(sapply(values$bertSaved, function(dabert) {
|
||
|
bertshort <- dabert
|
||
|
if (nchar(bertshort) > 23)
|
||
|
bertshort <- paste0(substr(bertshort, 1, 20), "...")
|
||
|
paste0(
|
||
|
"<tr><td>",
|
||
|
actionLink(paste0("saved_", dabert), bertshort, title = dabert,
|
||
|
style="margin-right: 5px;"), "</td><td>",
|
||
|
actionButton(paste0("bert_copypars_", dabert), "📋",
|
||
|
title = "Copy parameters"), "</td><td>",
|
||
|
actionButton(paste0("bert_infer_", dabert), "🤖",
|
||
|
title = "Infer predictions on current data"), "</td><td>",
|
||
|
actionButton(paste0("bert_delete_", dabert), "🗑",
|
||
|
title = "Delete model"),
|
||
|
"</td></tr>")
|
||
|
}), collapse = "\n"),
|
||
|
"</tbody></table>"
|
||
|
))
|
||
|
})
|
||
|
|
||
|
##########
|
||
|
## Bert UI main panel
|
||
|
|
||
|
## UI outputs : message and stats table and plot
|
||
|
output$bertMsg <- reactive(
|
||
|
ifelse(is.null(values$bertMsg),
|
||
|
"Train new BERT, or select saved model.",
|
||
|
values$bertMsg))
|
||
|
|
||
|
output$bertValstats <- DT::renderDataTable(
|
||
|
DT::datatable(values$bertValstats, list(pageLength = 25)))
|
||
|
|
||
|
output$bertValPlot <- renderPlot(values$bertValPlot)
|
||
|
|
||
|
output$bertMsgHyperpar <- reactive(values$bertMsgHyperpar)
|
||
|
|
||
|
##################################
|
||
|
## Async handling of BERT training
|
||
|
|
||
|
## Status File
|
||
|
bert_status_file <- paste0("tiggerbert_", session$token, "_trainstatus")
|
||
|
|
||
|
fire_interrupt <- function(dafile) set_status(dafile, "interrupt")
|
||
|
fire_ready <- function(dafile) set_status(dafile, "Ready")
|
||
|
fire_running <- function(dafile, perc_complete = NULL){
|
||
|
if(is.null(perc_complete))
|
||
|
msg <- "Training..."
|
||
|
else
|
||
|
msg <- paste0("Training... ", perc_complete, "% complete")
|
||
|
set_status(dafile, msg)
|
||
|
}
|
||
|
interrupted <- function(dafile) get_status(dafile) == "interrupt"
|
||
|
|
||
|
## Create Status File
|
||
|
fire_ready(bert_status_file)
|
||
|
|
||
|
|
||
|
## Button action preparation
|
||
|
bert_nclicks <- reactiveVal(0)
|
||
|
bert_result_val <- reactiveVal()
|
||
|
bert_hyperpars <- reactiveVal()
|
||
|
|
||
|
## Button appearance: write "Stop" if training is ongoing
|
||
|
observeEvent(bert_nclicks(), {
|
||
|
updateActionButton(session, "bertTrain",
|
||
|
label = ifelse(bert_nclicks() > 0, "Stop", "Train BERT"))
|
||
|
})
|
||
|
|
||
|
## Button action: Train new BERT
|
||
|
observeEvent(input$bertTrain,{
|
||
|
|
||
|
## Don't do anything if analysis is already being run
|
||
|
# if(bert_nclicks() != 0){
|
||
|
# values$bertMsg <- "BERT is already training"
|
||
|
# return(NULL)
|
||
|
# }
|
||
|
tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token)
|
||
|
if(bert_nclicks() != 0){
|
||
|
write("stop", paste0(tmpsession, "_stop"))
|
||
|
bert_nclicks(0)
|
||
|
return(NULL)
|
||
|
}
|
||
|
if (file.exists(paste0(tmpsession, "_stop")))
|
||
|
file.remove(paste0(tmpsession, "_stop"))
|
||
|
|
||
|
## Increment clicks and prevent concurrent analyses
|
||
|
bert_nclicks(bert_nclicks() + 1)
|
||
|
|
||
|
bert_result_val(data.frame(Status="Running..."))
|
||
|
fire_running(bert_status_file)
|
||
|
|
||
|
## Prepare async training
|
||
|
values$bertLoadSaved <- NULL
|
||
|
# tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token)
|
||
|
tmpmodel <- input$bertModel
|
||
|
system(paste0("rm ", tmpsession, "_done"))
|
||
|
|
||
|
tmpdat <- data.frame(
|
||
|
id = ok.data()[, values$conf$idcol],
|
||
|
text = ok.nicetext(),
|
||
|
label = values$label
|
||
|
)
|
||
|
tmpdat <- tmpdat[!is.na(tmpdat$label), ]
|
||
|
tmpdat <- tmpdat[grepl("\\S", tmpdat$text), ]
|
||
|
|
||
|
## Filter data: min occurences, balance classes, validation split
|
||
|
tmplabtable <- table(tmpdat$label)
|
||
|
if (is.numeric(input$bertMinOccur)) if (input$bertMinOccur > 1) {
|
||
|
tmpdat <- tmpdat[tmpdat$label %in%
|
||
|
names(which(tmplabtable >= input$bertMinOccur)), ]
|
||
|
tmplabtable <- table(tmpdat$label)
|
||
|
}
|
||
|
set.seed(input$bertValidSeed)
|
||
|
if (input$bertBalance) {
|
||
|
tmpminocc <- min(tmplabtable)
|
||
|
tmptrain <- sample(do.call(c, lapply(unique(tmpdat$label), function(dalabel)
|
||
|
sample(which(tmpdat$label == dalabel), round((1-input$bertValidFrac) * tmpminocc)))))
|
||
|
tmpvalid <- sample(do.call(c, lapply(unique(tmpdat$label), function(dalabel) {
|
||
|
dasamp <- which(tmpdat$label == dalabel)
|
||
|
dasamp <- dasamp[! dasamp %in% tmptrain]
|
||
|
sample(dasamp, round(input$bertValidFrac * tmpminocc))
|
||
|
})))
|
||
|
} else {
|
||
|
tmptrain <- sample(1:nrow(tmpdat), round((1 - input$bertValidFrac) * nrow(tmpdat)))
|
||
|
tmpvalid <- (1:nrow(tmpdat))[-tmptrain]
|
||
|
}
|
||
|
|
||
|
write.csv(tmpdat[tmptrain, ], paste0(tmpsession, "_dat_train.csv"), row.names = FALSE)
|
||
|
write.csv(tmpdat[tmpvalid, ], paste0(tmpsession, "_dat_valid.csv"), row.names = FALSE)
|
||
|
|
||
|
iwait <- 0
|
||
|
while (iwait < 10) {
|
||
|
if (!file.exists(paste0(tmpsession, "_dat_valid.csv"))) Sys.sleep(500)
|
||
|
iwait <- iwait + 1
|
||
|
}
|
||
|
|
||
|
tmp_cmd <- paste0(
|
||
|
values$conf$python, " gobert.py -m ", tmpmodel,
|
||
|
" -t ", tmpsession, "_dat_train.csv",
|
||
|
" -v ", tmpsession, "_dat_valid.csv",
|
||
|
" -b ", input$bertBatchsize,
|
||
|
" -g ", input$bertGradacc,
|
||
|
" -e ", input$bertEpochs,
|
||
|
" -l ", input$bertLrate,
|
||
|
" -w ", input$bertWdecay,
|
||
|
" -B ", ifelse(input$bertBest, 1, 0),
|
||
|
" -E ", input$bertNeval,
|
||
|
" -s ", tmpsession,
|
||
|
" -G ", as.numeric(values$conf$use_gpu),
|
||
|
" -A ", as.numeric(input$bertAdapt),
|
||
|
" 2>&1 ; touch ", tmpsession, "_done"
|
||
|
)
|
||
|
|
||
|
## Save hyperpars
|
||
|
bert_hyperpars(list(bertModel = input$bertModel,
|
||
|
bertEpochs = input$bertEpochs,
|
||
|
bertLrate = input$bertLrate,
|
||
|
bertWdecay = input$bertWdecay,
|
||
|
bertBatchsize = input$bertBatchsize,
|
||
|
bertGradacc = input$bertGradacc,
|
||
|
bertNeval = input$bertNeval,
|
||
|
bertBest = input$bertBest,
|
||
|
bertValidFrac = input$bertValidFrac,
|
||
|
bertValidSeed = input$bertValidSeed,
|
||
|
bertNtrain = length(tmptrain),
|
||
|
bertTrainTable = as.list(table(tmpdat$label[tmptrain])),
|
||
|
bertScheme = ok.labelcol()))
|
||
|
|
||
|
## Launch async training
|
||
|
bert_async <- future({
|
||
|
if (file.exists(tmpsession))
|
||
|
system(paste0("rm -R ", tmpsession, "*"))
|
||
|
|
||
|
# system(tmp_cmd, wait = FALSE, intern = FALSE)
|
||
|
tmp_exec <- system(tmp_cmd, wait = FALSE, intern = TRUE)
|
||
|
writeLines(tmp_exec, paste0(tmpsession, "_log"))
|
||
|
|
||
|
## After training, get trainer state from last checkpoint for diagnostics
|
||
|
chkpt <- dir(paste0(tmpsession, "_train"), pattern = "^checkpoint")
|
||
|
chkpt <-
|
||
|
chkpt[order(as.numeric(gsub("\\D", "", chkpt)), decreasing = T)[1]]
|
||
|
system(paste0(
|
||
|
"cp ", tmpsession, "_train/", chkpt, "/trainer_state.json ",
|
||
|
tmpsession, "/"))
|
||
|
}) %...>% bert_result_val()
|
||
|
|
||
|
## Catch interrupt (or any other error) and notify user
|
||
|
bert_async <- catch(bert_async, function(e){
|
||
|
bert_result_val(NULL)
|
||
|
showNotification(e$message)
|
||
|
values$bertMsg <- e$message
|
||
|
})
|
||
|
|
||
|
## After the promise has been evaluated set nclicks to 0 to allow another Run
|
||
|
bert_async <- finally(bert_async, function() {
|
||
|
system(paste0("rm -R ", tmpsession, "_train"))
|
||
|
system(paste0("rm ", tmpsession, "_done"))
|
||
|
|
||
|
fire_ready(bert_status_file)
|
||
|
bert_nclicks(0)
|
||
|
|
||
|
if (is.null(bert_result_val())) return(NULL)
|
||
|
|
||
|
bertpreds <- try(read.csv(paste0(tmpsession, "_predval.csv")))
|
||
|
if (inherits(bertpreds, "try-error")) {
|
||
|
# values$bertMsg <- "BERT error - try again"
|
||
|
values$bertMsg <- try(paste(readLines(paste0(tmpsession, "_log")), collapse = "\n"))
|
||
|
values$bertValstats <- NULL
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertLastUnsaved <- NULL
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
values$bertLastHyperpar <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
datruth <- tmpdat[tmpvalid, "label"]
|
||
|
dapred <- bertpreds$bertpred
|
||
|
dametrics <- metricsTable(datruth, dapred)
|
||
|
damsg <- paste0(
|
||
|
"Unsaved model (", tmpmodel, ")\n\n",
|
||
|
"BERT validation stats:\n",
|
||
|
"* Accuracy : ", round(100 * mean(datruth == dapred), 2), "%\n",
|
||
|
"* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n",
|
||
|
"* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1),
|
||
|
dametrics$N.cases)), "\n")
|
||
|
hypermsg <- bertHyperToMsg(bert_hyperpars())
|
||
|
writeLines(RJSONIO::toJSON(bert_hyperpars()),
|
||
|
paste0(tmpsession, "/tiggertrain.json"))
|
||
|
|
||
|
values$bertValstats <- dametrics
|
||
|
values$bertMsg <- damsg
|
||
|
values$bertValPlot <- bertPlotFromPath(tmpsession)
|
||
|
values$bertLastUnsaved <- list(
|
||
|
metrics = dametrics, msg = damsg, session = tmpsession)
|
||
|
values$bertMsgHyperpar <- hypermsg
|
||
|
values$bertLastHyperpar <- bert_hyperpars()
|
||
|
})
|
||
|
|
||
|
values$bertMsg <- "BERT is training"
|
||
|
values$bertValstats <- NULL
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertLastUnsaved <- NULL
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
values$bertLastHyperpar <- NULL
|
||
|
|
||
|
## Return something other than the promise so shiny remains responsive
|
||
|
NULL
|
||
|
})
|
||
|
|
||
|
## Automatic check of BERT training status
|
||
|
observe({
|
||
|
if (bert_nclicks() == 0) return(NULL)
|
||
|
isolate({
|
||
|
tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token)
|
||
|
## Check for checkpoint dirs
|
||
|
checkdirs <-
|
||
|
length(dir(paste0(tmpsession, "_train"), pattern = "^checkpoint"))
|
||
|
# fire_running(bert_status_file,
|
||
|
# min(100, round(100 * checkdirs / input$bertNeval, 1)))
|
||
|
# dastate <- try(RJSONIO::fromJSON(paste0(tmpsession, "_train/trainer_state.json")))
|
||
|
# if (!inherits(dastate, "try-error"))
|
||
|
# fire_running(bert_status_file,
|
||
|
# round(100 * as.numeric(dastate$epoch) / input$bertEpochs, 1))
|
||
|
try({
|
||
|
dapoints <- dir(paste0(tmpsession, "_train"),
|
||
|
pattern = "trainer_state[.]json$", recursive = T)
|
||
|
dapoints_num <- as.numeric(gsub("^checkpoint-(\\d+)/.*$", "\\1", dapoints))
|
||
|
statefile <- paste0(
|
||
|
tmpsession, "_train/", dapoints[which.max(dapoints_num)])
|
||
|
if (file.exists(statefile)) {
|
||
|
state <- RJSONIO::fromJSON(statefile)
|
||
|
fire_running(bert_status_file,
|
||
|
round(100 * as.numeric(state$epoch) / input$bertEpochs, 1))
|
||
|
}
|
||
|
})
|
||
|
})
|
||
|
if (checkdirs < isolate(input$bertNeval) & file.exists(paste0(tmpsession, "_done"))) {
|
||
|
values$bertMsg <- "BERT Error - try again"
|
||
|
# values$bertMsg <- paste(readLines(paste0(tmpsession, "_log")), collapse = "\n")
|
||
|
} else {
|
||
|
invalidateLater(1e3)
|
||
|
loss_progress <- ""
|
||
|
if (checkdirs > 0) {
|
||
|
## Get training diagnostics
|
||
|
try({
|
||
|
dapoints <- dir(paste0(tmpsession, "_train"),
|
||
|
pattern = "trainer_state[.]json$", recursive = T)
|
||
|
dapoints_num <- as.numeric(gsub("^checkpoint-(\\d+)/.*$", "\\1", dapoints))
|
||
|
statefile <- paste0(
|
||
|
tmpsession, "_train/", dapoints[which.max(dapoints_num)])
|
||
|
if (file.exists(statefile)) {
|
||
|
state <- RJSONIO::fromJSON(statefile)
|
||
|
logs <- data.table::rbindlist(lapply(state$log_history, as.list),
|
||
|
fill = TRUE)
|
||
|
logs <- logs[, .(loss = max(loss, na.rm = T),
|
||
|
eval_loss = max(eval_loss, na.rm = T)), by = epoch]
|
||
|
loss_progress <- paste0(
|
||
|
"\n* Epoch : ", sprintf("%.02f", logs$epoch),
|
||
|
"\ttrain loss : ", sprintf("%.05f",logs$loss),
|
||
|
"\teval loss : ", sprintf("%.05f", logs$eval_loss), collapse = "")
|
||
|
}
|
||
|
})
|
||
|
}
|
||
|
values$bertMsg <-
|
||
|
ifelse(checkdirs == 0, "Pre-processing...",
|
||
|
paste0(get_status(bert_status_file), loss_progress))
|
||
|
}
|
||
|
values$bertValstats <- NULL
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertLoadSaved <- NULL
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
})
|
||
|
|
||
|
|
||
|
## Action: show last BERT model
|
||
|
observeEvent(input$bertLast, {
|
||
|
values$bertMsg <- values$bertLastUnsaved$msg
|
||
|
values$bertValstats <- values$bertLastUnsaved$metrics
|
||
|
values$bertValPlot <- bertPlotFromPath(values$bertLastUnsaved$session)
|
||
|
values$bertMsgHyperpar <- bertHyperToMsg(values$bertLastHyperpar)
|
||
|
values$bertLoadSaved <- NULL
|
||
|
})
|
||
|
|
||
|
|
||
|
## Action: save BERT model
|
||
|
observeEvent(input$bertSave, {
|
||
|
tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token)
|
||
|
if (!file.exists(paste0(tmpsession)))
|
||
|
return(NULL)
|
||
|
|
||
|
if (!grepl("\\S", input$bertSaveName)) {
|
||
|
tmpname <- paste0(
|
||
|
ok.labelcol(), "_", gsub("\\D+$", "", gsub("\\D", "-", Sys.time())))
|
||
|
} else
|
||
|
tmpname <- cleanFileName(input$bertSaveName)
|
||
|
|
||
|
savedir <- paste0("tiggerbert_saved_", tmpname)
|
||
|
system(paste("cp -R", tmpsession, savedir))
|
||
|
system(paste0("cp ", tmpsession, "_dat_valid.csv ",
|
||
|
savedir, "/tiggerbert_dat_valid.csv"))
|
||
|
system(paste0("cp ", tmpsession, "_predval.csv ",
|
||
|
savedir, "/tiggerbert_pred_valid.csv"))
|
||
|
|
||
|
system(paste0("rm -R ", tmpsession, "*"))
|
||
|
values$bertMsg <- paste0("BERT model successfully saved: ", tmpname)
|
||
|
values$bertValstats <- NULL
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
|
||
|
if (is.null(values$bertSaveTrigger)) {
|
||
|
values$bertSaveTrigger <- 0
|
||
|
} else
|
||
|
values$bertSaveTrigger <- values$bertSaveTrigger + 1
|
||
|
values$bertLastUnsaved <- NULL
|
||
|
updateTextInput(session, "bertSaveName", value = "")
|
||
|
})
|
||
|
|
||
|
|
||
|
## Actions: load saved model (list updated only with new models)
|
||
|
observeEvent(values$bertSaved, {
|
||
|
if (!length(values$bertSaved)) return(NULL)
|
||
|
tmpsaved <- values$bertSaved[! values$bertSaved %in% values$bertSavedAll]
|
||
|
|
||
|
lapply(tmpsaved, function(dabert) {
|
||
|
############
|
||
|
## Actions to load saved model
|
||
|
observeEvent(input[[paste0("saved_", dabert)]], {
|
||
|
bertconf <- RJSONIO::fromJSON(paste0(
|
||
|
"tiggerbert_saved_", dabert, "/config.json"))
|
||
|
truth <- read.csv(paste0(
|
||
|
"tiggerbert_saved_", dabert, "/tiggerbert_dat_valid.csv"))$label
|
||
|
pred <- read.csv(paste0(
|
||
|
"tiggerbert_saved_", dabert, "/tiggerbert_pred_valid.csv"))$bertpred
|
||
|
|
||
|
dametrics <- metricsTable(truth, pred)
|
||
|
values$bertValstats <- dametrics
|
||
|
values$bertMsg <- paste0(
|
||
|
"Saved model ", dabert, " (", bertconf[["_name_or_path"]], ")\n\n",
|
||
|
"BERT validation stats:\n",
|
||
|
"* Accuracy : ", round(100 * mean(truth == pred), 2), "%\n",
|
||
|
"* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n",
|
||
|
"* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1),
|
||
|
dametrics$N.cases)), "\n")
|
||
|
|
||
|
## Hyperparameters
|
||
|
tiggertrain_file <- paste0("tiggerbert_saved_", dabert, "/tiggertrain.json")
|
||
|
if (file.exists(tiggertrain_file)) {
|
||
|
try({
|
||
|
values$bertMsgHyperpar <- bertHyperToMsg(RJSONIO::fromJSON(tiggertrain_file))
|
||
|
})
|
||
|
} else
|
||
|
values$bertMsgHyperpar <- "Training parameters not saved."
|
||
|
|
||
|
values$bertValPlot <-
|
||
|
bertPlotFromPath(paste0("tiggerbert_saved_", dabert))
|
||
|
values$bertLoadSaved <- dabert
|
||
|
})
|
||
|
|
||
|
############
|
||
|
## Actions to copy hyperparameters
|
||
|
observeEvent(input[[paste0("bert_copypars_", dabert)]], {
|
||
|
tiggertrain_file <- paste0("tiggerbert_saved_", dabert, "/tiggertrain.json")
|
||
|
if (!file.exists(tiggertrain_file)) return(NULL)
|
||
|
try({
|
||
|
tmppars <- RJSONIO::fromJSON(tiggertrain_file)
|
||
|
updateSelectInput(session, "bertModel", selected = tmppars$bertModel)
|
||
|
updateCheckboxInput(session, "bertBest", value = tmppars$bertBest)
|
||
|
numinputs <- c("bertEpochs", "bertLrate", "bertWdecay",
|
||
|
"bertBatchsize", "bertGradacc")
|
||
|
for (iinput in numinputs)
|
||
|
updateNumericInput(session, iinput, value = tmppars[[iinput]])
|
||
|
})
|
||
|
})
|
||
|
|
||
|
############
|
||
|
## Actions to delete saved model (with confirm)
|
||
|
observeEvent(input[[paste0("bert_delete_", dabert)]], {
|
||
|
showModal(modalDialog(
|
||
|
title = "Delete BERT model",
|
||
|
paste0("Confirm delete: ", dabert,
|
||
|
" (model and predictions will be deleted)."),
|
||
|
footer = tagList(actionButton(paste0("bert_delete_confirm_", dabert), "Delete"),
|
||
|
modalButton("Cancel"))))
|
||
|
})
|
||
|
|
||
|
observeEvent(input[[paste0("bert_delete_confirm_", dabert)]], {
|
||
|
system(paste0("rm -R tiggerbert_saved_", dabert))
|
||
|
system(paste0(
|
||
|
"rm ", values$conf$datadir, values$conf$dataname, "_bertpred_",
|
||
|
dabert, ".feather"))
|
||
|
# values$bertLoadSaved <- NULL
|
||
|
if (is.null(values$bertSaveTrigger)) {
|
||
|
values$bertSaveTrigger <- 0
|
||
|
} else
|
||
|
values$bertSaveTrigger <- values$bertSaveTrigger + 1
|
||
|
removeModal()
|
||
|
values$bertMsg <- NULL
|
||
|
values$bertValstats <- NULL
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
})
|
||
|
|
||
|
############
|
||
|
## Actions to infer on current data from saved BERT (async)
|
||
|
observeEvent(input[[paste0("bert_infer_", dabert)]], {
|
||
|
if (bertpred_nclicks() > 0 | bpall_nclicks() > 0) {
|
||
|
values$bertMsg <- "Inference already running"
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertValstats <- NULL
|
||
|
return(NULL)
|
||
|
}
|
||
|
bertpred_nclicks(1)
|
||
|
|
||
|
saved_model <- paste0("tiggerbert_saved_", dabert)
|
||
|
out_path <- paste0(
|
||
|
values$conf$datadir, values$conf$dataname, "_bertpred_",
|
||
|
dabert, ".feather")
|
||
|
|
||
|
if (file.exists(out_path)) {
|
||
|
old_infer <- arrow::read_feather(out_path)
|
||
|
} else
|
||
|
old_infer <- NULL
|
||
|
|
||
|
to_infer <- ok.data()[[values$conf$idcol]]
|
||
|
to_infer <- to_infer[! to_infer %in% old_infer[[values$conf$idcol]]]
|
||
|
to_infer_row <- match(to_infer, ok.data()[[values$conf$idcol]])
|
||
|
|
||
|
if (!length(to_infer)) {
|
||
|
values$bertMsg <- paste0(
|
||
|
"All current ", nrow(ok.data()),
|
||
|
" texts already infered by ", dabert)
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertValstats <- NULL
|
||
|
bertpred_nclicks(0)
|
||
|
return(NULL)
|
||
|
}
|
||
|
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertValstats <- NULL
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
values$bertMsg <- paste0(
|
||
|
"Preparing inference on ", length(to_infer), " new texts")
|
||
|
tmp_dat <- paste0("tiggerbert_infer_", session$token, ".feather")
|
||
|
arrow::write_feather(
|
||
|
data.frame(
|
||
|
ok.data()[to_infer_row, values$conf$idcol, drop = FALSE],
|
||
|
text = ok.nicetext()[to_infer_row]),
|
||
|
tmp_dat)
|
||
|
|
||
|
values$bertMsg <- paste0(
|
||
|
"Infering ", dabert, " predictions on ",
|
||
|
length(to_infer), " new texts")
|
||
|
infer_cmd <- paste0(
|
||
|
values$conf$python, " gobert_infer.py -m ", saved_model,
|
||
|
" -o ", out_path,
|
||
|
" -d ", tmp_dat,
|
||
|
" -i ", values$conf$idcol,
|
||
|
" -l ", paste0("tiggerbert_infer_", session$token, ".log"),
|
||
|
" -G ", as.numeric(values$conf$use_gpu),
|
||
|
" -b ", 128)
|
||
|
bertpred_res <- reactiveVal()
|
||
|
bertpred_async <- future({
|
||
|
system(infer_cmd)
|
||
|
system(paste0("rm ", tmp_dat))
|
||
|
|
||
|
if (!is.null(old_infer)) {
|
||
|
new_infer <- arrow::read_feather(out_path)
|
||
|
arrow::write_feather(rbind(old_infer, new_infer), out_path)
|
||
|
}
|
||
|
}) %...>% bertpred_res
|
||
|
bertpred_async <- catch(bertpred_async, function(error) {
|
||
|
bertpred_nclicks(0)
|
||
|
values$bertMsg <- paste0("BERT inference error: ", error)
|
||
|
system(paste0("rm ", tmp_dat))
|
||
|
})
|
||
|
bertpred_async <- finally(bertpred_async, function() {
|
||
|
bertpred_nclicks(0)
|
||
|
values$bertMsg <-
|
||
|
"Inference complete!\nRe-import data in Project/Sample panel to use the predictions for tagging."
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertValstats <- NULL
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
})
|
||
|
NULL
|
||
|
})
|
||
|
})
|
||
|
values$bertSavedAll <- c(values$bertSavedAll, tmpsaved)
|
||
|
})
|
||
|
|
||
|
## Automatic check of BERT inference progress
|
||
|
observe({
|
||
|
if (bertpred_nclicks() == 0) return(NULL)
|
||
|
invalidateLater(1e3)
|
||
|
logfile <- paste0("tiggerbert_infer_", isolate(session$token), ".log")
|
||
|
if (file.exists(logfile)) {
|
||
|
values$bertMsg <- readLines(logfile)[1]
|
||
|
values$bertValPlot <- NULL
|
||
|
values$bertValstats <- NULL
|
||
|
values$bertMsgHyperpar <- NULL
|
||
|
}
|
||
|
})
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## Export panel
|
||
|
##############################################################################
|
||
|
|
||
|
## Export tags and full model predictions
|
||
|
output$dlTagSave <-
|
||
|
downloadHandler(
|
||
|
function()
|
||
|
paste0(Sys.Date(), "-activeTigger-", ok.labelcol(),".", input$dlTagFormat),
|
||
|
function(con) {
|
||
|
if (is.null(ok.data())) return(NULL)
|
||
|
dadat <- ok.data()[, values$conf$idcol, drop = FALSE]
|
||
|
if ("tags" %in% input$dlTagSelect)
|
||
|
dadat[[ok.labelcol()]] <- values$label
|
||
|
if ("comments" %in% input$dlTagSelect) if (ok.commentCol() != "(None)")
|
||
|
dadat[, ok.commentCol()] <- values$comment
|
||
|
if ("predictions" %in% input$dlTagSelect) if (ok.train.possible()) {
|
||
|
dadat$tigger.predict <- ok.predlab1()
|
||
|
dadat$tigger.predprob <- apply(ok.pred(), 1, max)
|
||
|
tmp_pred <- ok.pred()
|
||
|
colnames(tmp_pred) <- paste0("tiggerpred_", colnames(tmp_pred))
|
||
|
dadat <- cbind(dadat, tmp_pred)
|
||
|
}
|
||
|
othercols <- input$dlTagSelect
|
||
|
othercols <- othercols[! othercols %in% c("tags", "comments", "predictions")]
|
||
|
if (length(othercols))
|
||
|
dadat <- cbind(dadat, ok.data()[, othercols, drop = FALSE])
|
||
|
|
||
|
if (input$dlTagFormat == "csv") {
|
||
|
write.csv(dadat, row.names = F, con)
|
||
|
} else
|
||
|
arrow::write_feather(dadat, con)
|
||
|
})
|
||
|
|
||
|
|
||
|
## Export embeddings (ft, sb, umap, t-sne)
|
||
|
output$dlEmbedSave <- downloadHandler(
|
||
|
function()
|
||
|
paste0(Sys.Date(), "-activeTigger-emb-",
|
||
|
paste(input$dlEmbedSelect, collapse = "-"),
|
||
|
".", input$dlEmbedFormat),
|
||
|
function (file) {
|
||
|
if (is.null(input$dlEmbedSelect)) return(NULL)
|
||
|
export <- ok.data()[, values$conf$idcol, drop = FALSE]
|
||
|
if ("ft" %in% input$dlEmbedSelect)
|
||
|
export <- data.frame(export, ok.data()[, values$cols_ft])
|
||
|
if ("sb" %in% input$dlEmbedSelect)
|
||
|
export <- data.frame(export, ok.data()[, values$cols_sb])
|
||
|
if (any(c("t-sne", "umap") %in% input$dlEmbedSelect)) {
|
||
|
tmpvisu <- ok.visu()
|
||
|
tmpalgo <- input$dlEmbedSelect[input$dlEmbedSelect %in% c("t-sne", "umap")]
|
||
|
colnames(tmpvisu) <- paste0(gsub("-", "", tmpalgo), 1:2)
|
||
|
export <- data.frame(export, tmpvisu)
|
||
|
}
|
||
|
if (input$dlEmbedFormat == "csv") {
|
||
|
write.csv(export, file, row.names = FALSE)
|
||
|
} else if (input$dlEmbedFormat == "feather") {
|
||
|
arrow::write_feather(export, file)
|
||
|
}
|
||
|
})
|
||
|
|
||
|
## Export BERT predictions
|
||
|
|
||
|
output$dlBPMsg <- reactive(values$dlBPMsg)
|
||
|
|
||
|
|
||
|
|
||
|
# output$dlBertPredCsv <-
|
||
|
# downloadHandler(paste0(Sys.Date(), "-bertpred-", ok.labelcol(),".csv"),
|
||
|
# content= function(con) {
|
||
|
# if (is.null(ok.data())) return(NULL)
|
||
|
# dadat <- ok.data()[, c(values$conf$idcol, ok.labelcol())]
|
||
|
# dadat[[ok.labelcol()]] <- values$label
|
||
|
# if (ok.commentCol() != "(None)") {
|
||
|
# dadat[, ok.commentCol()] <- values$comment
|
||
|
# }
|
||
|
# if (ok.train.possible()) {
|
||
|
# dadat$tigger.predict <- ok.predlab1()
|
||
|
# dadat$tigger.predprob <- apply(ok.pred(), 1, max)
|
||
|
# }
|
||
|
# dadat <- cbind(dadat, ok.pred())
|
||
|
# write.csv(dadat, row.names = F, con)
|
||
|
# })
|
||
|
|
||
|
############
|
||
|
## Action to infer on ALL data from saved BERT (async)
|
||
|
bpall_res <- reactiveVal()
|
||
|
observeEvent(input$dlBPInfer, {
|
||
|
if (bpall_nclicks() > 0 | bertpred_nclicks() > 0) {
|
||
|
values$dlBPMsg <- "Inference already running"
|
||
|
bpall_res(NULL)
|
||
|
return(NULL)
|
||
|
}
|
||
|
bpall_nclicks(1)
|
||
|
bpall_res_async <- reactiveVal()
|
||
|
dabert <- input$dlBPSelect
|
||
|
|
||
|
saved_model <- paste0("tiggerbert_saved_", dabert)
|
||
|
out_path <- paste0(
|
||
|
values$conf$datadir, values$conf$dataname, "_bertpredall_",
|
||
|
dabert, ".feather")
|
||
|
tmp_dat <- paste0("tiggerbert_inferall_", session$token, ".feather")
|
||
|
tmp_log <- paste0("tiggerbert_inferall_", session$token, ".log")
|
||
|
tmp_done <- paste0("tiggerbert_inferall_", session$token, "_done")
|
||
|
if (file.exists(tmp_done))
|
||
|
system(paste0("rm ", tmp_done))
|
||
|
|
||
|
if (file.exists(out_path)) {
|
||
|
values$dlBPMsg <- "Inference already complete, ready for download"
|
||
|
bpall_nclicks(0)
|
||
|
bpall_res(out_path)
|
||
|
return(NULL)
|
||
|
# old_infer <- arrow::read_feather(out_path)
|
||
|
}
|
||
|
|
||
|
fulldat <- arrow::read_feather(paste0(
|
||
|
values$conf$datadir, values$conf$dataname, ".feather"))
|
||
|
values$dlBPMsg <- paste0(
|
||
|
"Preparing inference on ", nrow(fulldat), " texts")
|
||
|
if (length(values$conf$textcols) > 1) {
|
||
|
fulldat$text <- do.call(paste, c(
|
||
|
fulldat[, values$conf$textcols, drop = FALSE], sep = "\n\n"))
|
||
|
} else {
|
||
|
fulldat$text <- fulldat[, values$conf$textcols]
|
||
|
}
|
||
|
arrow::write_feather(fulldat[, c(values$conf$idcol, "text")], tmp_dat)
|
||
|
|
||
|
infer_cmd <- paste0(
|
||
|
values$conf$python, " gobert_infer.py -m ", saved_model,
|
||
|
" -o ", out_path,
|
||
|
" -d ", tmp_dat,
|
||
|
" -i ", values$conf$idcol,
|
||
|
" -l ", tmp_log,
|
||
|
" -G ", as.numeric(values$conf$use_gpu),
|
||
|
" -b ", 128,
|
||
|
"; touch ", tmp_done)
|
||
|
|
||
|
bpall_async <- future({
|
||
|
system(infer_cmd)
|
||
|
system(paste0("rm ", tmp_dat))
|
||
|
}) %...>% bpall_res_async
|
||
|
|
||
|
bpall_async <- catch(bpall_async, function(error) {
|
||
|
values$dlBPMsg <- paste0("BERT inference error: ", error)
|
||
|
system(paste0("rm ", tmp_dat))
|
||
|
system(paste0("rm ", tmp_done))
|
||
|
})
|
||
|
|
||
|
bpall_async <- finally(bpall_async, function() {
|
||
|
bpall_nclicks(0)
|
||
|
values$dlBPMsg <-
|
||
|
"Inference complete, ready for download"
|
||
|
bpall_res(out_path)
|
||
|
})
|
||
|
NULL
|
||
|
})
|
||
|
|
||
|
## Automatic check of BERT all prediction progress
|
||
|
observe({
|
||
|
if (bpall_nclicks() == 0) return(NULL)
|
||
|
invalidateLater(1e3)
|
||
|
logfile <- paste0("tiggerbert_inferall_", isolate(session$token), ".log")
|
||
|
if (file.exists(logfile))
|
||
|
values$dlBPMsg <- readLines(logfile)[1]
|
||
|
})
|
||
|
|
||
|
## Show BERTpred download button when data is ready
|
||
|
output$dlBPDlButton <- renderUI({
|
||
|
if (is.null(bpall_res())) return(NULL)
|
||
|
downloadButton("dlBPDl", NULL)
|
||
|
})
|
||
|
|
||
|
output$dlBPDl <- downloadHandler(
|
||
|
function()
|
||
|
paste0(Sys.Date(), "-tiggerbert-", values$conf$dataname,
|
||
|
input$dlBPSelect, ".", input$dlBPFormat),
|
||
|
function(file) {
|
||
|
if (is.null(bpall_res())) return(NULL)
|
||
|
tmp_bert <- gsub("^.*_bertpredall_(.*)[.]feather$", "\\1", bpall_res())
|
||
|
if (tmp_bert != input$dlBPSelect) {
|
||
|
values$dlBPMsg <- "BERT model changed - predict again or select current model"
|
||
|
return(NULL)
|
||
|
}
|
||
|
tmp <- arrow::read_feather(bpall_res())
|
||
|
if (input$dlBPFormat == "csv") {
|
||
|
write.csv(tmp, file, row.names = FALSE)
|
||
|
} else if (input$dlBPFormat == "feather") {
|
||
|
arrow::write_feather(tmp, file)
|
||
|
}
|
||
|
}
|
||
|
)
|
||
|
|
||
|
|
||
|
##############################################################################
|
||
|
## Delete files at end of session
|
||
|
onStop(function() {
|
||
|
if(file.exists(bert_status_file)) unlink(bert_status_file)
|
||
|
if(length(dir(pattern = paste(session$token))))
|
||
|
system(paste0("rm -R *", session$token, "*"))
|
||
|
})
|
||
|
|
||
|
})
|