pkgs <- c("arrow", "class", "DT", "future", "ggplot2", "glmnet", "htmlTable", "LiblineaR", "Matrix","Metrics", "promises", "quanteda", "quanteda.textmodels", "ranger", "rlang", "Rtsne", "RJSONIO", "shiny", "SparseM", "stringi", "uwot") pkgs <- pkgs[! pkgs %in% rownames(installed.packages())] if (length(pkgs)) { warning(paste0("Packages ", paste0(pkgs, collapse = " , "), " not available. ", "App might crash.")) } # for (i in seq_along(pkgs)) install.packages(pkgs[i]) verbose <- TRUE # sink("ActiveTigger.log") options(shiny.maxRequestSize = 30*1024^3) # File upload size modelnames <- read.csv("modelnames.csv") modelnames_labels <- modelnames$short names(modelnames_labels) <- modelnames$short_lang library(quanteda) library(ggplot2) library(promises) library(future) plan(multisession) ################################################################################ ## Global functions ################################################################################ label2hash <- function(label) rlang::hash(label) cleanFileName <- function(name) gsub('\\s|[/]|[\\]|[:]|[*]|[?]|["]|[<]|[>]|[+]|[$]|[=]', "_", gsub("^\\s+|\\s+$", "", name)) wtdF1 <- function(truth, pred) { pred <- pred[!is.na(truth)] truth <- truth[!is.na(truth)] sum(sapply(unique(truth), function(ilab) Metrics::fbeta_score(truth == ilab, pred == ilab) * sum(truth == ilab) / length(truth)), na.rm = TRUE) } metricsTable <- function(truth, pred) { do.call(rbind, lapply(sort(unique(truth)), function(ilab) { tmptrue <- truth == ilab tmppred <- pred == ilab tmp <- data.frame( "Tag" = ilab, "N.cases" = sum(tmptrue), "N.predict" = sum(tmppred), "Accuracy" = sprintf("%f", round(Metrics::accuracy(tmptrue, tmppred), 6)), "Precision" = sprintf("%f", round(Metrics::precision(tmptrue, tmppred), 6)), "Recall" = sprintf("%f", round(Metrics::recall(tmptrue, tmppred), 6)), "F1" = sprintf("%f", round(Metrics::fbeta_score(tmptrue, tmppred), 6))) tmp$F1[tmp$F1 == "NaN"] <- sprintf("%f", 0) tmp })) } bertPlotFromPath <- function(path) { if (!file.exists(paste0(path, "/trainer_state.json"))) return(NULL) state <- RJSONIO::fromJSON(paste0(path, "/trainer_state.json")) logs <- data.table::rbindlist(lapply(state$log_history, as.list), fill = TRUE) epochmin <- logs$epoch[which.min(logs$eval_loss)] gdat <- data.table::rbindlist( use.names = FALSE, list( cbind("train", logs[!is.na(logs$loss), c("epoch", "loss")]), cbind("valid", logs[!is.na(logs$eval_loss), c("epoch", "eval_loss")]) )) colnames(gdat) <- c("data", "epoch", "loss") ggplot(gdat, aes_string("epoch", "loss", color = "data", shape = "data")) + geom_point(size = 2) + geom_line() + geom_vline(xintercept = epochmin, linetype = 2) + theme_bw() + expand_limits(y=0) } bertHyperToMsg <- function(pars) { if (is.null(pars)) return(NULL) paste0("Scheme : ", pars$bertScheme, "; ", "Model : ", pars$bertModel, "\n* ", "Epochs : ", pars$bertEpochs, " ; ", "Lrate : ", pars$bertLrate, " ; ", "Wdecay : ", pars$bertWdecay, " ; ", "Batch size : ", pars$bertBatchsize, " ; ", "Grad. accum. : ", pars$bertGradacc, "\n* ", "Keep best : ", pars$bertBest, " ; ", "ValidFrac : ", pars$bertValidFrac, " ; ", "ValidSeed : ", pars$bertValidSeed, " \n* ", pars$bertNtrain, " training samples : ", paste0(names(pars$bertTrainTable), " (", as.numeric(pars$bertTrainTable), ")", collapse = " ; ")) } get_status <- function(dafile) scan(dafile, what = "character", sep="\n", quiet = TRUE) set_status <- function(dafile, msg) write(msg, dafile) ################################################################################ ## Main server function ################################################################################ # options(shiny.maxRequestSize=2^34) # Max filesize shinyServer(function(input, output, session) { values <- reactiveValues() values$visuGo <- FALSE values$dfmGo <- FALSE # queryTrigger <- reactiveVal(0) # predlabTrigger <- reactiveVal(0) initDataTrigger <- reactiveVal(0) queryNext <- reactiveVal(0) trainTrigger <- reactiveVal(0) predTrigger <- reactiveVal(0) diagTrigger <- reactiveVal(0) ready2tag <- reactiveVal(0) trainCountdown <- reactiveVal(0) ok.data <- reactiveVal() ok.data.async <- reactiveVal() ok.data.running <- reactiveVal(FALSE) bpall_nclicks <- reactiveVal(0) bertpred_nclicks <- reactiveVal(0) ## NEW use_regressors list as values$new_use_regressors observeEvent(input$use_regressors, { values$new_use_regressors <- input$use_regressors cat("DEBUG new_use_regressors: ", paste(input$use_regressors, collapse = " "), "\n") }) ## NEW trainCountdown with input$trainCountdown ## Policy: 0, don't train ; 1, trigger train on tag ; >1, decrement on tag observeEvent(input$trainCountdown, trainCountdown(input$trainCountdown)) #################### ## Projects and data ## Read config files, update UI forms accordingly ## This should happen only once observe({ if (verbose) cat("DEBUG read project configs\n") # values$conf <- RJSONIO::fromJSON("conf.json") projfiles <- dir(pattern = "^tigger_.*[.]json$") isolate({ if (!length(projfiles)) { values$conf <- NULL return(NULL) } values$confnames <- gsub("^tigger_(.*)[.]json$", "\\1", projfiles) values$conf <- RJSONIO::fromJSON(projfiles[1]) }) }) ## When projects added, update selector observeEvent(values$confnames, { if (verbose) cat("DEBUG update selectProject from confnames\n") updateSelectInput(session, "selectProject", choices = values$confnames) }) ## Event from project change observeEvent(input$selectProject, { if (is.null(input$selectProject)) return(NULL) if (!nchar(input$selectProject)) return(NULL) if (verbose) cat("DEBUG project change step 1\n") # values$project <- input$selectProject # values$conf <- values$confs[[values$project]] values$conf <- RJSONIO::fromJSON( paste0("tigger_", input$selectProject, ".json")) ok.data(NULL) ok.visu(NULL) initDataTrigger(initDataTrigger() + 1) if (verbose) cat("DEBUG project change step 2\n") if (!is.null(values$conf$dataNrows)) updateNumericInput(session, "dataNrows", value = as.numeric(values$conf$dataNrows)) if (!is.null(values$conf$dataSkipRows)) updateNumericInput(session, "dataSkipRows", value = as.numeric(values$conf$dataSkipRows)) #### Python #### ## Default use_python if not found in conf file: FALSE if(is.null(values$conf$use_python)) values$conf$use_python <- FALSE updateCheckboxInput(session, "sys_use_python", value = values$conf$use_python) ## Set default python if not found in conf file if(is.null(values$conf$python)) values$conf$python <- "python3" updateTextInput(session, "sys_which_python", value = values$conf$python) values$python_ok <- FALSE if (values$conf$use_python) { ## Check whether python is working pytest <- try(system(paste(values$conf$python, "--version"), intern = TRUE)) if (inherits(pytest, "try-error")) { showNotification(paste( "Python path `", values$conf$python, "` not valid, try changing it in Project/System tab"), duration = 10, type = "error") } else values$python_ok <- TRUE } if (verbose) cat("Python ok:", values$python_ok, "\n") #### GPU #### if (!is.null(values$conf$use_gpu)) updateCheckboxInput(session, "sys_use_gpu", value = values$conf$use_gpu) #### Spacy #### if (!is.null(values$conf$use_spacy)) updateCheckboxInput(session, "sys_use_spacy", value = values$conf$use_spacy) updateTextInput(session, "sys_use_spacy_model", value = values$conf$use_spacy_model) #### fastText #### if (!is.null(values$conf$use_ft)) updateCheckboxInput(session, "sys_use_ft", value = values$conf$use_ft) updateTextInput(session, "sys_use_ft_model", value = values$conf$use_ft_model) #### SBERT #### if (!is.null(values$conf$use_sb)) updateCheckboxInput(session, "sys_use_sb", value = values$conf$use_sb) updateTextInput(session, "sys_use_sb_model", value = values$conf$use_sb_model) }) ############################################################################## ## Reactives : project settings UI ############################################################################## output$sys_datadir <- renderText(values$conf$datadir) output$sys_datafile <- renderText(paste0(values$conf$dataname, ".feather")) output$sys_var_id <- renderUI(renderText(values$conf$idcol)) output$sys_var_text <- renderUI(renderText(values$conf$textcols)) output$sys_var_tag <- renderUI(renderText(values$conf$tagcols)) output$sys_var_context_ui <- renderUI(selectInput( "sys_var_context", NULL, values$data_orig_colnames, values$conf$contextcols, multiple = TRUE)) output$sys_var_comm_ui <- renderUI(selectInput( "sys_var_comm", NULL, c("(none)", values$data_orig_colnames), values$conf$commcol)) output$sys_ex_lang_ui <- renderUI(selectInput( "sys_ex_lang", NULL, modelnames_labels, "en" )) output$sys_ex_spacy <- renderText( modelnames$spacy_name[modelnames$short == input$sys_ex_lang]) output$sys_ex_spacy_dl <- renderText(paste0( "Download with python module: ", values$conf$python, " -m spacy download ", modelnames$spacy_name[modelnames$short == input$sys_ex_lang])) output$sys_ex_ft <- renderText( modelnames$fasttext_name[modelnames$short == input$sys_ex_lang]) output$sys_ex_ft_dl <- renderText(paste0( "Manual download link : ", modelnames$fasttext_url[modelnames$short == input$sys_ex_lang])) output$sys_ex_sb <- renderText( ifelse(input$sys_ex_lang %in% c("ar", "zh", "nl", "en", "fr", "de", "it", "ko", "pl", "pt", "ru", "es", "tr"), "distiluse-base-multilingual-cased-v1", "distiluse-base-multilingual-cased-v2")) ## Save system changes on button click, with modal output$save_config_msg <- renderUI({ req(values$save_config_msg) p(strong(values$save_config_msg)) }) output$save_embed_msg <- renderUI({ tmp <- "" if (input$sys_use_spacy) if (!identical(input$sys_use_spacy_model, values$conf$use_spacy_model)) tmp <- paste0(tmp, "
  • Changing the spacy model will delete the current tokenized text and fasttext word embeddings, to prevent conflicts
  • ") if (input$sys_use_ft) if (!identical(input$sys_use_ft_model, values$conf$use_ft_model)) tmp <- paste0(tmp, "
  • Changing the fasttext model will delete the current fasttext word embeddings, to prevent conflicts.
  • ") if (input$sys_use_sb) if (!identical(input$sys_use_sb_model, values$conf$use_sb_model)) tmp <- paste0(tmp, "
  • Changing the SBERT model will delete the current SBERT sentence embeddings, to prevent conflicts.
  • ") if (nchar(tmp)) { return(HTML(paste("

    Warning:

    "))) } else return(NULL) }) observeEvent(input$saveSystem, { values$save_config_msg <- NULL showModal(modalDialog( title = "Save project configuration", paste0("Click 'Save' to check and confirm the new configuration"), uiOutput("save_embed_msg"), uiOutput("save_config_msg"), footer = tagList(actionButton(paste0("save_conf_confirm"), "Save"), modalButton("Cancel")))) }) observeEvent(input$save_conf_confirm, { newconf <- values$conf newconf$contextcols <- input$sys_var_context newconf$commcol <- input$sys_var_comm if (input$sys_use_python) { pytest <- try(system(paste(input$sys_which_python, "--version"), intern = TRUE)) if (inherits(pytest, "try-error")) { values$save_config_msg <- paste0( "Error: python path `", input$sys_which_python, "` not valid") return(NULL) } } newconf$use_python <- input$sys_use_python newconf$python <- input$sys_which_python newconf$use_gpu <- input$sys_use_gpu if (input$sys_use_spacy) { sptest <- system(paste0( newconf$python, " -m spacy info ", input$sys_use_spacy_model), intern = TRUE) if (length(sptest) == 0) { values$save_config_msg <- paste( "Error loading spacy, check that it is installed in the specified python env") return(NULL) } if (length(attr(sptest, "status"))) { values$save_config_msg <- paste( "Error loading spacy model, check that it has been downloaded") return(NULL) } newconf$use_spacy_model <- input$sys_use_spacy_model } newconf$use_spacy <- input$sys_use_spacy if (input$sys_use_ft) { if (!file.exists(input$sys_use_ft_model)) { values$save_config_msg <- paste( "Error loading fasttext model, check the specified path") return(NULL) } newconf$use_ft_model <- input$sys_use_ft_model } newconf$use_ft <- input$sys_use_ft if (input$sys_use_sb) newconf$use_sb_model <- input$sys_use_sb_model newconf$use_sb <- input$sys_use_sb # if (newconf$use_spacy) if (newconf$use_spacy_model != values$conf$use_spacy_model) { if (newconf$use_spacy) if (!identical(newconf$use_spacy_model, values$conf$use_spacy_model)) { file.remove(paste0(values$conf$datadir, values$conf$dataname, "_spa.feather")) file.remove(paste0(values$conf$datadir, values$conf$dataname, "_ft.feather")) } # if (newconf$use_ft) if (newconf$use_ft_model != values$conf$use_ft_model) if (newconf$use_ft) if (!identical(newconf$use_ft_model, values$conf$use_ft_model)) file.remove(paste0(values$conf$datadir, values$conf$dataname, "_ft.feather")) # if (newconf$use_sb) if (newconf$use_sb_model != values$conf$use_sb_model) if (newconf$use_sb) if (!identical(newconf$use_sb_model, values$conf$use_sb_model)) file.remove(paste0(values$conf$datadir, values$conf$dataname, "_sb.feather")) writeLines(RJSONIO::toJSON(newconf), paste0("tigger_", newconf$projectid, ".json")) values$conf <- newconf initDataTrigger(initDataTrigger() + 1) removeModal() }) ############################################################################## ## Reactives : data ############################################################################## ok.commentCol <- reactive(values$conf$commcol) ## Current imported data (async) observeEvent(input$dataImport, { if (ok.data.running()) return(NULL) ## Save nrows to conf file on change if (!is.numeric(input$dataNrows) | !is.numeric(input$dataSkipRows)) return(NULL) values$conf$dataNrows <- input$dataNrows values$conf$dataSkipRows <- input$dataSkipRows if (!is.null(values$conf$projectid)) writeLines(RJSONIO::toJSON(values$conf), paste0( "tigger_", values$conf$projectid, ".json")) initDataTrigger(initDataTrigger() + 1) }) ## Read data event observeEvent(initDataTrigger(), { if (is.null(values$conf)) { ok.data(NULL) return(NULL) } if (initDataTrigger() == 0) return(NULL) if (verbose) cat("DEBUG enter ok.data\n") if (verbose) cat("DEBUG enter ok.data trigger", initDataTrigger(), "\n") ok.data.running(TRUE) ## Prepare variables for async launch coco <- values$conf coco$python_ok <- values$python_ok da_token <- session$token data_status <- paste0("tigger_", da_token, "_data") file_orig_colnames <- paste0(data_status, "_origcols") file_colstok <- paste0(data_status, "_tok") file_colsft <- paste0(data_status, "_ft") file_colssb <- paste0(data_status, "_sb") file_colsbertpred <- paste0(data_status, "_bertpred") da_missing <- paste0(data_status, "_bertpred_missing") projpath <- paste0(coco$datadir, coco$projectid) if (!file.exists(projpath)) dir.create(projpath) ## Read data: async launch data_async <- future({ ################# ## Main data file set_status(data_status, "Importing main data file...") res <- arrow::read_feather( paste0(coco$datadir, coco$dataname, ".feather")) if (is.null(coco$dataNrows)) return(NULL) rowselect <- intersect(1:nrow(res), (1:(coco$dataNrows)) + coco$dataSkipRows) if (!length(rowselect)) return(NULL) res <- res[rowselect, , drop = FALSE] writeLines(colnames(res), file_orig_colnames) ################# ## Tokenized text ## Import data : feather of two cols, one the id, other the tokenized set_status(data_status, "Importing tokenized text...") file_tok <- paste0(coco$datadir, coco$dataname, "_spa.feather") if (file.exists(file_tok)) { tokdat <- arrow::read_feather(file_tok) } else tokdat <- NULL tok_remaining <- res[[coco$idcol]] tok_remaining <- tok_remaining[! tok_remaining %in% tokdat[[coco$idcol]]] ## Tokenize remaining texts if (length(tok_remaining)) { if (!coco$python_ok | !coco$use_spacy) { ## If remaining texts to tokenize but no python, fall back on ## untokenized text ## TODO here use quanteda tokenizer with options? tokdat <- data.frame( res[[coco$idcol]], do.call(paste, c(res[match(tok_remaining, res[[coco$idcol]]), coco$textcols, drop = FALSE], sep = "\n\n"))) colnames(tokdat) <- c(coco$idcol, "text_spa") set_status(data_status, "Importing tokenized text: no spacy ok") } else { ## If python present, tokenize with spacy set_status(data_status, paste("Tokenizing", length(tok_remaining), "new texts")) tmpfile <- paste0("tigger_tok_", da_token) arrow::write_feather( data.frame( id = tok_remaining, text = do.call( paste, c(res[match(tok_remaining, res[[coco$idcol]]), coco$textcols, drop = FALSE], sep = "\n\n"))), paste0(tmpfile, ".feather") ) system(paste0( coco$python, " tokenize_spacy.py -d ", tmpfile, ".feather", " -m ", coco$use_spacy_model)) tok_new <- arrow::read_feather(paste0(tmpfile, "_spa.feather")) colnames(tok_new)[colnames(tok_new) == "id"] <- coco$idcol tokdat <- rbind(tokdat, tok_new) arrow::write_feather(tokdat, file_tok) system(paste0("rm ", tmpfile, "*")) set_status(data_status, "Importing tokenized text: spacy ok") } } ## Save colname for tokenized text tmp_cols_tok <- colnames(tokdat)[colnames(tokdat) != coco$idcol] writeLines(tmp_cols_tok, file_colstok) set_status(data_status, "Importing tokenized text: wrote colnames") ## Merge with current data res <- merge(res, tokdat, by = coco$idcol, all.y = FALSE, sort = FALSE) rm(tokdat) set_status(data_status, "Importing tokenized text: merged") ################# ## FastText embeddings set_status(data_status, paste("Loading fastText embeddings")) ## FastText embeddings : feather file with id and embeddings file_ft <- paste0(coco$datadir, coco$dataname, "_ft.feather") if (file.exists(file_ft)) { ftdat <- arrow::read_feather(file_ft) } else ftdat <- NULL ft_remaining <- res[[coco$idcol]] ft_remaining <- ft_remaining[! ft_remaining %in% ftdat[[coco$idcol]]] ## Embed remaining texts, if FT in use if (length(ft_remaining)) { if (!coco$python_ok | !coco$use_ft) { ## If no python or FT, abort if not all rows already embedded ftdat <- NULL } else { set_status(data_status, paste( "Embedding", length(ft_remaining), "new texts with fastText")) tmpfile <- paste0("tigger_ft_", da_token) arrow::write_feather( data.frame( id = ft_remaining, text = do.call( paste, res[match(ft_remaining, res[[coco$idcol]]), tmp_cols_tok, drop = FALSE])), paste0(tmpfile, ".feather") ) system(paste0( coco$python, " embed_fasttext.py -d ", tmpfile, ".feather", " -m ", coco$use_ft_model)) ft_new <- arrow::read_feather(paste0(tmpfile, "_ft.feather")) colnames(ft_new)[colnames(ft_new) == "id"] <- coco$idcol ftdat <- rbind(ftdat, ft_new) arrow::write_feather(ftdat, file_ft) system(paste0("rm ", tmpfile, "*")) } } if (!is.null(ftdat)) { ## Merge with current data writeLines(colnames(ftdat)[colnames(ftdat) != coco$idcol], file_colsft) res <- merge(res, ftdat, by = coco$idcol, all.y = FALSE, sort = FALSE) rm(ftdat) } ################# ## SBERT embeddings set_status(data_status, "Loading SBERT embeddings") ## SBERT file : feather of id and embeddings file_sb <- paste0(coco$datadir, coco$dataname, "_sb.feather") if (file.exists(file_sb)) { sbdat <- arrow::read_feather(file_sb) }else sbdat <- NULL sb_remaining <- res[[coco$idcol]] sb_remaining <- sb_remaining[! sb_remaining %in% sbdat[[coco$idcol]]] ## Embed remaining texts if (length(sb_remaining)) { if (!coco$python_ok | !coco$use_sb) { ## If no python or SB, abort if not all rows already embedded sbdat <- NULL } else { set_status(data_status, paste( "Embedding", length(sb_remaining), "new texts with SBERT")) tmpfile <- paste0("tigger_sb_", da_token) arrow::write_feather( data.frame( id = sb_remaining, text = do.call( paste, res[match(sb_remaining, res[[coco$idcol]]), coco$textcols, drop = FALSE])), paste0(tmpfile, ".feather") ) system(paste0( coco$python, " embed_sbert.py -d ", tmpfile, ".feather", " -m ", coco$use_sb_model)) sb_new <- arrow::read_feather(paste0(tmpfile, "_sb.feather")) colnames(sb_new)[colnames(sb_new) == "id"] <- coco$idcol sbdat <- rbind(sbdat, sb_new) arrow::write_feather(sbdat, file_sb) system(paste0("rm ", tmpfile, "*")) } } if (!is.null(sbdat)) { ## Merge with current data writeLines(colnames(sbdat)[colnames(sbdat) != coco$idcol], file_colssb) res <- merge(res, sbdat, by = coco$idcol, all.y = FALSE, sort = FALSE) rm(sbdat) } ################# ## BERT predictions set_status(data_status, "Loading BERT predictions") ## BERT predictions : feathers of id and probs ## TODO: take full prediction if existing files_pred <- dir( coco$datadir, paste0("^", coco$dataname, "_bertpred_.*[.]feather$")) # files_predall <- dir( # coco$datadir, paste0("^", coco$dataname, "_bertpredall_.*[.]feather$")) # files_pred <- files_pred[ # ! gsub("_bertpred_", "_bertpredall_", files_pred) %in% files_predall] # files_pred <- c(files_predall, files_pred) tmp_bertpred_cols <- list() if (length(files_pred)) { for (ipred in files_pred) { preddat <- arrow::read_feather(paste0(coco$datadir, ipred)) predname <- gsub(paste0("^", coco$dataname, "_bertpred_(.*)[.]feather$"), "\\1", ipred) # predname <- # gsub(paste0("^", coco$dataname, "_bertpred(all)?_(.*)[.]feather$"), # "\\2", ipred, perl = TRUE) n_missing <- sum(! res[[coco$idcol]] %in% preddat[[coco$idcol]]) if (n_missing) { write( paste0("BERT prediction ", predname, " : ", n_missing, " texts missing in prediction."), da_missing, append = TRUE) } else { colnames(preddat) <- gsub( "^bertpred_", paste0("bertpred_", predname), colnames(preddat)) tmp_bertpred_cols[[predname]] <- colnames(preddat)[colnames(preddat) != coco$idcol] res <- merge(res, preddat, by = coco$idcol, all.y = FALSE, sort = FALSE) } rm(preddat) } if (length(tmp_bertpred_cols)) { writeLines(names(tmp_bertpred_cols), file_colsbertpred) lapply(names(tmp_bertpred_cols), function(x) { writeLines(tmp_bertpred_cols[[x]], paste0(file_colsbertpred, "_", x)) }) } } ## Comment col set_status(data_status, "Loading comments") tmpfile <- paste0(projpath, "/", coco$commcol, ".csv") if (file.exists(tmpfile)) { tmpcol <- read.csv(tmpfile) colnames(tmpcol)[2] <- coco$commcol ## Remove duplicates, keep newest tmpcol <- tmpcol[nrow(tmpcol):1, , drop = FALSE] tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), , drop = FALSE] tmpcol <- tmpcol[nrow(tmpcol):1, , drop = FALSE] res <- merge(res, tmpcol, by = coco$idcol, all.x = TRUE, all.y = FALSE, sort = FALSE) res[[coco$commcol]][is.na(res[[coco$commcol]])] <- "" } else res[[coco$commcol]] <- "" # ## Tagging cols # set_status(data_status, "Loading tags") # for (itag in coco$tagcols) { # tmpfile <- paste0(projpath, "/", itag, ".csv") # if (file.exists(tmpfile)) { # tmpcol <- read.csv(tmpfile) # tmpcol[[paste0("hist_", itag)]] <- 1:nrow(tmpcol) # ## Remove duplicates, keep newest # tmpcol <- tmpcol[nrow(tmpcol):1, ] # tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), ] # tmpcol <- tmpcol[nrow(tmpcol):1, ] # if (verbose) # cat("DEBUG imported ", itag, nrow(tmpcol), "\n") # res <- merge(res, tmpcol, by = coco$idcol, # all.x = TRUE, all.y = FALSE, sort = FALSE) # } else { # res[[itag]] <- NA # res[[paste0("hist_", itag)]] <- NA # } # } ## Tagging cols set_status(data_status, "Loading tags") for (itag in coco$tagcols) { itaghist <- paste0("hist_", itag) if (is.null(res[[itag]]) | all(is.na(res[[itag]]))) { res[[itag]] <- NA res[[itaghist]] <- NA } else { res[[itaghist]][!is.na(res[[itag]])] <- 1:sum(!is.na(res[[itag]])) } tmpfile <- paste0(projpath, "/", itag, ".csv") if (file.exists(tmpfile)) { tmpcol <- read.csv(tmpfile) tmpcol[[itaghist]] <- 1:nrow(tmpcol) + max(0, suppressWarnings(max(na.omit(res[[itaghist]])))) ## Remove duplicates, keep newest tmpcol <- tmpcol[nrow(tmpcol):1, ] tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), ] tmpcol <- tmpcol[nrow(tmpcol):1, ] tmpcol <- tmpcol[tmpcol[[coco$idcol]] %in% res[[coco$idcol]], ] if (verbose) cat("DEBUG imported", itag, nrow(tmpcol), "\n") # res <- merge(res, tmpcol, by = coco$idcol, # all.x = TRUE, all.y = FALSE, sort = FALSE) if (nrow(tmpcol)) { idmatches <- match(tmpcol[[coco$idcol]], res[[coco$idcol]]) # idmatches <- idmatches[!is.na(idmatches)] res[[itag]][idmatches] <- tmpcol[[itag]] res[[itaghist]][idmatches] <- tmpcol[[itaghist]] } } } set_status(data_status, "Finalizing data import") res }) %...>% ok.data.async() data_async <- catch(data_async, function(error) { values$modelDiagno <- paste0("Error in data import: ", error) }) data_async <- finally(data_async, function() { ################# ## Finalize data importation res <- ok.data.async() if (is.null(res)) return(NULL) set_status(data_status, "Finalizing data import: read columns") ## Read column (tok, ft, sb, ...) names from dedicated files values$tagcols <- coco$tagcols values$cols_tok <- readLines(file_colstok) values$cols_ft <- NULL if (file.exists(file_colsft)) { values$cols_ft <- readLines(file_colsft) } values$cols_sb <- NULL if (file.exists(file_colssb)) { values$cols_sb <- readLines(file_colssb) } set_status(data_status, "Finalizing data import: bertpred") if (file.exists(file_colsbertpred)) { tmp_bertpred <- readLines(file_colsbertpred) values$cols_bertpred <- lapply(tmp_bertpred, function(x) readLines(paste0(file_colsbertpred, "_", x))) names(values$cols_bertpred) <- tmp_bertpred } else values$cols_bertpred <- NULL set_status(data_status, "Finalizing data import: bertpred missing") if (file.exists(da_missing)) { lapply(readLines(da_missing), function(x) showNotification(paste( x, "\n", "You may predict on current text in BERT panel, ", "and then reload the data."))) } ## Populate regressors list in prediction panel set_status(data_status, "Finalizing data import: populating regressors") regr_choices <- c( "Regex" = "regex", "Word counts (DFM)" = "dfm") if (!is.null(values$cols_ft)) regr_choices <- c(regr_choices, "Word embeddings (FastText)" = "ft") if (!is.null(values$cols_sb)) regr_choices <- c(regr_choices, "Sentence embeddings (SBERT)" = "sb") regr_choices <- c(regr_choices, "Extra predictors" = "extra") if (length(values$cols_bertpred)) { tmp_choices <- paste0("bertpred_", names(values$cols_bertpred)) names(tmp_choices) <- paste0("BERT_", names(values$cols_bertpred)) regr_choices <- c(regr_choices, tmp_choices) } # if (length(regr_choices) > 3) { regr_choices_sel <- regr_choices[regr_choices %in% c("regex", "ft", "sb")] # } else # regr_choices_sel <- c("regex", "dfm") updateSelectizeInput( session, "use_regressors", choices = regr_choices, selected = regr_choices_sel) values$new_use_regressors <- regr_choices_sel othercols <- colnames(res) othercols <- othercols[! othercols %in% c( values$cols_tok, values$cols_ft, values$cols_sb, values$cols_bertpred, values$tagcols, paste0("hist_", values$tagcols), coco$commcol, coco$textcols)] updateSelectInput( session, "dlTagSelect", choices = c("tags", "comments", "predictions", othercols), selected = c("tags", "comments", "predictions")) ## Populate extra regressors lists in prediction panel updateSelectizeInput( session, "use_ootregnum", choices = othercols, selected = NULL) updateSelectizeInput( session, "use_ootregcat", choices = othercols, selected = NULL) ## Project path values$projectdir <- paste0(projpath, "/") ## Create tagmat: data.frame of all current taggings values$tagmat <- res[, coco$tagcols, drop = FALSE] ## Create tagmathist: data.frame of historical order of taggings values$tagmathist <- res[, paste0("hist_", coco$tagcols), drop = FALSE] colnames(values$tagmathist) <- coco$tagcols ## Save data original colnames values$data_orig_colnames <- readLines(file_orig_colnames) ok.data(ok.data.async()) ok.data.async(NULL) ok.data.running(FALSE) set_status(data_status, "Exit data async") system(paste0("rm ", data_status, "*")) values$modelDiagno <- "No model" if(trainTrigger() == 0) trainTrigger(1) ## For first run. TODO set trainTrigger to 0 when changing data? if (verbose) cat("DEBUG Exit data async finally\n") }) NULL }) ## Read data import status from file observe({ if (ok.data.running()) { invalidateLater(100) tmpfile <- paste0("tigger_", session$token, "_data") if (file.exists(tmpfile)) { values$modelDiagno <- paste(collapse = "\n", scan( tmpfile, what = "character", sep="\n", quiet = TRUE)) } else values$modelDiagno <- "" } }) ############### ## ok.text() reactive ############### ok.text <- eventReactive(ok.data(), { if (is.null(ok.data())) return(NULL) res <- ok.data()[, values$cols_tok[1]] if (length(values$cols_tok) > 1) { for (icol in values$cols_tok[-1]) { res <- paste0(res, "\n\n", ok.data()[, icol]) } } # queryTrigger(queryTrigger() + 1) if (verbose) cat("DEBUG ok.text", length(res), "\n") res }) ok.labelcol <- reactive({ if (is.null(ok.data())) return(NULL) if (verbose) # cat("DEBUG enter ok.labelcol\n") if (is.null(input$selectScheme)) return(values$conf$tagcols[1]) input$selectScheme }) ok.nicetext <- reactive({ req(ok.data(), values$conf$textcols) # if (is.null(ok.data())) return(NULL) do.call(paste, c(ok.data()[, values$conf$textcols, drop = FALSE], sep = "\n\n")) }) ok.regressor.names <- reactive({ daregs <- values$new_use_regressors if (verbose) cat("DEBUG enter ok.regressor.names ", paste(daregs, collapse = " "), "\n") res <- NULL if ("ft" %in% daregs) res <- c(res, values$cols_ft) if ("sb" %in% daregs) res <- c(res, values$cols_sb) if (length(values$cols_bertpred)) { for (ipred in 1:length(values$cols_bertpred)) { if (paste0("bertpred_", names(values$cols_bertpred)[ipred]) %in% daregs) res <- c(res, values$cols_bertpred[[ipred]]) } } if (verbose) cat("DEBUG exit ok.regressor.names:", length(res), "\n") res }) ok.contextCol <- reactive({ if (verbose) cat("DEBUG enter ok.contextCol\n") # res <- isolate(values$conf$contextcols) res <- values$conf$contextcols names(res) <- res res }) ## Initiate label and queries history, on ok.text() or labelcol change, observeEvent(list(ok.text(), ok.labelcol()), { if (verbose) cat("DEBUG initiate label\n") if (is.null(ok.labelcol())) { values$label <- NULL values$labelhist <- NULL values$queries <- NULL values$comment <- NULL values$wasTagged <- NULL values$wasRetagged <- NULL } else { values$label <- values$tagmat[, ok.labelcol()] values$labelhist <- values$tagmathist[, ok.labelcol()] values$queries <- which(!is.na(values$label)) values$comment <- ok.data()[[values$conf$commcol]] values$wasTagged <- !is.na(values$label) values$wasRetagged <- rep(FALSE, length(values$label)) } }) ## On values$label change, values$tagmat is updated for separate history observeEvent(values$label, { if (is.null(values$label)) return(NULL) if (verbose) cat("DEBUG Update tagmat from label\n") values$tagmat[, ok.labelcol()] <- values$label values$tagmathist[, ok.labelcol()] <- values$labelhist }) ############################################################################## ## Reactives : training ############################################################################## ## Message of how many tagged and untagged, for model diagnostics box ok.msgtagged <- reactive({ if (verbose) cat("DEBUG enter ok.msgtagged\n") # if (is.null(ok.data()) | ok.data.running()) { if (is.null(ok.data())) { return("No data yet ") } tmp <- "" # if (sum(is.na(values$label)) == 0) # tmp <- paste0(tmp, "Data correction mode (0 NA labels left).\n") paste0(tmp, "Tagged: ", sum(!is.na(values$label)), " ; Untagged: ", sum(is.na(values$label))) }) ## Training possible? At least 3 texts for 2 tags ok.train.possible <- reactive({ if (verbose) cat("DEBUG enter ok.train.possible\n") labtab <- table(values$label) if (length(labtab) < 2) return(FALSE) if (sum(labtab > 2) < 2) return(FALSE) TRUE }) ## Compute DFM (async) ## TODO make this less reactive observe({ # if ("dfm" %in% input$use_regressors & !values$dfmGo) if ("dfm" %in% values$new_use_regressors & !values$dfmGo) values$dfmGo <- TRUE }) ok.dfm <- reactiveVal() observe({ if (is.null(ok.text())) return(NULL) if (! values$dfmGo) return(NULL) if (!is.numeric(input$dfmNgrams)) return(NULL) if (verbose) cat("DEBUG enter dfm observer\n") datext <- ok.text() dfmNgrams <- input$dfmNgrams dfmTfIdf <- input$dfmTfIdf dfmMinDocfreq <- input$dfmMinDocfreq dfmMinTermfreq <- input$dfmMinTermfreq dfmTfScheme <- input$dfmTfScheme dfmDfScheme <- input$dfmDfScheme dfm_async <- future(seed = TRUE, { dadfm <- datext %>% tolower() %>% tokens(remove_punct = F) %>% # tokens_select(pattern = stopwords(language= "fr"), selection = "remove") %>% tokens_ngrams(n= 1:dfmNgrams) %>% dfm() %>% dfm_trim(min_docfreq = dfmMinDocfreq, min_termfreq = dfmMinTermfreq, verbose = TRUE) if (dfmTfIdf) { dadfm <- dadfm %>% dfm_tfidf(scheme_tf = dfmTfScheme, scheme_df = dfmDfScheme) } suppressWarnings({ # TODO : check dadfm <- quanteda::convert(dadfm, to= "tm") }) dadfm <- Matrix::sparseMatrix(i= dadfm$i, j= dadfm$j, x= dadfm$v, dimnames = dadfm$dimnames) dadfm }) %...>% ok.dfm() dfm_async <- catch(dfm_async, function(error) { values$modelDiagno <- paste0("Error in DFM: ", error) }) }) ## Predictor : df or matrix of dfm and extra ok.regressor.matrix <- reactive({ if (is.null(ok.text())) return(NULL) if (is.null(ok.regressor.names())) return(NULL) if (verbose) cat("DEBUG enter ok.regressor.matrix\n") model.matrix(~., data= ok.data()[, ok.regressor.names()])[, -1] }) ok.extraregressor.matrix <- reactive({ req(ok.text()) if (! "extra" %in% values$new_use_regressors) return(NULL) if (verbose) cat("DEBUG enter ok.extraregressor.matrix\n") res <- NULL for (inum in input$use_ootregnum) { suppressWarnings({tmp <- as.numeric(ok.data()[[inum]])}) if (all(is.na(tmp))) { showNotification(paste0(inum, " is all NAs or non-numeric ; predictor not used"), duration = 5, type = "warning") next } tmp[is.na(tmp)] <- median(tmp, na.rm = TRUE) res <- cbind(res, tmp) } for (icat in input$use_ootregcat) { tmp <- ok.data()[[icat]] if (! length(unique(tmp)) %in% 2:100) { showNotification(paste0(icat, " has ", length(unique(tmp)), " unique values ; predictor not used"), duration = 5, type = "warning") next } tmp <- as.character(tmp) tmp[is.na(tmp)] <- "" tmp <- as.factor(tmp) tmpmat <- model.matrix(~., data.frame(cat = tmp))[, -1, drop = FALSE] res <- cbind(res, tmpmat) } if (verbose) cat("DEBUG exit ok.extraregressor.matrix", ncol(res), "\n") res }) ok.extraRegex.matrix <- reactive({ if (is.null(ok.nicetext())) return(NULL) if (is.null(values$extraRegex)) return(NULL) if (is.null(values$regex_inuse)) return(NULL) if (!any(values$regex_inuse)) return(NULL) if (is.null(values$regex_case)) return(NULL) if (verbose) cat("DEBUG enter ok.extraRegex.matrix\n") tmp <- as.matrix(sapply(which(values$regex_inuse), function(iregex) { 1 * stringi::stri_detect_regex( ok.nicetext(), values$extraRegex[iregex], opts_regex = list(case_insensitive = !values$regex_case[iregex])) })) colnames(tmp) <- values$extraRegex[values$regex_inuse] tmp }) ok.predictor <- reactive({ req(values$new_use_regressors) if (ok.data.running()) return(NULL) if (is.null(ok.text())) return(NULL) if (verbose) cat("DEBUG enter ok.predictor\n") dadfm <- NULL if ("dfm" %in% values$new_use_regressors) { dadfm <- ok.dfm() } else { if (input$predModel == "naive bayes") { values$modelDiagno <- "Error in training: Naive Bayes requires DFM word counts as predictors." return(NULL) } } if (length(ok.regressor.names()) > 0) { if (input$predModel == "random forest") { if (is.null(dadfm)) { dadfm <- ok.data()[, ok.regressor.names()] } else dadfm <- cbind(ok.data()[, ok.regressor.names()], as.matrix(dadfm)) } if (input$predModel %in% c("lasso", "linear", "knn")) { dadfm <- cbind(ok.regressor.matrix(), dadfm) } } # if ("regex" %in% input$use_regressors && length(values$regex_inuse) > 0) { if ("regex" %in% values$new_use_regressors & !is.null(ok.extraRegex.matrix())) { dadfm <- cbind(dadfm, ok.extraRegex.matrix()) } if ("extra" %in% values$new_use_regressors) { dadfm <- cbind(dadfm, ok.extraregressor.matrix()) } if (is.null(dadfm)) { values$modelDiagno <- "Error in training: no predictors selected" if (verbose) cat("DEBUG ok.predictor exit: no predictors\n") return(NULL) } if (input$predModel == "linear") dadfm <- SparseM::as.matrix.csr(dadfm, nrow(dadfm), ncol(dadfm)) if (input$predModel == "naive bayes") dadfm <- as.dfm(dadfm) if (verbose) cat("DEBUG exit ok.predictor: ", ncol(dadfm), "\n") dadfm }) ## Train model async ok.train <- reactiveVal() ok.train.async <- reactiveVal() observeEvent( list(trainTrigger(), input$modelTrain), { if (is.null(ok.predictor())) return(NULL) if (verbose) cat("DEBUG enter ok.train observer\n") if (!ok.train.possible()) { values$modelDiagno <- "Tag at least 3 texts for 2 tags to train prediction model." ok.train(NULL) # queryTrigger(queryTrigger() + 1) return(NULL) } ready2tag(0) values$modelDiagno <- paste0("Training ", input$predModel) tmp_model <- input$predModel if (tmp_model == "random forest") { if (!is.numeric(input$rfNumTrees) | !is.numeric(input$rfMtry) | !is.numeric(input$rfSampleFrac)) { ok.train(NULL) return(NULL) } train_fun <- ranger::ranger train_args <- list( x= ok.predictor()[!is.na(values$label), ], y= as.factor(values$label[!is.na(values$label)]), num.trees = input$rfNumTrees, mtry = if (input$rfMtry > 0) input$rfMtry, sample.fraction = input$rfSampleFrac, probability = TRUE) } else { if (tmp_model == "linear") { tmpvalues <- values$label if (!is.numeric(input$liblinCost)) { ok.train(NULL) return(NULL) } for (ival in na.omit(unique(tmpvalues))) if (sum(tmpvalues == ival, na.rm = T) == 1) tmpvalues[which(tmpvalues == ival)] <- NA train_fun <- LiblineaR::LiblineaR train_args <- list( data= ok.predictor()[!is.na(tmpvalues), ], target= as.factor(tmpvalues[!is.na(tmpvalues)]), type= 0, cost = input$liblinCost) } else if (tmp_model == "lasso") { tmpvalues <- values$label for (ival in na.omit(unique(tmpvalues))) if (sum(tmpvalues == ival, na.rm = T) == 1) tmpvalues[which(tmpvalues == ival)] <- NA train_fun <- glmnet::glmnet train_args <- list( x= ok.predictor()[!is.na(tmpvalues), ], y= as.factor(tmpvalues[!is.na(tmpvalues)]), family= "multinomial") } else if (tmp_model == "naive bayes") { if (!is.numeric(input$naiveSmooth)) { ok.train(NULL) return(NULL) } tmpvalues <- values$label for (ival in na.omit(unique(tmpvalues))) if (sum(tmpvalues == ival, na.rm = T) == 1) tmpvalues[which(tmpvalues == ival)] <- NA train_fun <- quanteda.textmodels::textmodel_nb train_args <- list( x= ok.predictor()[!is.na(tmpvalues), ], y= as.factor(tmpvalues[!is.na(tmpvalues)]), smooth = input$naiveSmooth, prior = input$naivePrior, distribution = input$naiveDistri) } else if (tmp_model == "knn") { train_fun <- function(x) return(TRUE) train_args <- list(x = "knn") } } train_async <- future(seed = TRUE, { suppressWarnings(do.call(train_fun, train_args)) }) %...>% ok.train.async() train_async <- catch(train_async, function(error) { values$modelDiagno <- paste0("Model training error: ", error) ok.train(NULL) ok.train.async(NULL) }) train_async <- finally(train_async, function() { values$modelDiagno <- "Training done, predicting..." ok.train(ok.train.async()) ok.train.async(NULL) predTrigger(predTrigger() + 1) }) NULL } ) ## Model prediction on full dataset (async) ok.pred <- reactiveVal() ok.pred.async <- reactiveVal() # observeEvent(list(ok.train(), input$glmLambda, input$knnK, values$retagged), { observeEvent(predTrigger(), { if (is.null(ok.train())) { ok.pred(NULL) return(NULL) } if (verbose) cat("DEBUG ok.pred()\n") damodel <- input$predModel if (input$predModel == "random forest") { pred_args <- list(ok.train(), data = ok.predictor()) dacolnames <- colnames(ok.train()$predictions) } else if (input$predModel == "lasso") { if (!is.numeric(input$glmLambda)) { ok.pred(NULL) return(NULL) } pred_args <- list(ok.train(), newx = ok.predictor(), s = input$glmLambda, type = "response") } else if (input$predModel == "linear") { pred_args <- list(ok.train(), newx = ok.predictor(), proba = TRUE) } else if (input$predModel == "naive bayes") { pred_args <- list(ok.train(), newdata = ok.predictor(), type = "probability") } else if (input$predModel == "knn") { if (!is.numeric(input$knnK)) { ok.pred(NULL) return(NULL) } pred_args <- list(train = ok.predictor()[!is.na(values$label), ], test = ok.predictor(), cl = values$label[!is.na(values$label)], k = input$knnK, prob = TRUE) } pred_async <- future(seed = TRUE, { if (damodel == "random forest") { res <- do.call(predict, pred_args)$predictions colnames(res) <- dacolnames res } else if (damodel == "lasso") { do.call(predict, pred_args)[, , 1] } else if (damodel == "linear") { do.call(predict, pred_args)$probabilities } else if (damodel == "naive bayes") { do.call(predict, pred_args) } else if (damodel == "knn") { tmp <- do.call(class::knn, pred_args) ## A bit flaky here: probs of non-best classes uniformly share the rest, ## and distance is euc tmpvalues <- sort(unique(pred_args$cl)) res <- matrix(0, nrow(pred_args$test), length(tmpvalues), dimnames = list(NULL, tmpvalues)) res[cbind(1:nrow(res), tmp)] <- attr(tmp, "prob") - ((1 - attr(tmp, "prob")) / (ncol(res) - 1)) res <- res + (1 - attr(tmp, "prob")) / (ncol(res) - 1) res } }) %...>% ok.pred.async() pred_async <- catch(pred_async, function(error) { values$modelDiagno <- paste0("Error in prediction: ", error) }) pred_async <- finally(pred_async, function() { if (verbose) cat("DEBUG ok.pred() out of async\n") ok.pred(ok.pred.async()) ok.pred.async(NULL) # predlabTrigger(predlabTrigger() + 1) diagTrigger(diagTrigger() + 1) queryNext(queryNext() + 1) }) }) ## Best predicted label for full dataset # ok.predlab1 <- eventReactive(predlabTrigger(), { # if (is.null(ok.pred())) return(NULL) # if (verbose) # cat("DEBUG ok.predlab1\n") # # queryTrigger(queryTrigger() + 1) # colnames(ok.pred())[max.col(ok.pred())] # }) ok.predlab1 <- reactive({ if (is.null(ok.pred())) return(NULL) if (verbose) cat("DEBUG ok.predlab1\n") colnames(ok.pred())[max.col(ok.pred())] }) ## On scheme change, forget existing model, queue, query and hist ## load scheme description and predictor-regex observeEvent(ok.labelcol(), { if (verbose) cat("DEBUG event selectScheme", ok.labelcol(), "\n") ok.pred(NULL) ok.train(NULL) values$queryqueue <- NULL values$modelDiagno <- "No model" values$histSaveStack <- NULL ## Load description file descr <- "" if (file.exists(paste0(values$conf$datadir, values$conf$projectid, "/", input$selectScheme, ".txt"))) { descr <- paste(collapse = "\n", readLines(paste0( values$conf$datadir, values$conf$projectid, "/", input$selectScheme, ".txt"))) } updateTextAreaInput(session, "schemeDescr", value = descr) ## Load regex from json regexfile <- paste0(values$projectdir, input$selectScheme, "_regex.json") if (file.exists(regexfile)) { if (verbose) cat("DEBUG enter regex.json\n") tmp <- RJSONIO::fromJSON(paste(collapse = "\n", readLines(regexfile))) if (is.null(tmp$regex)) return(NULL) values$extraRegex <- tmp$regex names(values$extraRegex) <- sapply(tmp$regex, label2hash) for (iregex in 1:length(tmp$regex)) { values[[paste0("useregex_", names(values$extraRegex)[iregex])]] <- tmp$use[iregex] values[[paste0("caseregex_", names(values$extraRegex)[iregex])]] <- tmp$case[iregex] } } else values$extraRegex <- NULL }) ## queryNext event: update values$newQuery ; called by queryNext(), or by ## queryQueue or regex change ## Requires : ## - queryQueue (reactive: indices of current queue), and ## - wasTagged (boolean vector indicating whether each element has been tagged this session, used in "on untagged") ## - wasRetagged (boolean vector indicating whether each element has been tagged this session, used in "on tagged") ## TODO: make a scheme-list of wasTagged, now it will be wiped at each scheme change? observeEvent( list(queryNext(), values$queryQueue, input$sampleChoice, input$taggedWhich, input$maxprobWhich, input$regexFilter, input$regexCaseSens, input$visuLock, values$visuZoom$xlim, values$visuZoom$ylim), { if (is.null(values$queryQueue)) return(NULL) if (verbose) cat("DEBUG queryNext\n") queue <- values$queryQueue if (input$sampleChoice == "untagged") { queue <- queue[!values$wasTagged[queue]] } else if (input$sampleChoice == "tagged") { queue <- queue[!is.na(values$label)[queue] & !values$wasRetagged[queue]] if (input$taggedWhich != "all") { queue <- queue[values$label[queue] == input$taggedWhich] } } else if (input$sampleChoice == "all") { queue <- queue[!values$wasRetagged[queue]] } if (input$visuLock) if (!is.null(values$visuZoom$xlim)) { if (!is.null(ok.visu())) queue <- queue[ok.visu()[queue, 1] >= values$visuZoom$xlim[1] & ok.visu()[queue, 1] <= values$visuZoom$xlim[2] & ok.visu()[queue, 2] >= values$visuZoom$ylim[1] & ok.visu()[queue, 2] <= values$visuZoom$ylim[2]] } if (length(queue)) if (nchar(input$regexFilter)) { try_regex <- try(silent = TRUE, { queue <- queue[ stringi::stri_detect_regex( ok.nicetext()[queue], input$regexFilter, opts_regex = list(case_insensitive = !input$regexCaseSens))] }) if (inherits(try_regex, "try-error")) { showNotification("Invalid regex", duration = 2, type = "error") values$newQuery <- NULL return(NULL) } } if (!length(queue)) { if (nchar(input$regexFilter)) { showNotification("No texts matching the regex", duration = 2, type = "warning") } else showNotification("No more texts to be tagged in this set", duration = 2, type = "warning") values$newQuery <- NULL return(NULL) } values$lastQuery <- queue[1] # ?? values$newQuery <- queue[1] ready2tag(1) } ) ## queryQueue event: fix values$queryQueue, (indices of current queue) observeEvent( list(ok.data(), ok.pred(), input$strategy, input$maxprobWhich, input$selectProject, input$selectScheme), { if (input$strategy %in% c("entropy", "maxprob") & is.null(ok.pred())) { values$queryQueue <- NULL values$lastQuery <- NULL values$newQuery <- NULL return(NULL) } if (is.null(ok.text())) return(NULL) if (verbose) cat("DEBUG queryQueue change\n") if (input$strategy == "sequential") { values$queryQueue <- 1:length(ok.text()) } else if (input$strategy == "random") { values$queryQueue <- sample(length(ok.text())) } else { dapred <- ok.pred() if (input$strategy == "entropy") { tominimize <- rowSums(dapred * log(dapred)) if (any(dapred == 0)) { tominimize[dapred == 0] <- 0 } } if (input$strategy == "maxprob") { if (! input$maxprobWhich %in% colnames(dapred)) return(NULL) tominimize <- -dapred[, input$maxprobWhich]; } values$queryQueue <- order(tominimize) } } ) ## Update uniqueLabels and uniqueLabelsAll if necessary on values$label change observeEvent(values$label, { if (is.null(values$label)) return(NULL) if (verbose) cat("DEBUG uniqueLabels\n") dalabs <- sort(na.omit(unique(values$label))) # if (!length(dalabs)) return(NULL) if (!identical(dalabs, values$uniqueLabels)) values$uniqueLabels <- dalabs if (any(! dalabs %in% values$uniqueLabelsAll)) values$uniqueLabelsAll <- c(values$uniqueLabelsAll, dalabs[! dalabs %in% values$uniqueLabelsAll]) }) ############################################################################## ## Project management ############################################################################## output$cpDataMessage <- renderUI({ req(values$cpDataMessage) p(values$cpDataMessage) }) output$cpConfirmMessage <- renderUI({ req(values$cpConfirmMessage) p(strong(paste0("Error: ", values$cpConfirmMessage))) }) output$cpIdcolUI <- renderUI({ req(values$cpNewData) if (!is.data.frame(values$cpNewData)) return(NULL) selectInput("cpIdcol", NULL, colnames(values$cpNewData)) }) output$cpTextcolUI <- renderUI({ req(values$cpNewData) if (!is.data.frame(values$cpNewData)) return(NULL) selectInput("cpTextcol", NULL, colnames(values$cpNewData), multiple = TRUE) }) output$cpContextcolUI <- renderUI({ req(values$cpNewData) if (!is.data.frame(values$cpNewData)) return(NULL) selectInput("cpContextcol", NULL, colnames(values$cpNewData), multiple = TRUE) }) output$cpTagcolUI <- renderUI({ req(values$cpNewData) if (!is.data.frame(values$cpNewData)) return(NULL) selectInput("cpTagcol", NULL, colnames(values$cpNewData), multiple = TRUE) }) output$cpCommcolUI <- renderUI({ req(values$cpNewData) if (!is.data.frame(values$cpNewData)) return(NULL) selectInput("cpCommcol", NULL, c("none", colnames(values$cpNewData))) }) output$cp_spacyUI <- renderUI( textInput("cp_use_spacy_model", NULL, modelnames$spacy_name[modelnames$short == input$cp_lang], placeholder = "(spacy model name)") ) output$cp_spacyDlUI <- renderUI(HTML(paste( "Model must be downloaded once, from python:
    ", input$cp_which_python, "-m spacy download", modelnames$spacy_name[modelnames$short == input$cp_lang], "

    " ))) output$cp_ftUI <- renderUI( textInput("cp_use_ft_model", NULL, modelnames$fasttext_name[modelnames$short == input$cp_lang], placeholder = "(fasttext model path)") ) output$cp_ftDlUI <- renderUI(list(HTML(paste( "Model can be downloaded here")), br(), br())) output$cp_sbUI <- renderUI( textInput("cp_use_sb_model", NULL, ifelse( input$cp_lang %in% c("ar", "zh", "nl", "en", "fr", "de", "it", "ko", "pl", "pt", "ru", "es", "tr"), "distiluse-base-multilingual-cased-v1", "distiluse-base-multilingual-cased-v2"), placeholder = "(custom sentence_transformers model)") ) ## Create new project, with modal dialog observeEvent(input$createProject, { if (verbose) cat("DEBUG create project\n") showModal(modalDialog( title = "New project", p("Create a new text tagging project."), p("This mainly involves choosing a dataset, and how to process it."), p("Items marked by a star ", strong("*"), " cannot be changed once the project is created."), hr(), h4("Files and folders"), fluidRow( column(3, p(strong("*Project name"))), column(5, textInput("cpName", NULL, placeholder = "(Required)")), column(4, p("Tags will be stored in a new directory by this name.")) ), fluidRow( column(3, p(strong("*Data directory"))), column(5, textInput("cpDatadir", NULL, placeholder = "(Required)")), column(4, p("Place (on the server) where the data and project are stored")) ), fluidRow( column(3, p(strong("*Data filename"))), column(5, checkboxInput("cpDoImport", "New data", TRUE)), column(4, p("Main file, containing id and text columns")) ), conditionalPanel("!input.cpDoImport", list( ## Existing data fluidRow( column(3, p(strong("Existing file"))), column(5, textInput("cpDatafile", NULL, placeholder = "(Required)")), column(4, actionButton("cpDatafileGo", "Import")) ) )), conditionalPanel("input.cpDoImport", { ## Import data wellPanel( selectInput('cpNDfiletype', NULL, choices = c("Text file (.csv, .txt)" = "csv_txt", "Feather" = "feather", "Parquet" = "parquet", "OpenDocument (.ods)" = "ods", "Microsoft Excel (.xls, .xlsx)" = "excel", "SPSS (.sav, .por)" = "spss", "SAS (.sas7bdat)" = "sas_data", "Stata v5-v12 (.dta)" = "stata"), selected = "csv_txt" ), fileInput('cpNDfile', 'Choose File'), checkboxInput("cpNDparams", "File parameters"), conditionalPanel( "input.cpNDparams", conditionalPanel( "input.cpNDfiletype == 'csv_txt'", fluidRow(column(4, p('Header')), column(8, selectInput('cpCSVheader', NULL, c("Header" = TRUE, "No Header" = FALSE), "Auto"))), fluidRow(column(4, p('Separator')), column(8, selectInput('cpCSVsep', NULL, c("Comma ','" = ",", "Semicolon ';'" = ";", "Tab" = "\t", "Space" = " "), "Auto"))), fluidRow(column(4, p('Quote')), column(8, selectInput('cpCSVquote', NULL, c("Double Quote \"" = "\"", "Single Quote '" = "'", "None" = ""), "Double Quote \""))), fluidRow(column(4, p('Decimal mark')), column(8, selectInput('cpCSVdec', NULL, c('Period "."' = ".", 'Comma ","' = ","), 'Period "."'))), fluidRow(column(4, p('File Encoding')), column(8, selectInput('cpCSVencoding', NULL, c("unknown", "UTF-8", "Latin-1"), "unknown")))), ## Options for ods files conditionalPanel( "input.cpNDfiletype == 'ods'", fluidRow(column(4, p('Sheet')), column(8, numericInput('cpODSsheet', NULL, 1, 1, step = 1))), checkboxInput("cpODScolnames", "Column names", TRUE), fluidRow(column(4, p('NA symbol')), column(8, textInput('cpODSna', NULL, value = ""))), fluidRow(column(4, p('Skip rows')), column(8, numericInput('cpODSskip', NULL, 0, 0, step = 1))), fluidRow(column(4, p('Specify range')), column(8, textInput('cpODSrange', NULL, value = ""))) ), ## Options for xls / xlsx files conditionalPanel( "input.cpNDfiletype == 'excel'", fluidRow(column(4, p('Column names')), column(8, selectInput('cpXLScolnames', NULL, choices = c("Yes" = "TRUE", "No" = "FALSE"), selected = TRUE ))), fluidRow(column(4, p('Skip rows')), column(8, numericInput('cpXLSskip', NULL, 0, 0, step = 1))), fluidRow(column(4, p('Trim whitespaces')), column(8, selectInput('cpXLStrim', NULL, choices = c("Yes" = "TRUE", "No" = "FALSE"), selected = TRUE ))), checkboxInput("cpXLSsheet", "Specify Worksheet", FALSE), conditionalPanel("input.cpXLSsheet", textInput("cpXLSsheet_specs", NULL, "")), checkboxInput("cpXLSrange", "Specify Range", FALSE), conditionalPanel("input.cpXLSrange", textInput("cpXLSrange_specs", NULL, "") )), ## Options for SPSS files conditionalPanel( "input.cpNDfiletype == 'spss'", fluidRow( column(4, p('Skip rows')), column(8, numericInput('cpSPSSskip', NULL, 0, 0, step = 1))), checkboxInput("cpSPSSna", "User-defined NA", FALSE)), ## Options for SAS files conditionalPanel( "input.cpNDfiletype == 'sas_data'", fluidRow( column(4, p('Skip rows')), column(8, numericInput("cpSASskip", NULL, 0, 0, step = 1))), checkboxInput("cpSAScatalog", "Use catalog file", FALSE), conditionalPanel( "input.cpSAScatalog", fileInput("cpSAScatalog_file", "Catalog file")) ), ## Options for STATA dta files conditionalPanel( "input.cpNDfiletype == 'stata'", checkboxInput("cpSTATAfactors", "Convert factors", FALSE)) ) ) }), uiOutput("cpDataMessage"), hr(), h4("Data"), fluidRow( column(6, numericInput( "cpNrows", "N. rows for working sample", 500, 1, step = 1)), column(6, numericInput( "cpNskip", "Rows to skip before working sample", 0, 0, step = 1)) ), fluidRow( column(3, p(strong("*ID"))), column(5, uiOutput("cpIdcolUI")), column(4, p("Name of the id variable, unique identifier of each text")) ), fluidRow( column(3, p(strong("*Text"))), column(5, uiOutput("cpTextcolUI")), column(4, p("Name of the text variables: if more than one, texts are concatenated in the specified order")) ), fluidRow( column(3, p(strong("*Tags"))), column(5, uiOutput("cpTagcolUI")), column(4, p("(optional) Names of variables that are already tagged: each will create a new scheme")) ), fluidRow( column(3, p(strong("Comments"))), column(5, uiOutput("cpCommcolUI")), column(4, p("(optional) Name of the comments variable")) ), fluidRow( column(3, p(strong("Context"))), column(5, uiOutput("cpContextcolUI")), column(4, p("(optional) Names of variables not used in the models, but may be displayed during tagging")) ), hr(), h4("System"), fluidRow( column(3, checkboxInput("cp_use_python", "Python backend", FALSE)), column(5, conditionalPanel( "input.cp_use_python", textInput("cp_which_python", NULL, value = "python3", placeholder = "(custom python path)"))), column(4, conditionalPanel( "input.cp_use_python", p("This must be a working python3 environment, with the required modules installed (see documentation)"))) ), conditionalPanel("input.cp_use_python", list( fluidRow( column(3, p("Language")), column(5, selectInput("cp_lang", NULL, modelnames_labels, "en")), column(4, p("Used to preset tokenization and embedding models")) ), fluidRow( column(3, checkboxInput("cp_use_spacy", "SpaCy tokenization", FALSE)), column(5, conditionalPanel("input.cp_use_spacy", uiOutput("cp_spacyUI"))), column(4, p("Name of the spacy tokenizer model, used in DTM and word embeddings")) ), conditionalPanel("input.cp_use_spacy", fluidRow( column(3), column(9, uiOutput("cp_spacyDlUI"))) ), fluidRow( column(3, checkboxInput("cp_use_ft", "FastText word embeddings", FALSE)), column(5, conditionalPanel("input.cp_use_ft", uiOutput("cp_ftUI"))), column(4, p("Path to the local fasttext model binary")) ), conditionalPanel("input.cp_use_ft", fluidRow( column(3), column(9, uiOutput("cp_ftDlUI")))), fluidRow( column(3, checkboxInput("cp_use_sb", "SBERT sentence embeddings", FALSE)), column(5, conditionalPanel("input.cp_use_sb", uiOutput("cp_sbUI"))), column(4, p("(GPU recommended) Name or path of the sentence-transformers model")) ), conditionalPanel("input.cp_use_python", { checkboxInput("cp_use_gpu", "GPU support (CUDA, for SBERT and BERT)", FALSE) }) )), uiOutput("cpConfirmMessage"), footer = tagList(actionButton("cpConfirm", "Create"), modalButton("Cancel")))) }) observeEvent(input$cpDatafileGo, { dataname <- input$cpDatafile dataname <- gsub("[.]feather$", "", dataname) tmpdat <- try(arrow::read_feather( paste0(gsub("/+$", "/", paste0(input$cpDatadir, "/")), dataname, ".feather"))) if (inherits(tmpdat, "try-error")) { values$cpDataMessage <- paste0( "Data import error: ", as.character(tmpdat)) values$cpNewData <- NULL return(NULL) } if (!is.data.frame(tmpdat)) { values$cpDataMessage <- "Data import error: object not a data.frame" values$cpNewData <- NULL return(NULL) } if (ncol(tmpdat) < 2) { values$cpDataMessage <- "Data import error: only one column detected" values$cpNewData <- NULL return(NULL) } values$cpDataMessage <- paste0( "Data import success: ", nrow(tmpdat), " rows, ", ncol(tmpdat), " cols.") values$cpNewData <- tmpdat values$cpNewDataname <- dataname }) ## Data import mechanism observeEvent(input$cpNDfile, { req(input$cpNDfile) if(input$cpNDfiletype == "csv_txt") { res <- try(read.csv( input$cpNDfile$datapath, header = as.logical(input$cpCSVheader), sep = input$cpCSVsep, quote = input$cpCSVquote, dec = input$cpCSVdec, stringsAsFactors = FALSE, encoding = input$cpCSVencoding, check.names = TRUE)) } else if(input$cpNDfiletype == "feather") { res <- try(arrow::read_feather(input$cpNDfile$datapath)) } else if(input$cpNDfiletype == "parquet") { res <- try(arrow::read_parquet(input$cpNDfile$datapath)) } else if (input$cpNDfiletype == "ods") { range <- input$cpODSrange if (range == "") range <- NULL res <- try(readODS::read_ods( path = input$cpNDfile$datapath, sheet = input$cpODSsheet, col_names = input$cpODScolnames, na = input$cpODSna, skip = input$cpODSskip, range = input$cpODSrange)) } else if(input$cpNDfiletype == "excel"){ column_names <- as.logical(input$cpXLScolnames) trim_spaces <- as.logical(input$cpXLStrim) the.range <- NULL if(input$cpXLSrange == TRUE & input$cpXLSrange_specs != "") the.range <- input$cpXLSrange_specs the.sheet <- NULL if(input$cpXLSsheet == TRUE & input$cpXLSsheet_specs != "") the.sheet <- input$cpXLSsheet_specs res <- try(data.frame(readxl::read_excel( dataFile$datapath, col_names = column_names, range = the.range, sheet= the.sheet, trim_ws = trim_spaces, skip = rows_to_skip))) } else if(input$cpNDfiletype == "spss"){ res <- try(data.frame(haven::read_spss( file = input$cpNDfile$datapath, skip= input$cpSPSSskip, user_na = input$cpSPSSna))) if (!inherits(res, "try-error")) { res <- data.frame(lapply(res, function(x) { attr(x, "format.spss") <- NULL if ("haven_labelled" %in% class(x)) x <- haven::as_factor(x, levels= "labels") x })) } } else if(input$cpNDfiletype == "sas_data") { res <- try(data.frame(haven::read_sas( data_file = input$cpNDfile$datapath, catalog_file = input$cpSAScatalog_file$datapath, skip = input$cpSASskip))) } else if(input$cpNDfiletype == "stata") { res <- try(data.frame(foreign::read.dta( file = input$cpNDfile$datapath, convert.factors = input$cpSTATAfactors))) } if(inherits(res, "try-error")) { values$cpDataMessage <- paste0("Data import error: ", res) values$cpNewData <- NULL return(NULL) } if (ncol(res) < 2) { values$cpDataMessage <- "Data import error: only one column detected" values$cpNewData <- NULL return(NULL) } values$cpDataMessage <- paste0( "Data import success: ", nrow(res), " rows, ", ncol(res), " columns.") values$cpNewData <- res }) ## New project confirm observeEvent(input$cpConfirm, { if (verbose) cat("DEBUG cpConfirm\n") newconf <- list() tmpdir <- gsub("/+$", "/", paste0(input$cpDatadir, "/")) if (!dir.exists(tmpdir)) dir.create(tmpdir) newconf$datadir <- tmpdir tmpname <- cleanFileName(input$cpName) if (!nchar(tmpname)) { values$cpConfirmMessage <- "Project name is required" return(NULL) } if (dir.exists(paste0(newconf$datadir, tmpname))) { values$cpConfirmMessage <- paste0( "A project with the same name (", tmpname, ") already exists in the data directory.") return(NULL) } newconf$projectid <- tmpname if (!is.data.frame(values$cpNewData)) { values$cpConfirmMessage <- "Import data first" return(NULL) } if (input$cpDoImport) { newconf$dataname <- newconf$projectid } else newconf$dataname <- values$cpNewDataname if (any(is.na(values$cpNewData[[input$cpIdcol]]))) { values$cpConfirmMessage <- "ID variable contains missing values" return(NULL) } if (any(duplicated(values$cpNewData[[input$cpIdcol]]))) { values$cpConfirmMessage <- "ID variable contains duplicates" return(NULL) } newconf$idcol <- input$cpIdcol if (!length(input$cpTextcol)) { values$cpConfirmMessage <- "Select at least one text variable" return(NULL) } newconf$textcols <- input$cpTextcol if (input$cpCommcol == "none") { newconf$commcol <- paste0("comm_", newconf$projectid) } else newconf$commcol <- input$cpCommcol if (!length(input$cpTagcol)) { newconf$tagcols <- "scheme0" } else newconf$tagcols <- input$cpTagcol newconf$contextcols <- input$cpContextcol newconf$dataNrows <- input$cpNrows newconf$dataSkipRows <- input$cpNskip if (input$cp_use_python) { pytest <- try(system(paste(input$cp_which_python, "--version"), intern = TRUE)) if (inherits(pytest, "try-error")) { values$cpConfirmMessage <- paste( "Python path `", input$cp_which_python, "` not valid") return(NULL) } } newconf$use_python <- input$cp_use_python newconf$python <- input$cp_which_python newconf$use_gpu <- input$cp_use_gpu if (input$cp_use_spacy) { sptest <- system(paste0( newconf$python, " -m spacy info ", input$cp_use_spacy_model), intern = TRUE) if (length(sptest) == 0) { values$cpConfirmMessage <- paste( "Error loading spacy, check that it is installed in the specified python env") return(NULL) } if (length(attr(sptest, "status"))) { values$cpConfirmMessage <- paste( "Error loading spacy model, check that it has been downloaded") return(NULL) } } newconf$use_spacy <- input$cp_use_spacy newconf$use_spacy_model <- input$cp_use_spacy_model if (input$cp_use_ft) if (!file.exists(input$cp_use_ft_model)) { values$cpConfirmMessage <- paste( "Error loading fasttext model, check the specified path") return(NULL) } newconf$use_ft <- input$cp_use_ft newconf$use_ft_model <- input$cp_use_ft_model newconf$use_sb <- input$cp_use_sb newconf$use_sb_model <- input$cp_use_sb_model writeLines(RJSONIO::toJSON(newconf), paste0("tigger_", newconf$projectid, ".json")) values$confnames <- c(newconf$projectid, values$confnames) values$conf <- newconf ## On new data import, write a feather file with the name of the project if (input$cpDoImport) arrow::write_feather(values$cpNewData, paste0( newconf$datadir, newconf$dataname, ".feather" )) removeModal() }) ############################################################################## ## Scheme management ############################################################################## ## Print active scheme in scheme management panel output$printScheme <- renderUI(h4(input$selectScheme)) ## Create new scheme on request, with modal dialog observeEvent(input$createScheme, { if (verbose) cat("DEBUG create scheme\n") showModal(modalDialog( title = "Create tagging scheme", p(paste0("This will create a new scheme for this project")), textInput("newScheme", label = NULL, placeholder = "Required: new scheme name"), textAreaInput("newSchemeDescr", label = NULL, placeholder = "Optional: scheme description"), checkboxInput("schemeDuplicate", "Duplicate existing scheme:"), conditionalPanel( "input.schemeDuplicate", selectInput("schemeDuplicateFrom", NULL, rev(values$conf$tagcols))), footer = tagList(actionButton("createSchemeConfirm", "Create"), modalButton("Cancel")))) }) observeEvent(input$createSchemeConfirm, { if (is.null(ok.data())) return(NULL) if (verbose) cat("DEBUG create scheme event\n") ## Make sure new scheme name is a correct file name if (!grepl("\\S", input$newScheme)) { showNotification(type = "error", "Cannot create scheme: name empty.") return(NULL) } ## Check that new scheme name doesn't exist ## Re-read tagcols from conf file, to prevent scheme disappearances tmpname <- cleanFileName(input$newScheme) tagpath <- paste0(values$projectdir, tmpname, ".csv") confpath <- paste0("tigger_", values$conf$projectid, ".json") if (verbose) cat("DEBUG create scheme check json:", confpath,"\n") tmp_tagcols <- RJSONIO::fromJSON(confpath)$tagcols if (verbose) cat("DEBUG create scheme json OK\n") if (tmpname %in% tmp_tagcols | file.exists(tagpath)) { showNotification(type = "error", "Cannot create scheme: name already exists.") return(NULL) } ## Create new scheme description if (nchar(input$newSchemeDescr)) writeLines(input$newSchemeDescr, paste0(values$projectdir, tmpname, ".txt")) ## Update conf file and reactives values$conf$tagcols <- c(tmp_tagcols, tmpname) # values$confs[[values$conf$projectid]]$tagcols <- c(tmp_tagcols, tmpname) writeLines(RJSONIO::toJSON(values$conf), confpath) if (input$schemeDuplicate) { tmpmat <- matrix(values$tagmat[, input$schemeDuplicateFrom]) tmpmathist <- matrix(values$tagmathist[, input$schemeDuplicateFrom]) } else { tmpmat <- matrix(rep(NA, nrow(ok.data()))) tmpmathist <- matrix(rep(NA, nrow(ok.data()))) } colnames(tmpmat) <- tmpname colnames(tmpmathist) <- tmpname values$tagmat <- cbind(values$tagmat, tmpmat) values$tagmathist <- cbind(values$tagmat, tmpmathist) ## Create new tagging file if duplicating if (input$schemeDuplicate) { export <- data.frame(ok.data()[, values$conf$idcol], values$tagmat[, tmpname]) colnames(export) <- c(values$conf$idcol, tmpname) export <- na.omit(export) write.table(export, tagpath, sep = ",", qmethod = "double", fileEncoding = "UTF-8", row.names = FALSE, append = FALSE, col.names = TRUE) } cat("DEBUG create scheme exit\n") removeModal() }) ## Button action: delete scheme observeEvent(input$schemeDelete, { if (verbose) cat("DEBUG delete scheme\n") if (length(values$conf$tagcols) < 2) { showNotification(type = "error", "Cannot delete: at least one scheme must remain.") return(NULL) } showModal(modalDialog( title = "Delete tagging scheme", paste0("Confirm delete '", input$selectScheme, "': ", sum(!is.na(values$label)), " tags will be deleted", ", along with scheme description."), footer = tagList(actionButton("schemeDeleteConfirm", "Delete"), modalButton("Cancel")))) }) observeEvent(input$schemeDeleteConfirm, { file.remove(paste0(values$projectdir, input$selectScheme, ".csv")) file.remove(paste0(values$projectdir, input$selectScheme, ".txt")) file.remove(paste0(values$projectdir, input$selectScheme, "_regex.json")) values$conf$tagcols <- values$conf$tagcols[values$conf$tagcols != input$selectScheme] writeLines(RJSONIO::toJSON(values$conf), paste0("tigger_", input$selectProject, ".json")) removeModal() }) observeEvent(input$schemeDescrSave, { req(input$selectScheme) writeLines(input$schemeDescr, paste0(values$conf$datadir, values$conf$projectid, "/", input$selectScheme, ".txt")) }) ############################################################################## ############################################################################## ## Tag selection : populate observe({ values$conf$tagcols updateSelectInput( # session, "selectScheme", choices = c(rev(values$conf$tagcols), "")) session, "selectScheme", choices = rev(values$conf$tagcols)) }) ## Populate oracle buttons output$oracleButtons <- renderUI({ if (is.null(ok.data())) return(NULL) # if (is.null(values$label)) return(NULL) if (is.null(values$uniqueLabels)) return(NULL) if (verbose) cat("DEBUG populate oracle buttons\n") do.call(fluidRow, lapply(values$uniqueLabels, function(ilab) actionButton(paste0("oracle_", label2hash(ilab)), strong(ilab)))) }) ## Populate maxprob variable selector observe({ updateSelectInput(session, "maxprobWhich", choices = values$uniqueLabels) updateSelectInput(session, "taggedWhich", choices = c("All" = "all", values$uniqueLabels)) }) ## Data panel message observeEvent(ok.data(), { if (verbose) cat("DEBUG data panel message\n") if (is.null(ok.data())) { output$dataMessage <- renderUI(p("Importing data...")) } output$dataMessage <- NULL }) ## On config change, update dataView and retag options observeEvent(list(values$conf), { if (is.null(values$conf)) return(NULL) output$panelData <- renderUI(list( DT::dataTableOutput("dataView") )) output$panelRetag <- renderUI(c( list( br(), h4("Rename tags"), fluidRow( column(4, strong("Current")), column(2, strong("Count")), column(6, strong("New")) ), br()), lapply(values$uniqueLabels, function(iLab) { iHash <- label2hash(iLab) fluidRow( column(4, p(iLab)), column(2, p(sum(values$label == iLab, na.rm = TRUE))), column(4, textInput( paste0("newtag_", iHash), NULL, placeholder = "(new name, leave empty to untag)")), column(2, actionButton(paste0("retag_", iHash), "Rename"))) }) )) }) output$panelExtraRegex <- renderUI(list( p(strong("Regex predictors")), fluidRow( column(8, textInput("extraRegexText", NULL, placeholder = "(new regex)", width = "100%")), column(4, actionButton( "extraRegexSearch", strong("+"), width = "100%")) ), fluidRow( style="overflow-y:scroll; max-height: 25vh", fluidRow( column(2, strong("")), column(1, strong("Use")), column(1, strong("Case")), column(1, p(strong("N"))), column(7, p(strong("Regex"))) ), lapply(values$extraRegex, function(iRegex) { iHash <- label2hash(iRegex) tmp_cased <- values[[paste0("caseregex_", iHash)]] if (is.null(tmp_cased)) tmp_cased <- FALSE fluidRow( column(2, actionButton(paste0("delregex_", iHash), "🗑️", width="100%")), column(1, checkboxInput( paste0("useregex_", iHash), NULL, value = isolate(!isFALSE(values[[paste0("useregex_", iHash)]])))), column(1, checkboxInput( paste0("caseregex_", iHash), NULL, value = isolate(isTRUE(values[[paste0("caseregex_", iHash)]])))), column(1, br(), p(sum( stringi::stri_detect_regex( ok.nicetext(), iRegex, opts_regex = list(case_insensitive = !tmp_cased)), na.rm = TRUE))), column(7, br(), p(iRegex)) ) }) ) )) ## Process new regex observeEvent(input$extraRegexSearch, { if (is.null(ok.nicetext())) return(NULL) req(nchar(input$extraRegexText) > 0) if (input$extraRegexText %in% values$extraRegex) return(NULL) try_regex <- try(stringi::stri_detect_regex( ok.nicetext(), input$extraRegexText, opts_regex = list(case_insensitive = TRUE))) if (inherits(try_regex, "try-error")) return(NULL) new_regex <- input$extraRegexText new_hash <- label2hash(new_regex) values$extraRegex <- c(new_regex, values$extraRegex) names(values$extraRegex)[1] <- new_hash if (verbose) cat("DEBUG regex search", input$extraRegexText, "\n") }) ## Regex predictors: populate delete and use events on first creation useregex_trigger <- reactiveVal(0) delregex_trigger <- reactiveVal(0) caseregex_trigger <- reactiveVal(0) observeEvent(list(values$extraRegex), { if (!length(values$extraRegex)) return(NULL) if (any(! values$extraRegex %in% values$extraRegexHist)) { if (verbose) cat("DEBUG regex predictor names updates\n") tmpregex <- values$extraRegex[! values$extraRegex %in% values$extraRegexHist] values$extraRegexHist <- c(tmpregex, values$extraRegexHist) c(lapply(1:length(tmpregex), function(iregex) { observeEvent( input[[paste0("delregex_", names(tmpregex)[iregex])]], { if (verbose) cat("DEBUG regex delete", names(tmpregex)[iregex], "\n") values$extraRegex <- values$extraRegex[values$extraRegex != tmpregex[iregex]] delregex_trigger(delregex_trigger() + 1) }) }), lapply(1:length(tmpregex), function(iregex) { tmpname <- paste0("useregex_", names(tmpregex)[iregex]) observeEvent( input[[tmpname]], { values[[tmpname]] <- input[[tmpname]] useregex_trigger(useregex_trigger() + 1) }) }), lapply(1:length(tmpregex), function(iregex) { tmpname <- paste0("caseregex_", names(tmpregex)[iregex]) observeEvent( input[[tmpname]], { values[[tmpname]] <- input[[tmpname]] caseregex_trigger(caseregex_trigger() + 1) }) }) ) } }) observeEvent(list(delregex_trigger(), useregex_trigger(), caseregex_trigger()), { req(ok.data(), input$selectScheme) regex_all <- values$extraRegex values$regex_inuse <- sapply(names(regex_all), function(iname) values[[paste0("useregex_", iname)]]) values$regex_case <- sapply(names(regex_all), function(iname) values[[paste0("caseregex_", iname)]]) if (verbose) if (any(values$regex_inuse)) cat("DEBUG regex in use", paste0(collapse = " ", regex_all[values$regex_inuse]), "\n") writeLines( RJSONIO::toJSON(lapply(FUN = unname, list( regex = regex_all, use = values$regex_inuse, case = values$regex_case))), paste0(values$projectdir, input$selectScheme, "_regex.json")) }) output$dataView <- DT::renderDataTable({ # if (is.null(ok.text()) | ok.data.running()) return(NULL) req(ok.text()) if (verbose) cat("DEBUG enter dataView\n") if (verbose) cat("DEBUG render dataView\n") DT::datatable( rownames = F, data.frame( row = 1:nrow(ok.data()), id = ok.data()[, values$conf$idcol], text = ok.text()), options = list( pageLength = 3, columnDefs = list(list( targets = 2, render = DT::JS( "function(data, type, row, meta) {", "return type === 'display' && data.length > 200 ?", "'' + data.substr(0, 200) + '...' : data;", "}") )) ), callback = DT::JS('table.page(0).draw(false);') ) }) ## Triggers for mass rename, created with each new label, with confirm observeEvent(values$uniqueLabelsAll, { if (is.null(values$uniqueLabelsAll)) return(NULL) notthere <- unique(na.omit( values$uniqueLabelsAll[! values$uniqueLabelsAll %in% values$retagLabels])) lapply( notthere, function(iTag) { iHash <- label2hash(iTag) observeEvent(input[[paste0("retag_", iHash)]], { tmp_oldtag <- iTag tmp_retagged <- which(values$label == tmp_oldtag) tmp_newtag <- gsub( "^\\s+|\\s+$", "", input[[paste0("newtag_", iHash)]]) showModal(modalDialog( title = "Mass retag", paste0("Confirm retag: ", length(tmp_retagged), " ", tmp_oldtag, " tags will be ", ifelse(nchar(tmp_newtag), paste("renamed to", tmp_newtag) , "deleted")), footer = tagList(actionButton(paste0("retag_confirm_", iTag), "Retag"), modalButton("Cancel")))) }) observeEvent(input[[paste0("retag_confirm_", iTag)]], { tmp_oldtag <- iTag tmp_retagged <- which(values$label == tmp_oldtag) tmp_newtag <- gsub( "^\\s+|\\s+$", "", input[[paste0("newtag_", iHash)]]) if (!nchar(tmp_newtag)) tmp_newtag <- NA values$tagTrigger <- list( id = ok.data()[[values$conf$idcol]][tmp_retagged], tag = rep(tmp_newtag, length(tmp_retagged))) values$queries <- c(values$queries, tmp_retagged) removeModal() }) }) values$retagLabels <- c(values$retagLabels, notthere) }) ############################################################################## ## UI rendering of Panel Tagging / Visualization ############################################################################## #################### ## Tagging left panel # ## Trigger from Oops # observeEvent(input$oops, { # if (is.null(ok.data())) # return(NULL) # if (verbose) # cat("DEBUG enter Oops\n") # if (!is.null(values$retagged)) if (values$queries[length(values$queries)] == values$retagged[length(values$retagged)]) # values$retagged <- values$retagged[-length(values$retagged)] # values$label[values$queries[length(values$queries)]] <- NA # if (ok.commentCol() != "(None)") # values$comment[values$queries[length(values$queries)]] <- ok.data()[values$queries[length(values$queries)], ok.commentCol()] # values$queries <- values$queries[-length(values$queries)] # ## tagTrigger ? # # ready2tag(0) # queryNext(queryNext() + 1) # }) ## Tagging triggers for Oracle buttons, created with each new label observeEvent(values$uniqueLabelsAll, { if (is.null(values$label)) return(NULL) if (is.null(values$uniqueLabelsAll)) return(NULL) if (all(is.na(values$label))) return(NULL) notthere <- unique(na.omit( values$uniqueLabelsAll[! values$uniqueLabelsAll %in% values$oracleLabels] )) if (!length(notthere)) return(NULL) if (verbose) cat("DEBUG triggers for oracle buttons\n") lapply( notthere, function(iLab) { observeEvent( input[[paste0("oracle_", label2hash(iLab))]], { if (is.null(ok.text())) return(NULL) if (is.null(values$newQuery)) return(NULL) if (!ready2tag()) {cat("DEBUG not ready to tag in Button\n"); return(NULL)} # ready2tag(0) # if (input$allowRetag | sum(is.na(values$label)) == 0) if (!is.na(values$label[values$lastQuery])) values$retagged <- c(values$retagged, values$lastQuery) # values$label[values$lastQuery] <- iLab values$comment[values$lastQuery] <- input$currentComment # values$queries <- c(values$queries, values$lastQuery) values$tagTrigger <- list( id = ok.data()[values$lastQuery, values$conf$idcol], tag = iLab, comment = input$currentComment) }) }) values$oracleLabels <- c(values$oracleLabels, notthere) }) ## Trigger from Oracle Create value button observeEvent(input$currentAction, { if (is.null(ok.data())) return(NULL) if (is.null(values$newQuery)) return(NULL) if (verbose) cat("DEBUG enter create value\n") # if (input$allowRetag | sum(is.na(values$label)) == 0) if (!is.na(values$label[values$lastQuery])) values$retagged <- c(values$retagged, values$lastQuery) if (input$newLab == "") { return(NULL) } else { # values$label[values$lastQuery] <- input$newLab updateTextInput(session, "newLab", value= "") } values$comment[values$lastQuery] <- input$currentComment # values$queries <- c(values$queries, values$lastQuery) values$tagTrigger <- list( id = ok.data()[values$lastQuery, values$conf$idcol], tag = input$newLab, comment = input$currentComment) }) ## Trigger from Oracle Confirm button observeEvent(input$oracleConfirm, { req(ok.data(), ok.train(), values$newQuery) # if (is.null(ok.data())) return(NULL) # if (is.null(values$newQuery)) return(NULL) if (!ready2tag()) {cat("DEBUG not ready to tag in OC\n"); return(NULL)} # ready2tag(0) if (!ok.train.possible()) return(NULL) if (verbose) cat("DEBUG oracle confirm\n") # if (input$allowRetag | sum(is.na(values$label)) == 0) if (!is.na(values$label[values$lastQuery])) values$retagged <- c(values$retagged, values$lastQuery) # values$label[values$lastQuery] <- ok.predlab1()[values$lastQuery] # values$queries <- c(values$queries, values$lastQuery) values$comment[values$lastQuery] <- input$currentComment values$tagTrigger <- list( id = ok.data()[values$lastQuery, values$conf$idcol], tag = ok.predlab1()[values$lastQuery], comment = input$currentComment) }) ## Trigger for CV glm (async) glmCV_nclicks <- reactiveVal(0) observeEvent(input$glmCV, { if (is.null(ok.predictor())) return(NULL) if (glmCV_nclicks() != 0) { showNotification("LASSO CV already running") return(NULL) } glmCV_nclicks(1) glmCV_future <- reactiveVal() tmpvalues <- values$label tmppred <- ok.predictor() for (ival in na.omit(unique(tmpvalues))) if (sum(tmpvalues == ival, na.rm = T) <= 8) tmpvalues[which(tmpvalues == ival)] <- NA glmCV_async <- future(seed = TRUE, { glmnet::cv.glmnet( x= tmppred[!is.na(tmpvalues), ], y= as.factor(tmpvalues[!is.na(tmpvalues)]), family= "multinomial") }) %...>% glmCV_future() glmCV_async <- catch(glmCV_async, function(e) { showNotification(type = "error", paste0("LASSO CV error: ", e)) }) glmCV_async <- finally(glmCV_async, function() { glmCV_nclicks(0) if (is.null(glmCV_future())) return(NULL) updateNumericInput(session, "glmLambda", value = glmCV_future()$lambda.min) }) NULL }) ## Trigger for CV liblinear (async) liblinCV_nclicks <- reactiveVal(0) observeEvent(input$liblinCV, { if (is.null(ok.predictor())) return(NULL) if (liblinCV_nclicks() != 0) { showNotification("LibLineaR CV already running") return(NULL) } liblinCV_nclicks(1) liblinCV_future <- reactiveVal() tmpvalues <- values$label tmppred <- ok.predictor() for (ival in na.omit(unique(tmpvalues))) if (sum(tmpvalues == ival, na.rm = T) <= 3) tmpvalues[which(tmpvalues == ival)] <- NA liblinCV_async <- future(seed = TRUE, { LiblineaR::LiblineaR( data= tmppred[!is.na(tmpvalues), ], target= as.factor(tmpvalues[!is.na(tmpvalues)]), type= 0, findC= T) }) %...>% liblinCV_future() liblinCV_async <- catch(liblinCV_async, function(e) { showNotification(type = "error", paste0("LibLineaR CV error: ", e)) }) liblinCV_async <- finally(liblinCV_async, function() { liblinCV_nclicks(0) if (is.null(liblinCV_future())) return(NULL) updateNumericInput(session, "liblinCost", value= liblinCV_future()) }) NULL }) output$currentContext <- renderUI({ if (verbose) cat("DEBUG 1rentContext\n") if ((! is.null(ok.text())) & (length(ok.contextCol()) > 0)) HTML(paste(paste(ok.contextCol(), as.character(ok.data()[values$newQuery, ok.contextCol()]), sep = " : "), collapse= "

    ")) }) output$makeOracleConfirm <- renderUI({ if (verbose) cat("DEBUG update printPredicted\n") if (is.null(ok.pred())) { tmp <- "No prediction" } else if (is.null(values$newQuery)) { tmp <- "No query" } else { tmp <- paste0( ifelse(is.na(values$label[values$newQuery]), "", "Pred: "), ok.predlab1()[values$newQuery], " (", round(max(ok.pred()[values$newQuery, ]), 3), ")") } res <- list(actionButton("oracleConfirm", label= paste("🤖", tmp), width = "100%")) if (!is.null(values$label) & !is.null(values$newQuery)) { if (!is.na(values$label[values$newQuery])) res <- list(res[[1]], HTML(paste0( "
    Current: ", values$label[values$newQuery], "
    "))) } res }) ## Show training diagnostics on new model train output$trainDiagno <- renderText(paste0(ok.msgtagged(), "\n", values$modelDiagno)) ## Diagnostics event observeEvent(diagTrigger(), { if (verbose) cat("DEBUG update modelDiagno\n") if (is.null(ok.data())) { values$modelDiagno <- "No model" return(NULL) } if (!ok.train.possible()) { values$modelDiagno <- "No model (not enough labelled data)." return(NULL) } if (input$predModel == "random forest") { tmp <- paste0( "Train Accuracy: ", round(100 * mean(ok.predlab1()[!is.na(values$label)] == values$label[!is.na(values$label)]), 1), "% ; Wtd. F1: ", sprintf("%f", round(wtdF1(values$label[!is.na(values$label)], ok.predlab1()[!is.na(values$label)]), 3)), "%\nOOB Accuracy : ", round(100 * mean( colnames(ok.train()$predictions)[apply(ok.train()$predictions, 1, which.max)] == values$label[!is.na(values$label)]), 1), "% ; Wtd. F1: ", sprintf("%.3f", wtdF1( values$label[!is.na(values$label)], colnames(ok.train()$predictions)[apply(ok.train()$predictions, 1, which.max)])) ) } else if (input$predModel %in% c("lasso", "linear", "naive bayes", "knn")) { tmp <- paste0( "Train Accuracy: ", round(100 * mean(ok.predlab1()[!is.na(values$label)] == values$label[!is.na(values$label)]), 1), "% ; Wtd. F1: ", sprintf("%.3f", wtdF1( values$label[!is.na(values$label)], ok.predlab1()[!is.na(values$label)]))) } values$modelDiagno <- tmp }) ## Update comment input field observeEvent(values$newQuery, { if (is.null(ok.text())) return(NULL) if (verbose) cat("DEBUG update comment field\n") updateTextInput(session, "currentComment", value = values$comment[values$newQuery]) }) ## Save event: tag + comment in dedicated csv observeEvent(values$tagTrigger, { if (is.null(values$tagTrigger)) return(NULL) if (verbose) cat("DEBUG enter event tagTrigger", length(values$tagTrigger$id), "\n") ready2tag(0) # Prevent new tagging until next query dapath <- paste0(values$projectdir, ok.labelcol(), ".csv") darows <- match(values$tagTrigger$id, ok.data()[, values$conf$idcol]) export <- data.frame(values$tagTrigger$id, values$tagTrigger$tag) colnames(export) <- c(values$conf$idcol, ok.labelcol()) if (verbose) cat("DEBUG tagTrigger tag", paste(export[, 1], export[, 2], collapse = " ; "), "\n") write.table(export, dapath, sep = ",", qmethod = "double", fileEncoding = "UTF-8", row.names = F, append = ifelse(file.exists(dapath), T, F), col.names = ifelse(file.exists(dapath), F, T)) if (any(nchar(values$tagTrigger$comment))) { tmp_filter <- which(nchar(values$tagTrigger$comment) > 0) dapath <- paste0(values$projectdir, values$conf$commcol, ".csv") export <- data.frame(values$tagTrigger$id[tmp_filter], values$tagTrigger$comment[tmp_filter]) colnames(export) <- c(values$conf$idcol, values$conf$commcol) if (verbose) cat("DEBUG tagTrigger comment", export[1,1], export[1, 2]) write.table(export, dapath, sep = ",", qmethod = "double", fileEncoding = "UTF-8", row.names = F, append = ifelse(file.exists(dapath), T, F), col.names = ifelse(file.exists(dapath), F, T)) } ## Update label hist values$labelhist[match(values$tagTrigger$id, ok.data()[, values$conf$idcol])] <- max(values$labelhist, na.rm = T) + 1 ## Update wasTagged and wasRetagged itag <- match(values$tagTrigger$id, ok.data()[, values$conf$idcol]) if (input$sampleChoice == "untagged") { values$wasTagged[itag] <- TRUE } else if (input$sampleChoice == "tagged") { values$wasRetagged[itag] <- TRUE } else if (input$sampleChoice == "all") { if (is.na(values$label[itag])) { values$wasTagged[itag] <- TRUE } else values$wasRetagged[itag] <- TRUE } ## Call queryNext event, or launch training if (trainCountdown() == 1) { trainCountdown(input$trainCountdown) trainTrigger(trainTrigger() + 1) } else if (trainCountdown() > 1) { trainCountdown(trainCountdown() - 1) queryNext(queryNext() + 1) } else { queryNext(queryNext() + 1) } ## Update values$label etc values$label[darows] <- values$tagTrigger$tag values$queries <- c(values$queries, darows) if (verbose) cat("DEBUG exit event tagTrigger", nrow(export), "rows\n") }) #################### ## Tagging main panel output$textVisuCols <- renderUI({ if (verbose) cat("DEBUG render text/visu panel\n") leftsize <- 6 if (!input$panelText) leftsize <- 1 if (input$panelText & !input$panelVisu) leftsize <- 11 fluidRow( column( leftsize, conditionalPanel( "input.panelText", wellPanel(htmlOutput("currentText")) ) ), column( 12 - leftsize, conditionalPanel( "input.panelVisu", fluidRow( column(2, HTML(paste0('
    ', actionButton("visuCompute", "🎲"), '
    '))), column(2, HTML(paste0('
    ', actionButton("visuGoZoom", "🔍±"), '
    '))), column(2, HTML(paste0('
    ', checkboxInput("visuLock", "🔒", value = FALSE), '
    '))), column(2, HTML(paste0('
    ', checkboxInput("visuOptions", "🔧"), '
    '))), column(3, HTML(paste0('
    ', numericInput("visuHeight", NULL, 400, 20, 2e12, 10), '
    '))) ), conditionalPanel( 'input.visuOptions', flowLayout( selectInput( "visuAlgo", "Visu. method", c("umap", "t-sne"), "umap", width = "100%"), selectizeInput( "visuSource", "Visu. source", c("Word counts (DFM)" = "dfm", "Word embeddings (FastText)" = "ft", "Sentence embeddings (SBERT)" = "sb", "BERT predictions" = "bertpred", "Regex" = "regex"), c("sb", "ft"), multiple = TRUE) ) ), conditionalPanel( # umap options 'input.visuAlgo == "umap" & input.visuOptions', flowLayout( numericInput( "umapNeighb", "Nb. Neighbors", 15, 1, 500, 1), numericInput( "umapSpread", "Spread", 1, 0, 16, 1e-2), numericInput( "umapMindist", "Min dist.", .001, 0, 16, 1e-3), selectInput( "umapMetric", "Metric", selected = "cosine", c("euclidean", "cosine", "manhattan", "hamming", "correlation", "categorical") ) ) ), conditionalPanel( # t-SNE options 'input.visuAlgo == "t-sne" & input.visuOptions', flowLayout( numericInput( "tsnePerplex", "Perplexity", 15, 1, 500, 1), numericInput( "tsneTheta", "Theta", .5, 0, 1, 1e-2), numericInput( "tsnePcaDim", "PCA dims", 50, 1, 100, 1) ) ), uiOutput("visuMsg"), plotOutput( "visuPlot", width = "100%", dblclick = "visuDblclick", brush = brushOpts("visuBrush", resetOnNew = T)) ) ) ) }) ################## ## Subpanel "Text" ################## output$currentText <- renderUI({ if (verbose) cat("DEBUG render currentText\n") if (is.null(ok.data()) | is.null(ok.nicetext())) return(HTML("Import data to start training.")) if (input$strategy %in% c("entropy", "maxprob") & is.null(ok.train())) return(HTML("No model: train one, or try 'random' or 'sequential' strategy")) if (is.null(values$newQuery)) return(HTML("No query")) bastext <- ok.nicetext()[values$newQuery] if (nchar(input$regexFilter) > 0) bastext <- gsub(paste0("(", isolate(input$regexFilter), ")"), "\\1", bastext, ignore.case = isolate(!input$regexCaseSens)) baslen <- nchar(bastext) if (!length(baslen)) { if (nchar(input$regexFilter) > 0) return(HTML("Regex filter: no match")) return(HTML("No text matches the filters")) } if (is.na(baslen)) return(HTML("No query")) if (baslen > 1200) { bastext <- paste0(substring(bastext, 1, 1200), '', substring(bastext, 1201), "") } bastext <- gsub("\n", "
    ", bastext) HTML(bastext) }) ############################################################################## ## Panel "Visualize" ############################################################################## output$visuMsg <- renderUI({ if (!is.null(values$visuMsg)) # strong(values$visuMsg) h5(values$visuMsg, style = "text-align:center") }) ## Compute visualization points (async) visu_nclicks <- reactiveVal(0) ok.visu <- reactiveVal() observe({ if (!input$panelVisu) values$visuGo <- FALSE if (input$panelVisu & !values$visuGo) values$visuGo <- TRUE }) observeEvent(list(values$visuGo, input$visuCompute), { if (is.null(input$visuSource)) return(NULL) if (!values$visuGo) return(NULL) if (is.null(ok.data())) { values$visuMsg <- "No data imported." return(NULL) } if (visu_nclicks() > 0) { values$visuMsg <- "Already computing visualization" return(NULL) } visu_nclicks(1) values$visuMsg <- "Computing visualization..." ## Prepare options and objects for async computation dasources <- input$visuSource daalgo <- input$visuAlgo if (verbose) cat("DEBUG compute visu embedding: ", daalgo, paste(dasources, collapse = " "), "\n") visudat <- NULL if ("dfm" %in% dasources) dadfm <- ok.dfm() if ("ft" %in% dasources) { visudat <- ok.data()[, values$cols_ft] } if ("sb" %in% dasources) { if (is.null(visudat)) { visudat <- ok.data()[, values$cols_sb] } else visudat <- cbind(visudat, ok.data()[, values$cols_sb]) } if ("bertpred" %in% dasources) { if (is.null(visudat)) { visudat <- ok.data()[, do.call(c, values$cols_bertpred)] } else visudat <- cbind(visudat, ok.data()[, do.call(c, values$cols_bertpred)]) } if ("regex" %in% dasources) { if (is.null(visudat)) { visudat <- ok.extraRegex.matrix() } else visudat <- cbind(visudat, ok.extraRegex.matrix()) } if (daalgo == "umap") { visu_args <- list( n_neighbors = input$umapNeighb, metric = input$umapMetric, spread = input$umapSpread, min_dist = input$umapMindist) } else if (daalgo == "t-sne") { visu_args <- list( perplexity = input$tsnePerplex, theta = input$tsneTheta, initial_dims = input$tsnePcaDim, check_duplicates = FALSE) } visu_async <- future(seed = TRUE, { if ("dfm" %in% dasources) { ## TODO: CA treatment buggy, switch to SVD? # da_ca <- quanteda.textmodels::textmodel_ca( # quanteda::as.dfm(dadfm), nd = min(100, nrow(dadfm))) # da_ca <- (da_ca$rowcoord %*% diag(da_ca$sv))[, !is.na(da_ca$sv)] da_ca <- as.matrix(dadfm) if (is.null(visudat)) { visudat <- da_ca } else visudat <- cbind(da_ca, visudat) } if (daalgo == "umap") { do.call(uwot::umap, c(list(visudat), visu_args)) } else if (daalgo == "t-sne") { visudat_dupli <- duplicated(visudat) if (any(visudat_dupli)) visudat[visudat_dupli, ] <- jitter(visudat[visudat_dupli, ]) do.call(Rtsne::Rtsne, c(list(visudat), visu_args))$Y } }) %...>% ok.visu() visu_async <- catch(visu_async, function(error) { values$visuMsg <- paste0("Error: ", error) }) visu_async <- finally(visu_async, function() { visu_nclicks(0) if (input$visuGoZoom == 0) { values$visuMsg <- "Double-click to select, drag to select zooming zone." } else values$visuMsg <- NULL nice_choices <- c("FastText" = "ft", "SBERT" = "sb", "visu" = daalgo) names(nice_choices)[3] <- ifelse(daalgo == "t-sne", "t-SNE", "UMAP") updateSelectInput( session, "dlEmbedSelect", choices = nice_choices, selected = daalgo) if (verbose) cat("DEBUG Visu async out\n") }) NULL }) ## Compute visualization plot (async) ok.ggvisu <- reactiveVal() observe({ # if (is.null(ok.visu())) return(NULL) if (is.null(ok.visu())) { ok.ggvisu(NULL) return(NULL) } # if (!isTRUE(values$visugo)) return(NULL) if (!input$panelVisu) return(NULL) if (verbose) cat("DEBUG compute visu plot\n") ggdat <- data.frame(ok.visu(), tag = values$label) if (!is.null(ok.predlab1())) ggdat$tag[is.na(values$label)] <- ok.predlab1()[is.na(values$label)] da_predlabs <- ok.predlab1() da_tagged <- ! is.na(values$label) da_query <- values$newQuery ggvisu_async <- future({ ggres <- ggplot(ggdat, aes(X1, X2)) + theme_bw() + xlab("") + ylab("") + theme(legend.position = "bottom") + scale_shape_manual(values = rep(1:8, 40)) # TODO : adapt to tags ## Tagged points if (any(da_tagged)) { ggres <- ggres + geom_point(data = ggdat[da_tagged, ], aes(X1, X2, color = tag, shape = tag), size = 1.5, stroke = 1.5, alpha = .8) } ## Not tagged points if (!is.null(da_predlabs)) { ggres <- ggres + geom_point(data = ggdat[!da_tagged, ], aes(color = tag, shape = tag), show.legend = FALSE) } else { ggres <- ggres + geom_point(alpha = .8, data = ggdat[is.na(ggdat$tag), ]) } ## Current query if (!is.null(da_query)) { if (is.null(da_predlabs)) { ggres <- ggres + geom_point(data = ggdat[da_query, ], color = 2, stroke = 3, shape = 3, show.legend = FALSE) } else { ggres <- ggres + geom_point(data = ggdat[da_query, ], stroke = 3, shape = 3, show.legend = FALSE) } } ggres }) %...>% ok.ggvisu() ggvisu_async <- catch(ggvisu_async, function(error) { values$visuMsg <- paste0("Error in ggplot: ", error) }) }) observe({ if (verbose) cat("DEBUG update visuPlot\n") if (is.null(ok.ggvisu())) { values$visuMsg <- "No visualization yet." output$visuPlot <- NULL values$visuZoom$xlim <- NULL values$visuZoom$ylim <- NULL return(NULL) } output$visuPlot <- renderPlot(height = input$visuHeight, { # if (is.null(ok.ggvisu())) return(NULL) ok.ggvisu() + coord_fixed(xlim = values$visuZoom$xlim, ylim = values$visuZoom$ylim) }) }) observeEvent(input$visuDblclick, { if (!is.null(values$visuMsg)) values$visuMsg <- NULL tmp <- input$visuDblclick bmu <- t(ok.visu()) - c(tmp$x, tmp$y) bmu <- which.min(colSums(bmu^2)) values$lastQuery <- bmu values$newQuery <- bmu ready2tag(1) }) observeEvent(input$visuGoZoom, { if (!is.null(values$visuMsg)) values$visuMsg <- NULL values$visuZoom$xlim <- c(input$visuBrush$xmin, input$visuBrush$xmax) values$visuZoom$ylim <- c(input$visuBrush$ymin, input$visuBrush$ymax) }) ############################################################################## ## Panel History ############################################################################## ## Create history link and retag events as they appear observeEvent(input$histDTable_rows_current, { histrows <- order(values$labelhist) histrows <- rev(histrows[!is.na(values$labelhist[histrows])]) histrows <- histrows[!is.na(values$label[histrows])] histrows <- histrows[input$histDTable_rows_current] notthere <- histrows[! histrows %in% values$histQueries] if (!length(notthere)) return(NULL) lapply(notthere, function(daquery) { if (verbose) cat("DEBUG create history events", daquery, "\n") observeEvent(input[[paste0("hist", daquery)]], { if (verbose) cat("DEBUG history link", daquery, "\n") values$lastQuery <- daquery values$newQuery <- daquery updateTabsetPanel(session, "mainPanelset", selected = "Tagging") ready2tag(1) }) observeEvent(input[[paste0("histTag", daquery)]], { if (verbose) cat("DEBUG history retag", daquery, "\n") values$histSaveStack <- rbind( values$histSaveStack, data.frame(id = ok.data()[daquery, values$conf$idcol], tag = input[[paste0("histTag", daquery)]], comment = values$comment[daquery]) ) }) }) values$histQueries <- c(values$histQueries, notthere) }) ## History save action observeEvent(input$histSave, { req(is.data.frame(values$histSaveStack)) values$tagTrigger <- as.list(values$histSaveStack) values$label[match(values$histSaveStack$id, ok.data()[[values$conf$idcol]])] <- values$histSaveStack$tag values$histSaveStack <- NULL }) ## Update save name with number of unsaved changes observeEvent(length(values$histSaveStack$id), { updateActionButton( session, "histSave", label = paste0("Save changes (", length(unique(values$histSaveStack$id)),")")) }) ## History table, with clickable links and retagging output$histDTable <- DT::renderDataTable({ histrows <- order(values$labelhist) histrows <- rev(histrows[!is.na(values$labelhist[histrows])]) histrows <- histrows[!is.na(values$label[histrows])] # histrows <- rev(values$queries) hist_df <- data.frame( # history = length(histrows):1, history = values$labelhist[histrows], id = sapply(histrows, function(irow) { as.character(actionLink( paste0("hist", irow), label = ok.data()[irow, values$conf$idcol], onclick = paste0('Shiny.onInputChange(\"hist', irow, '\", this.id)'))) }), Tag = values$label[histrows], Retag = sapply(histrows, function(irow) { paste0( "" ) }), Comment = values$comment[histrows], Text = ok.nicetext()[histrows]) if (!is.null(values$conf$contextcols)) hist_df <- cbind(hist_df, ok.data()[histrows, values$conf$contextcols]) DT::datatable(hist_df, escape = FALSE, rownames = FALSE, filter = "top", options = list( columnDefs = list(list( targets = 5, render = DT::JS( "function(data, type, row, meta) {", "return type === 'display' && data.length > 200 ?", "'' + data.substr(0, 200) + '...' : data;", "}"))) )) }) ############################################################################## ## Panel "Stats" ############################################################################## output$statsTagTable <- renderTable(rownames = F, { table(tag = values$label) }) ## Model diagnostics by CV output$statsCVoutput <- reactive({ if (is.null(values$modelcv)) return("Model diagnostics will appear here") values$modelcv }) output$statsCVtable <- DT::renderDataTable({ if (is.null(values$cvtable)) return(NULL) DT::datatable(values$cvtable, list(pageLength = 25), rownames = F) }) ## Button action: compute CV (async) cv_nclicks <- reactiveVal(0) observeEvent(input$statsCVgo, { if (is.null(ok.train())) { values$modelcv <- "Error in model training" return(NULL) } if (cv_nclicks() != 0) { values$modelcv <- "CV already running" return(NULL) } cv_nclicks(cv_nclicks() + 1) predmat <- ok.predictor()[!is.na(values$label), ] predlab <- values$label[!is.na(values$label)] cvsamp <- sample(10, nrow(predmat), replace = TRUE) cvres <- rep(NA, nrow(predmat)) cvres_future <- reactiveVal() tmp_model <- input$predModel if (tmp_model == "random forest") { cv_args <- list( num.trees = input$rfNumTrees, mtry = if (input$rfMtry > 0) input$rfMtry, sample.fraction = input$rfSampleFrac, probability = FALSE) } else if (tmp_model == "linear") { cv_args <- list(type= 0, cost = input$liblinCost) } else if (tmp_model == "lasso") { cv_args <- input$glmLambda } else if (tmp_model == "naive bayes") { cv_args <- list(smooth = input$naiveSmooth, prior = input$naivePrior, distribution = input$naiveDistri) } else if (tmp_model == "knn") { cv_args <- list(k = input$knnK) } cv_status_file <- paste0(session$token, "_cv") cv_async <- future(seed = TRUE, { for (icv in 1:10) { set_status(cv_status_file, paste0("Computing CV fold ", icv, " / 10")) if (tmp_model == "random forest") { cvmodel <- do.call(ranger::ranger, c(cv_args, list( x= predmat[cvsamp != icv, ], y= as.factor(predlab[cvsamp != icv])))) cvres[cvsamp == icv] <- as.character( predict(cvmodel, data= predmat[cvsamp == icv, ])$predictions) } else { tmpvalues <- predlab[cvsamp != icv] # for (ival in na.omit(unique(tmpvalues))) # if (sum(tmpvalues == ival, na.rm = T) == 1) # tmpvalues[which(tmpvalues == ival)] <- NA if (tmp_model == "lasso") { cvmodel <- suppressWarnings( glmnet::glmnet(x= predmat[cvsamp != icv, ][!is.na(tmpvalues), ], y= as.factor(tmpvalues[!is.na(tmpvalues)]), family= "multinomial")) cvres[cvsamp == icv] <- predict( cvmodel, newx = predmat[cvsamp == icv, ], type = "class", s= cv_args) } else if (tmp_model == "linear") { cvmodel <- do.call(LiblineaR::LiblineaR, c( list(data= predmat[cvsamp != icv, ][!is.na(tmpvalues), ], target= as.factor(tmpvalues[!is.na(tmpvalues)])), cv_args)) cvres[cvsamp == icv] <- as.character( predict(cvmodel, newx= predmat[cvsamp == icv, ])$predictions) } else if (tmp_model == "naive bayes") { cvmodel <- do.call(quanteda.textmodels::textmodel_nb, c( list(x= predmat[cvsamp != icv, ][!is.na(tmpvalues), ], y= as.factor(tmpvalues[!is.na(tmpvalues)])), cv_args)) cvres[cvsamp == icv] <- as.character( predict( cvmodel, newdata= predmat[cvsamp == icv, ], type= "class")) } else if (isolate(tmp_model) == "knn") { cvres[cvsamp == icv] <- as.character(do.call(class::knn, c(cv_args, list( train = predmat[cvsamp != icv, ], test = predmat[cvsamp == icv, ], cl = predlab[cvsamp != icv])))) } } } cvres }) %...>% cvres_future() cv_async <- catch(cv_async, function(e) { cvres_future(NULL) values$modelcv <- e$message }) cv_async <- finally(cv_async, function() { cv_nclicks(0) if (file.exists(cv_status_file)) unlink(cv_status_file) if (is.null(cvres_future())) return(NULL) cvres <- cvres_future() predlab <- values$label[!is.na(values$label)] dametrics <- metricsTable(predlab, cvres) values$cvtable <- dametrics res <- "10-CV OOB evaluation:\n" # if (any(grepl("^bertpred", input$use_regressors))) if (any(grepl("^bertpred", values$new_use_regressors))) res <- paste0( res, "⚠️ Scores are biased upwards if BERT predictions were trained on current data ⚠️ \n") res <- paste0( res, "* Accuracy : ", round(100 * mean(cvres == predlab), 2), "%\n", "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n", "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1), dametrics$N.cases)), "\n" ) values$modelcv <- res }) NULL }) ## Automatic message update during CV computation observe({ if (cv_nclicks() == 0) return(NULL) invalidateLater(200) if (!file.exists(paste0(session$token, "_cv"))) return(NULL) values$modelcv <- get_status(paste0(session$token, "_cv")) values$cvtable <- NULL }) ############################################################################## ## Panel BERT ############################################################################## ########## ## Bert UI left panel ## Update list of saved models and dlSelect on init and on saveTrigger observeEvent(list(ok.data(), values$bertSaveTrigger), { damodels <- dir(pattern = "^tiggerbert_saved_") if (!length(damodels)) return(NULL) damodels <- gsub("^tiggerbert_saved_", "", damodels) values$bertSaved <- damodels updateSelectInput(session, "dlBPSelect", choices = damodels) }) ## Populate saved models list, with buttons output$bertSaved <- renderUI({ if (verbose) cat("DEBUG bert update saved list\n") HTML(paste0( "", paste0(sapply(values$bertSaved, function(dabert) { bertshort <- dabert if (nchar(bertshort) > 23) bertshort <- paste0(substr(bertshort, 1, 20), "...") paste0( "") }), collapse = "\n"), "
    ", actionLink(paste0("saved_", dabert), bertshort, title = dabert, style="margin-right: 5px;"), "", actionButton(paste0("bert_copypars_", dabert), "📋", title = "Copy parameters"), "", actionButton(paste0("bert_infer_", dabert), "🤖", title = "Infer predictions on current data"), "", actionButton(paste0("bert_delete_", dabert), "🗑", title = "Delete model"), "
    " )) }) ########## ## Bert UI main panel ## UI outputs : message and stats table and plot output$bertMsg <- reactive( ifelse(is.null(values$bertMsg), "Train new BERT, or select saved model.", values$bertMsg)) output$bertValstats <- DT::renderDataTable( DT::datatable(values$bertValstats, list(pageLength = 25))) output$bertValPlot <- renderPlot(values$bertValPlot) output$bertMsgHyperpar <- reactive(values$bertMsgHyperpar) ################################## ## Async handling of BERT training ## Status File bert_status_file <- paste0("tiggerbert_", session$token, "_trainstatus") fire_interrupt <- function(dafile) set_status(dafile, "interrupt") fire_ready <- function(dafile) set_status(dafile, "Ready") fire_running <- function(dafile, perc_complete = NULL){ if(is.null(perc_complete)) msg <- "Training..." else msg <- paste0("Training... ", perc_complete, "% complete") set_status(dafile, msg) } interrupted <- function(dafile) get_status(dafile) == "interrupt" ## Create Status File fire_ready(bert_status_file) ## Button action preparation bert_nclicks <- reactiveVal(0) bert_result_val <- reactiveVal() bert_hyperpars <- reactiveVal() ## Button appearance: write "Stop" if training is ongoing observeEvent(bert_nclicks(), { updateActionButton(session, "bertTrain", label = ifelse(bert_nclicks() > 0, "Stop", "Train BERT")) }) ## Button action: Train new BERT observeEvent(input$bertTrain,{ ## Don't do anything if analysis is already being run # if(bert_nclicks() != 0){ # values$bertMsg <- "BERT is already training" # return(NULL) # } tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token) if(bert_nclicks() != 0){ write("stop", paste0(tmpsession, "_stop")) bert_nclicks(0) return(NULL) } if (file.exists(paste0(tmpsession, "_stop"))) file.remove(paste0(tmpsession, "_stop")) ## Increment clicks and prevent concurrent analyses bert_nclicks(bert_nclicks() + 1) bert_result_val(data.frame(Status="Running...")) fire_running(bert_status_file) ## Prepare async training values$bertLoadSaved <- NULL # tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token) tmpmodel <- input$bertModel system(paste0("rm ", tmpsession, "_done")) tmpdat <- data.frame( id = ok.data()[, values$conf$idcol], text = ok.nicetext(), label = values$label ) tmpdat <- tmpdat[!is.na(tmpdat$label), ] tmpdat <- tmpdat[grepl("\\S", tmpdat$text), ] ## Filter data: min occurences, balance classes, validation split tmplabtable <- table(tmpdat$label) if (is.numeric(input$bertMinOccur)) if (input$bertMinOccur > 1) { tmpdat <- tmpdat[tmpdat$label %in% names(which(tmplabtable >= input$bertMinOccur)), ] tmplabtable <- table(tmpdat$label) } set.seed(input$bertValidSeed) if (input$bertBalance) { tmpminocc <- min(tmplabtable) tmptrain <- sample(do.call(c, lapply(unique(tmpdat$label), function(dalabel) sample(which(tmpdat$label == dalabel), round((1-input$bertValidFrac) * tmpminocc))))) tmpvalid <- sample(do.call(c, lapply(unique(tmpdat$label), function(dalabel) { dasamp <- which(tmpdat$label == dalabel) dasamp <- dasamp[! dasamp %in% tmptrain] sample(dasamp, round(input$bertValidFrac * tmpminocc)) }))) } else { tmptrain <- sample(1:nrow(tmpdat), round((1 - input$bertValidFrac) * nrow(tmpdat))) tmpvalid <- (1:nrow(tmpdat))[-tmptrain] } write.csv(tmpdat[tmptrain, ], paste0(tmpsession, "_dat_train.csv"), row.names = FALSE) write.csv(tmpdat[tmpvalid, ], paste0(tmpsession, "_dat_valid.csv"), row.names = FALSE) iwait <- 0 while (iwait < 10) { if (!file.exists(paste0(tmpsession, "_dat_valid.csv"))) Sys.sleep(500) iwait <- iwait + 1 } tmp_cmd <- paste0( values$conf$python, " gobert.py -m ", tmpmodel, " -t ", tmpsession, "_dat_train.csv", " -v ", tmpsession, "_dat_valid.csv", " -b ", input$bertBatchsize, " -g ", input$bertGradacc, " -e ", input$bertEpochs, " -l ", input$bertLrate, " -w ", input$bertWdecay, " -B ", ifelse(input$bertBest, 1, 0), " -E ", input$bertNeval, " -s ", tmpsession, " -G ", as.numeric(values$conf$use_gpu), " -A ", as.numeric(input$bertAdapt), " 2>&1 ; touch ", tmpsession, "_done" ) ## Save hyperpars bert_hyperpars(list(bertModel = input$bertModel, bertEpochs = input$bertEpochs, bertLrate = input$bertLrate, bertWdecay = input$bertWdecay, bertBatchsize = input$bertBatchsize, bertGradacc = input$bertGradacc, bertNeval = input$bertNeval, bertBest = input$bertBest, bertValidFrac = input$bertValidFrac, bertValidSeed = input$bertValidSeed, bertNtrain = length(tmptrain), bertTrainTable = as.list(table(tmpdat$label[tmptrain])), bertScheme = ok.labelcol())) ## Launch async training bert_async <- future({ if (file.exists(tmpsession)) system(paste0("rm -R ", tmpsession, "*")) # system(tmp_cmd, wait = FALSE, intern = FALSE) tmp_exec <- system(tmp_cmd, wait = FALSE, intern = TRUE) writeLines(tmp_exec, paste0(tmpsession, "_log")) ## After training, get trainer state from last checkpoint for diagnostics chkpt <- dir(paste0(tmpsession, "_train"), pattern = "^checkpoint") chkpt <- chkpt[order(as.numeric(gsub("\\D", "", chkpt)), decreasing = T)[1]] system(paste0( "cp ", tmpsession, "_train/", chkpt, "/trainer_state.json ", tmpsession, "/")) }) %...>% bert_result_val() ## Catch interrupt (or any other error) and notify user bert_async <- catch(bert_async, function(e){ bert_result_val(NULL) showNotification(e$message) values$bertMsg <- e$message }) ## After the promise has been evaluated set nclicks to 0 to allow another Run bert_async <- finally(bert_async, function() { system(paste0("rm -R ", tmpsession, "_train")) system(paste0("rm ", tmpsession, "_done")) fire_ready(bert_status_file) bert_nclicks(0) if (is.null(bert_result_val())) return(NULL) bertpreds <- try(read.csv(paste0(tmpsession, "_predval.csv"))) if (inherits(bertpreds, "try-error")) { # values$bertMsg <- "BERT error - try again" values$bertMsg <- try(paste(readLines(paste0(tmpsession, "_log")), collapse = "\n")) values$bertValstats <- NULL values$bertValPlot <- NULL values$bertLastUnsaved <- NULL values$bertMsgHyperpar <- NULL values$bertLastHyperpar <- NULL return(NULL) } datruth <- tmpdat[tmpvalid, "label"] dapred <- bertpreds$bertpred dametrics <- metricsTable(datruth, dapred) damsg <- paste0( "Unsaved model (", tmpmodel, ")\n\n", "BERT validation stats:\n", "* Accuracy : ", round(100 * mean(datruth == dapred), 2), "%\n", "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n", "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1), dametrics$N.cases)), "\n") hypermsg <- bertHyperToMsg(bert_hyperpars()) writeLines(RJSONIO::toJSON(bert_hyperpars()), paste0(tmpsession, "/tiggertrain.json")) values$bertValstats <- dametrics values$bertMsg <- damsg values$bertValPlot <- bertPlotFromPath(tmpsession) values$bertLastUnsaved <- list( metrics = dametrics, msg = damsg, session = tmpsession) values$bertMsgHyperpar <- hypermsg values$bertLastHyperpar <- bert_hyperpars() }) values$bertMsg <- "BERT is training" values$bertValstats <- NULL values$bertValPlot <- NULL values$bertLastUnsaved <- NULL values$bertMsgHyperpar <- NULL values$bertLastHyperpar <- NULL ## Return something other than the promise so shiny remains responsive NULL }) ## Automatic check of BERT training status observe({ if (bert_nclicks() == 0) return(NULL) isolate({ tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token) ## Check for checkpoint dirs checkdirs <- length(dir(paste0(tmpsession, "_train"), pattern = "^checkpoint")) # fire_running(bert_status_file, # min(100, round(100 * checkdirs / input$bertNeval, 1))) # dastate <- try(RJSONIO::fromJSON(paste0(tmpsession, "_train/trainer_state.json"))) # if (!inherits(dastate, "try-error")) # fire_running(bert_status_file, # round(100 * as.numeric(dastate$epoch) / input$bertEpochs, 1)) try({ dapoints <- dir(paste0(tmpsession, "_train"), pattern = "trainer_state[.]json$", recursive = T) dapoints_num <- as.numeric(gsub("^checkpoint-(\\d+)/.*$", "\\1", dapoints)) statefile <- paste0( tmpsession, "_train/", dapoints[which.max(dapoints_num)]) if (file.exists(statefile)) { state <- RJSONIO::fromJSON(statefile) fire_running(bert_status_file, round(100 * as.numeric(state$epoch) / input$bertEpochs, 1)) } }) }) if (checkdirs < isolate(input$bertNeval) & file.exists(paste0(tmpsession, "_done"))) { values$bertMsg <- "BERT Error - try again" # values$bertMsg <- paste(readLines(paste0(tmpsession, "_log")), collapse = "\n") } else { invalidateLater(1e3) loss_progress <- "" if (checkdirs > 0) { ## Get training diagnostics try({ dapoints <- dir(paste0(tmpsession, "_train"), pattern = "trainer_state[.]json$", recursive = T) dapoints_num <- as.numeric(gsub("^checkpoint-(\\d+)/.*$", "\\1", dapoints)) statefile <- paste0( tmpsession, "_train/", dapoints[which.max(dapoints_num)]) if (file.exists(statefile)) { state <- RJSONIO::fromJSON(statefile) logs <- data.table::rbindlist(lapply(state$log_history, as.list), fill = TRUE) logs <- logs[, .(loss = max(loss, na.rm = T), eval_loss = max(eval_loss, na.rm = T)), by = epoch] loss_progress <- paste0( "\n* Epoch : ", sprintf("%.02f", logs$epoch), "\ttrain loss : ", sprintf("%.05f",logs$loss), "\teval loss : ", sprintf("%.05f", logs$eval_loss), collapse = "") } }) } values$bertMsg <- ifelse(checkdirs == 0, "Pre-processing...", paste0(get_status(bert_status_file), loss_progress)) } values$bertValstats <- NULL values$bertValPlot <- NULL values$bertLoadSaved <- NULL values$bertMsgHyperpar <- NULL }) ## Action: show last BERT model observeEvent(input$bertLast, { values$bertMsg <- values$bertLastUnsaved$msg values$bertValstats <- values$bertLastUnsaved$metrics values$bertValPlot <- bertPlotFromPath(values$bertLastUnsaved$session) values$bertMsgHyperpar <- bertHyperToMsg(values$bertLastHyperpar) values$bertLoadSaved <- NULL }) ## Action: save BERT model observeEvent(input$bertSave, { tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token) if (!file.exists(paste0(tmpsession))) return(NULL) if (!grepl("\\S", input$bertSaveName)) { tmpname <- paste0( ok.labelcol(), "_", gsub("\\D+$", "", gsub("\\D", "-", Sys.time()))) } else tmpname <- cleanFileName(input$bertSaveName) savedir <- paste0("tiggerbert_saved_", tmpname) system(paste("cp -R", tmpsession, savedir)) system(paste0("cp ", tmpsession, "_dat_valid.csv ", savedir, "/tiggerbert_dat_valid.csv")) system(paste0("cp ", tmpsession, "_predval.csv ", savedir, "/tiggerbert_pred_valid.csv")) system(paste0("rm -R ", tmpsession, "*")) values$bertMsg <- paste0("BERT model successfully saved: ", tmpname) values$bertValstats <- NULL values$bertValPlot <- NULL values$bertMsgHyperpar <- NULL if (is.null(values$bertSaveTrigger)) { values$bertSaveTrigger <- 0 } else values$bertSaveTrigger <- values$bertSaveTrigger + 1 values$bertLastUnsaved <- NULL updateTextInput(session, "bertSaveName", value = "") }) ## Actions: load saved model (list updated only with new models) observeEvent(values$bertSaved, { if (!length(values$bertSaved)) return(NULL) tmpsaved <- values$bertSaved[! values$bertSaved %in% values$bertSavedAll] lapply(tmpsaved, function(dabert) { ############ ## Actions to load saved model observeEvent(input[[paste0("saved_", dabert)]], { bertconf <- RJSONIO::fromJSON(paste0( "tiggerbert_saved_", dabert, "/config.json")) truth <- read.csv(paste0( "tiggerbert_saved_", dabert, "/tiggerbert_dat_valid.csv"))$label pred <- read.csv(paste0( "tiggerbert_saved_", dabert, "/tiggerbert_pred_valid.csv"))$bertpred dametrics <- metricsTable(truth, pred) values$bertValstats <- dametrics values$bertMsg <- paste0( "Saved model ", dabert, " (", bertconf[["_name_or_path"]], ")\n\n", "BERT validation stats:\n", "* Accuracy : ", round(100 * mean(truth == pred), 2), "%\n", "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n", "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1), dametrics$N.cases)), "\n") ## Hyperparameters tiggertrain_file <- paste0("tiggerbert_saved_", dabert, "/tiggertrain.json") if (file.exists(tiggertrain_file)) { try({ values$bertMsgHyperpar <- bertHyperToMsg(RJSONIO::fromJSON(tiggertrain_file)) }) } else values$bertMsgHyperpar <- "Training parameters not saved." values$bertValPlot <- bertPlotFromPath(paste0("tiggerbert_saved_", dabert)) values$bertLoadSaved <- dabert }) ############ ## Actions to copy hyperparameters observeEvent(input[[paste0("bert_copypars_", dabert)]], { tiggertrain_file <- paste0("tiggerbert_saved_", dabert, "/tiggertrain.json") if (!file.exists(tiggertrain_file)) return(NULL) try({ tmppars <- RJSONIO::fromJSON(tiggertrain_file) updateSelectInput(session, "bertModel", selected = tmppars$bertModel) updateCheckboxInput(session, "bertBest", value = tmppars$bertBest) numinputs <- c("bertEpochs", "bertLrate", "bertWdecay", "bertBatchsize", "bertGradacc") for (iinput in numinputs) updateNumericInput(session, iinput, value = tmppars[[iinput]]) }) }) ############ ## Actions to delete saved model (with confirm) observeEvent(input[[paste0("bert_delete_", dabert)]], { showModal(modalDialog( title = "Delete BERT model", paste0("Confirm delete: ", dabert, " (model and predictions will be deleted)."), footer = tagList(actionButton(paste0("bert_delete_confirm_", dabert), "Delete"), modalButton("Cancel")))) }) observeEvent(input[[paste0("bert_delete_confirm_", dabert)]], { system(paste0("rm -R tiggerbert_saved_", dabert)) system(paste0( "rm ", values$conf$datadir, values$conf$dataname, "_bertpred_", dabert, ".feather")) # values$bertLoadSaved <- NULL if (is.null(values$bertSaveTrigger)) { values$bertSaveTrigger <- 0 } else values$bertSaveTrigger <- values$bertSaveTrigger + 1 removeModal() values$bertMsg <- NULL values$bertValstats <- NULL values$bertMsgHyperpar <- NULL }) ############ ## Actions to infer on current data from saved BERT (async) observeEvent(input[[paste0("bert_infer_", dabert)]], { if (bertpred_nclicks() > 0 | bpall_nclicks() > 0) { values$bertMsg <- "Inference already running" values$bertMsgHyperpar <- NULL values$bertValPlot <- NULL values$bertValstats <- NULL return(NULL) } bertpred_nclicks(1) saved_model <- paste0("tiggerbert_saved_", dabert) out_path <- paste0( values$conf$datadir, values$conf$dataname, "_bertpred_", dabert, ".feather") if (file.exists(out_path)) { old_infer <- arrow::read_feather(out_path) } else old_infer <- NULL to_infer <- ok.data()[[values$conf$idcol]] to_infer <- to_infer[! to_infer %in% old_infer[[values$conf$idcol]]] to_infer_row <- match(to_infer, ok.data()[[values$conf$idcol]]) if (!length(to_infer)) { values$bertMsg <- paste0( "All current ", nrow(ok.data()), " texts already infered by ", dabert) values$bertMsgHyperpar <- NULL values$bertValPlot <- NULL values$bertValstats <- NULL bertpred_nclicks(0) return(NULL) } values$bertValPlot <- NULL values$bertValstats <- NULL values$bertMsgHyperpar <- NULL values$bertMsg <- paste0( "Preparing inference on ", length(to_infer), " new texts") tmp_dat <- paste0("tiggerbert_infer_", session$token, ".feather") arrow::write_feather( data.frame( ok.data()[to_infer_row, values$conf$idcol, drop = FALSE], text = ok.nicetext()[to_infer_row]), tmp_dat) values$bertMsg <- paste0( "Infering ", dabert, " predictions on ", length(to_infer), " new texts") infer_cmd <- paste0( values$conf$python, " gobert_infer.py -m ", saved_model, " -o ", out_path, " -d ", tmp_dat, " -i ", values$conf$idcol, " -l ", paste0("tiggerbert_infer_", session$token, ".log"), " -G ", as.numeric(values$conf$use_gpu), " -b ", 128) bertpred_res <- reactiveVal() bertpred_async <- future({ system(infer_cmd) system(paste0("rm ", tmp_dat)) if (!is.null(old_infer)) { new_infer <- arrow::read_feather(out_path) arrow::write_feather(rbind(old_infer, new_infer), out_path) } }) %...>% bertpred_res bertpred_async <- catch(bertpred_async, function(error) { bertpred_nclicks(0) values$bertMsg <- paste0("BERT inference error: ", error) system(paste0("rm ", tmp_dat)) }) bertpred_async <- finally(bertpred_async, function() { bertpred_nclicks(0) values$bertMsg <- "Inference complete!\nRe-import data in Project/Sample panel to use the predictions for tagging." values$bertValPlot <- NULL values$bertValstats <- NULL values$bertMsgHyperpar <- NULL }) NULL }) }) values$bertSavedAll <- c(values$bertSavedAll, tmpsaved) }) ## Automatic check of BERT inference progress observe({ if (bertpred_nclicks() == 0) return(NULL) invalidateLater(1e3) logfile <- paste0("tiggerbert_infer_", isolate(session$token), ".log") if (file.exists(logfile)) { values$bertMsg <- readLines(logfile)[1] values$bertValPlot <- NULL values$bertValstats <- NULL values$bertMsgHyperpar <- NULL } }) ############################################################################## ## Export panel ############################################################################## ## Export tags and full model predictions output$dlTagSave <- downloadHandler( function() paste0(Sys.Date(), "-activeTigger-", ok.labelcol(),".", input$dlTagFormat), function(con) { if (is.null(ok.data())) return(NULL) dadat <- ok.data()[, values$conf$idcol, drop = FALSE] if ("tags" %in% input$dlTagSelect) dadat[[ok.labelcol()]] <- values$label if ("comments" %in% input$dlTagSelect) if (ok.commentCol() != "(None)") dadat[, ok.commentCol()] <- values$comment if ("predictions" %in% input$dlTagSelect) if (ok.train.possible()) { dadat$tigger.predict <- ok.predlab1() dadat$tigger.predprob <- apply(ok.pred(), 1, max) tmp_pred <- ok.pred() colnames(tmp_pred) <- paste0("tiggerpred_", colnames(tmp_pred)) dadat <- cbind(dadat, tmp_pred) } othercols <- input$dlTagSelect othercols <- othercols[! othercols %in% c("tags", "comments", "predictions")] if (length(othercols)) dadat <- cbind(dadat, ok.data()[, othercols, drop = FALSE]) if (input$dlTagFormat == "csv") { write.csv(dadat, row.names = F, con) } else arrow::write_feather(dadat, con) }) ## Export embeddings (ft, sb, umap, t-sne) output$dlEmbedSave <- downloadHandler( function() paste0(Sys.Date(), "-activeTigger-emb-", paste(input$dlEmbedSelect, collapse = "-"), ".", input$dlEmbedFormat), function (file) { if (is.null(input$dlEmbedSelect)) return(NULL) export <- ok.data()[, values$conf$idcol, drop = FALSE] if ("ft" %in% input$dlEmbedSelect) export <- data.frame(export, ok.data()[, values$cols_ft]) if ("sb" %in% input$dlEmbedSelect) export <- data.frame(export, ok.data()[, values$cols_sb]) if (any(c("t-sne", "umap") %in% input$dlEmbedSelect)) { tmpvisu <- ok.visu() tmpalgo <- input$dlEmbedSelect[input$dlEmbedSelect %in% c("t-sne", "umap")] colnames(tmpvisu) <- paste0(gsub("-", "", tmpalgo), 1:2) export <- data.frame(export, tmpvisu) } if (input$dlEmbedFormat == "csv") { write.csv(export, file, row.names = FALSE) } else if (input$dlEmbedFormat == "feather") { arrow::write_feather(export, file) } }) ## Export BERT predictions output$dlBPMsg <- reactive(values$dlBPMsg) # output$dlBertPredCsv <- # downloadHandler(paste0(Sys.Date(), "-bertpred-", ok.labelcol(),".csv"), # content= function(con) { # if (is.null(ok.data())) return(NULL) # dadat <- ok.data()[, c(values$conf$idcol, ok.labelcol())] # dadat[[ok.labelcol()]] <- values$label # if (ok.commentCol() != "(None)") { # dadat[, ok.commentCol()] <- values$comment # } # if (ok.train.possible()) { # dadat$tigger.predict <- ok.predlab1() # dadat$tigger.predprob <- apply(ok.pred(), 1, max) # } # dadat <- cbind(dadat, ok.pred()) # write.csv(dadat, row.names = F, con) # }) ############ ## Action to infer on ALL data from saved BERT (async) bpall_res <- reactiveVal() observeEvent(input$dlBPInfer, { if (bpall_nclicks() > 0 | bertpred_nclicks() > 0) { values$dlBPMsg <- "Inference already running" bpall_res(NULL) return(NULL) } bpall_nclicks(1) bpall_res_async <- reactiveVal() dabert <- input$dlBPSelect saved_model <- paste0("tiggerbert_saved_", dabert) out_path <- paste0( values$conf$datadir, values$conf$dataname, "_bertpredall_", dabert, ".feather") tmp_dat <- paste0("tiggerbert_inferall_", session$token, ".feather") tmp_log <- paste0("tiggerbert_inferall_", session$token, ".log") tmp_done <- paste0("tiggerbert_inferall_", session$token, "_done") if (file.exists(tmp_done)) system(paste0("rm ", tmp_done)) if (file.exists(out_path)) { values$dlBPMsg <- "Inference already complete, ready for download" bpall_nclicks(0) bpall_res(out_path) return(NULL) # old_infer <- arrow::read_feather(out_path) } fulldat <- arrow::read_feather(paste0( values$conf$datadir, values$conf$dataname, ".feather")) values$dlBPMsg <- paste0( "Preparing inference on ", nrow(fulldat), " texts") if (length(values$conf$textcols) > 1) { fulldat$text <- do.call(paste, c( fulldat[, values$conf$textcols, drop = FALSE], sep = "\n\n")) } else { fulldat$text <- fulldat[, values$conf$textcols] } arrow::write_feather(fulldat[, c(values$conf$idcol, "text")], tmp_dat) infer_cmd <- paste0( values$conf$python, " gobert_infer.py -m ", saved_model, " -o ", out_path, " -d ", tmp_dat, " -i ", values$conf$idcol, " -l ", tmp_log, " -G ", as.numeric(values$conf$use_gpu), " -b ", 128, "; touch ", tmp_done) bpall_async <- future({ system(infer_cmd) system(paste0("rm ", tmp_dat)) }) %...>% bpall_res_async bpall_async <- catch(bpall_async, function(error) { values$dlBPMsg <- paste0("BERT inference error: ", error) system(paste0("rm ", tmp_dat)) system(paste0("rm ", tmp_done)) }) bpall_async <- finally(bpall_async, function() { bpall_nclicks(0) values$dlBPMsg <- "Inference complete, ready for download" bpall_res(out_path) }) NULL }) ## Automatic check of BERT all prediction progress observe({ if (bpall_nclicks() == 0) return(NULL) invalidateLater(1e3) logfile <- paste0("tiggerbert_inferall_", isolate(session$token), ".log") if (file.exists(logfile)) values$dlBPMsg <- readLines(logfile)[1] }) ## Show BERTpred download button when data is ready output$dlBPDlButton <- renderUI({ if (is.null(bpall_res())) return(NULL) downloadButton("dlBPDl", NULL) }) output$dlBPDl <- downloadHandler( function() paste0(Sys.Date(), "-tiggerbert-", values$conf$dataname, input$dlBPSelect, ".", input$dlBPFormat), function(file) { if (is.null(bpall_res())) return(NULL) tmp_bert <- gsub("^.*_bertpredall_(.*)[.]feather$", "\\1", bpall_res()) if (tmp_bert != input$dlBPSelect) { values$dlBPMsg <- "BERT model changed - predict again or select current model" return(NULL) } tmp <- arrow::read_feather(bpall_res()) if (input$dlBPFormat == "csv") { write.csv(tmp, file, row.names = FALSE) } else if (input$dlBPFormat == "feather") { arrow::write_feather(tmp, file) } } ) ############################################################################## ## Delete files at end of session onStop(function() { if(file.exists(bert_status_file)) unlink(bert_status_file) if(length(dir(pattern = paste(session$token)))) system(paste0("rm -R *", session$token, "*")) }) })