diff --git a/docker-images-datalab/myactivetigger/.drone.yml b/docker-images-datalab/myactivetigger/.drone.yml
new file mode 100644
index 0000000..81a0799
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/.drone.yml
@@ -0,0 +1,48 @@
+kind: pipeline
+name: Build & publish main
+
+steps:
+ - name: publish-image
+ pull: always
+ image: plugins/kaniko:1.7.1-kaniko1.9.1
+ settings:
+ auto_tag: true
+ auto_tag_suffix: latest
+ registry: code.groupe-genes.fr
+ repo: code.groupe-genes.fr/datalab/docker-images-datalab/activetigger
+ username:
+ from_secret: docker_username
+ password:
+ from_secret: docker_password
+ when:
+ event:
+ exclude:
+ - pull_request
+ - name: deploy
+ image: alpine
+ environment:
+ kubernetes_server:
+ from_secret: kubernetes_server
+ kubernetes_cert:
+ from_secret: kubernetes_cert
+ kubernetes_token:
+ from_secret: kubernetes_token
+ commands:
+ - apk add --no-cache curl
+ - curl -LL -o /usr/bin/kubectl "https://dl.k8s.io/release/v1.28.2/bin/linux/amd64/kubectl"
+ - curl -LL -o helm.tar.gz "https://get.helm.sh/helm-v3.14.0-linux-amd64.tar.gz"
+ - tar xf "helm.tar.gz" && mv ./linux-amd64/helm /usr/bin/helm
+ - chmod +x /usr/bin/kubectl
+ - chmod +x /usr/bin/helm
+ - kubectl config set-cluster default --server=$kubernetes_server --insecure-skip-tls-verify=true
+ - kubectl config set-credentials user --token=$kubernetes_token
+ - kubectl config set-context default --user=user --cluster=default --namespace=activetigger
+ - kubectl config use-context default
+ - kubectl get pods
+ - helm ls -n activetigger --debug
+ - helm dependency build ./helm-chart
+ - helm upgrade activetigger ./helm-chart -f ./helm-chart/values.yaml -n activetigger
+ when:
+ event:
+ exclude:
+ - pull_request
\ No newline at end of file
diff --git a/docker-images-datalab/myactivetigger/Dockerfile b/docker-images-datalab/myactivetigger/Dockerfile
new file mode 100644
index 0000000..107cdf3
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/Dockerfile
@@ -0,0 +1,130 @@
+
+ARG CACHEBUST=1
+FROM ubuntu:22.04
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY requirements.r /requirements.r
+COPY requirementspython.txt /requirementspython.txt
+
+
+# Installation python
+RUN apt-get update && apt-get install -y \
+ python3.10 \
+ python3.10-distutils \
+ python3.10-venv \
+ python3-pip \
+ r-base \
+ wget \
+ && apt-get clean
+
+# Installation R + shiny server
+RUN apt-get update && \
+ apt-get install -y r-base
+RUN R -e "install.packages('shiny', repos='https://cran.rstudio.com/')"
+
+# Install gdebi-core and shiny-server
+RUN apt-get update
+RUN apt-get install -y gdebi-core
+RUN wget https://download3.rstudio.org/ubuntu-18.04/x86_64/shiny-server-1.5.21.1012-amd64.deb
+RUN gdebi --non-interactive shiny-server-1.5.21.1012-amd64.deb
+
+
+## Packages package R (à installer depuis l'exécutable R employé par shiny server)
+RUN Rscript /requirements.r
+
+## Environnement python | a vérifier dans requirementspython.txt l'installation des cu118 se fais de cette manière
+
+
+
+# Install Miniconda
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+ bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
+ rm Miniconda3-latest-Linux-x86_64.sh
+
+# Add Conda binaries to PATH
+ENV PATH="/opt/conda/bin:${PATH}"
+
+# Create a Conda environment and activate it
+RUN conda create -n tigger python==3.10 && \
+ echo "conda activate tigger" >> ~/.bashrc
+
+# Mise à jour et installation des dépendances système
+RUN rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --no-cache-dir \
+ torch torchvision torchaudio \
+ -f https://download.pytorch.org/whl/cu118/torch_stable.html
+
+RUN apt-get update && apt-get install -y curl build-essential
+RUN pip3 install --no-cache-dir six
+
+# Install Rust using rustup
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+# Add Cargo's bin directory to the PATH environment variable
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+RUN pip3 install --no-cache-dir --upgrade setuptools
+
+
+# Autres installations de bibliothèques Python
+RUN pip3 install argparse
+RUN pip3 install datasets
+RUN pip3 install fasttext
+RUN pip3 install numpy
+RUN pip3 install pandas
+RUN pip3 install pyarrow
+RUN pip3 install scikit-learn
+RUN pip3 install sentence-transformers
+RUN pip3 install transformers
+RUN pip3 install typing-inspect==0.8.0
+RUN pip3 install typing-extensions==4.6.1
+RUN pip3 install spacy
+
+# Mettre en place des configurations supplémentaires si nécessaire
+
+# Commande par défaut à exécuter lorsque le conteneur démarre
+CMD ["/bin/bash"]
+
+## Téléchargement des modèles spacy et fasttext
+
+### Français
+#WORKDIR ~
+#RUN python -m spacy download fr_core_news_sm
+
+#RUN python -m spacy download fr_core_news_sm \
+#WORKDIR ~
+
+RUN wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz \
+ && gunzip cc.fr.300.bin.gz
+
+
+# A chaque création d'instance
+
+## Clone git pour créer la nouvelle instance (remplacer "tigger-name" par le nom que prendra l'instance, ie https://analytics.huma-num.fr/Prenom.Nom/tigger-name/)
+COPY activetigger/ ./activetigger
+
+
+
+
+
+
+
+# Expose the default Shiny port
+EXPOSE 3838
+
+# Command to run the Shiny app
+CMD ["R", "-e", "shiny::runApp('activetigger', port=3838, host='0.0.0.0')"]
+
+RUN R -e "install.packages('quanteda')"
+
+
+## Dans l'application
+
+## Tout en haut à gauche, bouton "+" pour "create project". Puis dans les champs :
+
+## - data directory: moi j'utilise toujours ~/tagging/domaine (genre ~/tagging/radio ou ~/tagging/journaux), mais c'est à toi de voir où tu veux que les données et tags soient stockées sur ton serveur
+## - je conseille de cocher toutes les cases : python, spacy, fasttext, sbert, gpu
+## - python : "~/conda/envs/tigger/bin/python"
+## - fasttext : "~/cc.fr.300.bin" (càd qu'il faut donner le chemin du modèle sur ton serveur, pas juste le nom)
+## - spacy et SBERT : garder les valeurs par défaut pour la langue choisie
diff --git a/docker-images-datalab/myactivetigger/activetigger/embed_fasttext.py b/docker-images-datalab/myactivetigger/activetigger/embed_fasttext.py
new file mode 100644
index 0000000..5c72316
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/activetigger/embed_fasttext.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+## FastText embed sentences
+## Requires data file with columns id and text
+
+import argparse
+import fasttext
+from os.path import expanduser
+import pandas as pd
+import pyarrow as pa
+import pyarrow.feather as feather
+import re
+
+
+def main(args):
+ print("FastText: Importing data")
+ datapath = expanduser(args.data)
+ dat = feather.read_feather(datapath)
+ outfile = re.sub("[.]feather$", "_ft.feather", datapath)
+
+ print("FastText: Loading model")
+ ft = fasttext.load_model(expanduser(args.model))
+ print("FastText: Embedding sentences")
+ emb = [ft.get_sentence_vector(re.sub("\n", " ", x)) for x in dat["text"]]
+
+ print("FastText: Exporting")
+ emb = pd.DataFrame(emb)
+ emb.columns = ["ft%03d" % (x + 1) for x in range(len(emb.columns))]
+ emb = pd.concat([dat["id"], emb], axis=1)
+ feather.write_feather(emb, outfile)
+ print("FastText: Done")
+
+
+if __name__ == "__main__":
+ argParser = argparse.ArgumentParser()
+ argParser.add_argument("-m", "--model", help="Model path", default="/data/user/b/jboelaert/cc.fr.100.bin")
+ argParser.add_argument("-d", "--data", help="Path to data (feather)")
+ args = argParser.parse_args()
+ main(args)
+
diff --git a/docker-images-datalab/myactivetigger/activetigger/embed_sbert.py b/docker-images-datalab/myactivetigger/activetigger/embed_sbert.py
new file mode 100644
index 0000000..d944fd9
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/activetigger/embed_sbert.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+## SBERT embed sentences
+## Requires data file with columns id and text
+
+import argparse
+from os.path import expanduser
+import pandas as pd
+import pyarrow as pa
+import pyarrow.feather as feather
+import re
+from sentence_transformers import SentenceTransformer
+
+
+def main(args):
+ print("SBERT: Importing data")
+ datapath = expanduser(args.data)
+ dat = feather.read_feather(datapath)
+ outfile = re.sub("[.]feather$", "_sb.feather", datapath)
+
+ print("SBERT: Loading model")
+ sbert = SentenceTransformer(expanduser(args.model))
+ sbert.max_seq_length = 512
+ print("SBERT: Embedding sentences")
+ emb = sbert.encode(dat["text"])
+
+ print("SBERT: Exporting")
+ emb = pd.DataFrame(emb)
+ emb.columns = ["sb%03d" % (x + 1) for x in range(len(emb.columns))]
+ emb = pd.concat([dat["id"], emb], axis=1)
+ feather.write_feather(emb, outfile)
+ print("SBERT: Done")
+
+
+if __name__ == "__main__":
+ argParser = argparse.ArgumentParser()
+ argParser.add_argument("-m", "--model", help="Model name or path", default="distiluse-base-multilingual-cased-v1")
+ argParser.add_argument("-d", "--data", help="Path to data (feather)")
+ args = argParser.parse_args()
+ main(args)
+
diff --git a/docker-images-datalab/myactivetigger/activetigger/gobert.py b/docker-images-datalab/myactivetigger/activetigger/gobert.py
new file mode 100644
index 0000000..32a9f04
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/activetigger/gobert.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+## BERT trainer to be called by server.R
+## Requires two data files with columns id, label and text
+
+import argparse
+import datasets
+from datasets import load_metric
+import numpy as np
+from os.path import expanduser
+import os
+import pandas as pd
+import re
+from sklearn import metrics
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from transformers import Trainer, TrainingArguments, TrainerCallback
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+def main(args):
+ print("Importing data")
+ dattrain = pd.read_csv(expanduser(args.traindat))
+ datval = pd.read_csv(expanduser(args.valdat))
+ datval_id = datval["id"]
+ classcolname = "label"
+
+ ## Make class_names
+ class_names = [x for x in dattrain[classcolname].unique()]
+
+ ## Labels to class number
+ dattrain[classcolname] = [class_names.index(x) for x in dattrain[classcolname].to_list()]
+ datval[classcolname] = [class_names.index(x) for x in datval[classcolname].to_list()]
+
+ ## Transform to datasets
+ dattrain = datasets.Dataset.from_pandas(dattrain[['text', 'label']])
+ datval = datasets.Dataset.from_pandas(datval[['text', 'label']])
+
+ # Model choice
+ modelname = expanduser(args.model)
+
+ ## Tokenizer
+ print("Tokenizing")
+
+ tokenizer = AutoTokenizer.from_pretrained(modelname)
+
+ # toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length"), batched=True)
+ # toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length"), batched=True)
+ if args.adapt:
+ toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=512), batched=True)
+ toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=512), batched=True)
+ else:
+ toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True)
+ toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True)
+
+ del(dattrain)
+
+ ## Model
+ print("Loading model")
+ model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels = len(class_names))
+ if (args.gpu):
+ model.cuda()
+
+ ## Train using Trainer interface
+ print("Training...")
+ BATCH_SIZE = args.batchsize
+ GRAD_ACC = args.gradacc
+ epochs = args.epochs
+
+ total_steps = (epochs * len(toktrain)) // (BATCH_SIZE * GRAD_ACC)
+ warmup_steps = (total_steps) // 10
+ eval_steps = total_steps // args.eval
+
+ training_args = TrainingArguments(
+ output_dir=args.session + "_train",
+ learning_rate=args.lrate,
+ weight_decay=args.wdecay,
+ num_train_epochs=epochs,
+ gradient_accumulation_steps=GRAD_ACC,
+ per_device_train_batch_size=BATCH_SIZE,
+ # per_device_eval_batch_size=BATCH_SIZE,
+ per_device_eval_batch_size=32,
+ warmup_steps=warmup_steps,
+
+ eval_steps=eval_steps,
+ evaluation_strategy="steps",
+ save_strategy="steps",
+ save_steps=eval_steps,
+ logging_steps=eval_steps,
+ do_eval=True,
+ greater_is_better=False,
+ load_best_model_at_end=bool(args.best),
+ metric_for_best_model="eval_loss"
+ )
+
+ trainer = Trainer(model=model, args=training_args,
+ train_dataset=toktrain, eval_dataset=toktest)
+
+ the_session = args.session
+ class HaltCallback(TrainerCallback):
+ "A callback that checks for _stop file to interrupt training"
+
+ def on_step_begin(self, args, state, control, **kwargs):
+ if os.path.exists(the_session + "_stop"):
+ print("\nHalted by user.\n")
+ control.should_training_stop = True
+ return(control)
+ else:
+ print("\nNot halted by user.\n")
+
+ trainer.add_callback(HaltCallback)
+
+ trainer.train()
+
+ ## Add class names to model
+ label_to_id = {v: i for i, v in enumerate(class_names)}
+ model.config.label2id = label_to_id
+ model.config.id2label = {id: label for label, id in model.config.label2id.items()}
+
+ ## Save model
+ model.save_pretrained(args.session)
+
+
+ ## Prediction functions
+
+
+ def get_predprobs(text):
+ # inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
+ inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
+ if (args.gpu):
+ inputs = inputs.to("cuda")
+ outputs = model(**inputs)
+ res = outputs[0]
+ if (args.gpu):
+ res = res.cpu()
+ res = res.softmax(1).detach().numpy()
+ return res
+
+
+ def get_prediction(text):
+ return class_names[get_predprobs(text).argmax()]
+
+ ## Metrics on validation set
+ print("Computing predictions")
+ testpred = [get_prediction(txt) for txt in datval["text"]]
+ testtruth = [class_names[x] for x in datval["label"]]
+
+ exportpred = pd.DataFrame(datval_id)
+ exportpred.columns = ["id"]
+ exportpred["bertpred"] = testpred
+ exportpred.to_csv(args.session + "_predval.csv", index=False)
+
+
+if __name__ == "__main__":
+ argParser = argparse.ArgumentParser()
+ argParser.add_argument("-m", "--model", help="Model name or path", default="microsoft/Multilingual-MiniLM-L12-H384")
+ argParser.add_argument("-t", "--traindat", help="Path to training data")
+ argParser.add_argument("-v", "--valdat", help="Path to validation data")
+ argParser.add_argument("-b", "--batchsize", help="Batch size for training", type=int, default=4)
+ argParser.add_argument("-g", "--gradacc", help="Gradient accumulation for training", type=int, default=1)
+ argParser.add_argument("-e", "--epochs", help="Number of training epochs", type=float, default=3)
+ argParser.add_argument("-l", "--lrate", help="Learning rate", type=float, default=5e-05)
+ argParser.add_argument("-w", "--wdecay", help="Weight decay", type=float, default=.01)
+ argParser.add_argument("-B", "--best", help="Load best model instead of last", type=int, choices=[0,1], default=1)
+ argParser.add_argument("-E", "--eval", help="Number of intermediary evaluations", type=int, default=10)
+ argParser.add_argument("-s", "--session", help="Session name (used to save results)")
+ argParser.add_argument("-G", "--gpu", help="Use GPU (CUDA)", type=int, choices=[0,1], default=0)
+ argParser.add_argument("-A", "--adapt", help="Adapt token length to batch", type=int, choices=[0,1], default=1)
+
+
+ args = argParser.parse_args()
+
+ main(args)
+
diff --git a/docker-images-datalab/myactivetigger/activetigger/gobert_infer.py b/docker-images-datalab/myactivetigger/activetigger/gobert_infer.py
new file mode 100644
index 0000000..c2c607a
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/activetigger/gobert_infer.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+## BERT inference to be called by server.R
+
+import argparse
+import datasets
+import json
+import numpy as np
+from os import path, remove
+import pandas as pd
+import pyarrow.feather as feather
+import re
+from torch import no_grad
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+
+def chunker(seq, batch_size):
+ return (seq[pos:pos + batch_size] for pos in range(0, len(seq), batch_size))
+
+
+def main(args):
+ print("Importing data")
+ with open(path.expanduser(args.logfile), "w") as progfile:
+ progfile.write("Importing data")
+
+ dat = feather.read_feather(path.expanduser(args.dat))
+
+ with open(path.expanduser(args.logfile), "w") as progfile:
+ progfile.write("Tokenizing")
+
+ ## Tokenizer
+ print("Tokenizing")
+ with open(path.join(path.expanduser(args.model), "config.json"), "r") as jsonfile:
+ modeltype = json.load(jsonfile)["_name_or_path"]
+
+ tokenizer = AutoTokenizer.from_pretrained(modeltype)
+
+ ## Model
+ print("Loading model")
+ model = AutoModelForSequenceClassification.from_pretrained(path.expanduser(args.model))
+ if (args.gpu):
+ model.cuda()
+
+ ## Prediction functions
+
+
+ def get_predprobs(text):
+ inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
+ if (args.gpu):
+ inputs = inputs.to("cuda")
+ with no_grad():
+ outputs = model(**inputs)
+ res = outputs[0]
+ if (args.gpu):
+ res = res.cpu()
+ res = res.softmax(1).detach().numpy()
+ return res
+
+ print("Computing predictions")
+
+ chunks = chunker([str(x) for x in dat[args.txtname]], args.batch)
+ pred = []
+ for i, x in enumerate(chunks):
+ if (i % 5 == 0):
+ percent = round(100 * i * args.batch / len(dat), 1)
+ logmsg = "Computing: " + str(percent) + "% (" + str(i * args.batch) + "/" + str(len(dat)) + ")"
+ with open(path.expanduser(args.logfile), "w") as progfile:
+ progfile.write(logmsg)
+ pred.append(get_predprobs(x))
+
+ pred = np.concatenate(pred)
+ pred = pd.DataFrame(pred)
+ pred.columns = ["bertpred_" + v for i, v in model.config.id2label.items()]
+ pred = pd.concat([dat[args.idname], pred], axis=1)
+ feather.write_feather(pred, path.abspath(args.output))
+ remove(path.expanduser(args.logfile))
+
+
+if __name__ == "__main__":
+ argParser = argparse.ArgumentParser()
+ argParser.add_argument("-m", "--model", help="Trained model path")
+ argParser.add_argument("-d", "--dat", help="Path to data (feather file)")
+ argParser.add_argument("-o", "--output", help="Output path of predictions", default="tiggerbert.feather")
+ argParser.add_argument("-i", "--idname", help="Name of id variable", default="id")
+ argParser.add_argument("-x", "--txtname", help="Name of text variable", default="text")
+ argParser.add_argument("-l", "--logfile", help="Path to log file", default="tiggerbert-progress.txt")
+ argParser.add_argument("-G", "--gpu", help="Use GPU (CUDA)", type=int, choices=[0,1], default=1)
+ argParser.add_argument("-b", "--batch", help="Batch size", type=int, default=128)
+
+ args = argParser.parse_args()
+
+ main(args)
+
diff --git a/docker-images-datalab/myactivetigger/activetigger/modelnames.csv b/docker-images-datalab/myactivetigger/activetigger/modelnames.csv
new file mode 100644
index 0000000..a49e67e
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/activetigger/modelnames.csv
@@ -0,0 +1,159 @@
+"short","spacy_name","fasttext_name","fasttext_url","language","short_lang"
+"af","xx_ent_wiki_sm","cc.af.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.bin.gz","Afrikaans","(af) Afrikaans"
+"als","xx_ent_wiki_sm","cc.als.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.bin.gz","Alemannic","(als) Alemannic"
+"am","xx_ent_wiki_sm","cc.am.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.bin.gz","Amharic","(am) Amharic"
+"an","xx_ent_wiki_sm","cc.an.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.bin.gz","Aragonese","(an) Aragonese"
+"ar","xx_ent_wiki_sm","cc.ar.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz","Arabic","(ar) Arabic"
+"arz","xx_ent_wiki_sm","cc.arz.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.bin.gz","Egyptian Arabic","(arz) Egyptian Arabic"
+"as","xx_ent_wiki_sm","cc.as.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.bin.gz","Assamese","(as) Assamese"
+"ast","xx_ent_wiki_sm","cc.ast.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.bin.gz","Asturian","(ast) Asturian"
+"az","xx_ent_wiki_sm","cc.az.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.bin.gz","Azerbaijani","(az) Azerbaijani"
+"azb","xx_ent_wiki_sm","cc.azb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.bin.gz","Southern Azerbaijani","(azb) Southern Azerbaijani"
+"ba","xx_ent_wiki_sm","cc.ba.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.bin.gz","Bashkir","(ba) Bashkir"
+"bar","xx_ent_wiki_sm","cc.bar.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.bin.gz","Bavarian","(bar) Bavarian"
+"bcl","xx_ent_wiki_sm","cc.bcl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.bin.gz","Central Bicolano","(bcl) Central Bicolano"
+"be","xx_ent_wiki_sm","cc.be.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.bin.gz","Belarusian","(be) Belarusian"
+"bg","xx_ent_wiki_sm","cc.bg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.bin.gz","Bulgarian","(bg) Bulgarian"
+"bh","xx_ent_wiki_sm","cc.bh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.bin.gz","Bihari","(bh) Bihari"
+"bn","xx_ent_wiki_sm","cc.bn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz","Bengali","(bn) Bengali"
+"bo","xx_ent_wiki_sm","cc.bo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.bin.gz","Tibetan","(bo) Tibetan"
+"bpy","xx_ent_wiki_sm","cc.bpy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.bin.gz","Bishnupriya Manipuri","(bpy) Bishnupriya Manipuri"
+"br","xx_ent_wiki_sm","cc.br.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.bin.gz","Breton","(br) Breton"
+"bs","xx_ent_wiki_sm","cc.bs.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.bin.gz","Bosnian","(bs) Bosnian"
+"ca","ca_core_news_sm","cc.ca.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.bin.gz","Catalan","(ca) Catalan"
+"ce","xx_ent_wiki_sm","cc.ce.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.bin.gz","Chechen","(ce) Chechen"
+"ceb","xx_ent_wiki_sm","cc.ceb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.bin.gz","Cebuano","(ceb) Cebuano"
+"ckb","xx_ent_wiki_sm","cc.ckb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.bin.gz","Kurdish (Sorani)","(ckb) Kurdish (Sorani)"
+"co","xx_ent_wiki_sm","cc.co.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.bin.gz","Corsican","(co) Corsican"
+"cs","xx_ent_wiki_sm","cc.cs.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.bin.gz","Czech","(cs) Czech"
+"cv","xx_ent_wiki_sm","cc.cv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.bin.gz","Chuvash","(cv) Chuvash"
+"cy","xx_ent_wiki_sm","cc.cy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz","Welsh","(cy) Welsh"
+"da","da_core_news_sm","cc.da.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.bin.gz","Danish","(da) Danish"
+"de","de_core_news_sm","cc.de.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz","German","(de) German"
+"diq","xx_ent_wiki_sm","cc.diq.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.bin.gz","Zazaki","(diq) Zazaki"
+"dv","xx_ent_wiki_sm","cc.dv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.bin.gz","Divehi","(dv) Divehi"
+"el","el_core_news_sm","cc.el.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.bin.gz","Greek","(el) Greek"
+"eml","xx_ent_wiki_sm","cc.eml.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.bin.gz","Emilian-Romagnol","(eml) Emilian-Romagnol"
+"en","en_core_web_sm","cc.en.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz","English","(en) English"
+"eo","xx_ent_wiki_sm","cc.eo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.bin.gz","Esperanto","(eo) Esperanto"
+"es","es_core_news_sm","cc.es.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz","Spanish","(es) Spanish"
+"et","xx_ent_wiki_sm","cc.et.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.bin.gz","Estonian","(et) Estonian"
+"eu","xx_ent_wiki_sm","cc.eu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.bin.gz","Basque","(eu) Basque"
+"fa","xx_ent_wiki_sm","cc.fa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz","Persian","(fa) Persian"
+"fi","fi_core_news_sm","cc.fi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.bin.gz","Finnish","(fi) Finnish"
+"fr","fr_core_news_sm","cc.fr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz","French","(fr) French"
+"frr","xx_ent_wiki_sm","cc.frr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.bin.gz","North Frisian","(frr) North Frisian"
+"fy","xx_ent_wiki_sm","cc.fy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.bin.gz","West Frisian","(fy) West Frisian"
+"ga","xx_ent_wiki_sm","cc.ga.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.bin.gz","Irish","(ga) Irish"
+"gd","xx_ent_wiki_sm","cc.gd.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.bin.gz","Scottish Gaelic","(gd) Scottish Gaelic"
+"gl","xx_ent_wiki_sm","cc.gl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.bin.gz","Galician","(gl) Galician"
+"gom","xx_ent_wiki_sm","cc.gom.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.bin.gz","Goan Konkani","(gom) Goan Konkani"
+"gu","xx_ent_wiki_sm","cc.gu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.bin.gz","Gujarati","(gu) Gujarati"
+"gv","xx_ent_wiki_sm","cc.gv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.bin.gz","Manx","(gv) Manx"
+"he","xx_ent_wiki_sm","cc.he.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz","Hebrew","(he) Hebrew"
+"hi","xx_ent_wiki_sm","cc.hi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz","Hindi","(hi) Hindi"
+"hif","xx_ent_wiki_sm","cc.hif.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.bin.gz","Fiji Hindi","(hif) Fiji Hindi"
+"hr","hr_core_news_sm","cc.hr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz","Croatian","(hr) Croatian"
+"hsb","xx_ent_wiki_sm","cc.hsb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.bin.gz","Upper Sorbian","(hsb) Upper Sorbian"
+"ht","xx_ent_wiki_sm","cc.ht.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.bin.gz","Haitian","(ht) Haitian"
+"hu","xx_ent_wiki_sm","cc.hu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.bin.gz","Hungarian","(hu) Hungarian"
+"hy","xx_ent_wiki_sm","cc.hy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.bin.gz","Armenian","(hy) Armenian"
+"ia","xx_ent_wiki_sm","cc.ia.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.bin.gz","Interlingua","(ia) Interlingua"
+"id","xx_ent_wiki_sm","cc.id.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz","Indonesian","(id) Indonesian"
+"ilo","xx_ent_wiki_sm","cc.ilo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.bin.gz","Ilokano","(ilo) Ilokano"
+"io","xx_ent_wiki_sm","cc.io.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.bin.gz","Ido","(io) Ido"
+"is","xx_ent_wiki_sm","cc.is.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.bin.gz","Icelandic","(is) Icelandic"
+"it","it_core_news_sm","cc.it.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz","Italian","(it) Italian"
+"ja","ja_core_news_sm","cc.ja.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz","Japanese","(ja) Japanese"
+"jv","xx_ent_wiki_sm","cc.jv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.bin.gz","Javanese","(jv) Javanese"
+"ka","xx_ent_wiki_sm","cc.ka.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.bin.gz","Georgian","(ka) Georgian"
+"kk","xx_ent_wiki_sm","cc.kk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.bin.gz","Kazakh","(kk) Kazakh"
+"km","xx_ent_wiki_sm","cc.km.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz","Khmer","(km) Khmer"
+"kn","xx_ent_wiki_sm","cc.kn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.bin.gz","Kannada","(kn) Kannada"
+"ko","ko_core_news_sm","cc.ko.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz","Korean","(ko) Korean"
+"ku","xx_ent_wiki_sm","cc.ku.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.bin.gz","Kurdish (Kurmanji)","(ku) Kurdish (Kurmanji)"
+"ky","xx_ent_wiki_sm","cc.ky.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.bin.gz","Kirghiz","(ky) Kirghiz"
+"la","xx_ent_wiki_sm","cc.la.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz","Latin","(la) Latin"
+"lb","xx_ent_wiki_sm","cc.lb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.bin.gz","Luxembourgish","(lb) Luxembourgish"
+"li","xx_ent_wiki_sm","cc.li.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.bin.gz","Limburgish","(li) Limburgish"
+"lmo","xx_ent_wiki_sm","cc.lmo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.bin.gz","Lombard","(lmo) Lombard"
+"lt","lt_core_news_sm","cc.lt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.bin.gz","Lithuanian","(lt) Lithuanian"
+"lv","xx_ent_wiki_sm","cc.lv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.bin.gz","Latvian","(lv) Latvian"
+"mai","xx_ent_wiki_sm","cc.mai.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.bin.gz","Maithili","(mai) Maithili"
+"mg","xx_ent_wiki_sm","cc.mg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.bin.gz","Malagasy","(mg) Malagasy"
+"mhr","xx_ent_wiki_sm","cc.mhr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.bin.gz","Meadow Mari","(mhr) Meadow Mari"
+"min","xx_ent_wiki_sm","cc.min.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.bin.gz","Minangkabau","(min) Minangkabau"
+"mk","mk_core_news_sm","cc.mk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.bin.gz","Macedonian","(mk) Macedonian"
+"ml","xx_ent_wiki_sm","cc.ml.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.bin.gz","Malayalam","(ml) Malayalam"
+"mn","xx_ent_wiki_sm","cc.mn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.bin.gz","Mongolian","(mn) Mongolian"
+"mr","xx_ent_wiki_sm","cc.mr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.bin.gz","Marathi","(mr) Marathi"
+"mrj","xx_ent_wiki_sm","cc.mrj.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.bin.gz","Hill Mari","(mrj) Hill Mari"
+"ms","xx_ent_wiki_sm","cc.ms.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.bin.gz","Malay","(ms) Malay"
+"mt","xx_ent_wiki_sm","cc.mt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.bin.gz","Maltese","(mt) Maltese"
+"mwl","xx_ent_wiki_sm","cc.mwl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.bin.gz","Mirandese","(mwl) Mirandese"
+"my","xx_ent_wiki_sm","cc.my.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.bin.gz","Burmese","(my) Burmese"
+"myv","xx_ent_wiki_sm","cc.myv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.bin.gz","Erzya","(myv) Erzya"
+"mzn","xx_ent_wiki_sm","cc.mzn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.bin.gz","Mazandarani","(mzn) Mazandarani"
+"nah","xx_ent_wiki_sm","cc.nah.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.bin.gz","Nahuatl","(nah) Nahuatl"
+"nap","xx_ent_wiki_sm","cc.nap.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.bin.gz","Neapolitan","(nap) Neapolitan"
+"nds","xx_ent_wiki_sm","cc.nds.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.bin.gz","Low Saxon","(nds) Low Saxon"
+"ne","xx_ent_wiki_sm","cc.ne.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.bin.gz","Nepali","(ne) Nepali"
+"new","xx_ent_wiki_sm","cc.new.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.bin.gz","Newar","(new) Newar"
+"nl","nl_core_news_sm","cc.nl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.bin.gz","Dutch","(nl) Dutch"
+"nn","xx_ent_wiki_sm","cc.nn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.bin.gz","Norwegian (Nynorsk)","(nn) Norwegian (Nynorsk)"
+"no","xx_ent_wiki_sm","cc.no.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.bin.gz","Norwegian (Bokmål)","(no) Norwegian (Bokmål)"
+"nso","xx_ent_wiki_sm","cc.nso.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.bin.gz","Northern Sotho","(nso) Northern Sotho"
+"oc","xx_ent_wiki_sm","cc.oc.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.bin.gz","Occitan","(oc) Occitan"
+"or","xx_ent_wiki_sm","cc.or.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.bin.gz","Oriya","(or) Oriya"
+"os","xx_ent_wiki_sm","cc.os.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.bin.gz","Ossetian","(os) Ossetian"
+"pa","xx_ent_wiki_sm","cc.pa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.bin.gz","Eastern Punjabi","(pa) Eastern Punjabi"
+"pam","xx_ent_wiki_sm","cc.pam.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.bin.gz","Kapampangan","(pam) Kapampangan"
+"pfl","xx_ent_wiki_sm","cc.pfl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.bin.gz","Palatinate German","(pfl) Palatinate German"
+"pl","pl_core_news_sm","cc.pl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz","Polish","(pl) Polish"
+"pms","xx_ent_wiki_sm","cc.pms.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.bin.gz","Piedmontese","(pms) Piedmontese"
+"pnb","xx_ent_wiki_sm","cc.pnb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.bin.gz","Western Punjabi","(pnb) Western Punjabi"
+"ps","xx_ent_wiki_sm","cc.ps.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.bin.gz","Pashto","(ps) Pashto"
+"pt","pt_core_news_sm","cc.pt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.bin.gz","Portuguese","(pt) Portuguese"
+"qu","xx_ent_wiki_sm","cc.qu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.bin.gz","Quechua","(qu) Quechua"
+"rm","xx_ent_wiki_sm","cc.rm.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.bin.gz","Romansh","(rm) Romansh"
+"ro","ro_core_news_sm","cc.ro.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz","Romanian","(ro) Romanian"
+"ru","ru_core_news_sm","cc.ru.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz","Russian","(ru) Russian"
+"sa","xx_ent_wiki_sm","cc.sa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.bin.gz","Sanskrit","(sa) Sanskrit"
+"sah","xx_ent_wiki_sm","cc.sah.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.bin.gz","Sakha","(sah) Sakha"
+"sc","xx_ent_wiki_sm","cc.sc.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.bin.gz","Sardinian","(sc) Sardinian"
+"scn","xx_ent_wiki_sm","cc.scn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.bin.gz","Sicilian","(scn) Sicilian"
+"sco","xx_ent_wiki_sm","cc.sco.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.bin.gz","Scots","(sco) Scots"
+"sd","xx_ent_wiki_sm","cc.sd.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.bin.gz","Sindhi","(sd) Sindhi"
+"sh","xx_ent_wiki_sm","cc.sh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.bin.gz","Serbo-Croatian","(sh) Serbo-Croatian"
+"si","xx_ent_wiki_sm","cc.si.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz","Sinhalese","(si) Sinhalese"
+"sk","xx_ent_wiki_sm","cc.sk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.bin.gz","Slovak","(sk) Slovak"
+"sl","sl_core_news_sm","cc.sl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.bin.gz","Slovenian","(sl) Slovenian"
+"so","xx_ent_wiki_sm","cc.so.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.bin.gz","Somali","(so) Somali"
+"sq","xx_ent_wiki_sm","cc.sq.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.bin.gz","Albanian","(sq) Albanian"
+"sr","xx_ent_wiki_sm","cc.sr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.bin.gz","Serbian","(sr) Serbian"
+"su","xx_ent_wiki_sm","cc.su.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.bin.gz","Sundanese","(su) Sundanese"
+"sv","sv_core_news_sm","cc.sv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.bin.gz","Swedish","(sv) Swedish"
+"sw","xx_ent_wiki_sm","cc.sw.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.bin.gz","Swahili","(sw) Swahili"
+"ta","xx_ent_wiki_sm","cc.ta.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.bin.gz","Tamil","(ta) Tamil"
+"te","xx_ent_wiki_sm","cc.te.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.bin.gz","Telugu","(te) Telugu"
+"tg","xx_ent_wiki_sm","cc.tg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.bin.gz","Tajik","(tg) Tajik"
+"th","xx_ent_wiki_sm","cc.th.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.bin.gz","Thai","(th) Thai"
+"tk","xx_ent_wiki_sm","cc.tk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.bin.gz","Turkmen","(tk) Turkmen"
+"tl","xx_ent_wiki_sm","cc.tl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.bin.gz","Tagalog","(tl) Tagalog"
+"tr","xx_ent_wiki_sm","cc.tr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz","Turkish","(tr) Turkish"
+"tt","xx_ent_wiki_sm","cc.tt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.bin.gz","Tatar","(tt) Tatar"
+"ug","xx_ent_wiki_sm","cc.ug.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz","Uyghur","(ug) Uyghur"
+"uk","uk_core_news_sm","cc.uk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.bin.gz","Ukrainian","(uk) Ukrainian"
+"ur","xx_ent_wiki_sm","cc.ur.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.bin.gz","Urdu","(ur) Urdu"
+"uz","xx_ent_wiki_sm","cc.uz.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.bin.gz","Uzbek","(uz) Uzbek"
+"vec","xx_ent_wiki_sm","cc.vec.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.bin.gz","Venetian","(vec) Venetian"
+"vi","xx_ent_wiki_sm","cc.vi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.bin.gz","Vietnamese","(vi) Vietnamese"
+"vls","xx_ent_wiki_sm","cc.vls.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.bin.gz","West Flemish","(vls) West Flemish"
+"vo","xx_ent_wiki_sm","cc.vo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.bin.gz","Volapük","(vo) Volapük"
+"wa","xx_ent_wiki_sm","cc.wa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.bin.gz","Walloon","(wa) Walloon"
+"war","xx_ent_wiki_sm","cc.war.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.bin.gz","Waray","(war) Waray"
+"xmf","xx_ent_wiki_sm","cc.xmf.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.bin.gz","Mingrelian","(xmf) Mingrelian"
+"yi","xx_ent_wiki_sm","cc.yi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.bin.gz","Yiddish","(yi) Yiddish"
+"yo","xx_ent_wiki_sm","cc.yo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.bin.gz","Yoruba","(yo) Yoruba"
+"zea","xx_ent_wiki_sm","cc.zea.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.bin.gz","Zeelandic","(zea) Zeelandic"
+"zh","zh_core_web_sm","cc.zh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz","Chinese","(zh) Chinese"
diff --git a/docker-images-datalab/myactivetigger/activetigger/server.R b/docker-images-datalab/myactivetigger/activetigger/server.R
new file mode 100644
index 0000000..89a7ae6
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/activetigger/server.R
@@ -0,0 +1,4143 @@
+
+pkgs <- c("arrow", "class", "DT", "future", "ggplot2", "glmnet", "htmlTable",
+ "LiblineaR", "Matrix","Metrics", "promises", "quanteda",
+ "quanteda.textmodels", "ranger", "rlang", "Rtsne",
+ "RJSONIO", "shiny", "SparseM", "stringi", "uwot")
+pkgs <- pkgs[! pkgs %in% rownames(installed.packages())]
+if (length(pkgs)) {
+ warning(paste0("Packages ", paste0(pkgs, collapse = " , "), " not available. ",
+ "App might crash."))
+}
+# for (i in seq_along(pkgs)) install.packages(pkgs[i])
+
+verbose <- TRUE
+# sink("ActiveTigger.log")
+
+options(shiny.maxRequestSize = 30*1024^3) # File upload size
+
+modelnames <- read.csv("modelnames.csv")
+modelnames_labels <- modelnames$short
+names(modelnames_labels) <- modelnames$short_lang
+
+library(quanteda)
+library(ggplot2)
+library(promises)
+library(future)
+plan(multisession)
+
+
+################################################################################
+## Global functions
+################################################################################
+
+label2hash <- function(label) rlang::hash(label)
+
+cleanFileName <- function(name)
+ gsub('\\s|[/]|[\\]|[:]|[*]|[?]|["]|[<]|[>]|[+]|[$]|[=]', "_",
+ gsub("^\\s+|\\s+$", "", name))
+
+wtdF1 <- function(truth, pred) {
+ pred <- pred[!is.na(truth)]
+ truth <- truth[!is.na(truth)]
+ sum(sapply(unique(truth), function(ilab)
+ Metrics::fbeta_score(truth == ilab, pred == ilab) *
+ sum(truth == ilab) / length(truth)), na.rm = TRUE)
+}
+
+metricsTable <- function(truth, pred) {
+ do.call(rbind, lapply(sort(unique(truth)), function(ilab) {
+ tmptrue <- truth == ilab
+ tmppred <- pred == ilab
+ tmp <- data.frame(
+ "Tag" = ilab,
+ "N.cases" = sum(tmptrue),
+ "N.predict" = sum(tmppred),
+ "Accuracy" = sprintf("%f", round(Metrics::accuracy(tmptrue, tmppred), 6)),
+ "Precision" = sprintf("%f", round(Metrics::precision(tmptrue, tmppred), 6)),
+ "Recall" = sprintf("%f", round(Metrics::recall(tmptrue, tmppred), 6)),
+ "F1" = sprintf("%f", round(Metrics::fbeta_score(tmptrue, tmppred), 6)))
+ tmp$F1[tmp$F1 == "NaN"] <- sprintf("%f", 0)
+ tmp
+ }))
+}
+
+bertPlotFromPath <- function(path) {
+ if (!file.exists(paste0(path, "/trainer_state.json"))) return(NULL)
+ state <- RJSONIO::fromJSON(paste0(path, "/trainer_state.json"))
+ logs <- data.table::rbindlist(lapply(state$log_history, as.list), fill = TRUE)
+ epochmin <- logs$epoch[which.min(logs$eval_loss)]
+
+ gdat <- data.table::rbindlist(
+ use.names = FALSE,
+ list(
+ cbind("train", logs[!is.na(logs$loss), c("epoch", "loss")]),
+ cbind("valid", logs[!is.na(logs$eval_loss), c("epoch", "eval_loss")])
+ ))
+ colnames(gdat) <- c("data", "epoch", "loss")
+
+ ggplot(gdat, aes_string("epoch", "loss", color = "data", shape = "data")) +
+ geom_point(size = 2) + geom_line() +
+ geom_vline(xintercept = epochmin, linetype = 2) +
+ theme_bw() + expand_limits(y=0)
+}
+
+bertHyperToMsg <- function(pars) {
+ if (is.null(pars)) return(NULL)
+ paste0("Scheme : ", pars$bertScheme, "; ",
+ "Model : ", pars$bertModel, "\n* ",
+ "Epochs : ", pars$bertEpochs, " ; ",
+ "Lrate : ", pars$bertLrate, " ; ",
+ "Wdecay : ", pars$bertWdecay, " ; ",
+ "Batch size : ", pars$bertBatchsize, " ; ",
+ "Grad. accum. : ", pars$bertGradacc, "\n* ",
+ "Keep best : ", pars$bertBest, " ; ",
+ "ValidFrac : ", pars$bertValidFrac, " ; ",
+ "ValidSeed : ", pars$bertValidSeed, " \n* ",
+ pars$bertNtrain, " training samples : ",
+ paste0(names(pars$bertTrainTable), " (",
+ as.numeric(pars$bertTrainTable), ")",
+ collapse = " ; "))
+}
+
+get_status <- function(dafile)
+ scan(dafile, what = "character", sep="\n", quiet = TRUE)
+set_status <- function(dafile, msg) write(msg, dafile)
+
+
+
+################################################################################
+## Main server function
+################################################################################
+
+# options(shiny.maxRequestSize=2^34) # Max filesize
+
+shinyServer(function(input, output, session) {
+
+ values <- reactiveValues()
+ values$visuGo <- FALSE
+ values$dfmGo <- FALSE
+ # queryTrigger <- reactiveVal(0)
+ # predlabTrigger <- reactiveVal(0)
+ initDataTrigger <- reactiveVal(0)
+ queryNext <- reactiveVal(0)
+ trainTrigger <- reactiveVal(0)
+ predTrigger <- reactiveVal(0)
+ diagTrigger <- reactiveVal(0)
+ ready2tag <- reactiveVal(0)
+ trainCountdown <- reactiveVal(0)
+
+ ok.data <- reactiveVal()
+ ok.data.async <- reactiveVal()
+ ok.data.running <- reactiveVal(FALSE)
+
+ bpall_nclicks <- reactiveVal(0)
+ bertpred_nclicks <- reactiveVal(0)
+
+ ## NEW use_regressors list as values$new_use_regressors
+ observeEvent(input$use_regressors, {
+ values$new_use_regressors <- input$use_regressors
+ cat("DEBUG new_use_regressors: ", paste(input$use_regressors, collapse = " "), "\n")
+ })
+
+ ## NEW trainCountdown with input$trainCountdown
+ ## Policy: 0, don't train ; 1, trigger train on tag ; >1, decrement on tag
+ observeEvent(input$trainCountdown, trainCountdown(input$trainCountdown))
+
+ ####################
+ ## Projects and data
+
+ ## Read config files, update UI forms accordingly
+ ## This should happen only once
+ observe({
+ if (verbose)
+ cat("DEBUG read project configs\n")
+ # values$conf <- RJSONIO::fromJSON("conf.json")
+ projfiles <- dir(pattern = "^tigger_.*[.]json$")
+ isolate({
+ if (!length(projfiles)) {
+ values$conf <- NULL
+ return(NULL)
+ }
+ values$confnames <- gsub("^tigger_(.*)[.]json$", "\\1", projfiles)
+
+ values$conf <- RJSONIO::fromJSON(projfiles[1])
+ })
+ })
+
+ ## When projects added, update selector
+ observeEvent(values$confnames, {
+ if (verbose)
+ cat("DEBUG update selectProject from confnames\n")
+ updateSelectInput(session, "selectProject", choices = values$confnames)
+ })
+
+ ## Event from project change
+ observeEvent(input$selectProject, {
+ if (is.null(input$selectProject)) return(NULL)
+ if (!nchar(input$selectProject)) return(NULL)
+ if (verbose)
+ cat("DEBUG project change step 1\n")
+ # values$project <- input$selectProject
+ # values$conf <- values$confs[[values$project]]
+ values$conf <- RJSONIO::fromJSON(
+ paste0("tigger_", input$selectProject, ".json"))
+ ok.data(NULL)
+ ok.visu(NULL)
+ initDataTrigger(initDataTrigger() + 1)
+
+ if (verbose)
+ cat("DEBUG project change step 2\n")
+ if (!is.null(values$conf$dataNrows))
+ updateNumericInput(session, "dataNrows", value = as.numeric(values$conf$dataNrows))
+ if (!is.null(values$conf$dataSkipRows))
+ updateNumericInput(session, "dataSkipRows", value = as.numeric(values$conf$dataSkipRows))
+
+ #### Python ####
+ ## Default use_python if not found in conf file: FALSE
+ if(is.null(values$conf$use_python))
+ values$conf$use_python <- FALSE
+ updateCheckboxInput(session, "sys_use_python", value = values$conf$use_python)
+
+ ## Set default python if not found in conf file
+ if(is.null(values$conf$python))
+ values$conf$python <- "python3"
+ updateTextInput(session, "sys_which_python", value = values$conf$python)
+
+ values$python_ok <- FALSE
+ if (values$conf$use_python) {
+ ## Check whether python is working
+ pytest <- try(system(paste(values$conf$python, "--version"),
+ intern = TRUE))
+ if (inherits(pytest, "try-error")) {
+ showNotification(paste(
+ "Python path `", values$conf$python,
+ "` not valid, try changing it in Project/System tab"),
+ duration = 10, type = "error")
+ } else
+ values$python_ok <- TRUE
+ }
+ if (verbose)
+ cat("Python ok:", values$python_ok, "\n")
+
+
+ #### GPU ####
+ if (!is.null(values$conf$use_gpu))
+ updateCheckboxInput(session, "sys_use_gpu", value = values$conf$use_gpu)
+
+ #### Spacy ####
+ if (!is.null(values$conf$use_spacy))
+ updateCheckboxInput(session, "sys_use_spacy", value = values$conf$use_spacy)
+ updateTextInput(session, "sys_use_spacy_model", value = values$conf$use_spacy_model)
+
+ #### fastText ####
+ if (!is.null(values$conf$use_ft))
+ updateCheckboxInput(session, "sys_use_ft", value = values$conf$use_ft)
+ updateTextInput(session, "sys_use_ft_model", value = values$conf$use_ft_model)
+
+ #### SBERT ####
+ if (!is.null(values$conf$use_sb))
+ updateCheckboxInput(session, "sys_use_sb", value = values$conf$use_sb)
+ updateTextInput(session, "sys_use_sb_model", value = values$conf$use_sb_model)
+
+ })
+
+
+ ##############################################################################
+ ## Reactives : project settings UI
+ ##############################################################################
+
+ output$sys_datadir <- renderText(values$conf$datadir)
+ output$sys_datafile <- renderText(paste0(values$conf$dataname, ".feather"))
+ output$sys_var_id <- renderUI(renderText(values$conf$idcol))
+ output$sys_var_text <- renderUI(renderText(values$conf$textcols))
+ output$sys_var_tag <- renderUI(renderText(values$conf$tagcols))
+ output$sys_var_context_ui <- renderUI(selectInput(
+ "sys_var_context", NULL, values$data_orig_colnames, values$conf$contextcols,
+ multiple = TRUE))
+ output$sys_var_comm_ui <- renderUI(selectInput(
+ "sys_var_comm", NULL, c("(none)", values$data_orig_colnames),
+ values$conf$commcol))
+
+ output$sys_ex_lang_ui <- renderUI(selectInput(
+ "sys_ex_lang", NULL, modelnames_labels, "en"
+ ))
+ output$sys_ex_spacy <- renderText(
+ modelnames$spacy_name[modelnames$short == input$sys_ex_lang])
+ output$sys_ex_spacy_dl <- renderText(paste0(
+ "Download with python module: ", values$conf$python, " -m spacy download ",
+ modelnames$spacy_name[modelnames$short == input$sys_ex_lang]))
+ output$sys_ex_ft <- renderText(
+ modelnames$fasttext_name[modelnames$short == input$sys_ex_lang])
+ output$sys_ex_ft_dl <- renderText(paste0(
+ "Manual download link : ",
+ modelnames$fasttext_url[modelnames$short == input$sys_ex_lang]))
+ output$sys_ex_sb <- renderText(
+ ifelse(input$sys_ex_lang %in% c("ar", "zh", "nl", "en", "fr", "de",
+ "it", "ko", "pl", "pt", "ru", "es", "tr"),
+ "distiluse-base-multilingual-cased-v1",
+ "distiluse-base-multilingual-cased-v2"))
+
+
+ ## Save system changes on button click, with modal
+ output$save_config_msg <- renderUI({
+ req(values$save_config_msg)
+ p(strong(values$save_config_msg))
+ })
+ output$save_embed_msg <- renderUI({
+ tmp <- ""
+ if (input$sys_use_spacy) if (!identical(input$sys_use_spacy_model, values$conf$use_spacy_model))
+ tmp <- paste0(tmp, "
Changing the spacy model will delete the current tokenized text and fasttext word embeddings, to prevent conflicts")
+ if (input$sys_use_ft) if (!identical(input$sys_use_ft_model, values$conf$use_ft_model))
+ tmp <- paste0(tmp, " Changing the fasttext model will delete the current fasttext word embeddings, to prevent conflicts.")
+ if (input$sys_use_sb) if (!identical(input$sys_use_sb_model, values$conf$use_sb_model))
+ tmp <- paste0(tmp, " Changing the SBERT model will delete the current SBERT sentence embeddings, to prevent conflicts.")
+
+ if (nchar(tmp)) {
+ return(HTML(paste("
Warning:
")))
+ } else
+ return(NULL)
+ })
+ observeEvent(input$saveSystem, {
+ values$save_config_msg <- NULL
+ showModal(modalDialog(
+ title = "Save project configuration",
+ paste0("Click 'Save' to check and confirm the new configuration"),
+ uiOutput("save_embed_msg"),
+ uiOutput("save_config_msg"),
+ footer = tagList(actionButton(paste0("save_conf_confirm"), "Save"),
+ modalButton("Cancel"))))
+ })
+ observeEvent(input$save_conf_confirm, {
+ newconf <- values$conf
+ newconf$contextcols <- input$sys_var_context
+ newconf$commcol <- input$sys_var_comm
+
+ if (input$sys_use_python) {
+ pytest <- try(system(paste(input$sys_which_python, "--version"),
+ intern = TRUE))
+ if (inherits(pytest, "try-error")) {
+ values$save_config_msg <- paste0(
+ "Error: python path `", input$sys_which_python, "` not valid")
+ return(NULL)
+ }
+ }
+ newconf$use_python <- input$sys_use_python
+ newconf$python <- input$sys_which_python
+ newconf$use_gpu <- input$sys_use_gpu
+
+ if (input$sys_use_spacy) {
+ sptest <- system(paste0(
+ newconf$python, " -m spacy info ", input$sys_use_spacy_model), intern = TRUE)
+ if (length(sptest) == 0) {
+ values$save_config_msg <- paste(
+ "Error loading spacy, check that it is installed in the specified python env")
+ return(NULL)
+ }
+ if (length(attr(sptest, "status"))) {
+ values$save_config_msg <- paste(
+ "Error loading spacy model, check that it has been downloaded")
+ return(NULL)
+ }
+ newconf$use_spacy_model <- input$sys_use_spacy_model
+ }
+ newconf$use_spacy <- input$sys_use_spacy
+
+ if (input$sys_use_ft) {
+ if (!file.exists(input$sys_use_ft_model)) {
+ values$save_config_msg <- paste(
+ "Error loading fasttext model, check the specified path")
+ return(NULL)
+ }
+ newconf$use_ft_model <- input$sys_use_ft_model
+ }
+ newconf$use_ft <- input$sys_use_ft
+
+ if (input$sys_use_sb)
+ newconf$use_sb_model <- input$sys_use_sb_model
+ newconf$use_sb <- input$sys_use_sb
+
+ # if (newconf$use_spacy) if (newconf$use_spacy_model != values$conf$use_spacy_model) {
+ if (newconf$use_spacy) if (!identical(newconf$use_spacy_model, values$conf$use_spacy_model)) {
+ file.remove(paste0(values$conf$datadir, values$conf$dataname, "_spa.feather"))
+ file.remove(paste0(values$conf$datadir, values$conf$dataname, "_ft.feather"))
+ }
+ # if (newconf$use_ft) if (newconf$use_ft_model != values$conf$use_ft_model)
+ if (newconf$use_ft) if (!identical(newconf$use_ft_model, values$conf$use_ft_model))
+ file.remove(paste0(values$conf$datadir, values$conf$dataname, "_ft.feather"))
+ # if (newconf$use_sb) if (newconf$use_sb_model != values$conf$use_sb_model)
+ if (newconf$use_sb) if (!identical(newconf$use_sb_model, values$conf$use_sb_model))
+ file.remove(paste0(values$conf$datadir, values$conf$dataname, "_sb.feather"))
+
+ writeLines(RJSONIO::toJSON(newconf),
+ paste0("tigger_", newconf$projectid, ".json"))
+ values$conf <- newconf
+ initDataTrigger(initDataTrigger() + 1)
+ removeModal()
+ })
+
+ ##############################################################################
+ ## Reactives : data
+ ##############################################################################
+
+ ok.commentCol <- reactive(values$conf$commcol)
+
+ ## Current imported data (async)
+ observeEvent(input$dataImport, {
+ if (ok.data.running()) return(NULL)
+ ## Save nrows to conf file on change
+ if (!is.numeric(input$dataNrows) | !is.numeric(input$dataSkipRows)) return(NULL)
+ values$conf$dataNrows <- input$dataNrows
+ values$conf$dataSkipRows <- input$dataSkipRows
+ if (!is.null(values$conf$projectid))
+ writeLines(RJSONIO::toJSON(values$conf), paste0(
+ "tigger_", values$conf$projectid, ".json"))
+ initDataTrigger(initDataTrigger() + 1)
+ })
+
+
+ ## Read data event
+ observeEvent(initDataTrigger(), {
+ if (is.null(values$conf)) {
+ ok.data(NULL)
+ return(NULL)
+ }
+ if (initDataTrigger() == 0) return(NULL)
+ if (verbose)
+ cat("DEBUG enter ok.data\n")
+ if (verbose)
+ cat("DEBUG enter ok.data trigger", initDataTrigger(), "\n")
+ ok.data.running(TRUE)
+
+ ## Prepare variables for async launch
+ coco <- values$conf
+ coco$python_ok <- values$python_ok
+
+ da_token <- session$token
+ data_status <- paste0("tigger_", da_token, "_data")
+
+ file_orig_colnames <- paste0(data_status, "_origcols")
+ file_colstok <- paste0(data_status, "_tok")
+ file_colsft <- paste0(data_status, "_ft")
+ file_colssb <- paste0(data_status, "_sb")
+ file_colsbertpred <- paste0(data_status, "_bertpred")
+ da_missing <- paste0(data_status, "_bertpred_missing")
+
+ projpath <- paste0(coco$datadir, coco$projectid)
+ if (!file.exists(projpath))
+ dir.create(projpath)
+
+ ## Read data: async launch
+ data_async <- future({
+ #################
+ ## Main data file
+
+ set_status(data_status, "Importing main data file...")
+ res <- arrow::read_feather(
+ paste0(coco$datadir, coco$dataname, ".feather"))
+ if (is.null(coco$dataNrows)) return(NULL)
+ rowselect <-
+ intersect(1:nrow(res), (1:(coco$dataNrows)) + coco$dataSkipRows)
+ if (!length(rowselect)) return(NULL)
+ res <- res[rowselect, , drop = FALSE]
+
+ writeLines(colnames(res), file_orig_colnames)
+
+
+ #################
+ ## Tokenized text
+
+ ## Import data : feather of two cols, one the id, other the tokenized
+ set_status(data_status, "Importing tokenized text...")
+ file_tok <- paste0(coco$datadir, coco$dataname, "_spa.feather")
+ if (file.exists(file_tok)) {
+ tokdat <- arrow::read_feather(file_tok)
+ } else
+ tokdat <- NULL
+ tok_remaining <- res[[coco$idcol]]
+ tok_remaining <-
+ tok_remaining[! tok_remaining %in% tokdat[[coco$idcol]]]
+
+ ## Tokenize remaining texts
+ if (length(tok_remaining)) {
+ if (!coco$python_ok | !coco$use_spacy) {
+ ## If remaining texts to tokenize but no python, fall back on
+ ## untokenized text
+ ## TODO here use quanteda tokenizer with options?
+ tokdat <- data.frame(
+ res[[coco$idcol]],
+ do.call(paste, c(res[match(tok_remaining, res[[coco$idcol]]),
+ coco$textcols, drop = FALSE], sep = "\n\n")))
+ colnames(tokdat) <- c(coco$idcol, "text_spa")
+ set_status(data_status, "Importing tokenized text: no spacy ok")
+ } else {
+ ## If python present, tokenize with spacy
+ set_status(data_status,
+ paste("Tokenizing", length(tok_remaining), "new texts"))
+ tmpfile <- paste0("tigger_tok_", da_token)
+ arrow::write_feather(
+ data.frame(
+ id = tok_remaining,
+ text = do.call(
+ paste, c(res[match(tok_remaining, res[[coco$idcol]]),
+ coco$textcols, drop = FALSE], sep = "\n\n"))),
+ paste0(tmpfile, ".feather")
+ )
+ system(paste0(
+ coco$python, " tokenize_spacy.py -d ", tmpfile, ".feather",
+ " -m ", coco$use_spacy_model))
+ tok_new <- arrow::read_feather(paste0(tmpfile, "_spa.feather"))
+ colnames(tok_new)[colnames(tok_new) == "id"] <- coco$idcol
+ tokdat <- rbind(tokdat, tok_new)
+ arrow::write_feather(tokdat, file_tok)
+ system(paste0("rm ", tmpfile, "*"))
+ set_status(data_status, "Importing tokenized text: spacy ok")
+ }
+ }
+
+ ## Save colname for tokenized text
+ tmp_cols_tok <- colnames(tokdat)[colnames(tokdat) != coco$idcol]
+ writeLines(tmp_cols_tok, file_colstok)
+
+ set_status(data_status, "Importing tokenized text: wrote colnames")
+
+ ## Merge with current data
+ res <- merge(res, tokdat, by = coco$idcol,
+ all.y = FALSE, sort = FALSE)
+ rm(tokdat)
+
+ set_status(data_status, "Importing tokenized text: merged")
+
+ #################
+ ## FastText embeddings
+
+ set_status(data_status, paste("Loading fastText embeddings"))
+
+ ## FastText embeddings : feather file with id and embeddings
+ file_ft <- paste0(coco$datadir, coco$dataname, "_ft.feather")
+ if (file.exists(file_ft)) {
+ ftdat <- arrow::read_feather(file_ft)
+ } else
+ ftdat <- NULL
+ ft_remaining <- res[[coco$idcol]]
+ ft_remaining <-
+ ft_remaining[! ft_remaining %in% ftdat[[coco$idcol]]]
+
+ ## Embed remaining texts, if FT in use
+ if (length(ft_remaining)) {
+ if (!coco$python_ok | !coco$use_ft) {
+ ## If no python or FT, abort if not all rows already embedded
+ ftdat <- NULL
+ } else {
+ set_status(data_status, paste(
+ "Embedding", length(ft_remaining), "new texts with fastText"))
+ tmpfile <- paste0("tigger_ft_", da_token)
+ arrow::write_feather(
+ data.frame(
+ id = ft_remaining,
+ text = do.call(
+ paste, res[match(ft_remaining, res[[coco$idcol]]),
+ tmp_cols_tok, drop = FALSE])),
+ paste0(tmpfile, ".feather")
+ )
+ system(paste0(
+ coco$python, " embed_fasttext.py -d ", tmpfile, ".feather",
+ " -m ", coco$use_ft_model))
+ ft_new <- arrow::read_feather(paste0(tmpfile, "_ft.feather"))
+ colnames(ft_new)[colnames(ft_new) == "id"] <- coco$idcol
+ ftdat <- rbind(ftdat, ft_new)
+ arrow::write_feather(ftdat, file_ft)
+ system(paste0("rm ", tmpfile, "*"))
+ }
+ }
+
+ if (!is.null(ftdat)) {
+ ## Merge with current data
+ writeLines(colnames(ftdat)[colnames(ftdat) != coco$idcol], file_colsft)
+ res <- merge(res, ftdat, by = coco$idcol, all.y = FALSE, sort = FALSE)
+ rm(ftdat)
+ }
+
+
+ #################
+ ## SBERT embeddings
+
+ set_status(data_status, "Loading SBERT embeddings")
+
+ ## SBERT file : feather of id and embeddings
+ file_sb <- paste0(coco$datadir, coco$dataname, "_sb.feather")
+ if (file.exists(file_sb)) {
+ sbdat <- arrow::read_feather(file_sb)
+ }else
+ sbdat <- NULL
+ sb_remaining <- res[[coco$idcol]]
+ sb_remaining <-
+ sb_remaining[! sb_remaining %in% sbdat[[coco$idcol]]]
+
+ ## Embed remaining texts
+ if (length(sb_remaining)) {
+ if (!coco$python_ok | !coco$use_sb) {
+ ## If no python or SB, abort if not all rows already embedded
+ sbdat <- NULL
+ } else {
+ set_status(data_status, paste(
+ "Embedding", length(sb_remaining), "new texts with SBERT"))
+ tmpfile <- paste0("tigger_sb_", da_token)
+ arrow::write_feather(
+ data.frame(
+ id = sb_remaining,
+ text = do.call(
+ paste, res[match(sb_remaining, res[[coco$idcol]]),
+ coco$textcols, drop = FALSE])),
+ paste0(tmpfile, ".feather")
+ )
+ system(paste0(
+ coco$python, " embed_sbert.py -d ", tmpfile, ".feather",
+ " -m ", coco$use_sb_model))
+ sb_new <- arrow::read_feather(paste0(tmpfile, "_sb.feather"))
+ colnames(sb_new)[colnames(sb_new) == "id"] <- coco$idcol
+ sbdat <- rbind(sbdat, sb_new)
+ arrow::write_feather(sbdat, file_sb)
+ system(paste0("rm ", tmpfile, "*"))
+ }
+ }
+
+ if (!is.null(sbdat)) {
+ ## Merge with current data
+ writeLines(colnames(sbdat)[colnames(sbdat) != coco$idcol], file_colssb)
+ res <- merge(res, sbdat, by = coco$idcol, all.y = FALSE, sort = FALSE)
+ rm(sbdat)
+ }
+
+ #################
+ ## BERT predictions
+
+ set_status(data_status, "Loading BERT predictions")
+
+ ## BERT predictions : feathers of id and probs
+ ## TODO: take full prediction if existing
+
+ files_pred <- dir(
+ coco$datadir, paste0("^", coco$dataname, "_bertpred_.*[.]feather$"))
+ # files_predall <- dir(
+ # coco$datadir, paste0("^", coco$dataname, "_bertpredall_.*[.]feather$"))
+ # files_pred <- files_pred[
+ # ! gsub("_bertpred_", "_bertpredall_", files_pred) %in% files_predall]
+ # files_pred <- c(files_predall, files_pred)
+
+ tmp_bertpred_cols <- list()
+ if (length(files_pred)) {
+ for (ipred in files_pred) {
+ preddat <- arrow::read_feather(paste0(coco$datadir, ipred))
+ predname <-
+ gsub(paste0("^", coco$dataname, "_bertpred_(.*)[.]feather$"),
+ "\\1", ipred)
+ # predname <-
+ # gsub(paste0("^", coco$dataname, "_bertpred(all)?_(.*)[.]feather$"),
+ # "\\2", ipred, perl = TRUE)
+ n_missing <- sum(! res[[coco$idcol]] %in% preddat[[coco$idcol]])
+ if (n_missing) {
+ write(
+ paste0("BERT prediction ", predname, " : ", n_missing,
+ " texts missing in prediction."),
+ da_missing, append = TRUE)
+ } else {
+ colnames(preddat) <- gsub(
+ "^bertpred_", paste0("bertpred_", predname), colnames(preddat))
+ tmp_bertpred_cols[[predname]] <-
+ colnames(preddat)[colnames(preddat) != coco$idcol]
+ res <- merge(res, preddat, by = coco$idcol,
+ all.y = FALSE, sort = FALSE)
+ }
+ rm(preddat)
+ }
+ if (length(tmp_bertpred_cols)) {
+ writeLines(names(tmp_bertpred_cols), file_colsbertpred)
+ lapply(names(tmp_bertpred_cols), function(x) {
+ writeLines(tmp_bertpred_cols[[x]], paste0(file_colsbertpred, "_", x))
+ })
+ }
+ }
+
+ ## Comment col
+ set_status(data_status, "Loading comments")
+ tmpfile <- paste0(projpath, "/", coco$commcol, ".csv")
+ if (file.exists(tmpfile)) {
+ tmpcol <- read.csv(tmpfile)
+ colnames(tmpcol)[2] <- coco$commcol
+ ## Remove duplicates, keep newest
+ tmpcol <- tmpcol[nrow(tmpcol):1, , drop = FALSE]
+ tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), , drop = FALSE]
+ tmpcol <- tmpcol[nrow(tmpcol):1, , drop = FALSE]
+ res <- merge(res, tmpcol, by = coco$idcol,
+ all.x = TRUE, all.y = FALSE, sort = FALSE)
+ res[[coco$commcol]][is.na(res[[coco$commcol]])] <- ""
+ } else
+ res[[coco$commcol]] <- ""
+
+ # ## Tagging cols
+ # set_status(data_status, "Loading tags")
+ # for (itag in coco$tagcols) {
+ # tmpfile <- paste0(projpath, "/", itag, ".csv")
+ # if (file.exists(tmpfile)) {
+ # tmpcol <- read.csv(tmpfile)
+ # tmpcol[[paste0("hist_", itag)]] <- 1:nrow(tmpcol)
+ # ## Remove duplicates, keep newest
+ # tmpcol <- tmpcol[nrow(tmpcol):1, ]
+ # tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), ]
+ # tmpcol <- tmpcol[nrow(tmpcol):1, ]
+ # if (verbose)
+ # cat("DEBUG imported ", itag, nrow(tmpcol), "\n")
+ # res <- merge(res, tmpcol, by = coco$idcol,
+ # all.x = TRUE, all.y = FALSE, sort = FALSE)
+ # } else {
+ # res[[itag]] <- NA
+ # res[[paste0("hist_", itag)]] <- NA
+ # }
+ # }
+ ## Tagging cols
+ set_status(data_status, "Loading tags")
+ for (itag in coco$tagcols) {
+ itaghist <- paste0("hist_", itag)
+ if (is.null(res[[itag]]) | all(is.na(res[[itag]]))) {
+ res[[itag]] <- NA
+ res[[itaghist]] <- NA
+ } else {
+ res[[itaghist]][!is.na(res[[itag]])] <- 1:sum(!is.na(res[[itag]]))
+ }
+ tmpfile <- paste0(projpath, "/", itag, ".csv")
+ if (file.exists(tmpfile)) {
+ tmpcol <- read.csv(tmpfile)
+ tmpcol[[itaghist]] <- 1:nrow(tmpcol) +
+ max(0, suppressWarnings(max(na.omit(res[[itaghist]]))))
+ ## Remove duplicates, keep newest
+ tmpcol <- tmpcol[nrow(tmpcol):1, ]
+ tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), ]
+ tmpcol <- tmpcol[nrow(tmpcol):1, ]
+ tmpcol <- tmpcol[tmpcol[[coco$idcol]] %in% res[[coco$idcol]], ]
+ if (verbose)
+ cat("DEBUG imported", itag, nrow(tmpcol), "\n")
+ # res <- merge(res, tmpcol, by = coco$idcol,
+ # all.x = TRUE, all.y = FALSE, sort = FALSE)
+ if (nrow(tmpcol)) {
+ idmatches <- match(tmpcol[[coco$idcol]], res[[coco$idcol]])
+ # idmatches <- idmatches[!is.na(idmatches)]
+ res[[itag]][idmatches] <- tmpcol[[itag]]
+ res[[itaghist]][idmatches] <- tmpcol[[itaghist]]
+ }
+ }
+ }
+
+ set_status(data_status, "Finalizing data import")
+ res
+ }) %...>% ok.data.async()
+ data_async <- catch(data_async, function(error) {
+ values$modelDiagno <- paste0("Error in data import: ", error)
+ })
+ data_async <- finally(data_async, function() {
+ #################
+ ## Finalize data importation
+ res <- ok.data.async()
+ if (is.null(res)) return(NULL)
+
+ set_status(data_status, "Finalizing data import: read columns")
+ ## Read column (tok, ft, sb, ...) names from dedicated files
+ values$tagcols <- coco$tagcols
+ values$cols_tok <- readLines(file_colstok)
+ values$cols_ft <- NULL
+ if (file.exists(file_colsft)) {
+ values$cols_ft <- readLines(file_colsft)
+ }
+ values$cols_sb <- NULL
+ if (file.exists(file_colssb)) {
+ values$cols_sb <- readLines(file_colssb)
+ }
+ set_status(data_status, "Finalizing data import: bertpred")
+ if (file.exists(file_colsbertpred)) {
+ tmp_bertpred <- readLines(file_colsbertpred)
+ values$cols_bertpred <- lapply(tmp_bertpred, function(x)
+ readLines(paste0(file_colsbertpred, "_", x)))
+ names(values$cols_bertpred) <- tmp_bertpred
+ } else
+ values$cols_bertpred <- NULL
+ set_status(data_status, "Finalizing data import: bertpred missing")
+ if (file.exists(da_missing)) {
+ lapply(readLines(da_missing), function(x)
+ showNotification(paste(
+ x, "\n",
+ "You may predict on current text in BERT panel, ",
+ "and then reload the data.")))
+ }
+
+ ## Populate regressors list in prediction panel
+ set_status(data_status, "Finalizing data import: populating regressors")
+ regr_choices <- c(
+ "Regex" = "regex",
+ "Word counts (DFM)" = "dfm")
+ if (!is.null(values$cols_ft))
+ regr_choices <- c(regr_choices, "Word embeddings (FastText)" = "ft")
+ if (!is.null(values$cols_sb))
+ regr_choices <- c(regr_choices, "Sentence embeddings (SBERT)" = "sb")
+ regr_choices <- c(regr_choices, "Extra predictors" = "extra")
+
+ if (length(values$cols_bertpred)) {
+ tmp_choices <- paste0("bertpred_", names(values$cols_bertpred))
+ names(tmp_choices) <- paste0("BERT_", names(values$cols_bertpred))
+ regr_choices <- c(regr_choices, tmp_choices)
+ }
+
+ # if (length(regr_choices) > 3) {
+ regr_choices_sel <-
+ regr_choices[regr_choices %in% c("regex", "ft", "sb")]
+ # } else
+ # regr_choices_sel <- c("regex", "dfm")
+
+ updateSelectizeInput(
+ session, "use_regressors", choices = regr_choices,
+ selected = regr_choices_sel)
+ values$new_use_regressors <- regr_choices_sel
+
+ othercols <- colnames(res)
+ othercols <- othercols[! othercols %in% c(
+ values$cols_tok, values$cols_ft, values$cols_sb, values$cols_bertpred,
+ values$tagcols, paste0("hist_", values$tagcols),
+ coco$commcol, coco$textcols)]
+ updateSelectInput(
+ session, "dlTagSelect",
+ choices = c("tags", "comments", "predictions", othercols),
+ selected = c("tags", "comments", "predictions"))
+
+ ## Populate extra regressors lists in prediction panel
+ updateSelectizeInput(
+ session, "use_ootregnum", choices = othercols, selected = NULL)
+ updateSelectizeInput(
+ session, "use_ootregcat", choices = othercols, selected = NULL)
+
+ ## Project path
+ values$projectdir <- paste0(projpath, "/")
+
+ ## Create tagmat: data.frame of all current taggings
+ values$tagmat <- res[, coco$tagcols, drop = FALSE]
+ ## Create tagmathist: data.frame of historical order of taggings
+ values$tagmathist <- res[, paste0("hist_", coco$tagcols), drop = FALSE]
+ colnames(values$tagmathist) <- coco$tagcols
+
+ ## Save data original colnames
+ values$data_orig_colnames <- readLines(file_orig_colnames)
+
+ ok.data(ok.data.async())
+ ok.data.async(NULL)
+ ok.data.running(FALSE)
+ set_status(data_status, "Exit data async")
+ system(paste0("rm ", data_status, "*"))
+ values$modelDiagno <- "No model"
+
+
+ if(trainTrigger() == 0) trainTrigger(1) ## For first run. TODO set trainTrigger to 0 when changing data?
+ if (verbose) cat("DEBUG Exit data async finally\n")
+ })
+ NULL
+ })
+
+ ## Read data import status from file
+ observe({
+ if (ok.data.running()) {
+ invalidateLater(100)
+ tmpfile <- paste0("tigger_", session$token, "_data")
+ if (file.exists(tmpfile)) {
+ values$modelDiagno <- paste(collapse = "\n", scan(
+ tmpfile, what = "character", sep="\n", quiet = TRUE))
+ } else
+ values$modelDiagno <- ""
+ }
+ })
+
+
+ ###############
+ ## ok.text() reactive
+ ###############
+ ok.text <- eventReactive(ok.data(), {
+ if (is.null(ok.data())) return(NULL)
+ res <- ok.data()[, values$cols_tok[1]]
+ if (length(values$cols_tok) > 1) {
+ for (icol in values$cols_tok[-1]) {
+ res <- paste0(res, "\n\n", ok.data()[, icol])
+ }
+ }
+ # queryTrigger(queryTrigger() + 1)
+ if (verbose)
+ cat("DEBUG ok.text", length(res), "\n")
+ res
+ })
+
+ ok.labelcol <- reactive({
+ if (is.null(ok.data())) return(NULL)
+ if (verbose)
+ # cat("DEBUG enter ok.labelcol\n")
+ if (is.null(input$selectScheme)) return(values$conf$tagcols[1])
+ input$selectScheme
+ })
+
+
+ ok.nicetext <- reactive({
+ req(ok.data(), values$conf$textcols)
+ # if (is.null(ok.data())) return(NULL)
+ do.call(paste, c(ok.data()[, values$conf$textcols, drop = FALSE],
+ sep = "\n\n"))
+ })
+
+ ok.regressor.names <- reactive({
+ daregs <- values$new_use_regressors
+ if (verbose)
+ cat("DEBUG enter ok.regressor.names ", paste(daregs, collapse = " "), "\n")
+ res <- NULL
+ if ("ft" %in% daregs)
+ res <- c(res, values$cols_ft)
+ if ("sb" %in% daregs)
+ res <- c(res, values$cols_sb)
+ if (length(values$cols_bertpred)) {
+ for (ipred in 1:length(values$cols_bertpred)) {
+ if (paste0("bertpred_", names(values$cols_bertpred)[ipred]) %in% daregs)
+ res <- c(res, values$cols_bertpred[[ipred]])
+ }
+ }
+ if (verbose)
+ cat("DEBUG exit ok.regressor.names:", length(res), "\n")
+ res
+ })
+
+ ok.contextCol <- reactive({
+ if (verbose)
+ cat("DEBUG enter ok.contextCol\n")
+ # res <- isolate(values$conf$contextcols)
+ res <- values$conf$contextcols
+ names(res) <- res
+ res
+ })
+
+ ## Initiate label and queries history, on ok.text() or labelcol change,
+ observeEvent(list(ok.text(), ok.labelcol()), {
+ if (verbose)
+ cat("DEBUG initiate label\n")
+ if (is.null(ok.labelcol())) {
+ values$label <- NULL
+ values$labelhist <- NULL
+ values$queries <- NULL
+ values$comment <- NULL
+ values$wasTagged <- NULL
+ values$wasRetagged <- NULL
+ } else {
+ values$label <- values$tagmat[, ok.labelcol()]
+ values$labelhist <- values$tagmathist[, ok.labelcol()]
+ values$queries <- which(!is.na(values$label))
+ values$comment <- ok.data()[[values$conf$commcol]]
+ values$wasTagged <- !is.na(values$label)
+ values$wasRetagged <- rep(FALSE, length(values$label))
+ }
+ })
+
+ ## On values$label change, values$tagmat is updated for separate history
+ observeEvent(values$label, {
+ if (is.null(values$label)) return(NULL)
+ if (verbose)
+ cat("DEBUG Update tagmat from label\n")
+ values$tagmat[, ok.labelcol()] <- values$label
+ values$tagmathist[, ok.labelcol()] <- values$labelhist
+ })
+
+
+ ##############################################################################
+ ## Reactives : training
+ ##############################################################################
+
+ ## Message of how many tagged and untagged, for model diagnostics box
+ ok.msgtagged <- reactive({
+ if (verbose)
+ cat("DEBUG enter ok.msgtagged\n")
+ # if (is.null(ok.data()) | ok.data.running()) {
+ if (is.null(ok.data())) {
+ return("No data yet ")
+ }
+
+ tmp <- ""
+ # if (sum(is.na(values$label)) == 0)
+ # tmp <- paste0(tmp, "Data correction mode (0 NA labels left).\n")
+ paste0(tmp, "Tagged: ", sum(!is.na(values$label)),
+ " ; Untagged: ", sum(is.na(values$label)))
+ })
+
+ ## Training possible? At least 3 texts for 2 tags
+ ok.train.possible <- reactive({
+ if (verbose)
+ cat("DEBUG enter ok.train.possible\n")
+ labtab <- table(values$label)
+ if (length(labtab) < 2) return(FALSE)
+ if (sum(labtab > 2) < 2) return(FALSE)
+ TRUE
+ })
+
+ ## Compute DFM (async)
+ ## TODO make this less reactive
+ observe({
+ # if ("dfm" %in% input$use_regressors & !values$dfmGo)
+ if ("dfm" %in% values$new_use_regressors & !values$dfmGo)
+ values$dfmGo <- TRUE
+ })
+ ok.dfm <- reactiveVal()
+ observe({
+ if (is.null(ok.text())) return(NULL)
+ if (! values$dfmGo) return(NULL)
+ if (!is.numeric(input$dfmNgrams)) return(NULL)
+ if (verbose)
+ cat("DEBUG enter dfm observer\n")
+
+ datext <- ok.text()
+ dfmNgrams <- input$dfmNgrams
+ dfmTfIdf <- input$dfmTfIdf
+ dfmMinDocfreq <- input$dfmMinDocfreq
+ dfmMinTermfreq <- input$dfmMinTermfreq
+ dfmTfScheme <- input$dfmTfScheme
+ dfmDfScheme <- input$dfmDfScheme
+
+ dfm_async <- future(seed = TRUE, {
+ dadfm <- datext %>%
+ tolower() %>% tokens(remove_punct = F) %>%
+ # tokens_select(pattern = stopwords(language= "fr"), selection = "remove") %>%
+ tokens_ngrams(n= 1:dfmNgrams) %>% dfm() %>%
+ dfm_trim(min_docfreq = dfmMinDocfreq, min_termfreq = dfmMinTermfreq,
+ verbose = TRUE)
+ if (dfmTfIdf) {
+ dadfm <- dadfm %>%
+ dfm_tfidf(scheme_tf = dfmTfScheme, scheme_df = dfmDfScheme)
+ }
+ suppressWarnings({ # TODO : check
+ dadfm <- quanteda::convert(dadfm, to= "tm")
+ })
+ dadfm <- Matrix::sparseMatrix(i= dadfm$i, j= dadfm$j, x= dadfm$v,
+ dimnames = dadfm$dimnames)
+ dadfm
+ }) %...>% ok.dfm()
+ dfm_async <- catch(dfm_async, function(error) {
+ values$modelDiagno <- paste0("Error in DFM: ", error)
+ })
+ })
+
+ ## Predictor : df or matrix of dfm and extra
+ ok.regressor.matrix <- reactive({
+ if (is.null(ok.text())) return(NULL)
+ if (is.null(ok.regressor.names())) return(NULL)
+ if (verbose)
+ cat("DEBUG enter ok.regressor.matrix\n")
+ model.matrix(~., data= ok.data()[, ok.regressor.names()])[, -1]
+ })
+
+ ok.extraregressor.matrix <- reactive({
+ req(ok.text())
+ if (! "extra" %in% values$new_use_regressors) return(NULL)
+ if (verbose)
+ cat("DEBUG enter ok.extraregressor.matrix\n")
+
+ res <- NULL
+ for (inum in input$use_ootregnum) {
+ suppressWarnings({tmp <- as.numeric(ok.data()[[inum]])})
+ if (all(is.na(tmp))) {
+ showNotification(paste0(inum, " is all NAs or non-numeric ; predictor not used"),
+ duration = 5, type = "warning")
+ next
+ }
+ tmp[is.na(tmp)] <- median(tmp, na.rm = TRUE)
+ res <- cbind(res, tmp)
+ }
+ for (icat in input$use_ootregcat) {
+ tmp <- ok.data()[[icat]]
+ if (! length(unique(tmp)) %in% 2:100) {
+ showNotification(paste0(icat, " has ", length(unique(tmp)),
+ " unique values ; predictor not used"),
+ duration = 5, type = "warning")
+ next
+ }
+ tmp <- as.character(tmp)
+ tmp[is.na(tmp)] <- ""
+ tmp <- as.factor(tmp)
+ tmpmat <- model.matrix(~., data.frame(cat = tmp))[, -1, drop = FALSE]
+ res <- cbind(res, tmpmat)
+ }
+ if (verbose)
+ cat("DEBUG exit ok.extraregressor.matrix", ncol(res), "\n")
+ res
+ })
+
+ ok.extraRegex.matrix <- reactive({
+ if (is.null(ok.nicetext())) return(NULL)
+ if (is.null(values$extraRegex)) return(NULL)
+ if (is.null(values$regex_inuse)) return(NULL)
+ if (!any(values$regex_inuse)) return(NULL)
+ if (is.null(values$regex_case)) return(NULL)
+ if (verbose)
+ cat("DEBUG enter ok.extraRegex.matrix\n")
+ tmp <- as.matrix(sapply(which(values$regex_inuse), function(iregex) {
+ 1 * stringi::stri_detect_regex(
+ ok.nicetext(), values$extraRegex[iregex],
+ opts_regex = list(case_insensitive = !values$regex_case[iregex]))
+ }))
+ colnames(tmp) <- values$extraRegex[values$regex_inuse]
+ tmp
+ })
+
+ ok.predictor <- reactive({
+ req(values$new_use_regressors)
+ if (ok.data.running()) return(NULL)
+ if (is.null(ok.text())) return(NULL)
+ if (verbose)
+ cat("DEBUG enter ok.predictor\n")
+ dadfm <- NULL
+ if ("dfm" %in% values$new_use_regressors) {
+ dadfm <- ok.dfm()
+ } else {
+ if (input$predModel == "naive bayes") {
+ values$modelDiagno <-
+ "Error in training: Naive Bayes requires DFM word counts as predictors."
+ return(NULL)
+ }
+ }
+ if (length(ok.regressor.names()) > 0) {
+ if (input$predModel == "random forest") {
+ if (is.null(dadfm)) {
+ dadfm <- ok.data()[, ok.regressor.names()]
+ } else
+ dadfm <- cbind(ok.data()[, ok.regressor.names()], as.matrix(dadfm))
+ }
+ if (input$predModel %in% c("lasso", "linear", "knn")) {
+ dadfm <- cbind(ok.regressor.matrix(), dadfm)
+ }
+ }
+ # if ("regex" %in% input$use_regressors && length(values$regex_inuse) > 0) {
+ if ("regex" %in% values$new_use_regressors & !is.null(ok.extraRegex.matrix())) {
+ dadfm <- cbind(dadfm, ok.extraRegex.matrix())
+ }
+ if ("extra" %in% values$new_use_regressors) {
+ dadfm <- cbind(dadfm, ok.extraregressor.matrix())
+ }
+
+ if (is.null(dadfm)) {
+ values$modelDiagno <- "Error in training: no predictors selected"
+ if (verbose)
+ cat("DEBUG ok.predictor exit: no predictors\n")
+ return(NULL)
+ }
+
+ if (input$predModel == "linear")
+ dadfm <- SparseM::as.matrix.csr(dadfm, nrow(dadfm), ncol(dadfm))
+ if (input$predModel == "naive bayes")
+ dadfm <- as.dfm(dadfm)
+ if (verbose)
+ cat("DEBUG exit ok.predictor: ", ncol(dadfm), "\n")
+ dadfm
+ })
+
+
+ ## Train model async
+ ok.train <- reactiveVal()
+ ok.train.async <- reactiveVal()
+ observeEvent(
+ list(trainTrigger(), input$modelTrain),
+ {
+ if (is.null(ok.predictor())) return(NULL)
+ if (verbose)
+ cat("DEBUG enter ok.train observer\n")
+ if (!ok.train.possible()) {
+ values$modelDiagno <-
+ "Tag at least 3 texts for 2 tags to train prediction model."
+ ok.train(NULL)
+ # queryTrigger(queryTrigger() + 1)
+ return(NULL)
+ }
+ ready2tag(0)
+ values$modelDiagno <- paste0("Training ", input$predModel)
+
+ tmp_model <- input$predModel
+ if (tmp_model == "random forest") {
+ if (!is.numeric(input$rfNumTrees) | !is.numeric(input$rfMtry) |
+ !is.numeric(input$rfSampleFrac)) {
+ ok.train(NULL)
+ return(NULL)
+ }
+ train_fun <- ranger::ranger
+ train_args <- list(
+ x= ok.predictor()[!is.na(values$label), ],
+ y= as.factor(values$label[!is.na(values$label)]),
+ num.trees = input$rfNumTrees,
+ mtry = if (input$rfMtry > 0) input$rfMtry,
+ sample.fraction = input$rfSampleFrac, probability = TRUE)
+ } else {
+ if (tmp_model == "linear") {
+ tmpvalues <- values$label
+ if (!is.numeric(input$liblinCost)) {
+ ok.train(NULL)
+ return(NULL)
+ }
+ for (ival in na.omit(unique(tmpvalues)))
+ if (sum(tmpvalues == ival, na.rm = T) == 1)
+ tmpvalues[which(tmpvalues == ival)] <- NA
+ train_fun <- LiblineaR::LiblineaR
+ train_args <- list(
+ data= ok.predictor()[!is.na(tmpvalues), ],
+ target= as.factor(tmpvalues[!is.na(tmpvalues)]),
+ type= 0, cost = input$liblinCost)
+ } else if (tmp_model == "lasso") {
+ tmpvalues <- values$label
+ for (ival in na.omit(unique(tmpvalues)))
+ if (sum(tmpvalues == ival, na.rm = T) == 1)
+ tmpvalues[which(tmpvalues == ival)] <- NA
+ train_fun <- glmnet::glmnet
+ train_args <- list(
+ x= ok.predictor()[!is.na(tmpvalues), ],
+ y= as.factor(tmpvalues[!is.na(tmpvalues)]),
+ family= "multinomial")
+ } else if (tmp_model == "naive bayes") {
+ if (!is.numeric(input$naiveSmooth)) {
+ ok.train(NULL)
+ return(NULL)
+ }
+ tmpvalues <- values$label
+ for (ival in na.omit(unique(tmpvalues)))
+ if (sum(tmpvalues == ival, na.rm = T) == 1)
+ tmpvalues[which(tmpvalues == ival)] <- NA
+ train_fun <- quanteda.textmodels::textmodel_nb
+ train_args <- list(
+ x= ok.predictor()[!is.na(tmpvalues), ],
+ y= as.factor(tmpvalues[!is.na(tmpvalues)]),
+ smooth = input$naiveSmooth, prior = input$naivePrior,
+ distribution = input$naiveDistri)
+ } else if (tmp_model == "knn") {
+ train_fun <- function(x) return(TRUE)
+ train_args <- list(x = "knn")
+ }
+ }
+
+ train_async <- future(seed = TRUE, {
+ suppressWarnings(do.call(train_fun, train_args))
+ }) %...>% ok.train.async()
+ train_async <- catch(train_async, function(error) {
+ values$modelDiagno <- paste0("Model training error: ", error)
+ ok.train(NULL)
+ ok.train.async(NULL)
+ })
+ train_async <- finally(train_async, function() {
+ values$modelDiagno <- "Training done, predicting..."
+ ok.train(ok.train.async())
+ ok.train.async(NULL)
+ predTrigger(predTrigger() + 1)
+ })
+ NULL
+ }
+ )
+
+ ## Model prediction on full dataset (async)
+ ok.pred <- reactiveVal()
+ ok.pred.async <- reactiveVal()
+ # observeEvent(list(ok.train(), input$glmLambda, input$knnK, values$retagged), {
+ observeEvent(predTrigger(), {
+ if (is.null(ok.train())) {
+ ok.pred(NULL)
+ return(NULL)
+ }
+ if (verbose)
+ cat("DEBUG ok.pred()\n")
+
+ damodel <- input$predModel
+ if (input$predModel == "random forest") {
+ pred_args <- list(ok.train(), data = ok.predictor())
+ dacolnames <- colnames(ok.train()$predictions)
+ } else if (input$predModel == "lasso") {
+ if (!is.numeric(input$glmLambda)) {
+ ok.pred(NULL)
+ return(NULL)
+ }
+ pred_args <- list(ok.train(), newx = ok.predictor(), s = input$glmLambda,
+ type = "response")
+ } else if (input$predModel == "linear") {
+ pred_args <- list(ok.train(), newx = ok.predictor(), proba = TRUE)
+ } else if (input$predModel == "naive bayes") {
+ pred_args <- list(ok.train(), newdata = ok.predictor(),
+ type = "probability")
+ } else if (input$predModel == "knn") {
+ if (!is.numeric(input$knnK)) {
+ ok.pred(NULL)
+ return(NULL)
+ }
+ pred_args <- list(train = ok.predictor()[!is.na(values$label), ],
+ test = ok.predictor(),
+ cl = values$label[!is.na(values$label)],
+ k = input$knnK, prob = TRUE)
+ }
+
+ pred_async <- future(seed = TRUE, {
+ if (damodel == "random forest") {
+ res <- do.call(predict, pred_args)$predictions
+ colnames(res) <- dacolnames
+ res
+ } else if (damodel == "lasso") {
+ do.call(predict, pred_args)[, , 1]
+ } else if (damodel == "linear") {
+ do.call(predict, pred_args)$probabilities
+ } else if (damodel == "naive bayes") {
+ do.call(predict, pred_args)
+ } else if (damodel == "knn") {
+ tmp <- do.call(class::knn, pred_args)
+ ## A bit flaky here: probs of non-best classes uniformly share the rest,
+ ## and distance is euc
+ tmpvalues <- sort(unique(pred_args$cl))
+ res <- matrix(0, nrow(pred_args$test), length(tmpvalues),
+ dimnames = list(NULL, tmpvalues))
+ res[cbind(1:nrow(res), tmp)] <- attr(tmp, "prob") -
+ ((1 - attr(tmp, "prob")) / (ncol(res) - 1))
+ res <- res + (1 - attr(tmp, "prob")) / (ncol(res) - 1)
+ res
+ }
+ }) %...>% ok.pred.async()
+ pred_async <- catch(pred_async, function(error) {
+ values$modelDiagno <- paste0("Error in prediction: ", error)
+ })
+ pred_async <- finally(pred_async, function() {
+ if (verbose)
+ cat("DEBUG ok.pred() out of async\n")
+ ok.pred(ok.pred.async())
+ ok.pred.async(NULL)
+ # predlabTrigger(predlabTrigger() + 1)
+ diagTrigger(diagTrigger() + 1)
+ queryNext(queryNext() + 1)
+ })
+ })
+
+ ## Best predicted label for full dataset
+ # ok.predlab1 <- eventReactive(predlabTrigger(), {
+ # if (is.null(ok.pred())) return(NULL)
+ # if (verbose)
+ # cat("DEBUG ok.predlab1\n")
+ # # queryTrigger(queryTrigger() + 1)
+ # colnames(ok.pred())[max.col(ok.pred())]
+ # })
+ ok.predlab1 <- reactive({
+ if (is.null(ok.pred())) return(NULL)
+ if (verbose)
+ cat("DEBUG ok.predlab1\n")
+ colnames(ok.pred())[max.col(ok.pred())]
+ })
+
+ ## On scheme change, forget existing model, queue, query and hist
+ ## load scheme description and predictor-regex
+ observeEvent(ok.labelcol(), {
+ if (verbose)
+ cat("DEBUG event selectScheme", ok.labelcol(), "\n")
+ ok.pred(NULL)
+ ok.train(NULL)
+ values$queryqueue <- NULL
+ values$modelDiagno <- "No model"
+ values$histSaveStack <- NULL
+
+ ## Load description file
+ descr <- ""
+ if (file.exists(paste0(values$conf$datadir, values$conf$projectid,
+ "/", input$selectScheme, ".txt"))) {
+ descr <- paste(collapse = "\n", readLines(paste0(
+ values$conf$datadir, values$conf$projectid,
+ "/", input$selectScheme, ".txt")))
+ }
+ updateTextAreaInput(session, "schemeDescr", value = descr)
+
+ ## Load regex from json
+ regexfile <- paste0(values$projectdir, input$selectScheme, "_regex.json")
+ if (file.exists(regexfile)) {
+ if (verbose)
+ cat("DEBUG enter regex.json\n")
+ tmp <- RJSONIO::fromJSON(paste(collapse = "\n", readLines(regexfile)))
+ if (is.null(tmp$regex)) return(NULL)
+ values$extraRegex <- tmp$regex
+ names(values$extraRegex) <- sapply(tmp$regex, label2hash)
+ for (iregex in 1:length(tmp$regex)) {
+ values[[paste0("useregex_", names(values$extraRegex)[iregex])]] <-
+ tmp$use[iregex]
+ values[[paste0("caseregex_", names(values$extraRegex)[iregex])]] <-
+ tmp$case[iregex]
+ }
+ } else
+ values$extraRegex <- NULL
+
+ })
+
+
+
+ ## queryNext event: update values$newQuery ; called by queryNext(), or by
+ ## queryQueue or regex change
+ ## Requires :
+ ## - queryQueue (reactive: indices of current queue), and
+ ## - wasTagged (boolean vector indicating whether each element has been tagged this session, used in "on untagged")
+ ## - wasRetagged (boolean vector indicating whether each element has been tagged this session, used in "on tagged")
+ ## TODO: make a scheme-list of wasTagged, now it will be wiped at each scheme change?
+ observeEvent(
+ list(queryNext(), values$queryQueue,
+ input$sampleChoice, input$taggedWhich, input$maxprobWhich,
+ input$regexFilter, input$regexCaseSens,
+ input$visuLock, values$visuZoom$xlim, values$visuZoom$ylim),
+ {
+ if (is.null(values$queryQueue)) return(NULL)
+ if (verbose)
+ cat("DEBUG queryNext\n")
+
+ queue <- values$queryQueue
+
+ if (input$sampleChoice == "untagged") {
+ queue <- queue[!values$wasTagged[queue]]
+ } else if (input$sampleChoice == "tagged") {
+ queue <- queue[!is.na(values$label)[queue] & !values$wasRetagged[queue]]
+ if (input$taggedWhich != "all") {
+ queue <- queue[values$label[queue] == input$taggedWhich]
+ }
+ } else if (input$sampleChoice == "all") {
+ queue <- queue[!values$wasRetagged[queue]]
+ }
+
+ if (input$visuLock) if (!is.null(values$visuZoom$xlim)) {
+ if (!is.null(ok.visu()))
+ queue <- queue[ok.visu()[queue, 1] >= values$visuZoom$xlim[1] &
+ ok.visu()[queue, 1] <= values$visuZoom$xlim[2] &
+ ok.visu()[queue, 2] >= values$visuZoom$ylim[1] &
+ ok.visu()[queue, 2] <= values$visuZoom$ylim[2]]
+ }
+
+ if (length(queue)) if (nchar(input$regexFilter)) {
+ try_regex <- try(silent = TRUE, {
+ queue <- queue[
+ stringi::stri_detect_regex(
+ ok.nicetext()[queue], input$regexFilter,
+ opts_regex = list(case_insensitive = !input$regexCaseSens))]
+ })
+ if (inherits(try_regex, "try-error")) {
+ showNotification("Invalid regex", duration = 2, type = "error")
+ values$newQuery <- NULL
+ return(NULL)
+ }
+ }
+
+ if (!length(queue)) {
+ if (nchar(input$regexFilter)) {
+ showNotification("No texts matching the regex",
+ duration = 2, type = "warning")
+ } else
+ showNotification("No more texts to be tagged in this set",
+ duration = 2, type = "warning")
+ values$newQuery <- NULL
+ return(NULL)
+ }
+
+ values$lastQuery <- queue[1] # ??
+ values$newQuery <- queue[1]
+ ready2tag(1)
+ }
+ )
+
+ ## queryQueue event: fix values$queryQueue, (indices of current queue)
+ observeEvent(
+ list(ok.data(), ok.pred(), input$strategy, input$maxprobWhich,
+ input$selectProject, input$selectScheme), {
+ if (input$strategy %in% c("entropy", "maxprob") & is.null(ok.pred())) {
+ values$queryQueue <- NULL
+ values$lastQuery <- NULL
+ values$newQuery <- NULL
+ return(NULL)
+ }
+ if (is.null(ok.text())) return(NULL)
+ if (verbose)
+ cat("DEBUG queryQueue change\n")
+
+ if (input$strategy == "sequential") {
+ values$queryQueue <- 1:length(ok.text())
+ } else if (input$strategy == "random") {
+ values$queryQueue <- sample(length(ok.text()))
+ } else {
+ dapred <- ok.pred()
+ if (input$strategy == "entropy") {
+ tominimize <- rowSums(dapred * log(dapred))
+ if (any(dapred == 0)) {
+ tominimize[dapred == 0] <- 0
+ }
+ }
+ if (input$strategy == "maxprob") {
+ if (! input$maxprobWhich %in% colnames(dapred)) return(NULL)
+ tominimize <- -dapred[, input$maxprobWhich];
+ }
+ values$queryQueue <- order(tominimize)
+ }
+ }
+ )
+
+
+ ## Update uniqueLabels and uniqueLabelsAll if necessary on values$label change
+ observeEvent(values$label, {
+ if (is.null(values$label)) return(NULL)
+ if (verbose)
+ cat("DEBUG uniqueLabels\n")
+ dalabs <- sort(na.omit(unique(values$label)))
+ # if (!length(dalabs)) return(NULL)
+ if (!identical(dalabs, values$uniqueLabels))
+ values$uniqueLabels <- dalabs
+ if (any(! dalabs %in% values$uniqueLabelsAll))
+ values$uniqueLabelsAll <- c(values$uniqueLabelsAll,
+ dalabs[! dalabs %in% values$uniqueLabelsAll])
+ })
+
+
+
+ ##############################################################################
+ ## Project management
+ ##############################################################################
+
+ output$cpDataMessage <- renderUI({
+ req(values$cpDataMessage)
+ p(values$cpDataMessage)
+ })
+
+ output$cpConfirmMessage <- renderUI({
+ req(values$cpConfirmMessage)
+ p(strong(paste0("Error: ", values$cpConfirmMessage)))
+ })
+
+ output$cpIdcolUI <- renderUI({
+ req(values$cpNewData)
+ if (!is.data.frame(values$cpNewData)) return(NULL)
+ selectInput("cpIdcol", NULL, colnames(values$cpNewData))
+ })
+
+ output$cpTextcolUI <- renderUI({
+ req(values$cpNewData)
+ if (!is.data.frame(values$cpNewData)) return(NULL)
+ selectInput("cpTextcol", NULL, colnames(values$cpNewData), multiple = TRUE)
+ })
+
+ output$cpContextcolUI <- renderUI({
+ req(values$cpNewData)
+ if (!is.data.frame(values$cpNewData)) return(NULL)
+ selectInput("cpContextcol", NULL, colnames(values$cpNewData), multiple = TRUE)
+ })
+
+ output$cpTagcolUI <- renderUI({
+ req(values$cpNewData)
+ if (!is.data.frame(values$cpNewData)) return(NULL)
+ selectInput("cpTagcol", NULL, colnames(values$cpNewData), multiple = TRUE)
+ })
+
+ output$cpCommcolUI <- renderUI({
+ req(values$cpNewData)
+ if (!is.data.frame(values$cpNewData)) return(NULL)
+ selectInput("cpCommcol", NULL, c("none", colnames(values$cpNewData)))
+ })
+
+ output$cp_spacyUI <- renderUI(
+ textInput("cp_use_spacy_model", NULL,
+ modelnames$spacy_name[modelnames$short == input$cp_lang],
+ placeholder = "(spacy model name)")
+ )
+
+ output$cp_spacyDlUI <- renderUI(HTML(paste(
+ "Model must be downloaded once, from python:
",
+ input$cp_which_python, "-m spacy download",
+ modelnames$spacy_name[modelnames$short == input$cp_lang], "
"
+ )))
+
+ output$cp_ftUI <- renderUI(
+ textInput("cp_use_ft_model", NULL,
+ modelnames$fasttext_name[modelnames$short == input$cp_lang],
+ placeholder = "(fasttext model path)")
+ )
+
+ output$cp_ftDlUI <- renderUI(list(HTML(paste(
+ "Model can be downloaded here")),
+ br(), br()))
+
+ output$cp_sbUI <- renderUI(
+ textInput("cp_use_sb_model", NULL, ifelse(
+ input$cp_lang %in% c("ar", "zh", "nl", "en", "fr", "de",
+ "it", "ko", "pl", "pt", "ru", "es", "tr"),
+ "distiluse-base-multilingual-cased-v1",
+ "distiluse-base-multilingual-cased-v2"),
+ placeholder = "(custom sentence_transformers model)")
+ )
+
+
+
+ ## Create new project, with modal dialog
+ observeEvent(input$createProject, {
+ if (verbose)
+ cat("DEBUG create project\n")
+ showModal(modalDialog(
+ title = "New project",
+
+ p("Create a new text tagging project."),
+ p("This mainly involves choosing a dataset, and how to process it."),
+ p("Items marked by a star ", strong("*"),
+ " cannot be changed once the project is created."),
+
+ hr(),
+ h4("Files and folders"),
+
+ fluidRow(
+ column(3, p(strong("*Project name"))),
+ column(5, textInput("cpName", NULL, placeholder = "(Required)")),
+ column(4, p("Tags will be stored in a new directory by this name."))
+ ),
+ fluidRow(
+ column(3, p(strong("*Data directory"))),
+ column(5, textInput("cpDatadir", NULL, placeholder = "(Required)")),
+ column(4, p("Place (on the server) where the data and project are stored"))
+ ),
+ fluidRow(
+ column(3, p(strong("*Data filename"))),
+ column(5, checkboxInput("cpDoImport", "New data", TRUE)),
+ column(4, p("Main file, containing id and text columns"))
+ ),
+
+ conditionalPanel("!input.cpDoImport", list(
+ ## Existing data
+ fluidRow(
+ column(3, p(strong("Existing file"))),
+ column(5, textInput("cpDatafile", NULL, placeholder = "(Required)")),
+ column(4, actionButton("cpDatafileGo", "Import"))
+ )
+ )),
+ conditionalPanel("input.cpDoImport", {
+ ## Import data
+ wellPanel(
+ selectInput('cpNDfiletype', NULL,
+ choices = c("Text file (.csv, .txt)" = "csv_txt",
+ "Feather" = "feather",
+ "Parquet" = "parquet",
+ "OpenDocument (.ods)" = "ods",
+ "Microsoft Excel (.xls, .xlsx)" = "excel",
+ "SPSS (.sav, .por)" = "spss",
+ "SAS (.sas7bdat)" = "sas_data",
+ "Stata v5-v12 (.dta)" = "stata"),
+ selected = "csv_txt"
+ ),
+ fileInput('cpNDfile', 'Choose File'),
+
+ checkboxInput("cpNDparams", "File parameters"),
+
+ conditionalPanel(
+ "input.cpNDparams",
+ conditionalPanel(
+ "input.cpNDfiletype == 'csv_txt'",
+
+ fluidRow(column(4, p('Header')),
+ column(8, selectInput('cpCSVheader', NULL,
+ c("Header" = TRUE,
+ "No Header" = FALSE),
+ "Auto"))),
+ fluidRow(column(4, p('Separator')),
+ column(8, selectInput('cpCSVsep', NULL,
+ c("Comma ','" = ",",
+ "Semicolon ';'" = ";",
+ "Tab" = "\t",
+ "Space" = " "),
+ "Auto"))),
+ fluidRow(column(4, p('Quote')),
+ column(8, selectInput('cpCSVquote', NULL,
+ c("Double Quote \"" = "\"",
+ "Single Quote '" = "'",
+ "None" = ""),
+ "Double Quote \""))),
+ fluidRow(column(4, p('Decimal mark')),
+ column(8, selectInput('cpCSVdec', NULL,
+ c('Period "."' = ".",
+ 'Comma ","' = ","),
+ 'Period "."'))),
+ fluidRow(column(4, p('File Encoding')),
+ column(8, selectInput('cpCSVencoding', NULL,
+ c("unknown", "UTF-8", "Latin-1"),
+ "unknown")))),
+
+ ## Options for ods files
+ conditionalPanel(
+ "input.cpNDfiletype == 'ods'",
+ fluidRow(column(4, p('Sheet')),
+ column(8, numericInput('cpODSsheet', NULL, 1, 1,
+ step = 1))),
+ checkboxInput("cpODScolnames", "Column names", TRUE),
+ fluidRow(column(4, p('NA symbol')),
+ column(8, textInput('cpODSna', NULL, value = ""))),
+ fluidRow(column(4, p('Skip rows')),
+ column(8, numericInput('cpODSskip', NULL, 0, 0,
+ step = 1))),
+ fluidRow(column(4, p('Specify range')),
+ column(8, textInput('cpODSrange', NULL, value = "")))
+ ),
+
+ ## Options for xls / xlsx files
+ conditionalPanel(
+ "input.cpNDfiletype == 'excel'",
+ fluidRow(column(4, p('Column names')),
+ column(8, selectInput('cpXLScolnames', NULL,
+ choices = c("Yes" = "TRUE",
+ "No" = "FALSE"),
+ selected = TRUE ))),
+ fluidRow(column(4, p('Skip rows')),
+ column(8, numericInput('cpXLSskip', NULL, 0, 0,
+ step = 1))),
+ fluidRow(column(4, p('Trim whitespaces')),
+ column(8, selectInput('cpXLStrim', NULL,
+ choices = c("Yes" = "TRUE",
+ "No" = "FALSE"),
+ selected = TRUE ))),
+ checkboxInput("cpXLSsheet",
+ "Specify Worksheet", FALSE),
+ conditionalPanel("input.cpXLSsheet",
+ textInput("cpXLSsheet_specs", NULL, "")),
+ checkboxInput("cpXLSrange", "Specify Range", FALSE),
+ conditionalPanel("input.cpXLSrange",
+ textInput("cpXLSrange_specs", NULL, "")
+ )),
+
+ ## Options for SPSS files
+ conditionalPanel(
+ "input.cpNDfiletype == 'spss'",
+ fluidRow(
+ column(4, p('Skip rows')),
+ column(8, numericInput('cpSPSSskip', NULL, 0, 0, step = 1))),
+ checkboxInput("cpSPSSna", "User-defined NA", FALSE)),
+
+ ## Options for SAS files
+ conditionalPanel(
+ "input.cpNDfiletype == 'sas_data'",
+ fluidRow(
+ column(4, p('Skip rows')),
+ column(8, numericInput("cpSASskip", NULL, 0, 0, step = 1))),
+ checkboxInput("cpSAScatalog", "Use catalog file", FALSE),
+ conditionalPanel(
+ "input.cpSAScatalog",
+ fileInput("cpSAScatalog_file", "Catalog file"))
+ ),
+
+ ## Options for STATA dta files
+ conditionalPanel(
+ "input.cpNDfiletype == 'stata'",
+ checkboxInput("cpSTATAfactors", "Convert factors", FALSE))
+ )
+ )
+ }),
+
+ uiOutput("cpDataMessage"),
+ hr(),
+ h4("Data"),
+
+ fluidRow(
+ column(6, numericInput(
+ "cpNrows", "N. rows for working sample", 500, 1, step = 1)),
+ column(6, numericInput(
+ "cpNskip", "Rows to skip before working sample", 0, 0, step = 1))
+ ),
+
+ fluidRow(
+ column(3, p(strong("*ID"))),
+ column(5, uiOutput("cpIdcolUI")),
+ column(4, p("Name of the id variable, unique identifier of each text"))
+ ),
+ fluidRow(
+ column(3, p(strong("*Text"))),
+ column(5, uiOutput("cpTextcolUI")),
+ column(4, p("Name of the text variables: if more than one, texts are concatenated in the specified order"))
+ ),
+ fluidRow(
+ column(3, p(strong("*Tags"))),
+ column(5, uiOutput("cpTagcolUI")),
+ column(4, p("(optional) Names of variables that are already tagged: each will create a new scheme"))
+ ),
+ fluidRow(
+ column(3, p(strong("Comments"))),
+ column(5, uiOutput("cpCommcolUI")),
+ column(4, p("(optional) Name of the comments variable"))
+ ),
+ fluidRow(
+ column(3, p(strong("Context"))),
+ column(5, uiOutput("cpContextcolUI")),
+ column(4, p("(optional) Names of variables not used in the models, but may be displayed during tagging"))
+ ),
+
+
+ hr(),
+ h4("System"),
+
+ fluidRow(
+ column(3, checkboxInput("cp_use_python", "Python backend", FALSE)),
+ column(5, conditionalPanel(
+ "input.cp_use_python",
+ textInput("cp_which_python", NULL, value = "python3",
+ placeholder = "(custom python path)"))),
+ column(4, conditionalPanel(
+ "input.cp_use_python",
+ p("This must be a working python3 environment, with the required modules installed (see documentation)")))
+ ),
+ conditionalPanel("input.cp_use_python", list(
+ fluidRow(
+ column(3, p("Language")),
+ column(5, selectInput("cp_lang", NULL, modelnames_labels, "en")),
+ column(4, p("Used to preset tokenization and embedding models"))
+ ),
+ fluidRow(
+ column(3, checkboxInput("cp_use_spacy", "SpaCy tokenization", FALSE)),
+ column(5, conditionalPanel("input.cp_use_spacy", uiOutput("cp_spacyUI"))),
+ column(4, p("Name of the spacy tokenizer model, used in DTM and word embeddings"))
+ ),
+ conditionalPanel("input.cp_use_spacy", fluidRow(
+ column(3),
+ column(9, uiOutput("cp_spacyDlUI")))
+ ),
+ fluidRow(
+ column(3, checkboxInput("cp_use_ft", "FastText word embeddings", FALSE)),
+ column(5, conditionalPanel("input.cp_use_ft", uiOutput("cp_ftUI"))),
+ column(4, p("Path to the local fasttext model binary"))
+ ),
+ conditionalPanel("input.cp_use_ft", fluidRow(
+ column(3),
+ column(9, uiOutput("cp_ftDlUI")))),
+ fluidRow(
+ column(3, checkboxInput("cp_use_sb", "SBERT sentence embeddings", FALSE)),
+ column(5, conditionalPanel("input.cp_use_sb", uiOutput("cp_sbUI"))),
+ column(4, p("(GPU recommended) Name or path of the sentence-transformers model"))
+ ),
+
+ conditionalPanel("input.cp_use_python", {
+ checkboxInput("cp_use_gpu", "GPU support (CUDA, for SBERT and BERT)", FALSE)
+ })
+ )),
+
+ uiOutput("cpConfirmMessage"),
+ footer = tagList(actionButton("cpConfirm", "Create"),
+ modalButton("Cancel"))))
+ })
+
+ observeEvent(input$cpDatafileGo, {
+ dataname <- input$cpDatafile
+ dataname <- gsub("[.]feather$", "", dataname)
+ tmpdat <- try(arrow::read_feather(
+ paste0(gsub("/+$", "/", paste0(input$cpDatadir, "/")),
+ dataname, ".feather")))
+ if (inherits(tmpdat, "try-error")) {
+ values$cpDataMessage <- paste0(
+ "Data import error: ", as.character(tmpdat))
+ values$cpNewData <- NULL
+ return(NULL)
+ }
+ if (!is.data.frame(tmpdat)) {
+ values$cpDataMessage <- "Data import error: object not a data.frame"
+ values$cpNewData <- NULL
+ return(NULL)
+ }
+ if (ncol(tmpdat) < 2) {
+ values$cpDataMessage <- "Data import error: only one column detected"
+ values$cpNewData <- NULL
+ return(NULL)
+ }
+
+ values$cpDataMessage <- paste0(
+ "Data import success: ", nrow(tmpdat), " rows, ", ncol(tmpdat), " cols.")
+ values$cpNewData <- tmpdat
+ values$cpNewDataname <- dataname
+
+ })
+
+ ## Data import mechanism
+ observeEvent(input$cpNDfile, {
+ req(input$cpNDfile)
+ if(input$cpNDfiletype == "csv_txt") {
+ res <- try(read.csv(
+ input$cpNDfile$datapath,
+ header = as.logical(input$cpCSVheader), sep = input$cpCSVsep,
+ quote = input$cpCSVquote, dec = input$cpCSVdec, stringsAsFactors = FALSE,
+ encoding = input$cpCSVencoding, check.names = TRUE))
+
+ } else if(input$cpNDfiletype == "feather") {
+ res <- try(arrow::read_feather(input$cpNDfile$datapath))
+
+ } else if(input$cpNDfiletype == "parquet") {
+ res <- try(arrow::read_parquet(input$cpNDfile$datapath))
+
+ } else if (input$cpNDfiletype == "ods") {
+ range <- input$cpODSrange
+ if (range == "") range <- NULL
+ res <- try(readODS::read_ods(
+ path = input$cpNDfile$datapath, sheet = input$cpODSsheet,
+ col_names = input$cpODScolnames, na = input$cpODSna,
+ skip = input$cpODSskip, range = input$cpODSrange))
+
+ } else if(input$cpNDfiletype == "excel"){
+ column_names <- as.logical(input$cpXLScolnames)
+ trim_spaces <- as.logical(input$cpXLStrim)
+ the.range <- NULL
+ if(input$cpXLSrange == TRUE & input$cpXLSrange_specs != "")
+ the.range <- input$cpXLSrange_specs
+ the.sheet <- NULL
+ if(input$cpXLSsheet == TRUE & input$cpXLSsheet_specs != "")
+ the.sheet <- input$cpXLSsheet_specs
+ res <- try(data.frame(readxl::read_excel(
+ dataFile$datapath, col_names = column_names, range = the.range,
+ sheet= the.sheet, trim_ws = trim_spaces, skip = rows_to_skip)))
+
+ } else if(input$cpNDfiletype == "spss"){
+ res <- try(data.frame(haven::read_spss(
+ file = input$cpNDfile$datapath,
+ skip= input$cpSPSSskip, user_na = input$cpSPSSna)))
+ if (!inherits(res, "try-error")) {
+ res <- data.frame(lapply(res, function(x) {
+ attr(x, "format.spss") <- NULL
+ if ("haven_labelled" %in% class(x))
+ x <- haven::as_factor(x, levels= "labels")
+ x
+ }))
+ }
+
+ } else if(input$cpNDfiletype == "sas_data") {
+ res <- try(data.frame(haven::read_sas(
+ data_file = input$cpNDfile$datapath,
+ catalog_file = input$cpSAScatalog_file$datapath,
+ skip = input$cpSASskip)))
+
+ } else if(input$cpNDfiletype == "stata") {
+ res <- try(data.frame(foreign::read.dta(
+ file = input$cpNDfile$datapath, convert.factors = input$cpSTATAfactors)))
+ }
+
+ if(inherits(res, "try-error")) {
+ values$cpDataMessage <- paste0("Data import error: ", res)
+ values$cpNewData <- NULL
+ return(NULL)
+ }
+ if (ncol(res) < 2) {
+ values$cpDataMessage <- "Data import error: only one column detected"
+ values$cpNewData <- NULL
+ return(NULL)
+ }
+ values$cpDataMessage <- paste0(
+ "Data import success: ", nrow(res), " rows, ", ncol(res), " columns.")
+ values$cpNewData <- res
+ })
+
+ ## New project confirm
+ observeEvent(input$cpConfirm, {
+ if (verbose)
+ cat("DEBUG cpConfirm\n")
+ newconf <- list()
+
+ tmpdir <- gsub("/+$", "/", paste0(input$cpDatadir, "/"))
+ if (!dir.exists(tmpdir))
+ dir.create(tmpdir)
+ newconf$datadir <- tmpdir
+
+ tmpname <- cleanFileName(input$cpName)
+ if (!nchar(tmpname)) {
+ values$cpConfirmMessage <- "Project name is required"
+ return(NULL)
+ }
+ if (dir.exists(paste0(newconf$datadir, tmpname))) {
+ values$cpConfirmMessage <- paste0(
+ "A project with the same name (", tmpname,
+ ") already exists in the data directory.")
+ return(NULL)
+ }
+ newconf$projectid <- tmpname
+
+ if (!is.data.frame(values$cpNewData)) {
+ values$cpConfirmMessage <- "Import data first"
+ return(NULL)
+ }
+
+ if (input$cpDoImport) {
+ newconf$dataname <- newconf$projectid
+ } else
+ newconf$dataname <- values$cpNewDataname
+
+ if (any(is.na(values$cpNewData[[input$cpIdcol]]))) {
+ values$cpConfirmMessage <- "ID variable contains missing values"
+ return(NULL)
+ }
+ if (any(duplicated(values$cpNewData[[input$cpIdcol]]))) {
+ values$cpConfirmMessage <- "ID variable contains duplicates"
+ return(NULL)
+ }
+ newconf$idcol <- input$cpIdcol
+
+ if (!length(input$cpTextcol)) {
+ values$cpConfirmMessage <- "Select at least one text variable"
+ return(NULL)
+ }
+ newconf$textcols <- input$cpTextcol
+
+ if (input$cpCommcol == "none") {
+ newconf$commcol <- paste0("comm_", newconf$projectid)
+ } else
+ newconf$commcol <- input$cpCommcol
+
+ if (!length(input$cpTagcol)) {
+ newconf$tagcols <- "scheme0"
+ } else
+ newconf$tagcols <- input$cpTagcol
+
+ newconf$contextcols <- input$cpContextcol
+
+ newconf$dataNrows <- input$cpNrows
+ newconf$dataSkipRows <- input$cpNskip
+
+ if (input$cp_use_python) {
+ pytest <- try(system(paste(input$cp_which_python, "--version"),
+ intern = TRUE))
+ if (inherits(pytest, "try-error")) {
+ values$cpConfirmMessage <- paste(
+ "Python path `", input$cp_which_python, "` not valid")
+ return(NULL)
+ }
+ }
+ newconf$use_python <- input$cp_use_python
+ newconf$python <- input$cp_which_python
+ newconf$use_gpu <- input$cp_use_gpu
+
+ if (input$cp_use_spacy) {
+ sptest <- system(paste0(
+ newconf$python, " -m spacy info ", input$cp_use_spacy_model), intern = TRUE)
+ if (length(sptest) == 0) {
+ values$cpConfirmMessage <- paste(
+ "Error loading spacy, check that it is installed in the specified python env")
+ return(NULL)
+ }
+ if (length(attr(sptest, "status"))) {
+ values$cpConfirmMessage <- paste(
+ "Error loading spacy model, check that it has been downloaded")
+ return(NULL)
+ }
+ }
+ newconf$use_spacy <- input$cp_use_spacy
+ newconf$use_spacy_model <- input$cp_use_spacy_model
+
+ if (input$cp_use_ft) if (!file.exists(input$cp_use_ft_model)) {
+ values$cpConfirmMessage <- paste(
+ "Error loading fasttext model, check the specified path")
+ return(NULL)
+ }
+ newconf$use_ft <- input$cp_use_ft
+ newconf$use_ft_model <- input$cp_use_ft_model
+
+ newconf$use_sb <- input$cp_use_sb
+ newconf$use_sb_model <- input$cp_use_sb_model
+
+ writeLines(RJSONIO::toJSON(newconf),
+ paste0("tigger_", newconf$projectid, ".json"))
+
+ values$confnames <- c(newconf$projectid, values$confnames)
+ values$conf <- newconf
+
+ ## On new data import, write a feather file with the name of the project
+ if (input$cpDoImport)
+ arrow::write_feather(values$cpNewData, paste0(
+ newconf$datadir, newconf$dataname, ".feather"
+ ))
+
+ removeModal()
+ })
+
+ ##############################################################################
+ ## Scheme management
+ ##############################################################################
+
+ ## Print active scheme in scheme management panel
+ output$printScheme <- renderUI(h4(input$selectScheme))
+
+ ## Create new scheme on request, with modal dialog
+ observeEvent(input$createScheme, {
+ if (verbose)
+ cat("DEBUG create scheme\n")
+ showModal(modalDialog(
+ title = "Create tagging scheme",
+ p(paste0("This will create a new scheme for this project")),
+
+ textInput("newScheme", label = NULL, placeholder = "Required: new scheme name"),
+ textAreaInput("newSchemeDescr", label = NULL,
+ placeholder = "Optional: scheme description"),
+ checkboxInput("schemeDuplicate", "Duplicate existing scheme:"),
+ conditionalPanel(
+ "input.schemeDuplicate",
+ selectInput("schemeDuplicateFrom", NULL, rev(values$conf$tagcols))),
+
+ footer = tagList(actionButton("createSchemeConfirm", "Create"),
+ modalButton("Cancel"))))
+ })
+
+ observeEvent(input$createSchemeConfirm, {
+ if (is.null(ok.data())) return(NULL)
+ if (verbose)
+ cat("DEBUG create scheme event\n")
+
+ ## Make sure new scheme name is a correct file name
+ if (!grepl("\\S", input$newScheme)) {
+ showNotification(type = "error",
+ "Cannot create scheme: name empty.")
+ return(NULL)
+ }
+
+ ## Check that new scheme name doesn't exist
+ ## Re-read tagcols from conf file, to prevent scheme disappearances
+ tmpname <- cleanFileName(input$newScheme)
+ tagpath <- paste0(values$projectdir, tmpname, ".csv")
+ confpath <- paste0("tigger_", values$conf$projectid, ".json")
+ if (verbose)
+ cat("DEBUG create scheme check json:", confpath,"\n")
+ tmp_tagcols <- RJSONIO::fromJSON(confpath)$tagcols
+ if (verbose)
+ cat("DEBUG create scheme json OK\n")
+ if (tmpname %in% tmp_tagcols | file.exists(tagpath)) {
+ showNotification(type = "error",
+ "Cannot create scheme: name already exists.")
+ return(NULL)
+ }
+
+ ## Create new scheme description
+ if (nchar(input$newSchemeDescr))
+ writeLines(input$newSchemeDescr,
+ paste0(values$projectdir, tmpname, ".txt"))
+
+ ## Update conf file and reactives
+ values$conf$tagcols <- c(tmp_tagcols, tmpname)
+ # values$confs[[values$conf$projectid]]$tagcols <- c(tmp_tagcols, tmpname)
+ writeLines(RJSONIO::toJSON(values$conf), confpath)
+
+ if (input$schemeDuplicate) {
+ tmpmat <- matrix(values$tagmat[, input$schemeDuplicateFrom])
+ tmpmathist <- matrix(values$tagmathist[, input$schemeDuplicateFrom])
+ } else {
+ tmpmat <- matrix(rep(NA, nrow(ok.data())))
+ tmpmathist <- matrix(rep(NA, nrow(ok.data())))
+ }
+ colnames(tmpmat) <- tmpname
+ colnames(tmpmathist) <- tmpname
+
+ values$tagmat <- cbind(values$tagmat, tmpmat)
+ values$tagmathist <- cbind(values$tagmat, tmpmathist)
+
+ ## Create new tagging file if duplicating
+ if (input$schemeDuplicate) {
+ export <- data.frame(ok.data()[, values$conf$idcol],
+ values$tagmat[, tmpname])
+ colnames(export) <- c(values$conf$idcol, tmpname)
+ export <- na.omit(export)
+ write.table(export, tagpath, sep = ",", qmethod = "double",
+ fileEncoding = "UTF-8", row.names = FALSE,
+ append = FALSE, col.names = TRUE)
+ }
+
+ cat("DEBUG create scheme exit\n")
+ removeModal()
+ })
+
+ ## Button action: delete scheme
+ observeEvent(input$schemeDelete, {
+ if (verbose)
+ cat("DEBUG delete scheme\n")
+ if (length(values$conf$tagcols) < 2) {
+ showNotification(type = "error",
+ "Cannot delete: at least one scheme must remain.")
+ return(NULL)
+ }
+ showModal(modalDialog(
+ title = "Delete tagging scheme",
+ paste0("Confirm delete '", input$selectScheme,
+ "': ", sum(!is.na(values$label)), " tags will be deleted",
+ ", along with scheme description."),
+ footer = tagList(actionButton("schemeDeleteConfirm", "Delete"),
+ modalButton("Cancel"))))
+ })
+
+ observeEvent(input$schemeDeleteConfirm, {
+ file.remove(paste0(values$projectdir, input$selectScheme, ".csv"))
+ file.remove(paste0(values$projectdir, input$selectScheme, ".txt"))
+ file.remove(paste0(values$projectdir, input$selectScheme, "_regex.json"))
+ values$conf$tagcols <-
+ values$conf$tagcols[values$conf$tagcols != input$selectScheme]
+ writeLines(RJSONIO::toJSON(values$conf),
+ paste0("tigger_", input$selectProject, ".json"))
+ removeModal()
+ })
+
+
+ observeEvent(input$schemeDescrSave, {
+ req(input$selectScheme)
+ writeLines(input$schemeDescr,
+ paste0(values$conf$datadir, values$conf$projectid,
+ "/", input$selectScheme, ".txt"))
+ })
+
+
+ ##############################################################################
+ ##############################################################################
+
+ ## Tag selection : populate
+ observe({
+ values$conf$tagcols
+ updateSelectInput(
+ # session, "selectScheme", choices = c(rev(values$conf$tagcols), ""))
+ session, "selectScheme", choices = rev(values$conf$tagcols))
+ })
+
+
+ ## Populate oracle buttons
+ output$oracleButtons <- renderUI({
+ if (is.null(ok.data())) return(NULL)
+ # if (is.null(values$label)) return(NULL)
+ if (is.null(values$uniqueLabels)) return(NULL)
+ if (verbose)
+ cat("DEBUG populate oracle buttons\n")
+
+ do.call(fluidRow, lapply(values$uniqueLabels, function(ilab)
+ actionButton(paste0("oracle_", label2hash(ilab)),
+ strong(ilab))))
+ })
+
+ ## Populate maxprob variable selector
+ observe({
+ updateSelectInput(session, "maxprobWhich", choices = values$uniqueLabels)
+ updateSelectInput(session, "taggedWhich",
+ choices = c("All" = "all", values$uniqueLabels))
+ })
+
+ ## Data panel message
+ observeEvent(ok.data(), {
+ if (verbose)
+ cat("DEBUG data panel message\n")
+ if (is.null(ok.data())) {
+ output$dataMessage <- renderUI(p("Importing data..."))
+ }
+ output$dataMessage <- NULL
+ })
+
+ ## On config change, update dataView and retag options
+ observeEvent(list(values$conf), {
+
+ if (is.null(values$conf)) return(NULL)
+ output$panelData <- renderUI(list(
+ DT::dataTableOutput("dataView")
+ ))
+
+ output$panelRetag <- renderUI(c(
+ list(
+ br(),
+ h4("Rename tags"),
+ fluidRow(
+ column(4, strong("Current")),
+ column(2, strong("Count")),
+ column(6, strong("New"))
+ ),
+ br()),
+ lapply(values$uniqueLabels, function(iLab) {
+ iHash <- label2hash(iLab)
+ fluidRow(
+ column(4, p(iLab)),
+ column(2, p(sum(values$label == iLab, na.rm = TRUE))),
+ column(4, textInput(
+ paste0("newtag_", iHash), NULL,
+ placeholder = "(new name, leave empty to untag)")),
+ column(2, actionButton(paste0("retag_", iHash), "Rename")))
+ })
+ ))
+ })
+
+ output$panelExtraRegex <- renderUI(list(
+ p(strong("Regex predictors")),
+ fluidRow(
+ column(8, textInput("extraRegexText", NULL,
+ placeholder = "(new regex)", width = "100%")),
+ column(4, actionButton(
+ "extraRegexSearch", strong("+"), width = "100%"))
+ ),
+ fluidRow(
+ style="overflow-y:scroll; max-height: 25vh",
+ fluidRow(
+ column(2, strong("")),
+ column(1, strong("Use")),
+ column(1, strong("Case")),
+ column(1, p(strong("N"))),
+ column(7, p(strong("Regex")))
+ ),
+
+ lapply(values$extraRegex, function(iRegex) {
+ iHash <- label2hash(iRegex)
+ tmp_cased <- values[[paste0("caseregex_", iHash)]]
+ if (is.null(tmp_cased)) tmp_cased <- FALSE
+ fluidRow(
+ column(2, actionButton(paste0("delregex_", iHash), "🗑️", width="100%")),
+ column(1, checkboxInput(
+ paste0("useregex_", iHash), NULL,
+ value = isolate(!isFALSE(values[[paste0("useregex_", iHash)]])))),
+ column(1, checkboxInput(
+ paste0("caseregex_", iHash), NULL,
+ value = isolate(isTRUE(values[[paste0("caseregex_", iHash)]])))),
+ column(1, br(), p(sum(
+ stringi::stri_detect_regex(
+ ok.nicetext(), iRegex,
+ opts_regex = list(case_insensitive = !tmp_cased)),
+ na.rm = TRUE))),
+ column(7, br(), p(iRegex))
+ )
+ })
+ )
+
+ ))
+
+ ## Process new regex
+ observeEvent(input$extraRegexSearch, {
+ if (is.null(ok.nicetext())) return(NULL)
+ req(nchar(input$extraRegexText) > 0)
+ if (input$extraRegexText %in% values$extraRegex) return(NULL)
+ try_regex <- try(stringi::stri_detect_regex(
+ ok.nicetext(), input$extraRegexText,
+ opts_regex = list(case_insensitive = TRUE)))
+ if (inherits(try_regex, "try-error")) return(NULL)
+ new_regex <- input$extraRegexText
+ new_hash <- label2hash(new_regex)
+ values$extraRegex <- c(new_regex, values$extraRegex)
+ names(values$extraRegex)[1] <- new_hash
+ if (verbose)
+ cat("DEBUG regex search", input$extraRegexText, "\n")
+ })
+
+ ## Regex predictors: populate delete and use events on first creation
+ useregex_trigger <- reactiveVal(0)
+ delregex_trigger <- reactiveVal(0)
+ caseregex_trigger <- reactiveVal(0)
+ observeEvent(list(values$extraRegex), {
+ if (!length(values$extraRegex)) return(NULL)
+ if (any(! values$extraRegex %in% values$extraRegexHist)) {
+ if (verbose)
+ cat("DEBUG regex predictor names updates\n")
+ tmpregex <- values$extraRegex[! values$extraRegex %in% values$extraRegexHist]
+ values$extraRegexHist <- c(tmpregex, values$extraRegexHist)
+
+ c(lapply(1:length(tmpregex), function(iregex) {
+ observeEvent(
+ input[[paste0("delregex_", names(tmpregex)[iregex])]], {
+ if (verbose)
+ cat("DEBUG regex delete", names(tmpregex)[iregex], "\n")
+ values$extraRegex <- values$extraRegex[values$extraRegex != tmpregex[iregex]]
+ delregex_trigger(delregex_trigger() + 1)
+ })
+ }),
+ lapply(1:length(tmpregex), function(iregex) {
+ tmpname <- paste0("useregex_", names(tmpregex)[iregex])
+ observeEvent(
+ input[[tmpname]], {
+ values[[tmpname]] <- input[[tmpname]]
+ useregex_trigger(useregex_trigger() + 1)
+ })
+ }),
+ lapply(1:length(tmpregex), function(iregex) {
+ tmpname <- paste0("caseregex_", names(tmpregex)[iregex])
+ observeEvent(
+ input[[tmpname]], {
+ values[[tmpname]] <- input[[tmpname]]
+ caseregex_trigger(caseregex_trigger() + 1)
+ })
+ })
+ )
+ }
+ })
+
+ observeEvent(list(delregex_trigger(), useregex_trigger(), caseregex_trigger()), {
+ req(ok.data(), input$selectScheme)
+ regex_all <- values$extraRegex
+ values$regex_inuse <- sapply(names(regex_all), function(iname)
+ values[[paste0("useregex_", iname)]])
+ values$regex_case <- sapply(names(regex_all), function(iname)
+ values[[paste0("caseregex_", iname)]])
+ if (verbose) if (any(values$regex_inuse))
+ cat("DEBUG regex in use", paste0(collapse = " ", regex_all[values$regex_inuse]), "\n")
+ writeLines(
+ RJSONIO::toJSON(lapply(FUN = unname, list(
+ regex = regex_all, use = values$regex_inuse, case = values$regex_case))),
+ paste0(values$projectdir, input$selectScheme, "_regex.json"))
+ })
+
+ output$dataView <- DT::renderDataTable({
+ # if (is.null(ok.text()) | ok.data.running()) return(NULL)
+ req(ok.text())
+ if (verbose)
+ cat("DEBUG enter dataView\n")
+ if (verbose)
+ cat("DEBUG render dataView\n")
+ DT::datatable(
+ rownames = F,
+ data.frame(
+ row = 1:nrow(ok.data()),
+ id = ok.data()[, values$conf$idcol],
+ text = ok.text()),
+ options = list(
+ pageLength = 3,
+ columnDefs = list(list(
+ targets = 2,
+ render = DT::JS(
+ "function(data, type, row, meta) {",
+ "return type === 'display' && data.length > 200 ?",
+ "'' + data.substr(0, 200) + '...' : data;",
+ "}")
+ ))
+ ),
+ callback = DT::JS('table.page(0).draw(false);')
+ )
+ })
+
+ ## Triggers for mass rename, created with each new label, with confirm
+ observeEvent(values$uniqueLabelsAll, {
+ if (is.null(values$uniqueLabelsAll)) return(NULL)
+ notthere <- unique(na.omit(
+ values$uniqueLabelsAll[! values$uniqueLabelsAll %in% values$retagLabels]))
+ lapply(
+ notthere,
+ function(iTag) {
+ iHash <- label2hash(iTag)
+
+ observeEvent(input[[paste0("retag_", iHash)]], {
+ tmp_oldtag <- iTag
+ tmp_retagged <- which(values$label == tmp_oldtag)
+ tmp_newtag <- gsub(
+ "^\\s+|\\s+$", "", input[[paste0("newtag_", iHash)]])
+ showModal(modalDialog(
+ title = "Mass retag",
+ paste0("Confirm retag: ", length(tmp_retagged), " ",
+ tmp_oldtag, " tags will be ",
+ ifelse(nchar(tmp_newtag),
+ paste("renamed to", tmp_newtag) , "deleted")),
+ footer = tagList(actionButton(paste0("retag_confirm_", iTag), "Retag"),
+ modalButton("Cancel"))))
+ })
+
+ observeEvent(input[[paste0("retag_confirm_", iTag)]], {
+ tmp_oldtag <- iTag
+ tmp_retagged <- which(values$label == tmp_oldtag)
+ tmp_newtag <- gsub(
+ "^\\s+|\\s+$", "", input[[paste0("newtag_", iHash)]])
+ if (!nchar(tmp_newtag)) tmp_newtag <- NA
+ values$tagTrigger <- list(
+ id = ok.data()[[values$conf$idcol]][tmp_retagged],
+ tag = rep(tmp_newtag, length(tmp_retagged)))
+ values$queries <- c(values$queries, tmp_retagged)
+
+ removeModal()
+ })
+
+ })
+ values$retagLabels <- c(values$retagLabels, notthere)
+ })
+
+
+
+ ##############################################################################
+ ## UI rendering of Panel Tagging / Visualization
+ ##############################################################################
+
+ ####################
+ ## Tagging left panel
+
+
+ # ## Trigger from Oops
+ # observeEvent(input$oops, {
+ # if (is.null(ok.data()))
+ # return(NULL)
+ # if (verbose)
+ # cat("DEBUG enter Oops\n")
+ # if (!is.null(values$retagged)) if (values$queries[length(values$queries)] == values$retagged[length(values$retagged)])
+ # values$retagged <- values$retagged[-length(values$retagged)]
+ # values$label[values$queries[length(values$queries)]] <- NA
+ # if (ok.commentCol() != "(None)")
+ # values$comment[values$queries[length(values$queries)]] <- ok.data()[values$queries[length(values$queries)], ok.commentCol()]
+ # values$queries <- values$queries[-length(values$queries)]
+ # ## tagTrigger ?
+ #
+ # ready2tag(0)
+ # queryNext(queryNext() + 1)
+ # })
+
+
+ ## Tagging triggers for Oracle buttons, created with each new label
+ observeEvent(values$uniqueLabelsAll, {
+ if (is.null(values$label)) return(NULL)
+ if (is.null(values$uniqueLabelsAll)) return(NULL)
+ if (all(is.na(values$label))) return(NULL)
+ notthere <- unique(na.omit(
+ values$uniqueLabelsAll[! values$uniqueLabelsAll %in% values$oracleLabels]
+ ))
+ if (!length(notthere)) return(NULL)
+ if (verbose)
+ cat("DEBUG triggers for oracle buttons\n")
+ lapply(
+ notthere,
+ function(iLab) {
+ observeEvent(
+ input[[paste0("oracle_", label2hash(iLab))]], {
+ if (is.null(ok.text())) return(NULL)
+ if (is.null(values$newQuery)) return(NULL)
+ if (!ready2tag()) {cat("DEBUG not ready to tag in Button\n"); return(NULL)}
+ # ready2tag(0)
+ # if (input$allowRetag | sum(is.na(values$label)) == 0)
+ if (!is.na(values$label[values$lastQuery]))
+ values$retagged <- c(values$retagged, values$lastQuery)
+ # values$label[values$lastQuery] <- iLab
+ values$comment[values$lastQuery] <- input$currentComment
+ # values$queries <- c(values$queries, values$lastQuery)
+ values$tagTrigger <- list(
+ id = ok.data()[values$lastQuery, values$conf$idcol],
+ tag = iLab,
+ comment = input$currentComment)
+ })
+ })
+ values$oracleLabels <- c(values$oracleLabels, notthere)
+ })
+
+ ## Trigger from Oracle Create value button
+ observeEvent(input$currentAction, {
+ if (is.null(ok.data())) return(NULL)
+ if (is.null(values$newQuery)) return(NULL)
+ if (verbose)
+ cat("DEBUG enter create value\n")
+ # if (input$allowRetag | sum(is.na(values$label)) == 0)
+ if (!is.na(values$label[values$lastQuery]))
+ values$retagged <- c(values$retagged, values$lastQuery)
+ if (input$newLab == "") {
+ return(NULL)
+ } else {
+ # values$label[values$lastQuery] <- input$newLab
+ updateTextInput(session, "newLab", value= "")
+ }
+ values$comment[values$lastQuery] <- input$currentComment
+ # values$queries <- c(values$queries, values$lastQuery)
+ values$tagTrigger <- list(
+ id = ok.data()[values$lastQuery, values$conf$idcol],
+ tag = input$newLab,
+ comment = input$currentComment)
+ })
+
+ ## Trigger from Oracle Confirm button
+ observeEvent(input$oracleConfirm, {
+ req(ok.data(), ok.train(), values$newQuery)
+ # if (is.null(ok.data())) return(NULL)
+ # if (is.null(values$newQuery)) return(NULL)
+ if (!ready2tag()) {cat("DEBUG not ready to tag in OC\n"); return(NULL)}
+ # ready2tag(0)
+ if (!ok.train.possible()) return(NULL)
+ if (verbose)
+ cat("DEBUG oracle confirm\n")
+ # if (input$allowRetag | sum(is.na(values$label)) == 0)
+ if (!is.na(values$label[values$lastQuery]))
+ values$retagged <- c(values$retagged, values$lastQuery)
+ # values$label[values$lastQuery] <- ok.predlab1()[values$lastQuery]
+ # values$queries <- c(values$queries, values$lastQuery)
+ values$comment[values$lastQuery] <- input$currentComment
+ values$tagTrigger <- list(
+ id = ok.data()[values$lastQuery, values$conf$idcol],
+ tag = ok.predlab1()[values$lastQuery],
+ comment = input$currentComment)
+ })
+
+ ## Trigger for CV glm (async)
+ glmCV_nclicks <- reactiveVal(0)
+ observeEvent(input$glmCV, {
+ if (is.null(ok.predictor())) return(NULL)
+ if (glmCV_nclicks() != 0) {
+ showNotification("LASSO CV already running")
+ return(NULL)
+ }
+ glmCV_nclicks(1)
+ glmCV_future <- reactiveVal()
+
+ tmpvalues <- values$label
+ tmppred <- ok.predictor()
+ for (ival in na.omit(unique(tmpvalues)))
+ if (sum(tmpvalues == ival, na.rm = T) <= 8)
+ tmpvalues[which(tmpvalues == ival)] <- NA
+ glmCV_async <- future(seed = TRUE, {
+ glmnet::cv.glmnet(
+ x= tmppred[!is.na(tmpvalues), ],
+ y= as.factor(tmpvalues[!is.na(tmpvalues)]),
+ family= "multinomial")
+ }) %...>% glmCV_future()
+
+ glmCV_async <- catch(glmCV_async, function(e) {
+ showNotification(type = "error", paste0("LASSO CV error: ", e))
+ })
+ glmCV_async <- finally(glmCV_async, function() {
+ glmCV_nclicks(0)
+ if (is.null(glmCV_future())) return(NULL)
+ updateNumericInput(session, "glmLambda",
+ value = glmCV_future()$lambda.min)
+ })
+ NULL
+ })
+
+ ## Trigger for CV liblinear (async)
+ liblinCV_nclicks <- reactiveVal(0)
+ observeEvent(input$liblinCV, {
+ if (is.null(ok.predictor())) return(NULL)
+ if (liblinCV_nclicks() != 0) {
+ showNotification("LibLineaR CV already running")
+ return(NULL)
+ }
+ liblinCV_nclicks(1)
+ liblinCV_future <- reactiveVal()
+
+ tmpvalues <- values$label
+ tmppred <- ok.predictor()
+ for (ival in na.omit(unique(tmpvalues)))
+ if (sum(tmpvalues == ival, na.rm = T) <= 3)
+ tmpvalues[which(tmpvalues == ival)] <- NA
+
+ liblinCV_async <- future(seed = TRUE, {
+ LiblineaR::LiblineaR(
+ data= tmppred[!is.na(tmpvalues), ],
+ target= as.factor(tmpvalues[!is.na(tmpvalues)]),
+ type= 0, findC= T)
+ }) %...>% liblinCV_future()
+ liblinCV_async <- catch(liblinCV_async, function(e) {
+ showNotification(type = "error", paste0("LibLineaR CV error: ", e))
+ })
+ liblinCV_async <- finally(liblinCV_async, function() {
+ liblinCV_nclicks(0)
+ if (is.null(liblinCV_future())) return(NULL)
+ updateNumericInput(session, "liblinCost", value= liblinCV_future())
+ })
+ NULL
+ })
+
+
+ output$currentContext <- renderUI({
+ if (verbose)
+ cat("DEBUG 1rentContext\n")
+ if ((! is.null(ok.text())) & (length(ok.contextCol()) > 0))
+ HTML(paste(paste(ok.contextCol(),
+ as.character(ok.data()[values$newQuery, ok.contextCol()]),
+ sep = " : "),
+ collapse= "
"))
+ })
+
+ output$makeOracleConfirm <- renderUI({
+ if (verbose)
+ cat("DEBUG update printPredicted\n")
+ if (is.null(ok.pred())) {
+ tmp <- "No prediction"
+ } else if (is.null(values$newQuery)) {
+ tmp <- "No query"
+ } else {
+ tmp <- paste0(
+ ifelse(is.na(values$label[values$newQuery]), "", "Pred: "),
+ ok.predlab1()[values$newQuery],
+ " (", round(max(ok.pred()[values$newQuery, ]), 3), ")")
+ }
+ res <- list(actionButton("oracleConfirm", label= paste("🤖", tmp), width = "100%"))
+ if (!is.null(values$label) & !is.null(values$newQuery)) {
+ if (!is.na(values$label[values$newQuery]))
+ res <- list(res[[1]], HTML(paste0(
+ " Current: ", values$label[values$newQuery],
+ "")))
+ }
+ res
+ })
+
+
+ ## Show training diagnostics on new model train
+ output$trainDiagno <-
+ renderText(paste0(ok.msgtagged(), "\n", values$modelDiagno))
+
+ ## Diagnostics event
+ observeEvent(diagTrigger(), {
+ if (verbose)
+ cat("DEBUG update modelDiagno\n")
+ if (is.null(ok.data())) {
+ values$modelDiagno <- "No model"
+ return(NULL)
+ }
+ if (!ok.train.possible()) {
+ values$modelDiagno <- "No model (not enough labelled data)."
+ return(NULL)
+ }
+
+ if (input$predModel == "random forest") {
+ tmp <- paste0(
+ "Train Accuracy: ",
+ round(100 * mean(ok.predlab1()[!is.na(values$label)] ==
+ values$label[!is.na(values$label)]), 1),
+ "% ; Wtd. F1: ",
+ sprintf("%f", round(wtdF1(values$label[!is.na(values$label)],
+ ok.predlab1()[!is.na(values$label)]), 3)),
+ "%\nOOB Accuracy : ",
+ round(100 * mean(
+ colnames(ok.train()$predictions)[apply(ok.train()$predictions, 1, which.max)] ==
+ values$label[!is.na(values$label)]), 1),
+ "% ; Wtd. F1: ",
+ sprintf("%.3f", wtdF1(
+ values$label[!is.na(values$label)],
+ colnames(ok.train()$predictions)[apply(ok.train()$predictions, 1, which.max)]))
+ )
+ } else if (input$predModel %in% c("lasso", "linear", "naive bayes", "knn")) {
+ tmp <- paste0(
+ "Train Accuracy: ",
+ round(100 * mean(ok.predlab1()[!is.na(values$label)] ==
+ values$label[!is.na(values$label)]), 1),
+ "% ; Wtd. F1: ",
+ sprintf("%.3f", wtdF1(
+ values$label[!is.na(values$label)],
+ ok.predlab1()[!is.na(values$label)])))
+ }
+ values$modelDiagno <- tmp
+ })
+
+ ## Update comment input field
+ observeEvent(values$newQuery, {
+ if (is.null(ok.text())) return(NULL)
+ if (verbose)
+ cat("DEBUG update comment field\n")
+ updateTextInput(session, "currentComment",
+ value = values$comment[values$newQuery])
+ })
+
+
+ ## Save event: tag + comment in dedicated csv
+ observeEvent(values$tagTrigger, {
+ if (is.null(values$tagTrigger)) return(NULL)
+ if (verbose)
+ cat("DEBUG enter event tagTrigger", length(values$tagTrigger$id), "\n")
+ ready2tag(0) # Prevent new tagging until next query
+ dapath <- paste0(values$projectdir, ok.labelcol(), ".csv")
+ darows <- match(values$tagTrigger$id, ok.data()[, values$conf$idcol])
+
+ export <- data.frame(values$tagTrigger$id, values$tagTrigger$tag)
+ colnames(export) <- c(values$conf$idcol, ok.labelcol())
+ if (verbose)
+ cat("DEBUG tagTrigger tag",
+ paste(export[, 1], export[, 2], collapse = " ; "), "\n")
+ write.table(export, dapath, sep = ",", qmethod = "double",
+ fileEncoding = "UTF-8", row.names = F,
+ append = ifelse(file.exists(dapath), T, F),
+ col.names = ifelse(file.exists(dapath), F, T))
+
+ if (any(nchar(values$tagTrigger$comment))) {
+ tmp_filter <- which(nchar(values$tagTrigger$comment) > 0)
+ dapath <- paste0(values$projectdir, values$conf$commcol, ".csv")
+ export <- data.frame(values$tagTrigger$id[tmp_filter],
+ values$tagTrigger$comment[tmp_filter])
+ colnames(export) <- c(values$conf$idcol, values$conf$commcol)
+ if (verbose)
+ cat("DEBUG tagTrigger comment", export[1,1], export[1, 2])
+ write.table(export, dapath, sep = ",", qmethod = "double",
+ fileEncoding = "UTF-8", row.names = F,
+ append = ifelse(file.exists(dapath), T, F),
+ col.names = ifelse(file.exists(dapath), F, T))
+ }
+
+ ## Update label hist
+ values$labelhist[match(values$tagTrigger$id, ok.data()[, values$conf$idcol])] <-
+ max(values$labelhist, na.rm = T) + 1
+ ## Update wasTagged and wasRetagged
+ itag <- match(values$tagTrigger$id, ok.data()[, values$conf$idcol])
+ if (input$sampleChoice == "untagged") {
+ values$wasTagged[itag] <- TRUE
+ } else if (input$sampleChoice == "tagged") {
+ values$wasRetagged[itag] <- TRUE
+ } else if (input$sampleChoice == "all") {
+ if (is.na(values$label[itag])) {
+ values$wasTagged[itag] <- TRUE
+ } else
+ values$wasRetagged[itag] <- TRUE
+ }
+
+ ## Call queryNext event, or launch training
+ if (trainCountdown() == 1) {
+ trainCountdown(input$trainCountdown)
+ trainTrigger(trainTrigger() + 1)
+ } else if (trainCountdown() > 1) {
+ trainCountdown(trainCountdown() - 1)
+ queryNext(queryNext() + 1)
+ } else {
+ queryNext(queryNext() + 1)
+ }
+
+ ## Update values$label etc
+ values$label[darows] <- values$tagTrigger$tag
+ values$queries <- c(values$queries, darows)
+
+ if (verbose)
+ cat("DEBUG exit event tagTrigger", nrow(export), "rows\n")
+ })
+
+
+ ####################
+ ## Tagging main panel
+
+ output$textVisuCols <- renderUI({
+ if (verbose)
+ cat("DEBUG render text/visu panel\n")
+ leftsize <- 6
+ if (!input$panelText) leftsize <- 1
+ if (input$panelText & !input$panelVisu) leftsize <- 11
+
+ fluidRow(
+ column(
+ leftsize,
+ conditionalPanel(
+ "input.panelText",
+ wellPanel(htmlOutput("currentText"))
+ )
+ ),
+ column(
+ 12 - leftsize,
+ conditionalPanel(
+ "input.panelVisu",
+ fluidRow(
+ column(2, HTML(paste0('',
+ actionButton("visuCompute", "🎲"),
+ '
'))),
+ column(2, HTML(paste0('',
+ actionButton("visuGoZoom", "🔍±"),
+ '
'))),
+ column(2, HTML(paste0('',
+ checkboxInput("visuLock", "🔒", value = FALSE),
+ '
'))),
+ column(2, HTML(paste0('',
+ checkboxInput("visuOptions", "🔧"),
+ '
'))),
+ column(3, HTML(paste0('',
+ numericInput("visuHeight", NULL, 400, 20, 2e12, 10),
+ '
')))
+ ),
+ conditionalPanel(
+ 'input.visuOptions',
+ flowLayout(
+ selectInput(
+ "visuAlgo", "Visu. method", c("umap", "t-sne"), "umap", width = "100%"),
+ selectizeInput(
+ "visuSource", "Visu. source",
+ c("Word counts (DFM)" = "dfm",
+ "Word embeddings (FastText)" = "ft",
+ "Sentence embeddings (SBERT)" = "sb",
+ "BERT predictions" = "bertpred",
+ "Regex" = "regex"),
+ c("sb", "ft"), multiple = TRUE)
+ )
+ ),
+ conditionalPanel( # umap options
+ 'input.visuAlgo == "umap" & input.visuOptions',
+ flowLayout(
+ numericInput(
+ "umapNeighb", "Nb. Neighbors", 15, 1, 500, 1),
+ numericInput(
+ "umapSpread", "Spread", 1, 0, 16, 1e-2),
+ numericInput(
+ "umapMindist", "Min dist.", .001, 0, 16, 1e-3),
+ selectInput(
+ "umapMetric", "Metric", selected = "cosine",
+ c("euclidean", "cosine", "manhattan", "hamming",
+ "correlation", "categorical")
+ )
+ )
+ ),
+ conditionalPanel( # t-SNE options
+ 'input.visuAlgo == "t-sne" & input.visuOptions',
+ flowLayout(
+ numericInput(
+ "tsnePerplex", "Perplexity", 15, 1, 500, 1),
+ numericInput(
+ "tsneTheta", "Theta", .5, 0, 1, 1e-2),
+ numericInput(
+ "tsnePcaDim", "PCA dims", 50, 1, 100, 1)
+ )
+ ),
+ uiOutput("visuMsg"),
+ plotOutput(
+ "visuPlot", width = "100%",
+ dblclick = "visuDblclick",
+ brush = brushOpts("visuBrush", resetOnNew = T))
+ )
+ )
+ )
+ })
+
+ ##################
+ ## Subpanel "Text"
+ ##################
+
+ output$currentText <- renderUI({
+ if (verbose)
+ cat("DEBUG render currentText\n")
+ if (is.null(ok.data()) | is.null(ok.nicetext()))
+ return(HTML("Import data to start training."))
+
+ if (input$strategy %in% c("entropy", "maxprob") & is.null(ok.train()))
+ return(HTML("No model: train one, or try 'random' or 'sequential' strategy"))
+
+ if (is.null(values$newQuery))
+ return(HTML("No query"))
+
+ bastext <- ok.nicetext()[values$newQuery]
+ if (nchar(input$regexFilter) > 0)
+ bastext <- gsub(paste0("(", isolate(input$regexFilter), ")"),
+ "\\1", bastext, ignore.case = isolate(!input$regexCaseSens))
+ baslen <- nchar(bastext)
+ if (!length(baslen)) {
+ if (nchar(input$regexFilter) > 0)
+ return(HTML("Regex filter: no match"))
+ return(HTML("No text matches the filters"))
+ }
+ if (is.na(baslen))
+ return(HTML("No query"))
+ if (baslen > 1200) {
+ bastext <- paste0(substring(bastext, 1, 1200), '',
+ substring(bastext, 1201), "")
+ }
+ bastext <- gsub("\n", "
", bastext)
+ HTML(bastext)
+ })
+
+
+
+ ##############################################################################
+ ## Panel "Visualize"
+ ##############################################################################
+
+ output$visuMsg <- renderUI({
+ if (!is.null(values$visuMsg))
+ # strong(values$visuMsg)
+ h5(values$visuMsg, style = "text-align:center")
+ })
+
+ ## Compute visualization points (async)
+ visu_nclicks <- reactiveVal(0)
+ ok.visu <- reactiveVal()
+ observe({
+ if (!input$panelVisu)
+ values$visuGo <- FALSE
+ if (input$panelVisu & !values$visuGo)
+ values$visuGo <- TRUE
+ })
+ observeEvent(list(values$visuGo, input$visuCompute), {
+ if (is.null(input$visuSource)) return(NULL)
+ if (!values$visuGo) return(NULL)
+ if (is.null(ok.data())) {
+ values$visuMsg <- "No data imported."
+ return(NULL)
+ }
+
+ if (visu_nclicks() > 0) {
+ values$visuMsg <- "Already computing visualization"
+ return(NULL)
+ }
+ visu_nclicks(1)
+
+ values$visuMsg <- "Computing visualization..."
+
+ ## Prepare options and objects for async computation
+ dasources <- input$visuSource
+ daalgo <- input$visuAlgo
+
+ if (verbose)
+ cat("DEBUG compute visu embedding: ", daalgo, paste(dasources, collapse = " "), "\n")
+
+ visudat <- NULL
+ if ("dfm" %in% dasources) dadfm <- ok.dfm()
+ if ("ft" %in% dasources) {
+ visudat <- ok.data()[, values$cols_ft]
+ }
+ if ("sb" %in% dasources) {
+ if (is.null(visudat)) {
+ visudat <- ok.data()[, values$cols_sb]
+ } else
+ visudat <- cbind(visudat, ok.data()[, values$cols_sb])
+ }
+ if ("bertpred" %in% dasources) {
+ if (is.null(visudat)) {
+ visudat <- ok.data()[, do.call(c, values$cols_bertpred)]
+ } else
+ visudat <- cbind(visudat, ok.data()[, do.call(c, values$cols_bertpred)])
+ }
+ if ("regex" %in% dasources) {
+ if (is.null(visudat)) {
+ visudat <- ok.extraRegex.matrix()
+ } else
+ visudat <- cbind(visudat, ok.extraRegex.matrix())
+ }
+ if (daalgo == "umap") {
+ visu_args <- list(
+ n_neighbors = input$umapNeighb, metric = input$umapMetric,
+ spread = input$umapSpread, min_dist = input$umapMindist)
+ } else if (daalgo == "t-sne") {
+ visu_args <- list(
+ perplexity = input$tsnePerplex, theta = input$tsneTheta,
+ initial_dims = input$tsnePcaDim, check_duplicates = FALSE)
+ }
+
+ visu_async <- future(seed = TRUE, {
+ if ("dfm" %in% dasources) {
+ ## TODO: CA treatment buggy, switch to SVD?
+ # da_ca <- quanteda.textmodels::textmodel_ca(
+ # quanteda::as.dfm(dadfm), nd = min(100, nrow(dadfm)))
+ # da_ca <- (da_ca$rowcoord %*% diag(da_ca$sv))[, !is.na(da_ca$sv)]
+ da_ca <- as.matrix(dadfm)
+ if (is.null(visudat)) {
+ visudat <- da_ca
+ } else
+ visudat <- cbind(da_ca, visudat)
+ }
+
+ if (daalgo == "umap") {
+ do.call(uwot::umap, c(list(visudat), visu_args))
+ } else if (daalgo == "t-sne") {
+ visudat_dupli <- duplicated(visudat)
+ if (any(visudat_dupli))
+ visudat[visudat_dupli, ] <- jitter(visudat[visudat_dupli, ])
+ do.call(Rtsne::Rtsne, c(list(visudat), visu_args))$Y
+ }
+ }) %...>% ok.visu()
+
+ visu_async <- catch(visu_async, function(error) {
+ values$visuMsg <- paste0("Error: ", error)
+ })
+ visu_async <- finally(visu_async, function() {
+ visu_nclicks(0)
+ if (input$visuGoZoom == 0) {
+ values$visuMsg <- "Double-click to select, drag to select zooming zone."
+ } else
+ values$visuMsg <- NULL
+
+ nice_choices <- c("FastText" = "ft", "SBERT" = "sb", "visu" = daalgo)
+ names(nice_choices)[3] <- ifelse(daalgo == "t-sne", "t-SNE", "UMAP")
+ updateSelectInput(
+ session, "dlEmbedSelect", choices = nice_choices, selected = daalgo)
+
+ if (verbose)
+ cat("DEBUG Visu async out\n")
+ })
+
+ NULL
+ })
+
+ ## Compute visualization plot (async)
+ ok.ggvisu <- reactiveVal()
+ observe({
+ # if (is.null(ok.visu())) return(NULL)
+ if (is.null(ok.visu())) {
+ ok.ggvisu(NULL)
+ return(NULL)
+ }
+ # if (!isTRUE(values$visugo)) return(NULL)
+ if (!input$panelVisu) return(NULL)
+
+ if (verbose)
+ cat("DEBUG compute visu plot\n")
+ ggdat <- data.frame(ok.visu(), tag = values$label)
+ if (!is.null(ok.predlab1()))
+ ggdat$tag[is.na(values$label)] <- ok.predlab1()[is.na(values$label)]
+ da_predlabs <- ok.predlab1()
+ da_tagged <- ! is.na(values$label)
+ da_query <- values$newQuery
+
+ ggvisu_async <- future({
+ ggres <- ggplot(ggdat, aes(X1, X2)) + theme_bw() +
+ xlab("") + ylab("") +
+ theme(legend.position = "bottom") +
+ scale_shape_manual(values = rep(1:8, 40)) # TODO : adapt to tags
+
+ ## Tagged points
+ if (any(da_tagged)) {
+ ggres <- ggres +
+ geom_point(data = ggdat[da_tagged, ],
+ aes(X1, X2, color = tag, shape = tag),
+ size = 1.5, stroke = 1.5, alpha = .8)
+ }
+
+ ## Not tagged points
+ if (!is.null(da_predlabs)) {
+ ggres <- ggres +
+ geom_point(data = ggdat[!da_tagged, ],
+ aes(color = tag, shape = tag), show.legend = FALSE)
+ } else {
+ ggres <- ggres +
+ geom_point(alpha = .8, data = ggdat[is.na(ggdat$tag), ])
+ }
+
+ ## Current query
+ if (!is.null(da_query)) {
+ if (is.null(da_predlabs)) {
+ ggres <- ggres +
+ geom_point(data = ggdat[da_query, ], color = 2,
+ stroke = 3, shape = 3, show.legend = FALSE)
+ } else {
+ ggres <- ggres +
+ geom_point(data = ggdat[da_query, ],
+ stroke = 3, shape = 3, show.legend = FALSE)
+ }
+ }
+ ggres
+ }) %...>% ok.ggvisu()
+
+ ggvisu_async <- catch(ggvisu_async, function(error) {
+ values$visuMsg <- paste0("Error in ggplot: ", error)
+ })
+
+ })
+
+ observe({
+ if (verbose)
+ cat("DEBUG update visuPlot\n")
+ if (is.null(ok.ggvisu())) {
+ values$visuMsg <- "No visualization yet."
+ output$visuPlot <- NULL
+ values$visuZoom$xlim <- NULL
+ values$visuZoom$ylim <- NULL
+ return(NULL)
+ }
+ output$visuPlot <- renderPlot(height = input$visuHeight, {
+ # if (is.null(ok.ggvisu())) return(NULL)
+ ok.ggvisu() +
+ coord_fixed(xlim = values$visuZoom$xlim, ylim = values$visuZoom$ylim)
+ })
+ })
+
+ observeEvent(input$visuDblclick, {
+ if (!is.null(values$visuMsg))
+ values$visuMsg <- NULL
+ tmp <- input$visuDblclick
+ bmu <- t(ok.visu()) - c(tmp$x, tmp$y)
+ bmu <- which.min(colSums(bmu^2))
+ values$lastQuery <- bmu
+ values$newQuery <- bmu
+ ready2tag(1)
+ })
+
+ observeEvent(input$visuGoZoom, {
+ if (!is.null(values$visuMsg))
+ values$visuMsg <- NULL
+ values$visuZoom$xlim <- c(input$visuBrush$xmin, input$visuBrush$xmax)
+ values$visuZoom$ylim <- c(input$visuBrush$ymin, input$visuBrush$ymax)
+ })
+
+
+ ##############################################################################
+ ## Panel History
+ ##############################################################################
+
+ ## Create history link and retag events as they appear
+ observeEvent(input$histDTable_rows_current, {
+ histrows <- order(values$labelhist)
+ histrows <- rev(histrows[!is.na(values$labelhist[histrows])])
+ histrows <- histrows[!is.na(values$label[histrows])]
+ histrows <- histrows[input$histDTable_rows_current]
+ notthere <- histrows[! histrows %in% values$histQueries]
+
+ if (!length(notthere)) return(NULL)
+ lapply(notthere, function(daquery) {
+ if (verbose) cat("DEBUG create history events", daquery, "\n")
+ observeEvent(input[[paste0("hist", daquery)]], {
+ if (verbose) cat("DEBUG history link", daquery, "\n")
+ values$lastQuery <- daquery
+ values$newQuery <- daquery
+ updateTabsetPanel(session, "mainPanelset", selected = "Tagging")
+ ready2tag(1)
+ })
+
+ observeEvent(input[[paste0("histTag", daquery)]], {
+ if (verbose) cat("DEBUG history retag", daquery, "\n")
+ values$histSaveStack <- rbind(
+ values$histSaveStack,
+ data.frame(id = ok.data()[daquery, values$conf$idcol],
+ tag = input[[paste0("histTag", daquery)]],
+ comment = values$comment[daquery])
+ )
+ })
+ })
+ values$histQueries <- c(values$histQueries, notthere)
+ })
+
+ ## History save action
+ observeEvent(input$histSave, {
+ req(is.data.frame(values$histSaveStack))
+ values$tagTrigger <- as.list(values$histSaveStack)
+ values$label[match(values$histSaveStack$id, ok.data()[[values$conf$idcol]])] <-
+ values$histSaveStack$tag
+ values$histSaveStack <- NULL
+ })
+
+ ## Update save name with number of unsaved changes
+ observeEvent(length(values$histSaveStack$id), {
+ updateActionButton(
+ session, "histSave",
+ label = paste0("Save changes (", length(unique(values$histSaveStack$id)),")"))
+ })
+
+ ## History table, with clickable links and retagging
+ output$histDTable <- DT::renderDataTable({
+ histrows <- order(values$labelhist)
+ histrows <- rev(histrows[!is.na(values$labelhist[histrows])])
+ histrows <- histrows[!is.na(values$label[histrows])]
+
+ # histrows <- rev(values$queries)
+ hist_df <- data.frame(
+ # history = length(histrows):1,
+ history = values$labelhist[histrows],
+ id = sapply(histrows, function(irow) {
+ as.character(actionLink(
+ paste0("hist", irow),
+ label = ok.data()[irow, values$conf$idcol],
+ onclick = paste0('Shiny.onInputChange(\"hist', irow, '\", this.id)')))
+ }),
+ Tag = values$label[histrows],
+ Retag = sapply(histrows, function(irow) {
+ paste0(
+ ""
+ )
+ }),
+ Comment = values$comment[histrows],
+ Text = ok.nicetext()[histrows])
+ if (!is.null(values$conf$contextcols))
+ hist_df <- cbind(hist_df,
+ ok.data()[histrows, values$conf$contextcols])
+
+ DT::datatable(hist_df, escape = FALSE, rownames = FALSE, filter = "top",
+ options = list(
+ columnDefs = list(list(
+ targets = 5,
+ render = DT::JS(
+ "function(data, type, row, meta) {",
+ "return type === 'display' && data.length > 200 ?",
+ "'' + data.substr(0, 200) + '...' : data;",
+ "}")))
+ ))
+ })
+
+ ##############################################################################
+ ## Panel "Stats"
+ ##############################################################################
+
+ output$statsTagTable <- renderTable(rownames = F, {
+ table(tag = values$label)
+ })
+
+
+ ## Model diagnostics by CV
+ output$statsCVoutput <- reactive({
+ if (is.null(values$modelcv))
+ return("Model diagnostics will appear here")
+ values$modelcv
+ })
+
+ output$statsCVtable <- DT::renderDataTable({
+ if (is.null(values$cvtable))
+ return(NULL)
+ DT::datatable(values$cvtable, list(pageLength = 25), rownames = F)
+ })
+
+ ## Button action: compute CV (async)
+ cv_nclicks <- reactiveVal(0)
+ observeEvent(input$statsCVgo, {
+ if (is.null(ok.train())) {
+ values$modelcv <- "Error in model training"
+ return(NULL)
+ }
+ if (cv_nclicks() != 0) {
+ values$modelcv <- "CV already running"
+ return(NULL)
+ }
+ cv_nclicks(cv_nclicks() + 1)
+
+ predmat <- ok.predictor()[!is.na(values$label), ]
+ predlab <- values$label[!is.na(values$label)]
+ cvsamp <- sample(10, nrow(predmat), replace = TRUE)
+ cvres <- rep(NA, nrow(predmat))
+ cvres_future <- reactiveVal()
+
+ tmp_model <- input$predModel
+ if (tmp_model == "random forest") {
+ cv_args <- list(
+ num.trees = input$rfNumTrees, mtry = if (input$rfMtry > 0) input$rfMtry,
+ sample.fraction = input$rfSampleFrac, probability = FALSE)
+ } else if (tmp_model == "linear") {
+ cv_args <- list(type= 0, cost = input$liblinCost)
+ } else if (tmp_model == "lasso") {
+ cv_args <- input$glmLambda
+ } else if (tmp_model == "naive bayes") {
+ cv_args <- list(smooth = input$naiveSmooth, prior = input$naivePrior,
+ distribution = input$naiveDistri)
+ } else if (tmp_model == "knn") {
+ cv_args <- list(k = input$knnK)
+ }
+
+ cv_status_file <- paste0(session$token, "_cv")
+
+ cv_async <- future(seed = TRUE, {
+ for (icv in 1:10) {
+ set_status(cv_status_file, paste0("Computing CV fold ", icv, " / 10"))
+ if (tmp_model == "random forest") {
+ cvmodel <- do.call(ranger::ranger, c(cv_args, list(
+ x= predmat[cvsamp != icv, ],
+ y= as.factor(predlab[cvsamp != icv]))))
+ cvres[cvsamp == icv] <- as.character(
+ predict(cvmodel, data= predmat[cvsamp == icv, ])$predictions)
+ } else {
+ tmpvalues <- predlab[cvsamp != icv]
+ # for (ival in na.omit(unique(tmpvalues)))
+ # if (sum(tmpvalues == ival, na.rm = T) == 1)
+ # tmpvalues[which(tmpvalues == ival)] <- NA
+
+ if (tmp_model == "lasso") {
+ cvmodel <- suppressWarnings(
+ glmnet::glmnet(x= predmat[cvsamp != icv, ][!is.na(tmpvalues), ],
+ y= as.factor(tmpvalues[!is.na(tmpvalues)]),
+ family= "multinomial"))
+ cvres[cvsamp == icv] <- predict(
+ cvmodel, newx = predmat[cvsamp == icv, ],
+ type = "class", s= cv_args)
+ } else if (tmp_model == "linear") {
+ cvmodel <- do.call(LiblineaR::LiblineaR, c(
+ list(data= predmat[cvsamp != icv, ][!is.na(tmpvalues), ],
+ target= as.factor(tmpvalues[!is.na(tmpvalues)])),
+ cv_args))
+ cvres[cvsamp == icv] <- as.character(
+ predict(cvmodel, newx= predmat[cvsamp == icv, ])$predictions)
+ } else if (tmp_model == "naive bayes") {
+ cvmodel <- do.call(quanteda.textmodels::textmodel_nb, c(
+ list(x= predmat[cvsamp != icv, ][!is.na(tmpvalues), ],
+ y= as.factor(tmpvalues[!is.na(tmpvalues)])),
+ cv_args))
+ cvres[cvsamp == icv] <- as.character(
+ predict(
+ cvmodel, newdata= predmat[cvsamp == icv, ], type= "class"))
+ } else if (isolate(tmp_model) == "knn") {
+ cvres[cvsamp == icv] <-
+ as.character(do.call(class::knn, c(cv_args, list(
+ train = predmat[cvsamp != icv, ],
+ test = predmat[cvsamp == icv, ],
+ cl = predlab[cvsamp != icv]))))
+ }
+ }
+ }
+ cvres
+ }) %...>% cvres_future()
+
+ cv_async <- catch(cv_async, function(e) {
+ cvres_future(NULL)
+ values$modelcv <- e$message
+ })
+
+ cv_async <- finally(cv_async, function() {
+ cv_nclicks(0)
+ if (file.exists(cv_status_file)) unlink(cv_status_file)
+ if (is.null(cvres_future())) return(NULL)
+ cvres <- cvres_future()
+ predlab <- values$label[!is.na(values$label)]
+ dametrics <- metricsTable(predlab, cvres)
+ values$cvtable <- dametrics
+ res <- "10-CV OOB evaluation:\n"
+ # if (any(grepl("^bertpred", input$use_regressors)))
+ if (any(grepl("^bertpred", values$new_use_regressors)))
+ res <- paste0(
+ res,
+ "⚠️ Scores are biased upwards if BERT predictions were trained on current data ⚠️ \n")
+ res <- paste0(
+ res,
+ "* Accuracy : ", round(100 * mean(cvres == predlab), 2), "%\n",
+ "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n",
+ "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1),
+ dametrics$N.cases)), "\n"
+ )
+ values$modelcv <- res
+ })
+
+ NULL
+ })
+
+ ## Automatic message update during CV computation
+ observe({
+ if (cv_nclicks() == 0) return(NULL)
+ invalidateLater(200)
+ if (!file.exists(paste0(session$token, "_cv"))) return(NULL)
+ values$modelcv <- get_status(paste0(session$token, "_cv"))
+ values$cvtable <- NULL
+ })
+
+
+ ##############################################################################
+ ## Panel BERT
+ ##############################################################################
+
+ ##########
+ ## Bert UI left panel
+
+ ## Update list of saved models and dlSelect on init and on saveTrigger
+
+ observeEvent(list(ok.data(), values$bertSaveTrigger), {
+ damodels <- dir(pattern = "^tiggerbert_saved_")
+ if (!length(damodels)) return(NULL)
+ damodels <- gsub("^tiggerbert_saved_", "", damodels)
+ values$bertSaved <- damodels
+ updateSelectInput(session, "dlBPSelect", choices = damodels)
+ })
+
+ ## Populate saved models list, with buttons
+
+ output$bertSaved <- renderUI({
+ if (verbose)
+ cat("DEBUG bert update saved list\n")
+ HTML(paste0(
+ "",
+ paste0(sapply(values$bertSaved, function(dabert) {
+ bertshort <- dabert
+ if (nchar(bertshort) > 23)
+ bertshort <- paste0(substr(bertshort, 1, 20), "...")
+ paste0(
+ "",
+ actionLink(paste0("saved_", dabert), bertshort, title = dabert,
+ style="margin-right: 5px;"), " | ",
+ actionButton(paste0("bert_copypars_", dabert), "📋",
+ title = "Copy parameters"), " | ",
+ actionButton(paste0("bert_infer_", dabert), "🤖",
+ title = "Infer predictions on current data"), " | ",
+ actionButton(paste0("bert_delete_", dabert), "🗑",
+ title = "Delete model"),
+ " |
")
+ }), collapse = "\n"),
+ "
"
+ ))
+ })
+
+ ##########
+ ## Bert UI main panel
+
+ ## UI outputs : message and stats table and plot
+ output$bertMsg <- reactive(
+ ifelse(is.null(values$bertMsg),
+ "Train new BERT, or select saved model.",
+ values$bertMsg))
+
+ output$bertValstats <- DT::renderDataTable(
+ DT::datatable(values$bertValstats, list(pageLength = 25)))
+
+ output$bertValPlot <- renderPlot(values$bertValPlot)
+
+ output$bertMsgHyperpar <- reactive(values$bertMsgHyperpar)
+
+ ##################################
+ ## Async handling of BERT training
+
+ ## Status File
+ bert_status_file <- paste0("tiggerbert_", session$token, "_trainstatus")
+
+ fire_interrupt <- function(dafile) set_status(dafile, "interrupt")
+ fire_ready <- function(dafile) set_status(dafile, "Ready")
+ fire_running <- function(dafile, perc_complete = NULL){
+ if(is.null(perc_complete))
+ msg <- "Training..."
+ else
+ msg <- paste0("Training... ", perc_complete, "% complete")
+ set_status(dafile, msg)
+ }
+ interrupted <- function(dafile) get_status(dafile) == "interrupt"
+
+ ## Create Status File
+ fire_ready(bert_status_file)
+
+
+ ## Button action preparation
+ bert_nclicks <- reactiveVal(0)
+ bert_result_val <- reactiveVal()
+ bert_hyperpars <- reactiveVal()
+
+ ## Button appearance: write "Stop" if training is ongoing
+ observeEvent(bert_nclicks(), {
+ updateActionButton(session, "bertTrain",
+ label = ifelse(bert_nclicks() > 0, "Stop", "Train BERT"))
+ })
+
+ ## Button action: Train new BERT
+ observeEvent(input$bertTrain,{
+
+ ## Don't do anything if analysis is already being run
+ # if(bert_nclicks() != 0){
+ # values$bertMsg <- "BERT is already training"
+ # return(NULL)
+ # }
+ tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token)
+ if(bert_nclicks() != 0){
+ write("stop", paste0(tmpsession, "_stop"))
+ bert_nclicks(0)
+ return(NULL)
+ }
+ if (file.exists(paste0(tmpsession, "_stop")))
+ file.remove(paste0(tmpsession, "_stop"))
+
+ ## Increment clicks and prevent concurrent analyses
+ bert_nclicks(bert_nclicks() + 1)
+
+ bert_result_val(data.frame(Status="Running..."))
+ fire_running(bert_status_file)
+
+ ## Prepare async training
+ values$bertLoadSaved <- NULL
+ # tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token)
+ tmpmodel <- input$bertModel
+ system(paste0("rm ", tmpsession, "_done"))
+
+ tmpdat <- data.frame(
+ id = ok.data()[, values$conf$idcol],
+ text = ok.nicetext(),
+ label = values$label
+ )
+ tmpdat <- tmpdat[!is.na(tmpdat$label), ]
+ tmpdat <- tmpdat[grepl("\\S", tmpdat$text), ]
+
+ ## Filter data: min occurences, balance classes, validation split
+ tmplabtable <- table(tmpdat$label)
+ if (is.numeric(input$bertMinOccur)) if (input$bertMinOccur > 1) {
+ tmpdat <- tmpdat[tmpdat$label %in%
+ names(which(tmplabtable >= input$bertMinOccur)), ]
+ tmplabtable <- table(tmpdat$label)
+ }
+ set.seed(input$bertValidSeed)
+ if (input$bertBalance) {
+ tmpminocc <- min(tmplabtable)
+ tmptrain <- sample(do.call(c, lapply(unique(tmpdat$label), function(dalabel)
+ sample(which(tmpdat$label == dalabel), round((1-input$bertValidFrac) * tmpminocc)))))
+ tmpvalid <- sample(do.call(c, lapply(unique(tmpdat$label), function(dalabel) {
+ dasamp <- which(tmpdat$label == dalabel)
+ dasamp <- dasamp[! dasamp %in% tmptrain]
+ sample(dasamp, round(input$bertValidFrac * tmpminocc))
+ })))
+ } else {
+ tmptrain <- sample(1:nrow(tmpdat), round((1 - input$bertValidFrac) * nrow(tmpdat)))
+ tmpvalid <- (1:nrow(tmpdat))[-tmptrain]
+ }
+
+ write.csv(tmpdat[tmptrain, ], paste0(tmpsession, "_dat_train.csv"), row.names = FALSE)
+ write.csv(tmpdat[tmpvalid, ], paste0(tmpsession, "_dat_valid.csv"), row.names = FALSE)
+
+ iwait <- 0
+ while (iwait < 10) {
+ if (!file.exists(paste0(tmpsession, "_dat_valid.csv"))) Sys.sleep(500)
+ iwait <- iwait + 1
+ }
+
+ tmp_cmd <- paste0(
+ values$conf$python, " gobert.py -m ", tmpmodel,
+ " -t ", tmpsession, "_dat_train.csv",
+ " -v ", tmpsession, "_dat_valid.csv",
+ " -b ", input$bertBatchsize,
+ " -g ", input$bertGradacc,
+ " -e ", input$bertEpochs,
+ " -l ", input$bertLrate,
+ " -w ", input$bertWdecay,
+ " -B ", ifelse(input$bertBest, 1, 0),
+ " -E ", input$bertNeval,
+ " -s ", tmpsession,
+ " -G ", as.numeric(values$conf$use_gpu),
+ " -A ", as.numeric(input$bertAdapt),
+ " 2>&1 ; touch ", tmpsession, "_done"
+ )
+
+ ## Save hyperpars
+ bert_hyperpars(list(bertModel = input$bertModel,
+ bertEpochs = input$bertEpochs,
+ bertLrate = input$bertLrate,
+ bertWdecay = input$bertWdecay,
+ bertBatchsize = input$bertBatchsize,
+ bertGradacc = input$bertGradacc,
+ bertNeval = input$bertNeval,
+ bertBest = input$bertBest,
+ bertValidFrac = input$bertValidFrac,
+ bertValidSeed = input$bertValidSeed,
+ bertNtrain = length(tmptrain),
+ bertTrainTable = as.list(table(tmpdat$label[tmptrain])),
+ bertScheme = ok.labelcol()))
+
+ ## Launch async training
+ bert_async <- future({
+ if (file.exists(tmpsession))
+ system(paste0("rm -R ", tmpsession, "*"))
+
+ # system(tmp_cmd, wait = FALSE, intern = FALSE)
+ tmp_exec <- system(tmp_cmd, wait = FALSE, intern = TRUE)
+ writeLines(tmp_exec, paste0(tmpsession, "_log"))
+
+ ## After training, get trainer state from last checkpoint for diagnostics
+ chkpt <- dir(paste0(tmpsession, "_train"), pattern = "^checkpoint")
+ chkpt <-
+ chkpt[order(as.numeric(gsub("\\D", "", chkpt)), decreasing = T)[1]]
+ system(paste0(
+ "cp ", tmpsession, "_train/", chkpt, "/trainer_state.json ",
+ tmpsession, "/"))
+ }) %...>% bert_result_val()
+
+ ## Catch interrupt (or any other error) and notify user
+ bert_async <- catch(bert_async, function(e){
+ bert_result_val(NULL)
+ showNotification(e$message)
+ values$bertMsg <- e$message
+ })
+
+ ## After the promise has been evaluated set nclicks to 0 to allow another Run
+ bert_async <- finally(bert_async, function() {
+ system(paste0("rm -R ", tmpsession, "_train"))
+ system(paste0("rm ", tmpsession, "_done"))
+
+ fire_ready(bert_status_file)
+ bert_nclicks(0)
+
+ if (is.null(bert_result_val())) return(NULL)
+
+ bertpreds <- try(read.csv(paste0(tmpsession, "_predval.csv")))
+ if (inherits(bertpreds, "try-error")) {
+ # values$bertMsg <- "BERT error - try again"
+ values$bertMsg <- try(paste(readLines(paste0(tmpsession, "_log")), collapse = "\n"))
+ values$bertValstats <- NULL
+ values$bertValPlot <- NULL
+ values$bertLastUnsaved <- NULL
+ values$bertMsgHyperpar <- NULL
+ values$bertLastHyperpar <- NULL
+ return(NULL)
+ }
+
+ datruth <- tmpdat[tmpvalid, "label"]
+ dapred <- bertpreds$bertpred
+ dametrics <- metricsTable(datruth, dapred)
+ damsg <- paste0(
+ "Unsaved model (", tmpmodel, ")\n\n",
+ "BERT validation stats:\n",
+ "* Accuracy : ", round(100 * mean(datruth == dapred), 2), "%\n",
+ "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n",
+ "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1),
+ dametrics$N.cases)), "\n")
+ hypermsg <- bertHyperToMsg(bert_hyperpars())
+ writeLines(RJSONIO::toJSON(bert_hyperpars()),
+ paste0(tmpsession, "/tiggertrain.json"))
+
+ values$bertValstats <- dametrics
+ values$bertMsg <- damsg
+ values$bertValPlot <- bertPlotFromPath(tmpsession)
+ values$bertLastUnsaved <- list(
+ metrics = dametrics, msg = damsg, session = tmpsession)
+ values$bertMsgHyperpar <- hypermsg
+ values$bertLastHyperpar <- bert_hyperpars()
+ })
+
+ values$bertMsg <- "BERT is training"
+ values$bertValstats <- NULL
+ values$bertValPlot <- NULL
+ values$bertLastUnsaved <- NULL
+ values$bertMsgHyperpar <- NULL
+ values$bertLastHyperpar <- NULL
+
+ ## Return something other than the promise so shiny remains responsive
+ NULL
+ })
+
+ ## Automatic check of BERT training status
+ observe({
+ if (bert_nclicks() == 0) return(NULL)
+ isolate({
+ tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token)
+ ## Check for checkpoint dirs
+ checkdirs <-
+ length(dir(paste0(tmpsession, "_train"), pattern = "^checkpoint"))
+ # fire_running(bert_status_file,
+ # min(100, round(100 * checkdirs / input$bertNeval, 1)))
+ # dastate <- try(RJSONIO::fromJSON(paste0(tmpsession, "_train/trainer_state.json")))
+ # if (!inherits(dastate, "try-error"))
+ # fire_running(bert_status_file,
+ # round(100 * as.numeric(dastate$epoch) / input$bertEpochs, 1))
+ try({
+ dapoints <- dir(paste0(tmpsession, "_train"),
+ pattern = "trainer_state[.]json$", recursive = T)
+ dapoints_num <- as.numeric(gsub("^checkpoint-(\\d+)/.*$", "\\1", dapoints))
+ statefile <- paste0(
+ tmpsession, "_train/", dapoints[which.max(dapoints_num)])
+ if (file.exists(statefile)) {
+ state <- RJSONIO::fromJSON(statefile)
+ fire_running(bert_status_file,
+ round(100 * as.numeric(state$epoch) / input$bertEpochs, 1))
+ }
+ })
+ })
+ if (checkdirs < isolate(input$bertNeval) & file.exists(paste0(tmpsession, "_done"))) {
+ values$bertMsg <- "BERT Error - try again"
+ # values$bertMsg <- paste(readLines(paste0(tmpsession, "_log")), collapse = "\n")
+ } else {
+ invalidateLater(1e3)
+ loss_progress <- ""
+ if (checkdirs > 0) {
+ ## Get training diagnostics
+ try({
+ dapoints <- dir(paste0(tmpsession, "_train"),
+ pattern = "trainer_state[.]json$", recursive = T)
+ dapoints_num <- as.numeric(gsub("^checkpoint-(\\d+)/.*$", "\\1", dapoints))
+ statefile <- paste0(
+ tmpsession, "_train/", dapoints[which.max(dapoints_num)])
+ if (file.exists(statefile)) {
+ state <- RJSONIO::fromJSON(statefile)
+ logs <- data.table::rbindlist(lapply(state$log_history, as.list),
+ fill = TRUE)
+ logs <- logs[, .(loss = max(loss, na.rm = T),
+ eval_loss = max(eval_loss, na.rm = T)), by = epoch]
+ loss_progress <- paste0(
+ "\n* Epoch : ", sprintf("%.02f", logs$epoch),
+ "\ttrain loss : ", sprintf("%.05f",logs$loss),
+ "\teval loss : ", sprintf("%.05f", logs$eval_loss), collapse = "")
+ }
+ })
+ }
+ values$bertMsg <-
+ ifelse(checkdirs == 0, "Pre-processing...",
+ paste0(get_status(bert_status_file), loss_progress))
+ }
+ values$bertValstats <- NULL
+ values$bertValPlot <- NULL
+ values$bertLoadSaved <- NULL
+ values$bertMsgHyperpar <- NULL
+ })
+
+
+ ## Action: show last BERT model
+ observeEvent(input$bertLast, {
+ values$bertMsg <- values$bertLastUnsaved$msg
+ values$bertValstats <- values$bertLastUnsaved$metrics
+ values$bertValPlot <- bertPlotFromPath(values$bertLastUnsaved$session)
+ values$bertMsgHyperpar <- bertHyperToMsg(values$bertLastHyperpar)
+ values$bertLoadSaved <- NULL
+ })
+
+
+ ## Action: save BERT model
+ observeEvent(input$bertSave, {
+ tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token)
+ if (!file.exists(paste0(tmpsession)))
+ return(NULL)
+
+ if (!grepl("\\S", input$bertSaveName)) {
+ tmpname <- paste0(
+ ok.labelcol(), "_", gsub("\\D+$", "", gsub("\\D", "-", Sys.time())))
+ } else
+ tmpname <- cleanFileName(input$bertSaveName)
+
+ savedir <- paste0("tiggerbert_saved_", tmpname)
+ system(paste("cp -R", tmpsession, savedir))
+ system(paste0("cp ", tmpsession, "_dat_valid.csv ",
+ savedir, "/tiggerbert_dat_valid.csv"))
+ system(paste0("cp ", tmpsession, "_predval.csv ",
+ savedir, "/tiggerbert_pred_valid.csv"))
+
+ system(paste0("rm -R ", tmpsession, "*"))
+ values$bertMsg <- paste0("BERT model successfully saved: ", tmpname)
+ values$bertValstats <- NULL
+ values$bertValPlot <- NULL
+ values$bertMsgHyperpar <- NULL
+
+ if (is.null(values$bertSaveTrigger)) {
+ values$bertSaveTrigger <- 0
+ } else
+ values$bertSaveTrigger <- values$bertSaveTrigger + 1
+ values$bertLastUnsaved <- NULL
+ updateTextInput(session, "bertSaveName", value = "")
+ })
+
+
+ ## Actions: load saved model (list updated only with new models)
+ observeEvent(values$bertSaved, {
+ if (!length(values$bertSaved)) return(NULL)
+ tmpsaved <- values$bertSaved[! values$bertSaved %in% values$bertSavedAll]
+
+ lapply(tmpsaved, function(dabert) {
+ ############
+ ## Actions to load saved model
+ observeEvent(input[[paste0("saved_", dabert)]], {
+ bertconf <- RJSONIO::fromJSON(paste0(
+ "tiggerbert_saved_", dabert, "/config.json"))
+ truth <- read.csv(paste0(
+ "tiggerbert_saved_", dabert, "/tiggerbert_dat_valid.csv"))$label
+ pred <- read.csv(paste0(
+ "tiggerbert_saved_", dabert, "/tiggerbert_pred_valid.csv"))$bertpred
+
+ dametrics <- metricsTable(truth, pred)
+ values$bertValstats <- dametrics
+ values$bertMsg <- paste0(
+ "Saved model ", dabert, " (", bertconf[["_name_or_path"]], ")\n\n",
+ "BERT validation stats:\n",
+ "* Accuracy : ", round(100 * mean(truth == pred), 2), "%\n",
+ "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n",
+ "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1),
+ dametrics$N.cases)), "\n")
+
+ ## Hyperparameters
+ tiggertrain_file <- paste0("tiggerbert_saved_", dabert, "/tiggertrain.json")
+ if (file.exists(tiggertrain_file)) {
+ try({
+ values$bertMsgHyperpar <- bertHyperToMsg(RJSONIO::fromJSON(tiggertrain_file))
+ })
+ } else
+ values$bertMsgHyperpar <- "Training parameters not saved."
+
+ values$bertValPlot <-
+ bertPlotFromPath(paste0("tiggerbert_saved_", dabert))
+ values$bertLoadSaved <- dabert
+ })
+
+ ############
+ ## Actions to copy hyperparameters
+ observeEvent(input[[paste0("bert_copypars_", dabert)]], {
+ tiggertrain_file <- paste0("tiggerbert_saved_", dabert, "/tiggertrain.json")
+ if (!file.exists(tiggertrain_file)) return(NULL)
+ try({
+ tmppars <- RJSONIO::fromJSON(tiggertrain_file)
+ updateSelectInput(session, "bertModel", selected = tmppars$bertModel)
+ updateCheckboxInput(session, "bertBest", value = tmppars$bertBest)
+ numinputs <- c("bertEpochs", "bertLrate", "bertWdecay",
+ "bertBatchsize", "bertGradacc")
+ for (iinput in numinputs)
+ updateNumericInput(session, iinput, value = tmppars[[iinput]])
+ })
+ })
+
+ ############
+ ## Actions to delete saved model (with confirm)
+ observeEvent(input[[paste0("bert_delete_", dabert)]], {
+ showModal(modalDialog(
+ title = "Delete BERT model",
+ paste0("Confirm delete: ", dabert,
+ " (model and predictions will be deleted)."),
+ footer = tagList(actionButton(paste0("bert_delete_confirm_", dabert), "Delete"),
+ modalButton("Cancel"))))
+ })
+
+ observeEvent(input[[paste0("bert_delete_confirm_", dabert)]], {
+ system(paste0("rm -R tiggerbert_saved_", dabert))
+ system(paste0(
+ "rm ", values$conf$datadir, values$conf$dataname, "_bertpred_",
+ dabert, ".feather"))
+ # values$bertLoadSaved <- NULL
+ if (is.null(values$bertSaveTrigger)) {
+ values$bertSaveTrigger <- 0
+ } else
+ values$bertSaveTrigger <- values$bertSaveTrigger + 1
+ removeModal()
+ values$bertMsg <- NULL
+ values$bertValstats <- NULL
+ values$bertMsgHyperpar <- NULL
+ })
+
+ ############
+ ## Actions to infer on current data from saved BERT (async)
+ observeEvent(input[[paste0("bert_infer_", dabert)]], {
+ if (bertpred_nclicks() > 0 | bpall_nclicks() > 0) {
+ values$bertMsg <- "Inference already running"
+ values$bertMsgHyperpar <- NULL
+ values$bertValPlot <- NULL
+ values$bertValstats <- NULL
+ return(NULL)
+ }
+ bertpred_nclicks(1)
+
+ saved_model <- paste0("tiggerbert_saved_", dabert)
+ out_path <- paste0(
+ values$conf$datadir, values$conf$dataname, "_bertpred_",
+ dabert, ".feather")
+
+ if (file.exists(out_path)) {
+ old_infer <- arrow::read_feather(out_path)
+ } else
+ old_infer <- NULL
+
+ to_infer <- ok.data()[[values$conf$idcol]]
+ to_infer <- to_infer[! to_infer %in% old_infer[[values$conf$idcol]]]
+ to_infer_row <- match(to_infer, ok.data()[[values$conf$idcol]])
+
+ if (!length(to_infer)) {
+ values$bertMsg <- paste0(
+ "All current ", nrow(ok.data()),
+ " texts already infered by ", dabert)
+ values$bertMsgHyperpar <- NULL
+ values$bertValPlot <- NULL
+ values$bertValstats <- NULL
+ bertpred_nclicks(0)
+ return(NULL)
+ }
+
+ values$bertValPlot <- NULL
+ values$bertValstats <- NULL
+ values$bertMsgHyperpar <- NULL
+ values$bertMsg <- paste0(
+ "Preparing inference on ", length(to_infer), " new texts")
+ tmp_dat <- paste0("tiggerbert_infer_", session$token, ".feather")
+ arrow::write_feather(
+ data.frame(
+ ok.data()[to_infer_row, values$conf$idcol, drop = FALSE],
+ text = ok.nicetext()[to_infer_row]),
+ tmp_dat)
+
+ values$bertMsg <- paste0(
+ "Infering ", dabert, " predictions on ",
+ length(to_infer), " new texts")
+ infer_cmd <- paste0(
+ values$conf$python, " gobert_infer.py -m ", saved_model,
+ " -o ", out_path,
+ " -d ", tmp_dat,
+ " -i ", values$conf$idcol,
+ " -l ", paste0("tiggerbert_infer_", session$token, ".log"),
+ " -G ", as.numeric(values$conf$use_gpu),
+ " -b ", 128)
+ bertpred_res <- reactiveVal()
+ bertpred_async <- future({
+ system(infer_cmd)
+ system(paste0("rm ", tmp_dat))
+
+ if (!is.null(old_infer)) {
+ new_infer <- arrow::read_feather(out_path)
+ arrow::write_feather(rbind(old_infer, new_infer), out_path)
+ }
+ }) %...>% bertpred_res
+ bertpred_async <- catch(bertpred_async, function(error) {
+ bertpred_nclicks(0)
+ values$bertMsg <- paste0("BERT inference error: ", error)
+ system(paste0("rm ", tmp_dat))
+ })
+ bertpred_async <- finally(bertpred_async, function() {
+ bertpred_nclicks(0)
+ values$bertMsg <-
+ "Inference complete!\nRe-import data in Project/Sample panel to use the predictions for tagging."
+ values$bertValPlot <- NULL
+ values$bertValstats <- NULL
+ values$bertMsgHyperpar <- NULL
+ })
+ NULL
+ })
+ })
+ values$bertSavedAll <- c(values$bertSavedAll, tmpsaved)
+ })
+
+ ## Automatic check of BERT inference progress
+ observe({
+ if (bertpred_nclicks() == 0) return(NULL)
+ invalidateLater(1e3)
+ logfile <- paste0("tiggerbert_infer_", isolate(session$token), ".log")
+ if (file.exists(logfile)) {
+ values$bertMsg <- readLines(logfile)[1]
+ values$bertValPlot <- NULL
+ values$bertValstats <- NULL
+ values$bertMsgHyperpar <- NULL
+ }
+ })
+
+
+
+
+ ##############################################################################
+ ## Export panel
+ ##############################################################################
+
+ ## Export tags and full model predictions
+ output$dlTagSave <-
+ downloadHandler(
+ function()
+ paste0(Sys.Date(), "-activeTigger-", ok.labelcol(),".", input$dlTagFormat),
+ function(con) {
+ if (is.null(ok.data())) return(NULL)
+ dadat <- ok.data()[, values$conf$idcol, drop = FALSE]
+ if ("tags" %in% input$dlTagSelect)
+ dadat[[ok.labelcol()]] <- values$label
+ if ("comments" %in% input$dlTagSelect) if (ok.commentCol() != "(None)")
+ dadat[, ok.commentCol()] <- values$comment
+ if ("predictions" %in% input$dlTagSelect) if (ok.train.possible()) {
+ dadat$tigger.predict <- ok.predlab1()
+ dadat$tigger.predprob <- apply(ok.pred(), 1, max)
+ tmp_pred <- ok.pred()
+ colnames(tmp_pred) <- paste0("tiggerpred_", colnames(tmp_pred))
+ dadat <- cbind(dadat, tmp_pred)
+ }
+ othercols <- input$dlTagSelect
+ othercols <- othercols[! othercols %in% c("tags", "comments", "predictions")]
+ if (length(othercols))
+ dadat <- cbind(dadat, ok.data()[, othercols, drop = FALSE])
+
+ if (input$dlTagFormat == "csv") {
+ write.csv(dadat, row.names = F, con)
+ } else
+ arrow::write_feather(dadat, con)
+ })
+
+
+ ## Export embeddings (ft, sb, umap, t-sne)
+ output$dlEmbedSave <- downloadHandler(
+ function()
+ paste0(Sys.Date(), "-activeTigger-emb-",
+ paste(input$dlEmbedSelect, collapse = "-"),
+ ".", input$dlEmbedFormat),
+ function (file) {
+ if (is.null(input$dlEmbedSelect)) return(NULL)
+ export <- ok.data()[, values$conf$idcol, drop = FALSE]
+ if ("ft" %in% input$dlEmbedSelect)
+ export <- data.frame(export, ok.data()[, values$cols_ft])
+ if ("sb" %in% input$dlEmbedSelect)
+ export <- data.frame(export, ok.data()[, values$cols_sb])
+ if (any(c("t-sne", "umap") %in% input$dlEmbedSelect)) {
+ tmpvisu <- ok.visu()
+ tmpalgo <- input$dlEmbedSelect[input$dlEmbedSelect %in% c("t-sne", "umap")]
+ colnames(tmpvisu) <- paste0(gsub("-", "", tmpalgo), 1:2)
+ export <- data.frame(export, tmpvisu)
+ }
+ if (input$dlEmbedFormat == "csv") {
+ write.csv(export, file, row.names = FALSE)
+ } else if (input$dlEmbedFormat == "feather") {
+ arrow::write_feather(export, file)
+ }
+ })
+
+ ## Export BERT predictions
+
+ output$dlBPMsg <- reactive(values$dlBPMsg)
+
+
+
+ # output$dlBertPredCsv <-
+ # downloadHandler(paste0(Sys.Date(), "-bertpred-", ok.labelcol(),".csv"),
+ # content= function(con) {
+ # if (is.null(ok.data())) return(NULL)
+ # dadat <- ok.data()[, c(values$conf$idcol, ok.labelcol())]
+ # dadat[[ok.labelcol()]] <- values$label
+ # if (ok.commentCol() != "(None)") {
+ # dadat[, ok.commentCol()] <- values$comment
+ # }
+ # if (ok.train.possible()) {
+ # dadat$tigger.predict <- ok.predlab1()
+ # dadat$tigger.predprob <- apply(ok.pred(), 1, max)
+ # }
+ # dadat <- cbind(dadat, ok.pred())
+ # write.csv(dadat, row.names = F, con)
+ # })
+
+ ############
+ ## Action to infer on ALL data from saved BERT (async)
+ bpall_res <- reactiveVal()
+ observeEvent(input$dlBPInfer, {
+ if (bpall_nclicks() > 0 | bertpred_nclicks() > 0) {
+ values$dlBPMsg <- "Inference already running"
+ bpall_res(NULL)
+ return(NULL)
+ }
+ bpall_nclicks(1)
+ bpall_res_async <- reactiveVal()
+ dabert <- input$dlBPSelect
+
+ saved_model <- paste0("tiggerbert_saved_", dabert)
+ out_path <- paste0(
+ values$conf$datadir, values$conf$dataname, "_bertpredall_",
+ dabert, ".feather")
+ tmp_dat <- paste0("tiggerbert_inferall_", session$token, ".feather")
+ tmp_log <- paste0("tiggerbert_inferall_", session$token, ".log")
+ tmp_done <- paste0("tiggerbert_inferall_", session$token, "_done")
+ if (file.exists(tmp_done))
+ system(paste0("rm ", tmp_done))
+
+ if (file.exists(out_path)) {
+ values$dlBPMsg <- "Inference already complete, ready for download"
+ bpall_nclicks(0)
+ bpall_res(out_path)
+ return(NULL)
+ # old_infer <- arrow::read_feather(out_path)
+ }
+
+ fulldat <- arrow::read_feather(paste0(
+ values$conf$datadir, values$conf$dataname, ".feather"))
+ values$dlBPMsg <- paste0(
+ "Preparing inference on ", nrow(fulldat), " texts")
+ if (length(values$conf$textcols) > 1) {
+ fulldat$text <- do.call(paste, c(
+ fulldat[, values$conf$textcols, drop = FALSE], sep = "\n\n"))
+ } else {
+ fulldat$text <- fulldat[, values$conf$textcols]
+ }
+ arrow::write_feather(fulldat[, c(values$conf$idcol, "text")], tmp_dat)
+
+ infer_cmd <- paste0(
+ values$conf$python, " gobert_infer.py -m ", saved_model,
+ " -o ", out_path,
+ " -d ", tmp_dat,
+ " -i ", values$conf$idcol,
+ " -l ", tmp_log,
+ " -G ", as.numeric(values$conf$use_gpu),
+ " -b ", 128,
+ "; touch ", tmp_done)
+
+ bpall_async <- future({
+ system(infer_cmd)
+ system(paste0("rm ", tmp_dat))
+ }) %...>% bpall_res_async
+
+ bpall_async <- catch(bpall_async, function(error) {
+ values$dlBPMsg <- paste0("BERT inference error: ", error)
+ system(paste0("rm ", tmp_dat))
+ system(paste0("rm ", tmp_done))
+ })
+
+ bpall_async <- finally(bpall_async, function() {
+ bpall_nclicks(0)
+ values$dlBPMsg <-
+ "Inference complete, ready for download"
+ bpall_res(out_path)
+ })
+ NULL
+ })
+
+ ## Automatic check of BERT all prediction progress
+ observe({
+ if (bpall_nclicks() == 0) return(NULL)
+ invalidateLater(1e3)
+ logfile <- paste0("tiggerbert_inferall_", isolate(session$token), ".log")
+ if (file.exists(logfile))
+ values$dlBPMsg <- readLines(logfile)[1]
+ })
+
+ ## Show BERTpred download button when data is ready
+ output$dlBPDlButton <- renderUI({
+ if (is.null(bpall_res())) return(NULL)
+ downloadButton("dlBPDl", NULL)
+ })
+
+ output$dlBPDl <- downloadHandler(
+ function()
+ paste0(Sys.Date(), "-tiggerbert-", values$conf$dataname,
+ input$dlBPSelect, ".", input$dlBPFormat),
+ function(file) {
+ if (is.null(bpall_res())) return(NULL)
+ tmp_bert <- gsub("^.*_bertpredall_(.*)[.]feather$", "\\1", bpall_res())
+ if (tmp_bert != input$dlBPSelect) {
+ values$dlBPMsg <- "BERT model changed - predict again or select current model"
+ return(NULL)
+ }
+ tmp <- arrow::read_feather(bpall_res())
+ if (input$dlBPFormat == "csv") {
+ write.csv(tmp, file, row.names = FALSE)
+ } else if (input$dlBPFormat == "feather") {
+ arrow::write_feather(tmp, file)
+ }
+ }
+ )
+
+
+ ##############################################################################
+ ## Delete files at end of session
+ onStop(function() {
+ if(file.exists(bert_status_file)) unlink(bert_status_file)
+ if(length(dir(pattern = paste(session$token))))
+ system(paste0("rm -R *", session$token, "*"))
+ })
+
+})
diff --git a/docker-images-datalab/myactivetigger/activetigger/tokenize_spacy.py b/docker-images-datalab/myactivetigger/activetigger/tokenize_spacy.py
new file mode 100644
index 0000000..a7143ce
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/activetigger/tokenize_spacy.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+## Spacy tokenize texts
+## Requires data file with columns id and text
+
+import argparse
+from os.path import expanduser
+import pandas as pd
+import pyarrow as pa
+import pyarrow.feather as feather
+import spacy
+import re
+
+
+def main(args):
+ print("Tokenizer: Importing data")
+ datapath = expanduser(args.data)
+ dat = feather.read_feather(datapath)
+ outfile = re.sub("[.]feather$", "_spa.feather", datapath)
+
+ print("Tokenizer: Loading model")
+ spa = spacy.load(expanduser(args.model))
+ print("Tokenizer: Tokenizing sentences")
+ tok = [" ".join([str(token) for token in spa.tokenizer(text)]) for text in dat["text"]]
+
+ print("Tokenizer: Exporting")
+ tok = pd.concat([dat["id"], pd.DataFrame(tok)], axis=1)
+ tok.columns = ["id", "text_spa"]
+ feather.write_feather(tok, outfile)
+ print("Tokenizer: Done")
+
+
+if __name__ == "__main__":
+ argParser = argparse.ArgumentParser()
+ argParser.add_argument("-m", "--model", help="Model name", default="fr_core_news_sm")
+ argParser.add_argument("-d", "--data", help="Path to data (feather)")
+ args = argParser.parse_args()
+ main(args)
+
diff --git a/docker-images-datalab/myactivetigger/activetigger/ui.R b/docker-images-datalab/myactivetigger/activetigger/ui.R
new file mode 100644
index 0000000..5f788a2
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/activetigger/ui.R
@@ -0,0 +1,631 @@
+## 21/04/2020 : shiny pour active learning étiquetage de textes
+
+shinyUI(fluidPage(
+ title = "Active Tigger",
+
+ ############################################################################
+ ## Top panel: Title, strategy, model options
+ ############################################################################
+ fluidRow(
+ column(
+ 4,
+ fluidRow(
+ column(4, br(),
+ HTML('
')),
+ column(
+ 8, br(),
+ p(strong("Project / Scheme")),
+ fluidRow(
+ column(2, HTML(paste0(
+ '',
+ actionButton("createProject", "+"),
+ '
'))),
+ column(10, selectInput("selectProject", NULL, NULL, NULL))
+ ),
+ fluidRow(
+ column(2, HTML(paste0(
+ '',
+ actionButton("createScheme", "+"),
+ '
'))),
+ column(10, selectInput("selectScheme", NULL, NULL, NULL))
+ )
+ ))
+ ),
+ column(
+ 2, br(),
+
+ HTML(paste0(
+ '',
+ p(strong("Strategy")),
+ selectInput("strategy", NULL,
+ choices= c("Active" = "entropy",
+ "MaxProb" = "maxprob",
+ "Random" = "random",
+ "Deterministic" = "sequential"),
+ selected = "sequential"),
+ conditionalPanel("input.strategy == 'maxprob'",
+ selectInput("maxprobWhich", NULL, NULL)),
+ '
'))
+ ),
+ column(
+ 2, br(),
+ HTML(paste0(
+ '',
+ p(strong("On")),
+ selectInput("sampleChoice", NULL,
+ choices = c("Untagged" = "untagged",
+ "Tagged" = "tagged",
+ "All" = "all")),
+ conditionalPanel("input.sampleChoice == 'tagged'",
+ selectInput("taggedWhich", NULL, NULL)),
+ '
'))
+ ),
+
+ column(4, p(br()),
+ verbatimTextOutput("trainDiagno"),
+ fluidRow(
+ column(8, HTML(paste0(
+ '',
+ actionButton("modelTrain", "🤖 Train", width = "100%"),
+ '
'))),
+ column(4, HTML(paste0(
+ '',
+ checkboxInput("showTrainOptions", "🔧"),
+ '
')))
+ ))
+ ),
+
+
+ ## General training options
+ conditionalPanel(
+ "input.showTrainOptions",
+ hr(),
+ fluidRow(
+ column(
+ 4,
+ conditionalPanel(
+ "input.showTrainOptions & input.use_regressors.includes('regex')",
+ uiOutput("panelExtraRegex")
+ )
+ ),
+ column(
+ 4,
+ p(strong("Predictors")),
+ selectizeInput("use_regressors", NULL, "",
+ multiple = TRUE, width = "100%"),
+ conditionalPanel(
+ "input.showTrainOptions & input.use_regressors.includes('extra')",
+ selectizeInput("use_ootregnum", "Extra predictors: continuous", "",
+ multiple = TRUE, width = "100%"),
+ selectizeInput("use_ootregcat", "Extra predictors: categorical", "",
+ multiple = TRUE, width = "100%")
+ ),
+
+ conditionalPanel(
+ "input.showTrainOptions & input.use_regressors.includes('dfm')",
+ hr(),
+ fluidRow(
+ column(
+ 6, numericInput("dfmMinTermfreq", "DFM : Min Termfreq",
+ min= 1, max= 1e3, value= 5, step= 1)),
+ column(
+ 6, numericInput("dfmMinDocfreq", "DFM : Min Docfreq",
+ min= 1, max= 1e3, value= 5, step= 1))),
+ fluidRow(
+ column(
+ 4, checkboxInput("dfmTfIdf", "Tf-Idf", TRUE)),
+ column(
+ 4, selectInput("dfmTfScheme", label = NULL,
+ choices= c("logcount", "count", "prop", "propmax",
+ "boolean", "augmented", "logave"),
+ selected= "logcount")),
+ column(
+ 4, selectInput("dfmDfScheme", label = NULL,
+ choices= c("inverse", "count",
+ "inversemax", "inverseprob", "unary"),
+ selected= "inverse"))
+ ),
+ numericInput("dfmNgrams", "N-grams", value= 1,
+ min= 1, max= 10, step = 1)
+ )
+ ),
+ column(
+ 4,
+ HTML(paste0(
+ '',
+ fluidRow(
+ column(6, strong("Auto train every")),
+ column(6, numericInput("trainCountdown", NULL, 0, 0, 1e6, 1))),
+ '
'
+ )),
+ HTML(paste0(
+ '',
+ fluidRow(
+ column(6, strong("Model")),
+ column(6, selectInput("predModel", NULL, selected = "linear",
+ choices = c("Naive Bayes" = "naive bayes",
+ "KNN" = "knn",
+ "Liblinear" = "linear",
+ "LASSO" = "lasso",
+ "Random Forest" = "random forest")))
+ ),
+ '
'
+ )),
+
+ ## Model-specific training options
+ ### Random forest options
+ conditionalPanel(
+ "input.showTrainOptions & input.predModel == 'random forest'",
+ fluidRow(
+ numericInput("rfNumTrees", label = "Num. trees",
+ min = 1, max = 2e3, value = 500, step = 1),
+ numericInput("rfMtry", label = "mtry",
+ min = 0, max = 1e5, value = 0, step = 1),
+ numericInput("rfSampleFrac", label = "Sample fraction",
+ min = 0, max = 1, value = 1, step = .01)
+ )
+ ),
+ ### Naive Bayes options
+ conditionalPanel(
+ "input.showTrainOptions & input.predModel == 'naive bayes'",
+ flowLayout(
+ numericInput("naiveSmooth", label = "Smooth",
+ min = 0, max = 2e3,
+ value = 1, step = 1e-3),
+ selectInput("naivePrior", "Prior",
+ c("uniform", "docfreq", "termfreq")),
+ selectInput("naiveDistri", "Distribution",
+ c("multinomial", "Bernoulli"))
+ )
+ ),
+ ### Lasso options
+ conditionalPanel(
+ "input.showTrainOptions & input.predModel == 'lasso'",
+ strong("Lasso penalty"),
+ fluidRow(
+ column(
+ 6, numericInput("glmLambda", label = NULL, min = 0, max = 2e3,
+ value = 0, step = 1e-6)),
+ column(
+ 6, actionButton("glmCV", label= "Find best (CV)")))
+ ),
+ ### Linear options
+ conditionalPanel(
+ "input.showTrainOptions & input.predModel == 'linear'",
+ strong("Liblinear Cost"),
+ fluidRow(
+ column(
+ 6, numericInput("liblinCost", label= NULL, min= 0, max= 2e10,
+ value= 32, step= 1)),
+ column(
+ 6, actionButton("liblinCV", label= "Find best (CV)")))
+ ),
+ ### KNN options
+ conditionalPanel(
+ "input.showTrainOptions & input.predModel == 'knn'",
+ flowLayout(
+ strong("N. Neighbours"),
+ numericInput("knnK", label = NULL, min = 1, max = 1e2,
+ value = 3, step = 1)
+ )
+ )
+ )
+ ),
+ hr()
+ ),
+
+
+ ############################################################################
+ ## Main panel set
+ ############################################################################
+ tabsetPanel(
+ id = "mainPanelset",
+ selected = "Tagging",
+
+ ########################################################################
+ ## Project panel
+ ########################################################################
+ tabPanel(
+ "Project",
+ br(),
+ tabsetPanel(
+ id = "tabsetProject",
+ selected = "Sample",
+
+ tabPanel(
+ "Settings",
+ br(),
+ actionButton("saveSystem", "Save changes"),
+
+ h4("Files"),
+ fluidRow(
+ column(2, p("Data directory")),
+ column(4, uiOutput("sys_datadir")),
+ column(6, p("Place (on the server) where the data and project are stored"))
+ ),
+ fluidRow(
+ column(2, p("Data filename")),
+ column(4, uiOutput("sys_datafile")),
+ column(6, p("Main file, containing id and text columns"))
+ ),
+
+ h4("Variables"),
+ fluidRow(
+ column(2, p("ID")),
+ column(4, uiOutput("sys_var_id")),
+ column(6, p("Name of the id variable, unique identifier of each text"))
+ ),
+ fluidRow(
+ column(2, p("Text")),
+ column(4, uiOutput("sys_var_text")),
+ column(6, p("Name of the text variables: if more than one, texts are concatenated in the specified order"))
+ ),
+ fluidRow(
+ column(2, p("Tags")),
+ column(4, uiOutput("sys_var_tag")),
+ column(6, p("Names of scheme variables"))
+ ),
+ fluidRow(
+ column(2, p("Comments")),
+ column(4, uiOutput("sys_var_comm_ui")),
+ column(6, p("Name of the comments variable"))
+ ),
+ fluidRow(
+ column(2, p("Context")),
+ column(4, uiOutput("sys_var_context_ui")),
+ column(6, p("Names of variables not used in the models, but may be displayed during tagging"))
+ ),
+
+ h4("System"),
+ fluidRow(
+ column(2, checkboxInput("sys_use_python", "Python backend", FALSE)),
+ column(4, conditionalPanel(
+ "input.sys_use_python",
+ textInput("sys_which_python", NULL, value = "python3",
+ placeholder = "(custom python path)"))),
+ column(6, conditionalPanel(
+ "input.sys_use_python",
+ p("This must be a working python3 environment, with the required modules installed (see documentation)")))
+ ),
+
+ conditionalPanel("input.sys_use_python", list(
+ fluidRow(
+ column(2, checkboxInput("sys_use_spacy", "SpaCy tokenization", FALSE)),
+ column(4, conditionalPanel("input.sys_use_spacy", textInput(
+ "sys_use_spacy_model", NULL, NULL, placeholder = "(spacy model name)"))),
+ column(6, p("Name of the spacy tokenizer model, used in DTM and word embeddings"))
+ ),
+ conditionalPanel("input.sys_use_spacy", fluidRow(
+ column(2),
+ column(9, uiOutput("sys_spacyDlUI")))
+ ),
+ fluidRow(
+ column(2, checkboxInput("sys_use_ft", "FastText word embeddings", FALSE)),
+ column(4, conditionalPanel("input.sys_use_ft", textInput(
+ "sys_use_ft_model", NULL, NULL, placeholder = "(fasttext model path)"))),
+ column(6, p("Path to the local fasttext model binary"))
+ ),
+ conditionalPanel("input.sys_use_ft", fluidRow(
+ column(2),
+ column(9, uiOutput("sys_ftDlUI")))
+ ),
+ fluidRow(
+ column(2, checkboxInput("sys_use_sb", "SBERT sentence embeddings", FALSE)),
+ column(4, conditionalPanel("input.sys_use_sb", textInput(
+ "sys_use_sb_model", NULL, NULL,
+ placeholder = "(custom sentence_transformers model)"))),
+ column(6, p("(GPU recommended) Name or path of the sentence-transformers model"))
+ ),
+
+ conditionalPanel("input.sys_use_python", list(
+ checkboxInput("sys_use_gpu", "GPU support (CUDA, for SBERT and BERT)", FALSE),
+
+ br(),
+ wellPanel(
+ h4("Model picker"),
+ fluidRow(
+ column(2, p("Language")),
+ column(4, uiOutput("sys_ex_lang_ui")),
+ column(6, p("Used to preset tokenization and embedding models"))
+ ),
+ fluidRow(
+ column(2),
+ column(4, strong("Recommended model")),
+ column(6, strong("Download instructions"))
+ ),
+ fluidRow(
+ column(2, p("SpaCy tokenization")),
+ column(4, uiOutput("sys_ex_spacy")),
+ column(6, uiOutput("sys_ex_spacy_dl"))
+ ),
+ fluidRow(
+ column(2, p("FastText word embeddings")),
+ column(4, uiOutput("sys_ex_ft")),
+ column(6, uiOutput("sys_ex_ft_dl"))
+ ),
+ fluidRow(
+ column(2, p("SBERT sentence embeddings")),
+ column(4, uiOutput("sys_ex_sb")),
+ column(6, p("(Auto download by python module)"))
+ )
+ )
+ ))
+ ))
+ ),
+
+ tabPanel(
+ "Sample",
+ br(),
+ fluidRow(
+ column(
+ 4,
+ wellPanel(
+ fluidRow(
+ column(8, h4("Sample")),
+ column(4, actionButton("dataImport", "Import", width = "100%"))),
+ fluidRow(
+ column(6, numericInput("dataNrows", "N. rows", 500, 10, 1e4, 1)),
+ column(6, numericInput("dataSkipRows", "Skip rows", 0, 0, step = 1))
+ )
+ )
+ ),
+ column(8, uiOutput("dataMessage"), uiOutput("panelData"))
+ )
+ ),
+
+ tabPanel(
+ "Scheme",
+ br(),
+ fluidRow(
+ # column(4, uiOutput("panelScheme")),
+ column(
+ 4,
+ wellPanel(
+ h4("Current scheme"),
+ fluidRow(
+ column(2, HTML(paste0(
+ "",
+ actionButton("schemeDelete", "🗑", width = "100%"),
+ "
"))),
+ column(6, uiOutput("printScheme")),
+ column(4, HTML(paste0(
+ "",
+ actionButton("schemeDescrSave", "Save", width = "100%"),
+ "
")))
+ ),
+ br(),
+ textAreaInput("schemeDescr", NULL, width = "100%", rows = 10,
+ placeholder = "Write scheme description here"),
+ hr()
+ )
+ ),
+ column(8, uiOutput("panelRetag"))
+ )
+ )
+ )
+ ),
+
+
+ ########################################################################
+ ## Text / visualization panel
+ ########################################################################
+ tabPanel(
+ "Tagging",
+ fluidRow(
+ column(
+ 3,
+ br(),
+ fluidRow(
+ column(8, textInput("regexFilter", label = NULL,
+ placeholder = "(Regex filter)")),
+ column(4, checkboxInput("regexCaseSens", "Case"))),
+
+ wellPanel(
+ ## Tagging buttons
+ fluidRow(
+ column(8, textInput("newLab", label = NULL,
+ placeholder = "(New label)")),
+ column(4, actionButton("currentAction", "Create"))
+ ),
+
+ # fluidRow(uiOutput("oracleButtons")),
+ uiOutput("oracleButtons"),
+
+ br(),
+ textInput("currentComment", NULL, "", width = "100%",
+ placeholder = "(Comment)"),
+ br(),
+ uiOutput("makeOracleConfirm")
+ ),
+
+ # fluidRow(
+ # column(6, checkboxInput("showContext", "Context")),
+ # column(6, actionButton("oops", strong("Oops")))
+ # ),
+ checkboxInput("showContext", "Context"),
+ conditionalPanel("input.showContext", htmlOutput("currentContext"))
+ ),
+ column(
+ 9,
+ fluidRow(
+ column(2, checkboxInput("panelText", "Text", TRUE)),
+ column(2, checkboxInput("panelVisu", "Visualization", FALSE),
+ offset = 8)
+ ),
+ uiOutput("textVisuCols") # Handled in server.R for adaptive columns
+ )
+ )
+ ),
+
+ ########################################################################
+ ## History panel
+ ########################################################################
+ tabPanel(
+ "History",
+ br(),
+ actionButton("histSave", "Save changes"),
+ br(),
+ br(),
+ DT::dataTableOutput("histDTable")
+ ),
+
+
+ ########################################################################
+ ## Stats panel
+ ########################################################################
+ tabPanel(
+ "Stats",
+ br(),
+ fluidRow(
+ column(
+ 3,
+ h3("Counts"),
+ tableOutput("statsTagTable")
+ ),
+ column(
+ 9,
+ h3("10-CV diagnostics"),
+ actionButton("statsCVgo", "Compute 10-CV"),
+ br(),
+ verbatimTextOutput("statsCVoutput"),
+ DT::dataTableOutput("statsCVtable")
+ )
+ ),
+ hr(),
+ h3("Gold Standard")
+ ),
+
+ ########################################################################
+ ## BERT panel
+ ########################################################################
+ tabPanel(
+ "BERT",
+
+ fluidRow(
+ column(
+ 3,
+ br(),
+ h3("Train new BERT"),
+ fluidRow(
+ column(6, actionButton("bertTrain", "Train BERT", width = "100%")),
+ column(6, checkboxInput("bertOptions", "Options"))),
+ fluidRow(
+ column(6, textInput(
+ "bertSaveName", NULL, placeholder = "(save name)")),
+ column(6, actionButton("bertSave", "Save", width = "100%"))),
+ actionLink("bertLast", "Last trained model"),
+ h3("Saved models"),
+ uiOutput("bertSaved")
+ ),
+ column(
+ 9,
+ br(),
+ conditionalPanel(
+ "input.bertOptions",
+ fluidRow(
+ column(6, selectInput(
+ "bertModel", "Model",
+ c("(Fr) CamemBERT-base" = "camembert/camembert-base",
+ "(Fr) CamemBERT-large" = "camembert/camembert-large",
+ "(Fr) FlauBERT-small" = "flaubert/flaubert_small_cased",
+ "(Fr) FlauBERT-base" = "flaubert/flaubert_base_cased",
+ "(Fr) FlauBERT-large" = "flaubert/flaubert_large_cased",
+ "(En) DistilBERT-base" = "distilbert-base-cased",
+ "(En) RoBERTa-base" = "roberta-base",
+ "(En) DeBERTa-base" = "microsoft/deberta-base",
+ "(Multi) DistilmBERT-base" = "distilbert-base-multilingual-cased",
+ "(Multi) MiniLM" = "microsoft/Multilingual-MiniLM-L12-H384",
+ "(Multi) XLM-RoBERTa-base" = "xlm-roberta-base"))),
+ column(6)
+ ),
+ fluidRow(
+ column(3, numericInput("bertEpochs", "Epochs", 3, 1, 20, 1)),
+ column(3, numericInput("bertLrate", "Learning rate", 2e-5, 1e-6, 1, 1e-6)),
+ column(3, numericInput("bertWdecay", "Weight decay", 0.01, 0, 10, 1e-6)),
+ column(3)
+ ),
+ fluidRow(
+ column(3, numericInput("bertBatchsize", "Batch size", 4, 1, 32, 1)),
+ column(3, numericInput("bertGradacc", "Gradient accum.", 4, 1, 32, 1)),
+ column(3, br(), checkboxInput("bertAdapt", "Adapt token length to batch", TRUE)),
+ column(3)
+ ),
+ fluidRow(
+ column(3, numericInput("bertValidFrac", "Validation fraction", .2, 0, .9)),
+ column(3, numericInput("bertValidSeed", "Validation seed", 1234, 1, 9e8)),
+ column(3, numericInput("bertNeval", "N. validation evals", 10, 1, 100, 1)),
+ column(3, br(), checkboxInput("bertBest", "Keep best", TRUE))
+ ),
+ fluidRow(
+ column(3, numericInput("bertMinOccur", "Min. class occurences", 1, 1, 1e4, 1)),
+ column(3, br(), checkboxInput("bertBalance", "Balance classes", FALSE)),
+ column(3),
+ column(3)
+ )
+ ),
+
+
+ fluidRow(
+ column(
+ 6,
+ # flowLayout(
+ # actionButton(
+ # "bertGoPred", "Infer on current data", width = "100%"),
+ # actionButton(
+ # "bertDelete", "Delete saved model", width = "100%")),
+ verbatimTextOutput("bertMsg")),
+ column(6, plotOutput("bertValPlot", height = 200))),
+
+ verbatimTextOutput("bertMsgHyperpar"),
+
+ DT::dataTableOutput("bertValstats")
+ )
+ )
+ ),
+
+ ########################################################################
+ ## Export panel
+ ########################################################################
+ tabPanel(
+ "Export",
+ h4("Export tagged data"),
+ p("Download the tags and predicted probabilities from the complete model, on the current data sample."),
+ # downloadButton("downloadCsv", "Save csv"),
+ flowLayout(
+ selectInput(
+ "dlTagSelect", NULL, c("tags", "comments", "predictions"),
+ c("tags", "comments", "predictions"), multiple = TRUE),
+ selectInput("dlTagFormat", NULL, c("csv", "feather"), "csv"),
+ downloadButton("dlTagSave", NULL, title = "Save tags")
+ ),
+
+ hr(),
+ h4("Export embeddings"),
+ p("Download the embeddings (incl. from visualization if present), on the current data sample."),
+ flowLayout(
+ selectInput(
+ "dlEmbedSelect", NULL, c("FastText" = "ft", "SBERT" = "sb"),
+ selected = "sb", multiple = TRUE),
+ selectInput("dlEmbedFormat", NULL, c("csv", "feather"), "feather"),
+ downloadButton("dlEmbedSave", NULL, title = "Save embeddings")
+ ),
+
+ hr(),
+ h4("Export BERT predictions"),
+ p("Download the predicted probabilities from the chosen BERT model, on the complete dataset."),
+ flowLayout(
+ selectInput("dlBPSelect", NULL, NULL, NULL),
+ selectInput("dlBPFormat", NULL, c("csv", "feather"), "feather"),
+ actionButton("dlBPInfer", "Predict"),
+ verbatimTextOutput("dlBPMsg"),
+ uiOutput("dlBPDlButton")
+ ),
+
+ hr(),
+ h4("Export BERT models")
+ )
+
+ ),
+ br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(),
+ br(), br(), br(), br(), br(), br()
+))
diff --git a/docker-images-datalab/myactivetigger/activetigger/www/active_tigger.jpeg b/docker-images-datalab/myactivetigger/activetigger/www/active_tigger.jpeg
new file mode 100644
index 0000000..c2e913e
Binary files /dev/null and b/docker-images-datalab/myactivetigger/activetigger/www/active_tigger.jpeg differ
diff --git a/docker-images-datalab/myactivetigger/activetigger/www/active_tigger.png b/docker-images-datalab/myactivetigger/activetigger/www/active_tigger.png
new file mode 100644
index 0000000..587d4ee
Binary files /dev/null and b/docker-images-datalab/myactivetigger/activetigger/www/active_tigger.png differ
diff --git a/docker-images-datalab/myactivetigger/activetigger/www/favicon.ico b/docker-images-datalab/myactivetigger/activetigger/www/favicon.ico
new file mode 100644
index 0000000..84e8c0f
Binary files /dev/null and b/docker-images-datalab/myactivetigger/activetigger/www/favicon.ico differ
diff --git a/docker-images-datalab/myactivetigger/requirements.py b/docker-images-datalab/myactivetigger/requirements.py
new file mode 100644
index 0000000..3e44557
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/requirements.py
@@ -0,0 +1,13 @@
+argparse
+datasets
+fasttext
+numpy
+pandas
+pyarrow
+scikit-learn
+torch
+transformers[torch]
+sentence_transformers
+typing-inspect==0.8.0
+typing_extensions==4.6.1
+spacy
diff --git a/docker-images-datalab/myactivetigger/requirements.r b/docker-images-datalab/myactivetigger/requirements.r
new file mode 100644
index 0000000..fee3fab
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/requirements.r
@@ -0,0 +1,3 @@
+packages=c("arrow", "class", "data.table", "DT", "foreign", "glmnet", "haven", "LiblineaR", "Matrix", "Metrics", "quanteda", "quanteda.textmodels", "ranger", "readODS", "readxl", "RJSONIO", "rlang", "Rtsne", "shiny", "SparseM", "stringi", "uwot", "future","htmlTable","ggplot2")
+
+install.packages(setdiff(packages, rownames(installed.packages())))
diff --git a/docker-images-datalab/myactivetigger/requirementspython.txt b/docker-images-datalab/myactivetigger/requirementspython.txt
new file mode 100644
index 0000000..6bbb052
--- /dev/null
+++ b/docker-images-datalab/myactivetigger/requirementspython.txt
@@ -0,0 +1,6 @@
+pip install argparse datasets fasttext numpy pandas pyarrow sklearn
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip install transformers[torch]
+pip install sentence_transformers
+pip install -U typing-inspect==0.8.0 typing_extensions==4.6.1
+pip install spacy
\ No newline at end of file