diff --git a/docker-images-datalab/activetigger/.drone.yml b/docker-images-datalab/activetigger/.drone.yml new file mode 100644 index 0000000..81a0799 --- /dev/null +++ b/docker-images-datalab/activetigger/.drone.yml @@ -0,0 +1,48 @@ +kind: pipeline +name: Build & publish main + +steps: + - name: publish-image + pull: always + image: plugins/kaniko:1.7.1-kaniko1.9.1 + settings: + auto_tag: true + auto_tag_suffix: latest + registry: code.groupe-genes.fr + repo: code.groupe-genes.fr/datalab/docker-images-datalab/activetigger + username: + from_secret: docker_username + password: + from_secret: docker_password + when: + event: + exclude: + - pull_request + - name: deploy + image: alpine + environment: + kubernetes_server: + from_secret: kubernetes_server + kubernetes_cert: + from_secret: kubernetes_cert + kubernetes_token: + from_secret: kubernetes_token + commands: + - apk add --no-cache curl + - curl -LL -o /usr/bin/kubectl "https://dl.k8s.io/release/v1.28.2/bin/linux/amd64/kubectl" + - curl -LL -o helm.tar.gz "https://get.helm.sh/helm-v3.14.0-linux-amd64.tar.gz" + - tar xf "helm.tar.gz" && mv ./linux-amd64/helm /usr/bin/helm + - chmod +x /usr/bin/kubectl + - chmod +x /usr/bin/helm + - kubectl config set-cluster default --server=$kubernetes_server --insecure-skip-tls-verify=true + - kubectl config set-credentials user --token=$kubernetes_token + - kubectl config set-context default --user=user --cluster=default --namespace=activetigger + - kubectl config use-context default + - kubectl get pods + - helm ls -n activetigger --debug + - helm dependency build ./helm-chart + - helm upgrade activetigger ./helm-chart -f ./helm-chart/values.yaml -n activetigger + when: + event: + exclude: + - pull_request \ No newline at end of file diff --git a/docker-images-datalab/activetigger/Dockerfile b/docker-images-datalab/activetigger/Dockerfile new file mode 100644 index 0000000..651fe7d --- /dev/null +++ b/docker-images-datalab/activetigger/Dockerfile @@ -0,0 +1,111 @@ +FROM ubuntu:22.04 +ARG DEBIAN_FRONTEND=noninteractive + +COPY requirements.r /requirements.r +COPY requirementspython.txt /requirementspython.txt + + +# Installation python +RUN apt-get update && apt-get install -y \ + python3.10 \ + python3.10-distutils \ + python3.10-venv \ + python3-pip \ + r-base \ + wget \ + && apt-get clean + +# Installation R + shiny server +RUN apt-get update && \ + apt-get install -y r-base +RUN R -e "install.packages('shiny', repos='https://cran.rstudio.com/')" + +# Install gdebi-core and shiny-server +RUN apt-get update +RUN apt-get install -y gdebi-core +RUN wget https://download3.rstudio.org/ubuntu-18.04/x86_64/shiny-server-1.5.21.1012-amd64.deb +RUN gdebi --non-interactive shiny-server-1.5.21.1012-amd64.deb + + +## Packages package R (à installer depuis l'exécutable R employé par shiny server) +RUN Rscript /requirements.r + +## Environnement python | a vérifier dans requirementspython.txt l'installation des cu118 se fais de cette manière + + + +# Install Miniconda + RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ + rm Miniconda3-latest-Linux-x86_64.sh + +# Add Conda binaries to PATH +ENV PATH="/opt/conda/bin:${PATH}" + +# Create a Conda environment and activate it +RUN conda create -n tigger python==3.10 && \ + echo "conda activate tigger" >> ~/.bashrc + +# Mise à jour et installation des dépendances système +RUN rm -rf /var/lib/apt/lists/* + +RUN pip3 install --no-cache-dir \ + torch torchvision torchaudio \ + -f https://download.pytorch.org/whl/cu118/torch_stable.html + +RUN apt-get update && apt-get install -y curl build-essential +RUN pip3 install --no-cache-dir six + +# Install Rust using rustup +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + +# Add Cargo's bin directory to the PATH environment variable +ENV PATH="/root/.cargo/bin:${PATH}" + +RUN pip3 install --no-cache-dir --upgrade setuptools + + +# Autres installations de bibliothèques Python +RUN pip3 install argparse +RUN pip3 install datasets +RUN pip3 install fasttext +RUN pip3 install numpy +RUN pip3 install pandas +RUN pip3 install pyarrow +RUN pip3 install scikit-learn +RUN pip3 install sentence-transformers +RUN pip3 install transformers +RUN pip3 install typing-inspect==0.8.0 +RUN pip3 install typing-extensions==4.6.1 +RUN pip3 install spacy + +# Mettre en place des configurations supplémentaires si nécessaire + +# Commande par défaut à exécuter lorsque le conteneur démarre +CMD ["/bin/bash"] + +## Téléchargement des modèles spacy et fasttext + +### Français +WORKDIR ~ +RUN python -m spacy download fr_core_news_sm + +RUN wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz \ + && gunzip cc.fr.300.bin.gz + + +# A chaque création d'instance + +## Clone git pour créer la nouvelle instance (remplacer "tigger-name" par le nom que prendra l'instance, ie https://analytics.huma-num.fr/Prenom.Nom/tigger-name/) +RUN mkdir -p ~/zPublish/shiny/tigger-name +COPY activetigger/ ~/zPublish/shiny/tigger-name + +## Dans l'application + +## Tout en haut à gauche, bouton "+" pour "create project". Puis dans les champs : + +## - data directory: moi j'utilise toujours ~/tagging/domaine (genre ~/tagging/radio ou ~/tagging/journaux), mais c'est à toi de voir où tu veux que les données et tags soient stockées sur ton serveur +## - je conseille de cocher toutes les cases : python, spacy, fasttext, sbert, gpu +## - python : "~/conda/envs/tigger/bin/python" +## - fasttext : "~/cc.fr.300.bin" (càd qu'il faut donner le chemin du modèle sur ton serveur, pas juste le nom) +## - spacy et SBERT : garder les valeurs par défaut pour la langue choisie diff --git a/docker-images-datalab/activetigger/activetigger/embed_fasttext.py b/docker-images-datalab/activetigger/activetigger/embed_fasttext.py new file mode 100644 index 0000000..5c72316 --- /dev/null +++ b/docker-images-datalab/activetigger/activetigger/embed_fasttext.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# coding: utf-8 + +## FastText embed sentences +## Requires data file with columns id and text + +import argparse +import fasttext +from os.path import expanduser +import pandas as pd +import pyarrow as pa +import pyarrow.feather as feather +import re + + +def main(args): + print("FastText: Importing data") + datapath = expanduser(args.data) + dat = feather.read_feather(datapath) + outfile = re.sub("[.]feather$", "_ft.feather", datapath) + + print("FastText: Loading model") + ft = fasttext.load_model(expanduser(args.model)) + print("FastText: Embedding sentences") + emb = [ft.get_sentence_vector(re.sub("\n", " ", x)) for x in dat["text"]] + + print("FastText: Exporting") + emb = pd.DataFrame(emb) + emb.columns = ["ft%03d" % (x + 1) for x in range(len(emb.columns))] + emb = pd.concat([dat["id"], emb], axis=1) + feather.write_feather(emb, outfile) + print("FastText: Done") + + +if __name__ == "__main__": + argParser = argparse.ArgumentParser() + argParser.add_argument("-m", "--model", help="Model path", default="/data/user/b/jboelaert/cc.fr.100.bin") + argParser.add_argument("-d", "--data", help="Path to data (feather)") + args = argParser.parse_args() + main(args) + diff --git a/docker-images-datalab/activetigger/activetigger/embed_sbert.py b/docker-images-datalab/activetigger/activetigger/embed_sbert.py new file mode 100644 index 0000000..d944fd9 --- /dev/null +++ b/docker-images-datalab/activetigger/activetigger/embed_sbert.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# coding: utf-8 + +## SBERT embed sentences +## Requires data file with columns id and text + +import argparse +from os.path import expanduser +import pandas as pd +import pyarrow as pa +import pyarrow.feather as feather +import re +from sentence_transformers import SentenceTransformer + + +def main(args): + print("SBERT: Importing data") + datapath = expanduser(args.data) + dat = feather.read_feather(datapath) + outfile = re.sub("[.]feather$", "_sb.feather", datapath) + + print("SBERT: Loading model") + sbert = SentenceTransformer(expanduser(args.model)) + sbert.max_seq_length = 512 + print("SBERT: Embedding sentences") + emb = sbert.encode(dat["text"]) + + print("SBERT: Exporting") + emb = pd.DataFrame(emb) + emb.columns = ["sb%03d" % (x + 1) for x in range(len(emb.columns))] + emb = pd.concat([dat["id"], emb], axis=1) + feather.write_feather(emb, outfile) + print("SBERT: Done") + + +if __name__ == "__main__": + argParser = argparse.ArgumentParser() + argParser.add_argument("-m", "--model", help="Model name or path", default="distiluse-base-multilingual-cased-v1") + argParser.add_argument("-d", "--data", help="Path to data (feather)") + args = argParser.parse_args() + main(args) + diff --git a/docker-images-datalab/activetigger/activetigger/gobert.py b/docker-images-datalab/activetigger/activetigger/gobert.py new file mode 100644 index 0000000..32a9f04 --- /dev/null +++ b/docker-images-datalab/activetigger/activetigger/gobert.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +# coding: utf-8 + +## BERT trainer to be called by server.R +## Requires two data files with columns id, label and text + +import argparse +import datasets +from datasets import load_metric +import numpy as np +from os.path import expanduser +import os +import pandas as pd +import re +from sklearn import metrics +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from transformers import Trainer, TrainingArguments, TrainerCallback + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +def main(args): + print("Importing data") + dattrain = pd.read_csv(expanduser(args.traindat)) + datval = pd.read_csv(expanduser(args.valdat)) + datval_id = datval["id"] + classcolname = "label" + + ## Make class_names + class_names = [x for x in dattrain[classcolname].unique()] + + ## Labels to class number + dattrain[classcolname] = [class_names.index(x) for x in dattrain[classcolname].to_list()] + datval[classcolname] = [class_names.index(x) for x in datval[classcolname].to_list()] + + ## Transform to datasets + dattrain = datasets.Dataset.from_pandas(dattrain[['text', 'label']]) + datval = datasets.Dataset.from_pandas(datval[['text', 'label']]) + + # Model choice + modelname = expanduser(args.model) + + ## Tokenizer + print("Tokenizing") + + tokenizer = AutoTokenizer.from_pretrained(modelname) + + # toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length"), batched=True) + # toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length"), batched=True) + if args.adapt: + toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=512), batched=True) + toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=512), batched=True) + else: + toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True) + toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True) + + del(dattrain) + + ## Model + print("Loading model") + model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels = len(class_names)) + if (args.gpu): + model.cuda() + + ## Train using Trainer interface + print("Training...") + BATCH_SIZE = args.batchsize + GRAD_ACC = args.gradacc + epochs = args.epochs + + total_steps = (epochs * len(toktrain)) // (BATCH_SIZE * GRAD_ACC) + warmup_steps = (total_steps) // 10 + eval_steps = total_steps // args.eval + + training_args = TrainingArguments( + output_dir=args.session + "_train", + learning_rate=args.lrate, + weight_decay=args.wdecay, + num_train_epochs=epochs, + gradient_accumulation_steps=GRAD_ACC, + per_device_train_batch_size=BATCH_SIZE, + # per_device_eval_batch_size=BATCH_SIZE, + per_device_eval_batch_size=32, + warmup_steps=warmup_steps, + + eval_steps=eval_steps, + evaluation_strategy="steps", + save_strategy="steps", + save_steps=eval_steps, + logging_steps=eval_steps, + do_eval=True, + greater_is_better=False, + load_best_model_at_end=bool(args.best), + metric_for_best_model="eval_loss" + ) + + trainer = Trainer(model=model, args=training_args, + train_dataset=toktrain, eval_dataset=toktest) + + the_session = args.session + class HaltCallback(TrainerCallback): + "A callback that checks for _stop file to interrupt training" + + def on_step_begin(self, args, state, control, **kwargs): + if os.path.exists(the_session + "_stop"): + print("\nHalted by user.\n") + control.should_training_stop = True + return(control) + else: + print("\nNot halted by user.\n") + + trainer.add_callback(HaltCallback) + + trainer.train() + + ## Add class names to model + label_to_id = {v: i for i, v in enumerate(class_names)} + model.config.label2id = label_to_id + model.config.id2label = {id: label for label, id in model.config.label2id.items()} + + ## Save model + model.save_pretrained(args.session) + + + ## Prediction functions + + + def get_predprobs(text): + # inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt") + inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt") + if (args.gpu): + inputs = inputs.to("cuda") + outputs = model(**inputs) + res = outputs[0] + if (args.gpu): + res = res.cpu() + res = res.softmax(1).detach().numpy() + return res + + + def get_prediction(text): + return class_names[get_predprobs(text).argmax()] + + ## Metrics on validation set + print("Computing predictions") + testpred = [get_prediction(txt) for txt in datval["text"]] + testtruth = [class_names[x] for x in datval["label"]] + + exportpred = pd.DataFrame(datval_id) + exportpred.columns = ["id"] + exportpred["bertpred"] = testpred + exportpred.to_csv(args.session + "_predval.csv", index=False) + + +if __name__ == "__main__": + argParser = argparse.ArgumentParser() + argParser.add_argument("-m", "--model", help="Model name or path", default="microsoft/Multilingual-MiniLM-L12-H384") + argParser.add_argument("-t", "--traindat", help="Path to training data") + argParser.add_argument("-v", "--valdat", help="Path to validation data") + argParser.add_argument("-b", "--batchsize", help="Batch size for training", type=int, default=4) + argParser.add_argument("-g", "--gradacc", help="Gradient accumulation for training", type=int, default=1) + argParser.add_argument("-e", "--epochs", help="Number of training epochs", type=float, default=3) + argParser.add_argument("-l", "--lrate", help="Learning rate", type=float, default=5e-05) + argParser.add_argument("-w", "--wdecay", help="Weight decay", type=float, default=.01) + argParser.add_argument("-B", "--best", help="Load best model instead of last", type=int, choices=[0,1], default=1) + argParser.add_argument("-E", "--eval", help="Number of intermediary evaluations", type=int, default=10) + argParser.add_argument("-s", "--session", help="Session name (used to save results)") + argParser.add_argument("-G", "--gpu", help="Use GPU (CUDA)", type=int, choices=[0,1], default=0) + argParser.add_argument("-A", "--adapt", help="Adapt token length to batch", type=int, choices=[0,1], default=1) + + + args = argParser.parse_args() + + main(args) + diff --git a/docker-images-datalab/activetigger/activetigger/gobert_infer.py b/docker-images-datalab/activetigger/activetigger/gobert_infer.py new file mode 100644 index 0000000..c2c607a --- /dev/null +++ b/docker-images-datalab/activetigger/activetigger/gobert_infer.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# coding: utf-8 + +## BERT inference to be called by server.R + +import argparse +import datasets +import json +import numpy as np +from os import path, remove +import pandas as pd +import pyarrow.feather as feather +import re +from torch import no_grad +from transformers import AutoModelForSequenceClassification, AutoTokenizer + + +def chunker(seq, batch_size): + return (seq[pos:pos + batch_size] for pos in range(0, len(seq), batch_size)) + + +def main(args): + print("Importing data") + with open(path.expanduser(args.logfile), "w") as progfile: + progfile.write("Importing data") + + dat = feather.read_feather(path.expanduser(args.dat)) + + with open(path.expanduser(args.logfile), "w") as progfile: + progfile.write("Tokenizing") + + ## Tokenizer + print("Tokenizing") + with open(path.join(path.expanduser(args.model), "config.json"), "r") as jsonfile: + modeltype = json.load(jsonfile)["_name_or_path"] + + tokenizer = AutoTokenizer.from_pretrained(modeltype) + + ## Model + print("Loading model") + model = AutoModelForSequenceClassification.from_pretrained(path.expanduser(args.model)) + if (args.gpu): + model.cuda() + + ## Prediction functions + + + def get_predprobs(text): + inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt") + if (args.gpu): + inputs = inputs.to("cuda") + with no_grad(): + outputs = model(**inputs) + res = outputs[0] + if (args.gpu): + res = res.cpu() + res = res.softmax(1).detach().numpy() + return res + + print("Computing predictions") + + chunks = chunker([str(x) for x in dat[args.txtname]], args.batch) + pred = [] + for i, x in enumerate(chunks): + if (i % 5 == 0): + percent = round(100 * i * args.batch / len(dat), 1) + logmsg = "Computing: " + str(percent) + "% (" + str(i * args.batch) + "/" + str(len(dat)) + ")" + with open(path.expanduser(args.logfile), "w") as progfile: + progfile.write(logmsg) + pred.append(get_predprobs(x)) + + pred = np.concatenate(pred) + pred = pd.DataFrame(pred) + pred.columns = ["bertpred_" + v for i, v in model.config.id2label.items()] + pred = pd.concat([dat[args.idname], pred], axis=1) + feather.write_feather(pred, path.abspath(args.output)) + remove(path.expanduser(args.logfile)) + + +if __name__ == "__main__": + argParser = argparse.ArgumentParser() + argParser.add_argument("-m", "--model", help="Trained model path") + argParser.add_argument("-d", "--dat", help="Path to data (feather file)") + argParser.add_argument("-o", "--output", help="Output path of predictions", default="tiggerbert.feather") + argParser.add_argument("-i", "--idname", help="Name of id variable", default="id") + argParser.add_argument("-x", "--txtname", help="Name of text variable", default="text") + argParser.add_argument("-l", "--logfile", help="Path to log file", default="tiggerbert-progress.txt") + argParser.add_argument("-G", "--gpu", help="Use GPU (CUDA)", type=int, choices=[0,1], default=1) + argParser.add_argument("-b", "--batch", help="Batch size", type=int, default=128) + + args = argParser.parse_args() + + main(args) + diff --git a/docker-images-datalab/activetigger/activetigger/modelnames.csv b/docker-images-datalab/activetigger/activetigger/modelnames.csv new file mode 100644 index 0000000..a49e67e --- /dev/null +++ b/docker-images-datalab/activetigger/activetigger/modelnames.csv @@ -0,0 +1,159 @@ +"short","spacy_name","fasttext_name","fasttext_url","language","short_lang" +"af","xx_ent_wiki_sm","cc.af.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.bin.gz","Afrikaans","(af) Afrikaans" +"als","xx_ent_wiki_sm","cc.als.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.bin.gz","Alemannic","(als) Alemannic" +"am","xx_ent_wiki_sm","cc.am.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.bin.gz","Amharic","(am) Amharic" +"an","xx_ent_wiki_sm","cc.an.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.bin.gz","Aragonese","(an) Aragonese" +"ar","xx_ent_wiki_sm","cc.ar.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz","Arabic","(ar) Arabic" +"arz","xx_ent_wiki_sm","cc.arz.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.bin.gz","Egyptian Arabic","(arz) Egyptian Arabic" +"as","xx_ent_wiki_sm","cc.as.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.bin.gz","Assamese","(as) Assamese" +"ast","xx_ent_wiki_sm","cc.ast.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.bin.gz","Asturian","(ast) Asturian" +"az","xx_ent_wiki_sm","cc.az.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.bin.gz","Azerbaijani","(az) Azerbaijani" +"azb","xx_ent_wiki_sm","cc.azb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.bin.gz","Southern Azerbaijani","(azb) Southern Azerbaijani" +"ba","xx_ent_wiki_sm","cc.ba.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.bin.gz","Bashkir","(ba) Bashkir" +"bar","xx_ent_wiki_sm","cc.bar.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.bin.gz","Bavarian","(bar) Bavarian" +"bcl","xx_ent_wiki_sm","cc.bcl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.bin.gz","Central Bicolano","(bcl) Central Bicolano" +"be","xx_ent_wiki_sm","cc.be.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.bin.gz","Belarusian","(be) Belarusian" +"bg","xx_ent_wiki_sm","cc.bg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.bin.gz","Bulgarian","(bg) Bulgarian" +"bh","xx_ent_wiki_sm","cc.bh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.bin.gz","Bihari","(bh) Bihari" +"bn","xx_ent_wiki_sm","cc.bn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz","Bengali","(bn) Bengali" +"bo","xx_ent_wiki_sm","cc.bo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.bin.gz","Tibetan","(bo) Tibetan" +"bpy","xx_ent_wiki_sm","cc.bpy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.bin.gz","Bishnupriya Manipuri","(bpy) Bishnupriya Manipuri" +"br","xx_ent_wiki_sm","cc.br.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.bin.gz","Breton","(br) Breton" +"bs","xx_ent_wiki_sm","cc.bs.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.bin.gz","Bosnian","(bs) Bosnian" +"ca","ca_core_news_sm","cc.ca.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.bin.gz","Catalan","(ca) Catalan" +"ce","xx_ent_wiki_sm","cc.ce.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.bin.gz","Chechen","(ce) Chechen" +"ceb","xx_ent_wiki_sm","cc.ceb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.bin.gz","Cebuano","(ceb) Cebuano" +"ckb","xx_ent_wiki_sm","cc.ckb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.bin.gz","Kurdish (Sorani)","(ckb) Kurdish (Sorani)" +"co","xx_ent_wiki_sm","cc.co.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.bin.gz","Corsican","(co) Corsican" +"cs","xx_ent_wiki_sm","cc.cs.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.bin.gz","Czech","(cs) Czech" +"cv","xx_ent_wiki_sm","cc.cv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.bin.gz","Chuvash","(cv) Chuvash" +"cy","xx_ent_wiki_sm","cc.cy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz","Welsh","(cy) Welsh" +"da","da_core_news_sm","cc.da.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.bin.gz","Danish","(da) Danish" +"de","de_core_news_sm","cc.de.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz","German","(de) German" +"diq","xx_ent_wiki_sm","cc.diq.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.bin.gz","Zazaki","(diq) Zazaki" +"dv","xx_ent_wiki_sm","cc.dv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.bin.gz","Divehi","(dv) Divehi" +"el","el_core_news_sm","cc.el.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.bin.gz","Greek","(el) Greek" +"eml","xx_ent_wiki_sm","cc.eml.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.bin.gz","Emilian-Romagnol","(eml) Emilian-Romagnol" +"en","en_core_web_sm","cc.en.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz","English","(en) English" +"eo","xx_ent_wiki_sm","cc.eo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.bin.gz","Esperanto","(eo) Esperanto" +"es","es_core_news_sm","cc.es.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz","Spanish","(es) Spanish" +"et","xx_ent_wiki_sm","cc.et.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.bin.gz","Estonian","(et) Estonian" +"eu","xx_ent_wiki_sm","cc.eu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.bin.gz","Basque","(eu) Basque" +"fa","xx_ent_wiki_sm","cc.fa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz","Persian","(fa) Persian" +"fi","fi_core_news_sm","cc.fi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.bin.gz","Finnish","(fi) Finnish" +"fr","fr_core_news_sm","cc.fr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz","French","(fr) French" +"frr","xx_ent_wiki_sm","cc.frr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.bin.gz","North Frisian","(frr) North Frisian" +"fy","xx_ent_wiki_sm","cc.fy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.bin.gz","West Frisian","(fy) West Frisian" +"ga","xx_ent_wiki_sm","cc.ga.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.bin.gz","Irish","(ga) Irish" +"gd","xx_ent_wiki_sm","cc.gd.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.bin.gz","Scottish Gaelic","(gd) Scottish Gaelic" +"gl","xx_ent_wiki_sm","cc.gl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.bin.gz","Galician","(gl) Galician" +"gom","xx_ent_wiki_sm","cc.gom.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.bin.gz","Goan Konkani","(gom) Goan Konkani" +"gu","xx_ent_wiki_sm","cc.gu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.bin.gz","Gujarati","(gu) Gujarati" +"gv","xx_ent_wiki_sm","cc.gv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.bin.gz","Manx","(gv) Manx" +"he","xx_ent_wiki_sm","cc.he.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz","Hebrew","(he) Hebrew" +"hi","xx_ent_wiki_sm","cc.hi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz","Hindi","(hi) Hindi" +"hif","xx_ent_wiki_sm","cc.hif.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.bin.gz","Fiji Hindi","(hif) Fiji Hindi" +"hr","hr_core_news_sm","cc.hr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz","Croatian","(hr) Croatian" +"hsb","xx_ent_wiki_sm","cc.hsb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.bin.gz","Upper Sorbian","(hsb) Upper Sorbian" +"ht","xx_ent_wiki_sm","cc.ht.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.bin.gz","Haitian","(ht) Haitian" +"hu","xx_ent_wiki_sm","cc.hu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.bin.gz","Hungarian","(hu) Hungarian" +"hy","xx_ent_wiki_sm","cc.hy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.bin.gz","Armenian","(hy) Armenian" +"ia","xx_ent_wiki_sm","cc.ia.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.bin.gz","Interlingua","(ia) Interlingua" +"id","xx_ent_wiki_sm","cc.id.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz","Indonesian","(id) Indonesian" +"ilo","xx_ent_wiki_sm","cc.ilo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.bin.gz","Ilokano","(ilo) Ilokano" +"io","xx_ent_wiki_sm","cc.io.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.bin.gz","Ido","(io) Ido" +"is","xx_ent_wiki_sm","cc.is.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.bin.gz","Icelandic","(is) Icelandic" +"it","it_core_news_sm","cc.it.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz","Italian","(it) Italian" +"ja","ja_core_news_sm","cc.ja.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz","Japanese","(ja) Japanese" +"jv","xx_ent_wiki_sm","cc.jv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.bin.gz","Javanese","(jv) Javanese" +"ka","xx_ent_wiki_sm","cc.ka.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.bin.gz","Georgian","(ka) Georgian" +"kk","xx_ent_wiki_sm","cc.kk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.bin.gz","Kazakh","(kk) Kazakh" +"km","xx_ent_wiki_sm","cc.km.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz","Khmer","(km) Khmer" +"kn","xx_ent_wiki_sm","cc.kn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.bin.gz","Kannada","(kn) Kannada" +"ko","ko_core_news_sm","cc.ko.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz","Korean","(ko) Korean" +"ku","xx_ent_wiki_sm","cc.ku.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.bin.gz","Kurdish (Kurmanji)","(ku) Kurdish (Kurmanji)" +"ky","xx_ent_wiki_sm","cc.ky.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.bin.gz","Kirghiz","(ky) Kirghiz" +"la","xx_ent_wiki_sm","cc.la.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz","Latin","(la) Latin" +"lb","xx_ent_wiki_sm","cc.lb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.bin.gz","Luxembourgish","(lb) Luxembourgish" +"li","xx_ent_wiki_sm","cc.li.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.bin.gz","Limburgish","(li) Limburgish" +"lmo","xx_ent_wiki_sm","cc.lmo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.bin.gz","Lombard","(lmo) Lombard" +"lt","lt_core_news_sm","cc.lt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.bin.gz","Lithuanian","(lt) Lithuanian" +"lv","xx_ent_wiki_sm","cc.lv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.bin.gz","Latvian","(lv) Latvian" +"mai","xx_ent_wiki_sm","cc.mai.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.bin.gz","Maithili","(mai) Maithili" +"mg","xx_ent_wiki_sm","cc.mg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.bin.gz","Malagasy","(mg) Malagasy" +"mhr","xx_ent_wiki_sm","cc.mhr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.bin.gz","Meadow Mari","(mhr) Meadow Mari" +"min","xx_ent_wiki_sm","cc.min.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.bin.gz","Minangkabau","(min) Minangkabau" +"mk","mk_core_news_sm","cc.mk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.bin.gz","Macedonian","(mk) Macedonian" +"ml","xx_ent_wiki_sm","cc.ml.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.bin.gz","Malayalam","(ml) Malayalam" +"mn","xx_ent_wiki_sm","cc.mn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.bin.gz","Mongolian","(mn) Mongolian" +"mr","xx_ent_wiki_sm","cc.mr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.bin.gz","Marathi","(mr) Marathi" +"mrj","xx_ent_wiki_sm","cc.mrj.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.bin.gz","Hill Mari","(mrj) Hill Mari" +"ms","xx_ent_wiki_sm","cc.ms.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.bin.gz","Malay","(ms) Malay" +"mt","xx_ent_wiki_sm","cc.mt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.bin.gz","Maltese","(mt) Maltese" +"mwl","xx_ent_wiki_sm","cc.mwl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.bin.gz","Mirandese","(mwl) Mirandese" +"my","xx_ent_wiki_sm","cc.my.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.bin.gz","Burmese","(my) Burmese" +"myv","xx_ent_wiki_sm","cc.myv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.bin.gz","Erzya","(myv) Erzya" +"mzn","xx_ent_wiki_sm","cc.mzn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.bin.gz","Mazandarani","(mzn) Mazandarani" +"nah","xx_ent_wiki_sm","cc.nah.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.bin.gz","Nahuatl","(nah) Nahuatl" +"nap","xx_ent_wiki_sm","cc.nap.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.bin.gz","Neapolitan","(nap) Neapolitan" +"nds","xx_ent_wiki_sm","cc.nds.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.bin.gz","Low Saxon","(nds) Low Saxon" +"ne","xx_ent_wiki_sm","cc.ne.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.bin.gz","Nepali","(ne) Nepali" +"new","xx_ent_wiki_sm","cc.new.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.bin.gz","Newar","(new) Newar" +"nl","nl_core_news_sm","cc.nl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.bin.gz","Dutch","(nl) Dutch" +"nn","xx_ent_wiki_sm","cc.nn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.bin.gz","Norwegian (Nynorsk)","(nn) Norwegian (Nynorsk)" +"no","xx_ent_wiki_sm","cc.no.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.bin.gz","Norwegian (Bokmål)","(no) Norwegian (Bokmål)" +"nso","xx_ent_wiki_sm","cc.nso.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.bin.gz","Northern Sotho","(nso) Northern Sotho" +"oc","xx_ent_wiki_sm","cc.oc.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.bin.gz","Occitan","(oc) Occitan" +"or","xx_ent_wiki_sm","cc.or.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.bin.gz","Oriya","(or) Oriya" +"os","xx_ent_wiki_sm","cc.os.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.bin.gz","Ossetian","(os) Ossetian" +"pa","xx_ent_wiki_sm","cc.pa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.bin.gz","Eastern Punjabi","(pa) Eastern Punjabi" +"pam","xx_ent_wiki_sm","cc.pam.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.bin.gz","Kapampangan","(pam) Kapampangan" +"pfl","xx_ent_wiki_sm","cc.pfl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.bin.gz","Palatinate German","(pfl) Palatinate German" +"pl","pl_core_news_sm","cc.pl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz","Polish","(pl) Polish" +"pms","xx_ent_wiki_sm","cc.pms.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.bin.gz","Piedmontese","(pms) Piedmontese" +"pnb","xx_ent_wiki_sm","cc.pnb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.bin.gz","Western Punjabi","(pnb) Western Punjabi" +"ps","xx_ent_wiki_sm","cc.ps.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.bin.gz","Pashto","(ps) Pashto" +"pt","pt_core_news_sm","cc.pt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.bin.gz","Portuguese","(pt) Portuguese" +"qu","xx_ent_wiki_sm","cc.qu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.bin.gz","Quechua","(qu) Quechua" +"rm","xx_ent_wiki_sm","cc.rm.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.bin.gz","Romansh","(rm) Romansh" +"ro","ro_core_news_sm","cc.ro.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz","Romanian","(ro) Romanian" +"ru","ru_core_news_sm","cc.ru.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz","Russian","(ru) Russian" +"sa","xx_ent_wiki_sm","cc.sa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.bin.gz","Sanskrit","(sa) Sanskrit" +"sah","xx_ent_wiki_sm","cc.sah.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.bin.gz","Sakha","(sah) Sakha" +"sc","xx_ent_wiki_sm","cc.sc.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.bin.gz","Sardinian","(sc) Sardinian" +"scn","xx_ent_wiki_sm","cc.scn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.bin.gz","Sicilian","(scn) Sicilian" +"sco","xx_ent_wiki_sm","cc.sco.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.bin.gz","Scots","(sco) Scots" +"sd","xx_ent_wiki_sm","cc.sd.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.bin.gz","Sindhi","(sd) Sindhi" +"sh","xx_ent_wiki_sm","cc.sh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.bin.gz","Serbo-Croatian","(sh) Serbo-Croatian" +"si","xx_ent_wiki_sm","cc.si.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz","Sinhalese","(si) Sinhalese" +"sk","xx_ent_wiki_sm","cc.sk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.bin.gz","Slovak","(sk) Slovak" +"sl","sl_core_news_sm","cc.sl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.bin.gz","Slovenian","(sl) Slovenian" +"so","xx_ent_wiki_sm","cc.so.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.bin.gz","Somali","(so) Somali" +"sq","xx_ent_wiki_sm","cc.sq.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.bin.gz","Albanian","(sq) Albanian" +"sr","xx_ent_wiki_sm","cc.sr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.bin.gz","Serbian","(sr) Serbian" +"su","xx_ent_wiki_sm","cc.su.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.bin.gz","Sundanese","(su) Sundanese" +"sv","sv_core_news_sm","cc.sv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.bin.gz","Swedish","(sv) Swedish" +"sw","xx_ent_wiki_sm","cc.sw.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.bin.gz","Swahili","(sw) Swahili" +"ta","xx_ent_wiki_sm","cc.ta.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.bin.gz","Tamil","(ta) Tamil" +"te","xx_ent_wiki_sm","cc.te.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.bin.gz","Telugu","(te) Telugu" +"tg","xx_ent_wiki_sm","cc.tg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.bin.gz","Tajik","(tg) Tajik" +"th","xx_ent_wiki_sm","cc.th.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.bin.gz","Thai","(th) Thai" +"tk","xx_ent_wiki_sm","cc.tk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.bin.gz","Turkmen","(tk) Turkmen" +"tl","xx_ent_wiki_sm","cc.tl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.bin.gz","Tagalog","(tl) Tagalog" +"tr","xx_ent_wiki_sm","cc.tr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz","Turkish","(tr) Turkish" +"tt","xx_ent_wiki_sm","cc.tt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.bin.gz","Tatar","(tt) Tatar" +"ug","xx_ent_wiki_sm","cc.ug.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz","Uyghur","(ug) Uyghur" +"uk","uk_core_news_sm","cc.uk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.bin.gz","Ukrainian","(uk) Ukrainian" +"ur","xx_ent_wiki_sm","cc.ur.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.bin.gz","Urdu","(ur) Urdu" +"uz","xx_ent_wiki_sm","cc.uz.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.bin.gz","Uzbek","(uz) Uzbek" +"vec","xx_ent_wiki_sm","cc.vec.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.bin.gz","Venetian","(vec) Venetian" +"vi","xx_ent_wiki_sm","cc.vi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.bin.gz","Vietnamese","(vi) Vietnamese" +"vls","xx_ent_wiki_sm","cc.vls.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.bin.gz","West Flemish","(vls) West Flemish" +"vo","xx_ent_wiki_sm","cc.vo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.bin.gz","Volapük","(vo) Volapük" +"wa","xx_ent_wiki_sm","cc.wa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.bin.gz","Walloon","(wa) Walloon" +"war","xx_ent_wiki_sm","cc.war.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.bin.gz","Waray","(war) Waray" +"xmf","xx_ent_wiki_sm","cc.xmf.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.bin.gz","Mingrelian","(xmf) Mingrelian" +"yi","xx_ent_wiki_sm","cc.yi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.bin.gz","Yiddish","(yi) Yiddish" +"yo","xx_ent_wiki_sm","cc.yo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.bin.gz","Yoruba","(yo) Yoruba" +"zea","xx_ent_wiki_sm","cc.zea.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.bin.gz","Zeelandic","(zea) Zeelandic" +"zh","zh_core_web_sm","cc.zh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz","Chinese","(zh) Chinese" diff --git a/docker-images-datalab/activetigger/activetigger/server.R b/docker-images-datalab/activetigger/activetigger/server.R new file mode 100644 index 0000000..89a7ae6 --- /dev/null +++ b/docker-images-datalab/activetigger/activetigger/server.R @@ -0,0 +1,4143 @@ + +pkgs <- c("arrow", "class", "DT", "future", "ggplot2", "glmnet", "htmlTable", + "LiblineaR", "Matrix","Metrics", "promises", "quanteda", + "quanteda.textmodels", "ranger", "rlang", "Rtsne", + "RJSONIO", "shiny", "SparseM", "stringi", "uwot") +pkgs <- pkgs[! pkgs %in% rownames(installed.packages())] +if (length(pkgs)) { + warning(paste0("Packages ", paste0(pkgs, collapse = " , "), " not available. ", + "App might crash.")) +} +# for (i in seq_along(pkgs)) install.packages(pkgs[i]) + +verbose <- TRUE +# sink("ActiveTigger.log") + +options(shiny.maxRequestSize = 30*1024^3) # File upload size + +modelnames <- read.csv("modelnames.csv") +modelnames_labels <- modelnames$short +names(modelnames_labels) <- modelnames$short_lang + +library(quanteda) +library(ggplot2) +library(promises) +library(future) +plan(multisession) + + +################################################################################ +## Global functions +################################################################################ + +label2hash <- function(label) rlang::hash(label) + +cleanFileName <- function(name) + gsub('\\s|[/]|[\\]|[:]|[*]|[?]|["]|[<]|[>]|[+]|[$]|[=]', "_", + gsub("^\\s+|\\s+$", "", name)) + +wtdF1 <- function(truth, pred) { + pred <- pred[!is.na(truth)] + truth <- truth[!is.na(truth)] + sum(sapply(unique(truth), function(ilab) + Metrics::fbeta_score(truth == ilab, pred == ilab) * + sum(truth == ilab) / length(truth)), na.rm = TRUE) +} + +metricsTable <- function(truth, pred) { + do.call(rbind, lapply(sort(unique(truth)), function(ilab) { + tmptrue <- truth == ilab + tmppred <- pred == ilab + tmp <- data.frame( + "Tag" = ilab, + "N.cases" = sum(tmptrue), + "N.predict" = sum(tmppred), + "Accuracy" = sprintf("%f", round(Metrics::accuracy(tmptrue, tmppred), 6)), + "Precision" = sprintf("%f", round(Metrics::precision(tmptrue, tmppred), 6)), + "Recall" = sprintf("%f", round(Metrics::recall(tmptrue, tmppred), 6)), + "F1" = sprintf("%f", round(Metrics::fbeta_score(tmptrue, tmppred), 6))) + tmp$F1[tmp$F1 == "NaN"] <- sprintf("%f", 0) + tmp + })) +} + +bertPlotFromPath <- function(path) { + if (!file.exists(paste0(path, "/trainer_state.json"))) return(NULL) + state <- RJSONIO::fromJSON(paste0(path, "/trainer_state.json")) + logs <- data.table::rbindlist(lapply(state$log_history, as.list), fill = TRUE) + epochmin <- logs$epoch[which.min(logs$eval_loss)] + + gdat <- data.table::rbindlist( + use.names = FALSE, + list( + cbind("train", logs[!is.na(logs$loss), c("epoch", "loss")]), + cbind("valid", logs[!is.na(logs$eval_loss), c("epoch", "eval_loss")]) + )) + colnames(gdat) <- c("data", "epoch", "loss") + + ggplot(gdat, aes_string("epoch", "loss", color = "data", shape = "data")) + + geom_point(size = 2) + geom_line() + + geom_vline(xintercept = epochmin, linetype = 2) + + theme_bw() + expand_limits(y=0) +} + +bertHyperToMsg <- function(pars) { + if (is.null(pars)) return(NULL) + paste0("Scheme : ", pars$bertScheme, "; ", + "Model : ", pars$bertModel, "\n* ", + "Epochs : ", pars$bertEpochs, " ; ", + "Lrate : ", pars$bertLrate, " ; ", + "Wdecay : ", pars$bertWdecay, " ; ", + "Batch size : ", pars$bertBatchsize, " ; ", + "Grad. accum. : ", pars$bertGradacc, "\n* ", + "Keep best : ", pars$bertBest, " ; ", + "ValidFrac : ", pars$bertValidFrac, " ; ", + "ValidSeed : ", pars$bertValidSeed, " \n* ", + pars$bertNtrain, " training samples : ", + paste0(names(pars$bertTrainTable), " (", + as.numeric(pars$bertTrainTable), ")", + collapse = " ; ")) +} + +get_status <- function(dafile) + scan(dafile, what = "character", sep="\n", quiet = TRUE) +set_status <- function(dafile, msg) write(msg, dafile) + + + +################################################################################ +## Main server function +################################################################################ + +# options(shiny.maxRequestSize=2^34) # Max filesize + +shinyServer(function(input, output, session) { + + values <- reactiveValues() + values$visuGo <- FALSE + values$dfmGo <- FALSE + # queryTrigger <- reactiveVal(0) + # predlabTrigger <- reactiveVal(0) + initDataTrigger <- reactiveVal(0) + queryNext <- reactiveVal(0) + trainTrigger <- reactiveVal(0) + predTrigger <- reactiveVal(0) + diagTrigger <- reactiveVal(0) + ready2tag <- reactiveVal(0) + trainCountdown <- reactiveVal(0) + + ok.data <- reactiveVal() + ok.data.async <- reactiveVal() + ok.data.running <- reactiveVal(FALSE) + + bpall_nclicks <- reactiveVal(0) + bertpred_nclicks <- reactiveVal(0) + + ## NEW use_regressors list as values$new_use_regressors + observeEvent(input$use_regressors, { + values$new_use_regressors <- input$use_regressors + cat("DEBUG new_use_regressors: ", paste(input$use_regressors, collapse = " "), "\n") + }) + + ## NEW trainCountdown with input$trainCountdown + ## Policy: 0, don't train ; 1, trigger train on tag ; >1, decrement on tag + observeEvent(input$trainCountdown, trainCountdown(input$trainCountdown)) + + #################### + ## Projects and data + + ## Read config files, update UI forms accordingly + ## This should happen only once + observe({ + if (verbose) + cat("DEBUG read project configs\n") + # values$conf <- RJSONIO::fromJSON("conf.json") + projfiles <- dir(pattern = "^tigger_.*[.]json$") + isolate({ + if (!length(projfiles)) { + values$conf <- NULL + return(NULL) + } + values$confnames <- gsub("^tigger_(.*)[.]json$", "\\1", projfiles) + + values$conf <- RJSONIO::fromJSON(projfiles[1]) + }) + }) + + ## When projects added, update selector + observeEvent(values$confnames, { + if (verbose) + cat("DEBUG update selectProject from confnames\n") + updateSelectInput(session, "selectProject", choices = values$confnames) + }) + + ## Event from project change + observeEvent(input$selectProject, { + if (is.null(input$selectProject)) return(NULL) + if (!nchar(input$selectProject)) return(NULL) + if (verbose) + cat("DEBUG project change step 1\n") + # values$project <- input$selectProject + # values$conf <- values$confs[[values$project]] + values$conf <- RJSONIO::fromJSON( + paste0("tigger_", input$selectProject, ".json")) + ok.data(NULL) + ok.visu(NULL) + initDataTrigger(initDataTrigger() + 1) + + if (verbose) + cat("DEBUG project change step 2\n") + if (!is.null(values$conf$dataNrows)) + updateNumericInput(session, "dataNrows", value = as.numeric(values$conf$dataNrows)) + if (!is.null(values$conf$dataSkipRows)) + updateNumericInput(session, "dataSkipRows", value = as.numeric(values$conf$dataSkipRows)) + + #### Python #### + ## Default use_python if not found in conf file: FALSE + if(is.null(values$conf$use_python)) + values$conf$use_python <- FALSE + updateCheckboxInput(session, "sys_use_python", value = values$conf$use_python) + + ## Set default python if not found in conf file + if(is.null(values$conf$python)) + values$conf$python <- "python3" + updateTextInput(session, "sys_which_python", value = values$conf$python) + + values$python_ok <- FALSE + if (values$conf$use_python) { + ## Check whether python is working + pytest <- try(system(paste(values$conf$python, "--version"), + intern = TRUE)) + if (inherits(pytest, "try-error")) { + showNotification(paste( + "Python path `", values$conf$python, + "` not valid, try changing it in Project/System tab"), + duration = 10, type = "error") + } else + values$python_ok <- TRUE + } + if (verbose) + cat("Python ok:", values$python_ok, "\n") + + + #### GPU #### + if (!is.null(values$conf$use_gpu)) + updateCheckboxInput(session, "sys_use_gpu", value = values$conf$use_gpu) + + #### Spacy #### + if (!is.null(values$conf$use_spacy)) + updateCheckboxInput(session, "sys_use_spacy", value = values$conf$use_spacy) + updateTextInput(session, "sys_use_spacy_model", value = values$conf$use_spacy_model) + + #### fastText #### + if (!is.null(values$conf$use_ft)) + updateCheckboxInput(session, "sys_use_ft", value = values$conf$use_ft) + updateTextInput(session, "sys_use_ft_model", value = values$conf$use_ft_model) + + #### SBERT #### + if (!is.null(values$conf$use_sb)) + updateCheckboxInput(session, "sys_use_sb", value = values$conf$use_sb) + updateTextInput(session, "sys_use_sb_model", value = values$conf$use_sb_model) + + }) + + + ############################################################################## + ## Reactives : project settings UI + ############################################################################## + + output$sys_datadir <- renderText(values$conf$datadir) + output$sys_datafile <- renderText(paste0(values$conf$dataname, ".feather")) + output$sys_var_id <- renderUI(renderText(values$conf$idcol)) + output$sys_var_text <- renderUI(renderText(values$conf$textcols)) + output$sys_var_tag <- renderUI(renderText(values$conf$tagcols)) + output$sys_var_context_ui <- renderUI(selectInput( + "sys_var_context", NULL, values$data_orig_colnames, values$conf$contextcols, + multiple = TRUE)) + output$sys_var_comm_ui <- renderUI(selectInput( + "sys_var_comm", NULL, c("(none)", values$data_orig_colnames), + values$conf$commcol)) + + output$sys_ex_lang_ui <- renderUI(selectInput( + "sys_ex_lang", NULL, modelnames_labels, "en" + )) + output$sys_ex_spacy <- renderText( + modelnames$spacy_name[modelnames$short == input$sys_ex_lang]) + output$sys_ex_spacy_dl <- renderText(paste0( + "Download with python module: ", values$conf$python, " -m spacy download ", + modelnames$spacy_name[modelnames$short == input$sys_ex_lang])) + output$sys_ex_ft <- renderText( + modelnames$fasttext_name[modelnames$short == input$sys_ex_lang]) + output$sys_ex_ft_dl <- renderText(paste0( + "Manual download link : ", + modelnames$fasttext_url[modelnames$short == input$sys_ex_lang])) + output$sys_ex_sb <- renderText( + ifelse(input$sys_ex_lang %in% c("ar", "zh", "nl", "en", "fr", "de", + "it", "ko", "pl", "pt", "ru", "es", "tr"), + "distiluse-base-multilingual-cased-v1", + "distiluse-base-multilingual-cased-v2")) + + + ## Save system changes on button click, with modal + output$save_config_msg <- renderUI({ + req(values$save_config_msg) + p(strong(values$save_config_msg)) + }) + output$save_embed_msg <- renderUI({ + tmp <- "" + if (input$sys_use_spacy) if (!identical(input$sys_use_spacy_model, values$conf$use_spacy_model)) + tmp <- paste0(tmp, "
  • Changing the spacy model will delete the current tokenized text and fasttext word embeddings, to prevent conflicts
  • ") + if (input$sys_use_ft) if (!identical(input$sys_use_ft_model, values$conf$use_ft_model)) + tmp <- paste0(tmp, "
  • Changing the fasttext model will delete the current fasttext word embeddings, to prevent conflicts.
  • ") + if (input$sys_use_sb) if (!identical(input$sys_use_sb_model, values$conf$use_sb_model)) + tmp <- paste0(tmp, "
  • Changing the SBERT model will delete the current SBERT sentence embeddings, to prevent conflicts.
  • ") + + if (nchar(tmp)) { + return(HTML(paste("

    Warning:

    "))) + } else + return(NULL) + }) + observeEvent(input$saveSystem, { + values$save_config_msg <- NULL + showModal(modalDialog( + title = "Save project configuration", + paste0("Click 'Save' to check and confirm the new configuration"), + uiOutput("save_embed_msg"), + uiOutput("save_config_msg"), + footer = tagList(actionButton(paste0("save_conf_confirm"), "Save"), + modalButton("Cancel")))) + }) + observeEvent(input$save_conf_confirm, { + newconf <- values$conf + newconf$contextcols <- input$sys_var_context + newconf$commcol <- input$sys_var_comm + + if (input$sys_use_python) { + pytest <- try(system(paste(input$sys_which_python, "--version"), + intern = TRUE)) + if (inherits(pytest, "try-error")) { + values$save_config_msg <- paste0( + "Error: python path `", input$sys_which_python, "` not valid") + return(NULL) + } + } + newconf$use_python <- input$sys_use_python + newconf$python <- input$sys_which_python + newconf$use_gpu <- input$sys_use_gpu + + if (input$sys_use_spacy) { + sptest <- system(paste0( + newconf$python, " -m spacy info ", input$sys_use_spacy_model), intern = TRUE) + if (length(sptest) == 0) { + values$save_config_msg <- paste( + "Error loading spacy, check that it is installed in the specified python env") + return(NULL) + } + if (length(attr(sptest, "status"))) { + values$save_config_msg <- paste( + "Error loading spacy model, check that it has been downloaded") + return(NULL) + } + newconf$use_spacy_model <- input$sys_use_spacy_model + } + newconf$use_spacy <- input$sys_use_spacy + + if (input$sys_use_ft) { + if (!file.exists(input$sys_use_ft_model)) { + values$save_config_msg <- paste( + "Error loading fasttext model, check the specified path") + return(NULL) + } + newconf$use_ft_model <- input$sys_use_ft_model + } + newconf$use_ft <- input$sys_use_ft + + if (input$sys_use_sb) + newconf$use_sb_model <- input$sys_use_sb_model + newconf$use_sb <- input$sys_use_sb + + # if (newconf$use_spacy) if (newconf$use_spacy_model != values$conf$use_spacy_model) { + if (newconf$use_spacy) if (!identical(newconf$use_spacy_model, values$conf$use_spacy_model)) { + file.remove(paste0(values$conf$datadir, values$conf$dataname, "_spa.feather")) + file.remove(paste0(values$conf$datadir, values$conf$dataname, "_ft.feather")) + } + # if (newconf$use_ft) if (newconf$use_ft_model != values$conf$use_ft_model) + if (newconf$use_ft) if (!identical(newconf$use_ft_model, values$conf$use_ft_model)) + file.remove(paste0(values$conf$datadir, values$conf$dataname, "_ft.feather")) + # if (newconf$use_sb) if (newconf$use_sb_model != values$conf$use_sb_model) + if (newconf$use_sb) if (!identical(newconf$use_sb_model, values$conf$use_sb_model)) + file.remove(paste0(values$conf$datadir, values$conf$dataname, "_sb.feather")) + + writeLines(RJSONIO::toJSON(newconf), + paste0("tigger_", newconf$projectid, ".json")) + values$conf <- newconf + initDataTrigger(initDataTrigger() + 1) + removeModal() + }) + + ############################################################################## + ## Reactives : data + ############################################################################## + + ok.commentCol <- reactive(values$conf$commcol) + + ## Current imported data (async) + observeEvent(input$dataImport, { + if (ok.data.running()) return(NULL) + ## Save nrows to conf file on change + if (!is.numeric(input$dataNrows) | !is.numeric(input$dataSkipRows)) return(NULL) + values$conf$dataNrows <- input$dataNrows + values$conf$dataSkipRows <- input$dataSkipRows + if (!is.null(values$conf$projectid)) + writeLines(RJSONIO::toJSON(values$conf), paste0( + "tigger_", values$conf$projectid, ".json")) + initDataTrigger(initDataTrigger() + 1) + }) + + + ## Read data event + observeEvent(initDataTrigger(), { + if (is.null(values$conf)) { + ok.data(NULL) + return(NULL) + } + if (initDataTrigger() == 0) return(NULL) + if (verbose) + cat("DEBUG enter ok.data\n") + if (verbose) + cat("DEBUG enter ok.data trigger", initDataTrigger(), "\n") + ok.data.running(TRUE) + + ## Prepare variables for async launch + coco <- values$conf + coco$python_ok <- values$python_ok + + da_token <- session$token + data_status <- paste0("tigger_", da_token, "_data") + + file_orig_colnames <- paste0(data_status, "_origcols") + file_colstok <- paste0(data_status, "_tok") + file_colsft <- paste0(data_status, "_ft") + file_colssb <- paste0(data_status, "_sb") + file_colsbertpred <- paste0(data_status, "_bertpred") + da_missing <- paste0(data_status, "_bertpred_missing") + + projpath <- paste0(coco$datadir, coco$projectid) + if (!file.exists(projpath)) + dir.create(projpath) + + ## Read data: async launch + data_async <- future({ + ################# + ## Main data file + + set_status(data_status, "Importing main data file...") + res <- arrow::read_feather( + paste0(coco$datadir, coco$dataname, ".feather")) + if (is.null(coco$dataNrows)) return(NULL) + rowselect <- + intersect(1:nrow(res), (1:(coco$dataNrows)) + coco$dataSkipRows) + if (!length(rowselect)) return(NULL) + res <- res[rowselect, , drop = FALSE] + + writeLines(colnames(res), file_orig_colnames) + + + ################# + ## Tokenized text + + ## Import data : feather of two cols, one the id, other the tokenized + set_status(data_status, "Importing tokenized text...") + file_tok <- paste0(coco$datadir, coco$dataname, "_spa.feather") + if (file.exists(file_tok)) { + tokdat <- arrow::read_feather(file_tok) + } else + tokdat <- NULL + tok_remaining <- res[[coco$idcol]] + tok_remaining <- + tok_remaining[! tok_remaining %in% tokdat[[coco$idcol]]] + + ## Tokenize remaining texts + if (length(tok_remaining)) { + if (!coco$python_ok | !coco$use_spacy) { + ## If remaining texts to tokenize but no python, fall back on + ## untokenized text + ## TODO here use quanteda tokenizer with options? + tokdat <- data.frame( + res[[coco$idcol]], + do.call(paste, c(res[match(tok_remaining, res[[coco$idcol]]), + coco$textcols, drop = FALSE], sep = "\n\n"))) + colnames(tokdat) <- c(coco$idcol, "text_spa") + set_status(data_status, "Importing tokenized text: no spacy ok") + } else { + ## If python present, tokenize with spacy + set_status(data_status, + paste("Tokenizing", length(tok_remaining), "new texts")) + tmpfile <- paste0("tigger_tok_", da_token) + arrow::write_feather( + data.frame( + id = tok_remaining, + text = do.call( + paste, c(res[match(tok_remaining, res[[coco$idcol]]), + coco$textcols, drop = FALSE], sep = "\n\n"))), + paste0(tmpfile, ".feather") + ) + system(paste0( + coco$python, " tokenize_spacy.py -d ", tmpfile, ".feather", + " -m ", coco$use_spacy_model)) + tok_new <- arrow::read_feather(paste0(tmpfile, "_spa.feather")) + colnames(tok_new)[colnames(tok_new) == "id"] <- coco$idcol + tokdat <- rbind(tokdat, tok_new) + arrow::write_feather(tokdat, file_tok) + system(paste0("rm ", tmpfile, "*")) + set_status(data_status, "Importing tokenized text: spacy ok") + } + } + + ## Save colname for tokenized text + tmp_cols_tok <- colnames(tokdat)[colnames(tokdat) != coco$idcol] + writeLines(tmp_cols_tok, file_colstok) + + set_status(data_status, "Importing tokenized text: wrote colnames") + + ## Merge with current data + res <- merge(res, tokdat, by = coco$idcol, + all.y = FALSE, sort = FALSE) + rm(tokdat) + + set_status(data_status, "Importing tokenized text: merged") + + ################# + ## FastText embeddings + + set_status(data_status, paste("Loading fastText embeddings")) + + ## FastText embeddings : feather file with id and embeddings + file_ft <- paste0(coco$datadir, coco$dataname, "_ft.feather") + if (file.exists(file_ft)) { + ftdat <- arrow::read_feather(file_ft) + } else + ftdat <- NULL + ft_remaining <- res[[coco$idcol]] + ft_remaining <- + ft_remaining[! ft_remaining %in% ftdat[[coco$idcol]]] + + ## Embed remaining texts, if FT in use + if (length(ft_remaining)) { + if (!coco$python_ok | !coco$use_ft) { + ## If no python or FT, abort if not all rows already embedded + ftdat <- NULL + } else { + set_status(data_status, paste( + "Embedding", length(ft_remaining), "new texts with fastText")) + tmpfile <- paste0("tigger_ft_", da_token) + arrow::write_feather( + data.frame( + id = ft_remaining, + text = do.call( + paste, res[match(ft_remaining, res[[coco$idcol]]), + tmp_cols_tok, drop = FALSE])), + paste0(tmpfile, ".feather") + ) + system(paste0( + coco$python, " embed_fasttext.py -d ", tmpfile, ".feather", + " -m ", coco$use_ft_model)) + ft_new <- arrow::read_feather(paste0(tmpfile, "_ft.feather")) + colnames(ft_new)[colnames(ft_new) == "id"] <- coco$idcol + ftdat <- rbind(ftdat, ft_new) + arrow::write_feather(ftdat, file_ft) + system(paste0("rm ", tmpfile, "*")) + } + } + + if (!is.null(ftdat)) { + ## Merge with current data + writeLines(colnames(ftdat)[colnames(ftdat) != coco$idcol], file_colsft) + res <- merge(res, ftdat, by = coco$idcol, all.y = FALSE, sort = FALSE) + rm(ftdat) + } + + + ################# + ## SBERT embeddings + + set_status(data_status, "Loading SBERT embeddings") + + ## SBERT file : feather of id and embeddings + file_sb <- paste0(coco$datadir, coco$dataname, "_sb.feather") + if (file.exists(file_sb)) { + sbdat <- arrow::read_feather(file_sb) + }else + sbdat <- NULL + sb_remaining <- res[[coco$idcol]] + sb_remaining <- + sb_remaining[! sb_remaining %in% sbdat[[coco$idcol]]] + + ## Embed remaining texts + if (length(sb_remaining)) { + if (!coco$python_ok | !coco$use_sb) { + ## If no python or SB, abort if not all rows already embedded + sbdat <- NULL + } else { + set_status(data_status, paste( + "Embedding", length(sb_remaining), "new texts with SBERT")) + tmpfile <- paste0("tigger_sb_", da_token) + arrow::write_feather( + data.frame( + id = sb_remaining, + text = do.call( + paste, res[match(sb_remaining, res[[coco$idcol]]), + coco$textcols, drop = FALSE])), + paste0(tmpfile, ".feather") + ) + system(paste0( + coco$python, " embed_sbert.py -d ", tmpfile, ".feather", + " -m ", coco$use_sb_model)) + sb_new <- arrow::read_feather(paste0(tmpfile, "_sb.feather")) + colnames(sb_new)[colnames(sb_new) == "id"] <- coco$idcol + sbdat <- rbind(sbdat, sb_new) + arrow::write_feather(sbdat, file_sb) + system(paste0("rm ", tmpfile, "*")) + } + } + + if (!is.null(sbdat)) { + ## Merge with current data + writeLines(colnames(sbdat)[colnames(sbdat) != coco$idcol], file_colssb) + res <- merge(res, sbdat, by = coco$idcol, all.y = FALSE, sort = FALSE) + rm(sbdat) + } + + ################# + ## BERT predictions + + set_status(data_status, "Loading BERT predictions") + + ## BERT predictions : feathers of id and probs + ## TODO: take full prediction if existing + + files_pred <- dir( + coco$datadir, paste0("^", coco$dataname, "_bertpred_.*[.]feather$")) + # files_predall <- dir( + # coco$datadir, paste0("^", coco$dataname, "_bertpredall_.*[.]feather$")) + # files_pred <- files_pred[ + # ! gsub("_bertpred_", "_bertpredall_", files_pred) %in% files_predall] + # files_pred <- c(files_predall, files_pred) + + tmp_bertpred_cols <- list() + if (length(files_pred)) { + for (ipred in files_pred) { + preddat <- arrow::read_feather(paste0(coco$datadir, ipred)) + predname <- + gsub(paste0("^", coco$dataname, "_bertpred_(.*)[.]feather$"), + "\\1", ipred) + # predname <- + # gsub(paste0("^", coco$dataname, "_bertpred(all)?_(.*)[.]feather$"), + # "\\2", ipred, perl = TRUE) + n_missing <- sum(! res[[coco$idcol]] %in% preddat[[coco$idcol]]) + if (n_missing) { + write( + paste0("BERT prediction ", predname, " : ", n_missing, + " texts missing in prediction."), + da_missing, append = TRUE) + } else { + colnames(preddat) <- gsub( + "^bertpred_", paste0("bertpred_", predname), colnames(preddat)) + tmp_bertpred_cols[[predname]] <- + colnames(preddat)[colnames(preddat) != coco$idcol] + res <- merge(res, preddat, by = coco$idcol, + all.y = FALSE, sort = FALSE) + } + rm(preddat) + } + if (length(tmp_bertpred_cols)) { + writeLines(names(tmp_bertpred_cols), file_colsbertpred) + lapply(names(tmp_bertpred_cols), function(x) { + writeLines(tmp_bertpred_cols[[x]], paste0(file_colsbertpred, "_", x)) + }) + } + } + + ## Comment col + set_status(data_status, "Loading comments") + tmpfile <- paste0(projpath, "/", coco$commcol, ".csv") + if (file.exists(tmpfile)) { + tmpcol <- read.csv(tmpfile) + colnames(tmpcol)[2] <- coco$commcol + ## Remove duplicates, keep newest + tmpcol <- tmpcol[nrow(tmpcol):1, , drop = FALSE] + tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), , drop = FALSE] + tmpcol <- tmpcol[nrow(tmpcol):1, , drop = FALSE] + res <- merge(res, tmpcol, by = coco$idcol, + all.x = TRUE, all.y = FALSE, sort = FALSE) + res[[coco$commcol]][is.na(res[[coco$commcol]])] <- "" + } else + res[[coco$commcol]] <- "" + + # ## Tagging cols + # set_status(data_status, "Loading tags") + # for (itag in coco$tagcols) { + # tmpfile <- paste0(projpath, "/", itag, ".csv") + # if (file.exists(tmpfile)) { + # tmpcol <- read.csv(tmpfile) + # tmpcol[[paste0("hist_", itag)]] <- 1:nrow(tmpcol) + # ## Remove duplicates, keep newest + # tmpcol <- tmpcol[nrow(tmpcol):1, ] + # tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), ] + # tmpcol <- tmpcol[nrow(tmpcol):1, ] + # if (verbose) + # cat("DEBUG imported ", itag, nrow(tmpcol), "\n") + # res <- merge(res, tmpcol, by = coco$idcol, + # all.x = TRUE, all.y = FALSE, sort = FALSE) + # } else { + # res[[itag]] <- NA + # res[[paste0("hist_", itag)]] <- NA + # } + # } + ## Tagging cols + set_status(data_status, "Loading tags") + for (itag in coco$tagcols) { + itaghist <- paste0("hist_", itag) + if (is.null(res[[itag]]) | all(is.na(res[[itag]]))) { + res[[itag]] <- NA + res[[itaghist]] <- NA + } else { + res[[itaghist]][!is.na(res[[itag]])] <- 1:sum(!is.na(res[[itag]])) + } + tmpfile <- paste0(projpath, "/", itag, ".csv") + if (file.exists(tmpfile)) { + tmpcol <- read.csv(tmpfile) + tmpcol[[itaghist]] <- 1:nrow(tmpcol) + + max(0, suppressWarnings(max(na.omit(res[[itaghist]])))) + ## Remove duplicates, keep newest + tmpcol <- tmpcol[nrow(tmpcol):1, ] + tmpcol <- tmpcol[!duplicated(tmpcol[, 1]), ] + tmpcol <- tmpcol[nrow(tmpcol):1, ] + tmpcol <- tmpcol[tmpcol[[coco$idcol]] %in% res[[coco$idcol]], ] + if (verbose) + cat("DEBUG imported", itag, nrow(tmpcol), "\n") + # res <- merge(res, tmpcol, by = coco$idcol, + # all.x = TRUE, all.y = FALSE, sort = FALSE) + if (nrow(tmpcol)) { + idmatches <- match(tmpcol[[coco$idcol]], res[[coco$idcol]]) + # idmatches <- idmatches[!is.na(idmatches)] + res[[itag]][idmatches] <- tmpcol[[itag]] + res[[itaghist]][idmatches] <- tmpcol[[itaghist]] + } + } + } + + set_status(data_status, "Finalizing data import") + res + }) %...>% ok.data.async() + data_async <- catch(data_async, function(error) { + values$modelDiagno <- paste0("Error in data import: ", error) + }) + data_async <- finally(data_async, function() { + ################# + ## Finalize data importation + res <- ok.data.async() + if (is.null(res)) return(NULL) + + set_status(data_status, "Finalizing data import: read columns") + ## Read column (tok, ft, sb, ...) names from dedicated files + values$tagcols <- coco$tagcols + values$cols_tok <- readLines(file_colstok) + values$cols_ft <- NULL + if (file.exists(file_colsft)) { + values$cols_ft <- readLines(file_colsft) + } + values$cols_sb <- NULL + if (file.exists(file_colssb)) { + values$cols_sb <- readLines(file_colssb) + } + set_status(data_status, "Finalizing data import: bertpred") + if (file.exists(file_colsbertpred)) { + tmp_bertpred <- readLines(file_colsbertpred) + values$cols_bertpred <- lapply(tmp_bertpred, function(x) + readLines(paste0(file_colsbertpred, "_", x))) + names(values$cols_bertpred) <- tmp_bertpred + } else + values$cols_bertpred <- NULL + set_status(data_status, "Finalizing data import: bertpred missing") + if (file.exists(da_missing)) { + lapply(readLines(da_missing), function(x) + showNotification(paste( + x, "\n", + "You may predict on current text in BERT panel, ", + "and then reload the data."))) + } + + ## Populate regressors list in prediction panel + set_status(data_status, "Finalizing data import: populating regressors") + regr_choices <- c( + "Regex" = "regex", + "Word counts (DFM)" = "dfm") + if (!is.null(values$cols_ft)) + regr_choices <- c(regr_choices, "Word embeddings (FastText)" = "ft") + if (!is.null(values$cols_sb)) + regr_choices <- c(regr_choices, "Sentence embeddings (SBERT)" = "sb") + regr_choices <- c(regr_choices, "Extra predictors" = "extra") + + if (length(values$cols_bertpred)) { + tmp_choices <- paste0("bertpred_", names(values$cols_bertpred)) + names(tmp_choices) <- paste0("BERT_", names(values$cols_bertpred)) + regr_choices <- c(regr_choices, tmp_choices) + } + + # if (length(regr_choices) > 3) { + regr_choices_sel <- + regr_choices[regr_choices %in% c("regex", "ft", "sb")] + # } else + # regr_choices_sel <- c("regex", "dfm") + + updateSelectizeInput( + session, "use_regressors", choices = regr_choices, + selected = regr_choices_sel) + values$new_use_regressors <- regr_choices_sel + + othercols <- colnames(res) + othercols <- othercols[! othercols %in% c( + values$cols_tok, values$cols_ft, values$cols_sb, values$cols_bertpred, + values$tagcols, paste0("hist_", values$tagcols), + coco$commcol, coco$textcols)] + updateSelectInput( + session, "dlTagSelect", + choices = c("tags", "comments", "predictions", othercols), + selected = c("tags", "comments", "predictions")) + + ## Populate extra regressors lists in prediction panel + updateSelectizeInput( + session, "use_ootregnum", choices = othercols, selected = NULL) + updateSelectizeInput( + session, "use_ootregcat", choices = othercols, selected = NULL) + + ## Project path + values$projectdir <- paste0(projpath, "/") + + ## Create tagmat: data.frame of all current taggings + values$tagmat <- res[, coco$tagcols, drop = FALSE] + ## Create tagmathist: data.frame of historical order of taggings + values$tagmathist <- res[, paste0("hist_", coco$tagcols), drop = FALSE] + colnames(values$tagmathist) <- coco$tagcols + + ## Save data original colnames + values$data_orig_colnames <- readLines(file_orig_colnames) + + ok.data(ok.data.async()) + ok.data.async(NULL) + ok.data.running(FALSE) + set_status(data_status, "Exit data async") + system(paste0("rm ", data_status, "*")) + values$modelDiagno <- "No model" + + + if(trainTrigger() == 0) trainTrigger(1) ## For first run. TODO set trainTrigger to 0 when changing data? + if (verbose) cat("DEBUG Exit data async finally\n") + }) + NULL + }) + + ## Read data import status from file + observe({ + if (ok.data.running()) { + invalidateLater(100) + tmpfile <- paste0("tigger_", session$token, "_data") + if (file.exists(tmpfile)) { + values$modelDiagno <- paste(collapse = "\n", scan( + tmpfile, what = "character", sep="\n", quiet = TRUE)) + } else + values$modelDiagno <- "" + } + }) + + + ############### + ## ok.text() reactive + ############### + ok.text <- eventReactive(ok.data(), { + if (is.null(ok.data())) return(NULL) + res <- ok.data()[, values$cols_tok[1]] + if (length(values$cols_tok) > 1) { + for (icol in values$cols_tok[-1]) { + res <- paste0(res, "\n\n", ok.data()[, icol]) + } + } + # queryTrigger(queryTrigger() + 1) + if (verbose) + cat("DEBUG ok.text", length(res), "\n") + res + }) + + ok.labelcol <- reactive({ + if (is.null(ok.data())) return(NULL) + if (verbose) + # cat("DEBUG enter ok.labelcol\n") + if (is.null(input$selectScheme)) return(values$conf$tagcols[1]) + input$selectScheme + }) + + + ok.nicetext <- reactive({ + req(ok.data(), values$conf$textcols) + # if (is.null(ok.data())) return(NULL) + do.call(paste, c(ok.data()[, values$conf$textcols, drop = FALSE], + sep = "\n\n")) + }) + + ok.regressor.names <- reactive({ + daregs <- values$new_use_regressors + if (verbose) + cat("DEBUG enter ok.regressor.names ", paste(daregs, collapse = " "), "\n") + res <- NULL + if ("ft" %in% daregs) + res <- c(res, values$cols_ft) + if ("sb" %in% daregs) + res <- c(res, values$cols_sb) + if (length(values$cols_bertpred)) { + for (ipred in 1:length(values$cols_bertpred)) { + if (paste0("bertpred_", names(values$cols_bertpred)[ipred]) %in% daregs) + res <- c(res, values$cols_bertpred[[ipred]]) + } + } + if (verbose) + cat("DEBUG exit ok.regressor.names:", length(res), "\n") + res + }) + + ok.contextCol <- reactive({ + if (verbose) + cat("DEBUG enter ok.contextCol\n") + # res <- isolate(values$conf$contextcols) + res <- values$conf$contextcols + names(res) <- res + res + }) + + ## Initiate label and queries history, on ok.text() or labelcol change, + observeEvent(list(ok.text(), ok.labelcol()), { + if (verbose) + cat("DEBUG initiate label\n") + if (is.null(ok.labelcol())) { + values$label <- NULL + values$labelhist <- NULL + values$queries <- NULL + values$comment <- NULL + values$wasTagged <- NULL + values$wasRetagged <- NULL + } else { + values$label <- values$tagmat[, ok.labelcol()] + values$labelhist <- values$tagmathist[, ok.labelcol()] + values$queries <- which(!is.na(values$label)) + values$comment <- ok.data()[[values$conf$commcol]] + values$wasTagged <- !is.na(values$label) + values$wasRetagged <- rep(FALSE, length(values$label)) + } + }) + + ## On values$label change, values$tagmat is updated for separate history + observeEvent(values$label, { + if (is.null(values$label)) return(NULL) + if (verbose) + cat("DEBUG Update tagmat from label\n") + values$tagmat[, ok.labelcol()] <- values$label + values$tagmathist[, ok.labelcol()] <- values$labelhist + }) + + + ############################################################################## + ## Reactives : training + ############################################################################## + + ## Message of how many tagged and untagged, for model diagnostics box + ok.msgtagged <- reactive({ + if (verbose) + cat("DEBUG enter ok.msgtagged\n") + # if (is.null(ok.data()) | ok.data.running()) { + if (is.null(ok.data())) { + return("No data yet ") + } + + tmp <- "" + # if (sum(is.na(values$label)) == 0) + # tmp <- paste0(tmp, "Data correction mode (0 NA labels left).\n") + paste0(tmp, "Tagged: ", sum(!is.na(values$label)), + " ; Untagged: ", sum(is.na(values$label))) + }) + + ## Training possible? At least 3 texts for 2 tags + ok.train.possible <- reactive({ + if (verbose) + cat("DEBUG enter ok.train.possible\n") + labtab <- table(values$label) + if (length(labtab) < 2) return(FALSE) + if (sum(labtab > 2) < 2) return(FALSE) + TRUE + }) + + ## Compute DFM (async) + ## TODO make this less reactive + observe({ + # if ("dfm" %in% input$use_regressors & !values$dfmGo) + if ("dfm" %in% values$new_use_regressors & !values$dfmGo) + values$dfmGo <- TRUE + }) + ok.dfm <- reactiveVal() + observe({ + if (is.null(ok.text())) return(NULL) + if (! values$dfmGo) return(NULL) + if (!is.numeric(input$dfmNgrams)) return(NULL) + if (verbose) + cat("DEBUG enter dfm observer\n") + + datext <- ok.text() + dfmNgrams <- input$dfmNgrams + dfmTfIdf <- input$dfmTfIdf + dfmMinDocfreq <- input$dfmMinDocfreq + dfmMinTermfreq <- input$dfmMinTermfreq + dfmTfScheme <- input$dfmTfScheme + dfmDfScheme <- input$dfmDfScheme + + dfm_async <- future(seed = TRUE, { + dadfm <- datext %>% + tolower() %>% tokens(remove_punct = F) %>% + # tokens_select(pattern = stopwords(language= "fr"), selection = "remove") %>% + tokens_ngrams(n= 1:dfmNgrams) %>% dfm() %>% + dfm_trim(min_docfreq = dfmMinDocfreq, min_termfreq = dfmMinTermfreq, + verbose = TRUE) + if (dfmTfIdf) { + dadfm <- dadfm %>% + dfm_tfidf(scheme_tf = dfmTfScheme, scheme_df = dfmDfScheme) + } + suppressWarnings({ # TODO : check + dadfm <- quanteda::convert(dadfm, to= "tm") + }) + dadfm <- Matrix::sparseMatrix(i= dadfm$i, j= dadfm$j, x= dadfm$v, + dimnames = dadfm$dimnames) + dadfm + }) %...>% ok.dfm() + dfm_async <- catch(dfm_async, function(error) { + values$modelDiagno <- paste0("Error in DFM: ", error) + }) + }) + + ## Predictor : df or matrix of dfm and extra + ok.regressor.matrix <- reactive({ + if (is.null(ok.text())) return(NULL) + if (is.null(ok.regressor.names())) return(NULL) + if (verbose) + cat("DEBUG enter ok.regressor.matrix\n") + model.matrix(~., data= ok.data()[, ok.regressor.names()])[, -1] + }) + + ok.extraregressor.matrix <- reactive({ + req(ok.text()) + if (! "extra" %in% values$new_use_regressors) return(NULL) + if (verbose) + cat("DEBUG enter ok.extraregressor.matrix\n") + + res <- NULL + for (inum in input$use_ootregnum) { + suppressWarnings({tmp <- as.numeric(ok.data()[[inum]])}) + if (all(is.na(tmp))) { + showNotification(paste0(inum, " is all NAs or non-numeric ; predictor not used"), + duration = 5, type = "warning") + next + } + tmp[is.na(tmp)] <- median(tmp, na.rm = TRUE) + res <- cbind(res, tmp) + } + for (icat in input$use_ootregcat) { + tmp <- ok.data()[[icat]] + if (! length(unique(tmp)) %in% 2:100) { + showNotification(paste0(icat, " has ", length(unique(tmp)), + " unique values ; predictor not used"), + duration = 5, type = "warning") + next + } + tmp <- as.character(tmp) + tmp[is.na(tmp)] <- "" + tmp <- as.factor(tmp) + tmpmat <- model.matrix(~., data.frame(cat = tmp))[, -1, drop = FALSE] + res <- cbind(res, tmpmat) + } + if (verbose) + cat("DEBUG exit ok.extraregressor.matrix", ncol(res), "\n") + res + }) + + ok.extraRegex.matrix <- reactive({ + if (is.null(ok.nicetext())) return(NULL) + if (is.null(values$extraRegex)) return(NULL) + if (is.null(values$regex_inuse)) return(NULL) + if (!any(values$regex_inuse)) return(NULL) + if (is.null(values$regex_case)) return(NULL) + if (verbose) + cat("DEBUG enter ok.extraRegex.matrix\n") + tmp <- as.matrix(sapply(which(values$regex_inuse), function(iregex) { + 1 * stringi::stri_detect_regex( + ok.nicetext(), values$extraRegex[iregex], + opts_regex = list(case_insensitive = !values$regex_case[iregex])) + })) + colnames(tmp) <- values$extraRegex[values$regex_inuse] + tmp + }) + + ok.predictor <- reactive({ + req(values$new_use_regressors) + if (ok.data.running()) return(NULL) + if (is.null(ok.text())) return(NULL) + if (verbose) + cat("DEBUG enter ok.predictor\n") + dadfm <- NULL + if ("dfm" %in% values$new_use_regressors) { + dadfm <- ok.dfm() + } else { + if (input$predModel == "naive bayes") { + values$modelDiagno <- + "Error in training: Naive Bayes requires DFM word counts as predictors." + return(NULL) + } + } + if (length(ok.regressor.names()) > 0) { + if (input$predModel == "random forest") { + if (is.null(dadfm)) { + dadfm <- ok.data()[, ok.regressor.names()] + } else + dadfm <- cbind(ok.data()[, ok.regressor.names()], as.matrix(dadfm)) + } + if (input$predModel %in% c("lasso", "linear", "knn")) { + dadfm <- cbind(ok.regressor.matrix(), dadfm) + } + } + # if ("regex" %in% input$use_regressors && length(values$regex_inuse) > 0) { + if ("regex" %in% values$new_use_regressors & !is.null(ok.extraRegex.matrix())) { + dadfm <- cbind(dadfm, ok.extraRegex.matrix()) + } + if ("extra" %in% values$new_use_regressors) { + dadfm <- cbind(dadfm, ok.extraregressor.matrix()) + } + + if (is.null(dadfm)) { + values$modelDiagno <- "Error in training: no predictors selected" + if (verbose) + cat("DEBUG ok.predictor exit: no predictors\n") + return(NULL) + } + + if (input$predModel == "linear") + dadfm <- SparseM::as.matrix.csr(dadfm, nrow(dadfm), ncol(dadfm)) + if (input$predModel == "naive bayes") + dadfm <- as.dfm(dadfm) + if (verbose) + cat("DEBUG exit ok.predictor: ", ncol(dadfm), "\n") + dadfm + }) + + + ## Train model async + ok.train <- reactiveVal() + ok.train.async <- reactiveVal() + observeEvent( + list(trainTrigger(), input$modelTrain), + { + if (is.null(ok.predictor())) return(NULL) + if (verbose) + cat("DEBUG enter ok.train observer\n") + if (!ok.train.possible()) { + values$modelDiagno <- + "Tag at least 3 texts for 2 tags to train prediction model." + ok.train(NULL) + # queryTrigger(queryTrigger() + 1) + return(NULL) + } + ready2tag(0) + values$modelDiagno <- paste0("Training ", input$predModel) + + tmp_model <- input$predModel + if (tmp_model == "random forest") { + if (!is.numeric(input$rfNumTrees) | !is.numeric(input$rfMtry) | + !is.numeric(input$rfSampleFrac)) { + ok.train(NULL) + return(NULL) + } + train_fun <- ranger::ranger + train_args <- list( + x= ok.predictor()[!is.na(values$label), ], + y= as.factor(values$label[!is.na(values$label)]), + num.trees = input$rfNumTrees, + mtry = if (input$rfMtry > 0) input$rfMtry, + sample.fraction = input$rfSampleFrac, probability = TRUE) + } else { + if (tmp_model == "linear") { + tmpvalues <- values$label + if (!is.numeric(input$liblinCost)) { + ok.train(NULL) + return(NULL) + } + for (ival in na.omit(unique(tmpvalues))) + if (sum(tmpvalues == ival, na.rm = T) == 1) + tmpvalues[which(tmpvalues == ival)] <- NA + train_fun <- LiblineaR::LiblineaR + train_args <- list( + data= ok.predictor()[!is.na(tmpvalues), ], + target= as.factor(tmpvalues[!is.na(tmpvalues)]), + type= 0, cost = input$liblinCost) + } else if (tmp_model == "lasso") { + tmpvalues <- values$label + for (ival in na.omit(unique(tmpvalues))) + if (sum(tmpvalues == ival, na.rm = T) == 1) + tmpvalues[which(tmpvalues == ival)] <- NA + train_fun <- glmnet::glmnet + train_args <- list( + x= ok.predictor()[!is.na(tmpvalues), ], + y= as.factor(tmpvalues[!is.na(tmpvalues)]), + family= "multinomial") + } else if (tmp_model == "naive bayes") { + if (!is.numeric(input$naiveSmooth)) { + ok.train(NULL) + return(NULL) + } + tmpvalues <- values$label + for (ival in na.omit(unique(tmpvalues))) + if (sum(tmpvalues == ival, na.rm = T) == 1) + tmpvalues[which(tmpvalues == ival)] <- NA + train_fun <- quanteda.textmodels::textmodel_nb + train_args <- list( + x= ok.predictor()[!is.na(tmpvalues), ], + y= as.factor(tmpvalues[!is.na(tmpvalues)]), + smooth = input$naiveSmooth, prior = input$naivePrior, + distribution = input$naiveDistri) + } else if (tmp_model == "knn") { + train_fun <- function(x) return(TRUE) + train_args <- list(x = "knn") + } + } + + train_async <- future(seed = TRUE, { + suppressWarnings(do.call(train_fun, train_args)) + }) %...>% ok.train.async() + train_async <- catch(train_async, function(error) { + values$modelDiagno <- paste0("Model training error: ", error) + ok.train(NULL) + ok.train.async(NULL) + }) + train_async <- finally(train_async, function() { + values$modelDiagno <- "Training done, predicting..." + ok.train(ok.train.async()) + ok.train.async(NULL) + predTrigger(predTrigger() + 1) + }) + NULL + } + ) + + ## Model prediction on full dataset (async) + ok.pred <- reactiveVal() + ok.pred.async <- reactiveVal() + # observeEvent(list(ok.train(), input$glmLambda, input$knnK, values$retagged), { + observeEvent(predTrigger(), { + if (is.null(ok.train())) { + ok.pred(NULL) + return(NULL) + } + if (verbose) + cat("DEBUG ok.pred()\n") + + damodel <- input$predModel + if (input$predModel == "random forest") { + pred_args <- list(ok.train(), data = ok.predictor()) + dacolnames <- colnames(ok.train()$predictions) + } else if (input$predModel == "lasso") { + if (!is.numeric(input$glmLambda)) { + ok.pred(NULL) + return(NULL) + } + pred_args <- list(ok.train(), newx = ok.predictor(), s = input$glmLambda, + type = "response") + } else if (input$predModel == "linear") { + pred_args <- list(ok.train(), newx = ok.predictor(), proba = TRUE) + } else if (input$predModel == "naive bayes") { + pred_args <- list(ok.train(), newdata = ok.predictor(), + type = "probability") + } else if (input$predModel == "knn") { + if (!is.numeric(input$knnK)) { + ok.pred(NULL) + return(NULL) + } + pred_args <- list(train = ok.predictor()[!is.na(values$label), ], + test = ok.predictor(), + cl = values$label[!is.na(values$label)], + k = input$knnK, prob = TRUE) + } + + pred_async <- future(seed = TRUE, { + if (damodel == "random forest") { + res <- do.call(predict, pred_args)$predictions + colnames(res) <- dacolnames + res + } else if (damodel == "lasso") { + do.call(predict, pred_args)[, , 1] + } else if (damodel == "linear") { + do.call(predict, pred_args)$probabilities + } else if (damodel == "naive bayes") { + do.call(predict, pred_args) + } else if (damodel == "knn") { + tmp <- do.call(class::knn, pred_args) + ## A bit flaky here: probs of non-best classes uniformly share the rest, + ## and distance is euc + tmpvalues <- sort(unique(pred_args$cl)) + res <- matrix(0, nrow(pred_args$test), length(tmpvalues), + dimnames = list(NULL, tmpvalues)) + res[cbind(1:nrow(res), tmp)] <- attr(tmp, "prob") - + ((1 - attr(tmp, "prob")) / (ncol(res) - 1)) + res <- res + (1 - attr(tmp, "prob")) / (ncol(res) - 1) + res + } + }) %...>% ok.pred.async() + pred_async <- catch(pred_async, function(error) { + values$modelDiagno <- paste0("Error in prediction: ", error) + }) + pred_async <- finally(pred_async, function() { + if (verbose) + cat("DEBUG ok.pred() out of async\n") + ok.pred(ok.pred.async()) + ok.pred.async(NULL) + # predlabTrigger(predlabTrigger() + 1) + diagTrigger(diagTrigger() + 1) + queryNext(queryNext() + 1) + }) + }) + + ## Best predicted label for full dataset + # ok.predlab1 <- eventReactive(predlabTrigger(), { + # if (is.null(ok.pred())) return(NULL) + # if (verbose) + # cat("DEBUG ok.predlab1\n") + # # queryTrigger(queryTrigger() + 1) + # colnames(ok.pred())[max.col(ok.pred())] + # }) + ok.predlab1 <- reactive({ + if (is.null(ok.pred())) return(NULL) + if (verbose) + cat("DEBUG ok.predlab1\n") + colnames(ok.pred())[max.col(ok.pred())] + }) + + ## On scheme change, forget existing model, queue, query and hist + ## load scheme description and predictor-regex + observeEvent(ok.labelcol(), { + if (verbose) + cat("DEBUG event selectScheme", ok.labelcol(), "\n") + ok.pred(NULL) + ok.train(NULL) + values$queryqueue <- NULL + values$modelDiagno <- "No model" + values$histSaveStack <- NULL + + ## Load description file + descr <- "" + if (file.exists(paste0(values$conf$datadir, values$conf$projectid, + "/", input$selectScheme, ".txt"))) { + descr <- paste(collapse = "\n", readLines(paste0( + values$conf$datadir, values$conf$projectid, + "/", input$selectScheme, ".txt"))) + } + updateTextAreaInput(session, "schemeDescr", value = descr) + + ## Load regex from json + regexfile <- paste0(values$projectdir, input$selectScheme, "_regex.json") + if (file.exists(regexfile)) { + if (verbose) + cat("DEBUG enter regex.json\n") + tmp <- RJSONIO::fromJSON(paste(collapse = "\n", readLines(regexfile))) + if (is.null(tmp$regex)) return(NULL) + values$extraRegex <- tmp$regex + names(values$extraRegex) <- sapply(tmp$regex, label2hash) + for (iregex in 1:length(tmp$regex)) { + values[[paste0("useregex_", names(values$extraRegex)[iregex])]] <- + tmp$use[iregex] + values[[paste0("caseregex_", names(values$extraRegex)[iregex])]] <- + tmp$case[iregex] + } + } else + values$extraRegex <- NULL + + }) + + + + ## queryNext event: update values$newQuery ; called by queryNext(), or by + ## queryQueue or regex change + ## Requires : + ## - queryQueue (reactive: indices of current queue), and + ## - wasTagged (boolean vector indicating whether each element has been tagged this session, used in "on untagged") + ## - wasRetagged (boolean vector indicating whether each element has been tagged this session, used in "on tagged") + ## TODO: make a scheme-list of wasTagged, now it will be wiped at each scheme change? + observeEvent( + list(queryNext(), values$queryQueue, + input$sampleChoice, input$taggedWhich, input$maxprobWhich, + input$regexFilter, input$regexCaseSens, + input$visuLock, values$visuZoom$xlim, values$visuZoom$ylim), + { + if (is.null(values$queryQueue)) return(NULL) + if (verbose) + cat("DEBUG queryNext\n") + + queue <- values$queryQueue + + if (input$sampleChoice == "untagged") { + queue <- queue[!values$wasTagged[queue]] + } else if (input$sampleChoice == "tagged") { + queue <- queue[!is.na(values$label)[queue] & !values$wasRetagged[queue]] + if (input$taggedWhich != "all") { + queue <- queue[values$label[queue] == input$taggedWhich] + } + } else if (input$sampleChoice == "all") { + queue <- queue[!values$wasRetagged[queue]] + } + + if (input$visuLock) if (!is.null(values$visuZoom$xlim)) { + if (!is.null(ok.visu())) + queue <- queue[ok.visu()[queue, 1] >= values$visuZoom$xlim[1] & + ok.visu()[queue, 1] <= values$visuZoom$xlim[2] & + ok.visu()[queue, 2] >= values$visuZoom$ylim[1] & + ok.visu()[queue, 2] <= values$visuZoom$ylim[2]] + } + + if (length(queue)) if (nchar(input$regexFilter)) { + try_regex <- try(silent = TRUE, { + queue <- queue[ + stringi::stri_detect_regex( + ok.nicetext()[queue], input$regexFilter, + opts_regex = list(case_insensitive = !input$regexCaseSens))] + }) + if (inherits(try_regex, "try-error")) { + showNotification("Invalid regex", duration = 2, type = "error") + values$newQuery <- NULL + return(NULL) + } + } + + if (!length(queue)) { + if (nchar(input$regexFilter)) { + showNotification("No texts matching the regex", + duration = 2, type = "warning") + } else + showNotification("No more texts to be tagged in this set", + duration = 2, type = "warning") + values$newQuery <- NULL + return(NULL) + } + + values$lastQuery <- queue[1] # ?? + values$newQuery <- queue[1] + ready2tag(1) + } + ) + + ## queryQueue event: fix values$queryQueue, (indices of current queue) + observeEvent( + list(ok.data(), ok.pred(), input$strategy, input$maxprobWhich, + input$selectProject, input$selectScheme), { + if (input$strategy %in% c("entropy", "maxprob") & is.null(ok.pred())) { + values$queryQueue <- NULL + values$lastQuery <- NULL + values$newQuery <- NULL + return(NULL) + } + if (is.null(ok.text())) return(NULL) + if (verbose) + cat("DEBUG queryQueue change\n") + + if (input$strategy == "sequential") { + values$queryQueue <- 1:length(ok.text()) + } else if (input$strategy == "random") { + values$queryQueue <- sample(length(ok.text())) + } else { + dapred <- ok.pred() + if (input$strategy == "entropy") { + tominimize <- rowSums(dapred * log(dapred)) + if (any(dapred == 0)) { + tominimize[dapred == 0] <- 0 + } + } + if (input$strategy == "maxprob") { + if (! input$maxprobWhich %in% colnames(dapred)) return(NULL) + tominimize <- -dapred[, input$maxprobWhich]; + } + values$queryQueue <- order(tominimize) + } + } + ) + + + ## Update uniqueLabels and uniqueLabelsAll if necessary on values$label change + observeEvent(values$label, { + if (is.null(values$label)) return(NULL) + if (verbose) + cat("DEBUG uniqueLabels\n") + dalabs <- sort(na.omit(unique(values$label))) + # if (!length(dalabs)) return(NULL) + if (!identical(dalabs, values$uniqueLabels)) + values$uniqueLabels <- dalabs + if (any(! dalabs %in% values$uniqueLabelsAll)) + values$uniqueLabelsAll <- c(values$uniqueLabelsAll, + dalabs[! dalabs %in% values$uniqueLabelsAll]) + }) + + + + ############################################################################## + ## Project management + ############################################################################## + + output$cpDataMessage <- renderUI({ + req(values$cpDataMessage) + p(values$cpDataMessage) + }) + + output$cpConfirmMessage <- renderUI({ + req(values$cpConfirmMessage) + p(strong(paste0("Error: ", values$cpConfirmMessage))) + }) + + output$cpIdcolUI <- renderUI({ + req(values$cpNewData) + if (!is.data.frame(values$cpNewData)) return(NULL) + selectInput("cpIdcol", NULL, colnames(values$cpNewData)) + }) + + output$cpTextcolUI <- renderUI({ + req(values$cpNewData) + if (!is.data.frame(values$cpNewData)) return(NULL) + selectInput("cpTextcol", NULL, colnames(values$cpNewData), multiple = TRUE) + }) + + output$cpContextcolUI <- renderUI({ + req(values$cpNewData) + if (!is.data.frame(values$cpNewData)) return(NULL) + selectInput("cpContextcol", NULL, colnames(values$cpNewData), multiple = TRUE) + }) + + output$cpTagcolUI <- renderUI({ + req(values$cpNewData) + if (!is.data.frame(values$cpNewData)) return(NULL) + selectInput("cpTagcol", NULL, colnames(values$cpNewData), multiple = TRUE) + }) + + output$cpCommcolUI <- renderUI({ + req(values$cpNewData) + if (!is.data.frame(values$cpNewData)) return(NULL) + selectInput("cpCommcol", NULL, c("none", colnames(values$cpNewData))) + }) + + output$cp_spacyUI <- renderUI( + textInput("cp_use_spacy_model", NULL, + modelnames$spacy_name[modelnames$short == input$cp_lang], + placeholder = "(spacy model name)") + ) + + output$cp_spacyDlUI <- renderUI(HTML(paste( + "Model must be downloaded once, from python:
    ", + input$cp_which_python, "-m spacy download", + modelnames$spacy_name[modelnames$short == input$cp_lang], "

    " + ))) + + output$cp_ftUI <- renderUI( + textInput("cp_use_ft_model", NULL, + modelnames$fasttext_name[modelnames$short == input$cp_lang], + placeholder = "(fasttext model path)") + ) + + output$cp_ftDlUI <- renderUI(list(HTML(paste( + "Model can be downloaded here")), + br(), br())) + + output$cp_sbUI <- renderUI( + textInput("cp_use_sb_model", NULL, ifelse( + input$cp_lang %in% c("ar", "zh", "nl", "en", "fr", "de", + "it", "ko", "pl", "pt", "ru", "es", "tr"), + "distiluse-base-multilingual-cased-v1", + "distiluse-base-multilingual-cased-v2"), + placeholder = "(custom sentence_transformers model)") + ) + + + + ## Create new project, with modal dialog + observeEvent(input$createProject, { + if (verbose) + cat("DEBUG create project\n") + showModal(modalDialog( + title = "New project", + + p("Create a new text tagging project."), + p("This mainly involves choosing a dataset, and how to process it."), + p("Items marked by a star ", strong("*"), + " cannot be changed once the project is created."), + + hr(), + h4("Files and folders"), + + fluidRow( + column(3, p(strong("*Project name"))), + column(5, textInput("cpName", NULL, placeholder = "(Required)")), + column(4, p("Tags will be stored in a new directory by this name.")) + ), + fluidRow( + column(3, p(strong("*Data directory"))), + column(5, textInput("cpDatadir", NULL, placeholder = "(Required)")), + column(4, p("Place (on the server) where the data and project are stored")) + ), + fluidRow( + column(3, p(strong("*Data filename"))), + column(5, checkboxInput("cpDoImport", "New data", TRUE)), + column(4, p("Main file, containing id and text columns")) + ), + + conditionalPanel("!input.cpDoImport", list( + ## Existing data + fluidRow( + column(3, p(strong("Existing file"))), + column(5, textInput("cpDatafile", NULL, placeholder = "(Required)")), + column(4, actionButton("cpDatafileGo", "Import")) + ) + )), + conditionalPanel("input.cpDoImport", { + ## Import data + wellPanel( + selectInput('cpNDfiletype', NULL, + choices = c("Text file (.csv, .txt)" = "csv_txt", + "Feather" = "feather", + "Parquet" = "parquet", + "OpenDocument (.ods)" = "ods", + "Microsoft Excel (.xls, .xlsx)" = "excel", + "SPSS (.sav, .por)" = "spss", + "SAS (.sas7bdat)" = "sas_data", + "Stata v5-v12 (.dta)" = "stata"), + selected = "csv_txt" + ), + fileInput('cpNDfile', 'Choose File'), + + checkboxInput("cpNDparams", "File parameters"), + + conditionalPanel( + "input.cpNDparams", + conditionalPanel( + "input.cpNDfiletype == 'csv_txt'", + + fluidRow(column(4, p('Header')), + column(8, selectInput('cpCSVheader', NULL, + c("Header" = TRUE, + "No Header" = FALSE), + "Auto"))), + fluidRow(column(4, p('Separator')), + column(8, selectInput('cpCSVsep', NULL, + c("Comma ','" = ",", + "Semicolon ';'" = ";", + "Tab" = "\t", + "Space" = " "), + "Auto"))), + fluidRow(column(4, p('Quote')), + column(8, selectInput('cpCSVquote', NULL, + c("Double Quote \"" = "\"", + "Single Quote '" = "'", + "None" = ""), + "Double Quote \""))), + fluidRow(column(4, p('Decimal mark')), + column(8, selectInput('cpCSVdec', NULL, + c('Period "."' = ".", + 'Comma ","' = ","), + 'Period "."'))), + fluidRow(column(4, p('File Encoding')), + column(8, selectInput('cpCSVencoding', NULL, + c("unknown", "UTF-8", "Latin-1"), + "unknown")))), + + ## Options for ods files + conditionalPanel( + "input.cpNDfiletype == 'ods'", + fluidRow(column(4, p('Sheet')), + column(8, numericInput('cpODSsheet', NULL, 1, 1, + step = 1))), + checkboxInput("cpODScolnames", "Column names", TRUE), + fluidRow(column(4, p('NA symbol')), + column(8, textInput('cpODSna', NULL, value = ""))), + fluidRow(column(4, p('Skip rows')), + column(8, numericInput('cpODSskip', NULL, 0, 0, + step = 1))), + fluidRow(column(4, p('Specify range')), + column(8, textInput('cpODSrange', NULL, value = ""))) + ), + + ## Options for xls / xlsx files + conditionalPanel( + "input.cpNDfiletype == 'excel'", + fluidRow(column(4, p('Column names')), + column(8, selectInput('cpXLScolnames', NULL, + choices = c("Yes" = "TRUE", + "No" = "FALSE"), + selected = TRUE ))), + fluidRow(column(4, p('Skip rows')), + column(8, numericInput('cpXLSskip', NULL, 0, 0, + step = 1))), + fluidRow(column(4, p('Trim whitespaces')), + column(8, selectInput('cpXLStrim', NULL, + choices = c("Yes" = "TRUE", + "No" = "FALSE"), + selected = TRUE ))), + checkboxInput("cpXLSsheet", + "Specify Worksheet", FALSE), + conditionalPanel("input.cpXLSsheet", + textInput("cpXLSsheet_specs", NULL, "")), + checkboxInput("cpXLSrange", "Specify Range", FALSE), + conditionalPanel("input.cpXLSrange", + textInput("cpXLSrange_specs", NULL, "") + )), + + ## Options for SPSS files + conditionalPanel( + "input.cpNDfiletype == 'spss'", + fluidRow( + column(4, p('Skip rows')), + column(8, numericInput('cpSPSSskip', NULL, 0, 0, step = 1))), + checkboxInput("cpSPSSna", "User-defined NA", FALSE)), + + ## Options for SAS files + conditionalPanel( + "input.cpNDfiletype == 'sas_data'", + fluidRow( + column(4, p('Skip rows')), + column(8, numericInput("cpSASskip", NULL, 0, 0, step = 1))), + checkboxInput("cpSAScatalog", "Use catalog file", FALSE), + conditionalPanel( + "input.cpSAScatalog", + fileInput("cpSAScatalog_file", "Catalog file")) + ), + + ## Options for STATA dta files + conditionalPanel( + "input.cpNDfiletype == 'stata'", + checkboxInput("cpSTATAfactors", "Convert factors", FALSE)) + ) + ) + }), + + uiOutput("cpDataMessage"), + hr(), + h4("Data"), + + fluidRow( + column(6, numericInput( + "cpNrows", "N. rows for working sample", 500, 1, step = 1)), + column(6, numericInput( + "cpNskip", "Rows to skip before working sample", 0, 0, step = 1)) + ), + + fluidRow( + column(3, p(strong("*ID"))), + column(5, uiOutput("cpIdcolUI")), + column(4, p("Name of the id variable, unique identifier of each text")) + ), + fluidRow( + column(3, p(strong("*Text"))), + column(5, uiOutput("cpTextcolUI")), + column(4, p("Name of the text variables: if more than one, texts are concatenated in the specified order")) + ), + fluidRow( + column(3, p(strong("*Tags"))), + column(5, uiOutput("cpTagcolUI")), + column(4, p("(optional) Names of variables that are already tagged: each will create a new scheme")) + ), + fluidRow( + column(3, p(strong("Comments"))), + column(5, uiOutput("cpCommcolUI")), + column(4, p("(optional) Name of the comments variable")) + ), + fluidRow( + column(3, p(strong("Context"))), + column(5, uiOutput("cpContextcolUI")), + column(4, p("(optional) Names of variables not used in the models, but may be displayed during tagging")) + ), + + + hr(), + h4("System"), + + fluidRow( + column(3, checkboxInput("cp_use_python", "Python backend", FALSE)), + column(5, conditionalPanel( + "input.cp_use_python", + textInput("cp_which_python", NULL, value = "python3", + placeholder = "(custom python path)"))), + column(4, conditionalPanel( + "input.cp_use_python", + p("This must be a working python3 environment, with the required modules installed (see documentation)"))) + ), + conditionalPanel("input.cp_use_python", list( + fluidRow( + column(3, p("Language")), + column(5, selectInput("cp_lang", NULL, modelnames_labels, "en")), + column(4, p("Used to preset tokenization and embedding models")) + ), + fluidRow( + column(3, checkboxInput("cp_use_spacy", "SpaCy tokenization", FALSE)), + column(5, conditionalPanel("input.cp_use_spacy", uiOutput("cp_spacyUI"))), + column(4, p("Name of the spacy tokenizer model, used in DTM and word embeddings")) + ), + conditionalPanel("input.cp_use_spacy", fluidRow( + column(3), + column(9, uiOutput("cp_spacyDlUI"))) + ), + fluidRow( + column(3, checkboxInput("cp_use_ft", "FastText word embeddings", FALSE)), + column(5, conditionalPanel("input.cp_use_ft", uiOutput("cp_ftUI"))), + column(4, p("Path to the local fasttext model binary")) + ), + conditionalPanel("input.cp_use_ft", fluidRow( + column(3), + column(9, uiOutput("cp_ftDlUI")))), + fluidRow( + column(3, checkboxInput("cp_use_sb", "SBERT sentence embeddings", FALSE)), + column(5, conditionalPanel("input.cp_use_sb", uiOutput("cp_sbUI"))), + column(4, p("(GPU recommended) Name or path of the sentence-transformers model")) + ), + + conditionalPanel("input.cp_use_python", { + checkboxInput("cp_use_gpu", "GPU support (CUDA, for SBERT and BERT)", FALSE) + }) + )), + + uiOutput("cpConfirmMessage"), + footer = tagList(actionButton("cpConfirm", "Create"), + modalButton("Cancel")))) + }) + + observeEvent(input$cpDatafileGo, { + dataname <- input$cpDatafile + dataname <- gsub("[.]feather$", "", dataname) + tmpdat <- try(arrow::read_feather( + paste0(gsub("/+$", "/", paste0(input$cpDatadir, "/")), + dataname, ".feather"))) + if (inherits(tmpdat, "try-error")) { + values$cpDataMessage <- paste0( + "Data import error: ", as.character(tmpdat)) + values$cpNewData <- NULL + return(NULL) + } + if (!is.data.frame(tmpdat)) { + values$cpDataMessage <- "Data import error: object not a data.frame" + values$cpNewData <- NULL + return(NULL) + } + if (ncol(tmpdat) < 2) { + values$cpDataMessage <- "Data import error: only one column detected" + values$cpNewData <- NULL + return(NULL) + } + + values$cpDataMessage <- paste0( + "Data import success: ", nrow(tmpdat), " rows, ", ncol(tmpdat), " cols.") + values$cpNewData <- tmpdat + values$cpNewDataname <- dataname + + }) + + ## Data import mechanism + observeEvent(input$cpNDfile, { + req(input$cpNDfile) + if(input$cpNDfiletype == "csv_txt") { + res <- try(read.csv( + input$cpNDfile$datapath, + header = as.logical(input$cpCSVheader), sep = input$cpCSVsep, + quote = input$cpCSVquote, dec = input$cpCSVdec, stringsAsFactors = FALSE, + encoding = input$cpCSVencoding, check.names = TRUE)) + + } else if(input$cpNDfiletype == "feather") { + res <- try(arrow::read_feather(input$cpNDfile$datapath)) + + } else if(input$cpNDfiletype == "parquet") { + res <- try(arrow::read_parquet(input$cpNDfile$datapath)) + + } else if (input$cpNDfiletype == "ods") { + range <- input$cpODSrange + if (range == "") range <- NULL + res <- try(readODS::read_ods( + path = input$cpNDfile$datapath, sheet = input$cpODSsheet, + col_names = input$cpODScolnames, na = input$cpODSna, + skip = input$cpODSskip, range = input$cpODSrange)) + + } else if(input$cpNDfiletype == "excel"){ + column_names <- as.logical(input$cpXLScolnames) + trim_spaces <- as.logical(input$cpXLStrim) + the.range <- NULL + if(input$cpXLSrange == TRUE & input$cpXLSrange_specs != "") + the.range <- input$cpXLSrange_specs + the.sheet <- NULL + if(input$cpXLSsheet == TRUE & input$cpXLSsheet_specs != "") + the.sheet <- input$cpXLSsheet_specs + res <- try(data.frame(readxl::read_excel( + dataFile$datapath, col_names = column_names, range = the.range, + sheet= the.sheet, trim_ws = trim_spaces, skip = rows_to_skip))) + + } else if(input$cpNDfiletype == "spss"){ + res <- try(data.frame(haven::read_spss( + file = input$cpNDfile$datapath, + skip= input$cpSPSSskip, user_na = input$cpSPSSna))) + if (!inherits(res, "try-error")) { + res <- data.frame(lapply(res, function(x) { + attr(x, "format.spss") <- NULL + if ("haven_labelled" %in% class(x)) + x <- haven::as_factor(x, levels= "labels") + x + })) + } + + } else if(input$cpNDfiletype == "sas_data") { + res <- try(data.frame(haven::read_sas( + data_file = input$cpNDfile$datapath, + catalog_file = input$cpSAScatalog_file$datapath, + skip = input$cpSASskip))) + + } else if(input$cpNDfiletype == "stata") { + res <- try(data.frame(foreign::read.dta( + file = input$cpNDfile$datapath, convert.factors = input$cpSTATAfactors))) + } + + if(inherits(res, "try-error")) { + values$cpDataMessage <- paste0("Data import error: ", res) + values$cpNewData <- NULL + return(NULL) + } + if (ncol(res) < 2) { + values$cpDataMessage <- "Data import error: only one column detected" + values$cpNewData <- NULL + return(NULL) + } + values$cpDataMessage <- paste0( + "Data import success: ", nrow(res), " rows, ", ncol(res), " columns.") + values$cpNewData <- res + }) + + ## New project confirm + observeEvent(input$cpConfirm, { + if (verbose) + cat("DEBUG cpConfirm\n") + newconf <- list() + + tmpdir <- gsub("/+$", "/", paste0(input$cpDatadir, "/")) + if (!dir.exists(tmpdir)) + dir.create(tmpdir) + newconf$datadir <- tmpdir + + tmpname <- cleanFileName(input$cpName) + if (!nchar(tmpname)) { + values$cpConfirmMessage <- "Project name is required" + return(NULL) + } + if (dir.exists(paste0(newconf$datadir, tmpname))) { + values$cpConfirmMessage <- paste0( + "A project with the same name (", tmpname, + ") already exists in the data directory.") + return(NULL) + } + newconf$projectid <- tmpname + + if (!is.data.frame(values$cpNewData)) { + values$cpConfirmMessage <- "Import data first" + return(NULL) + } + + if (input$cpDoImport) { + newconf$dataname <- newconf$projectid + } else + newconf$dataname <- values$cpNewDataname + + if (any(is.na(values$cpNewData[[input$cpIdcol]]))) { + values$cpConfirmMessage <- "ID variable contains missing values" + return(NULL) + } + if (any(duplicated(values$cpNewData[[input$cpIdcol]]))) { + values$cpConfirmMessage <- "ID variable contains duplicates" + return(NULL) + } + newconf$idcol <- input$cpIdcol + + if (!length(input$cpTextcol)) { + values$cpConfirmMessage <- "Select at least one text variable" + return(NULL) + } + newconf$textcols <- input$cpTextcol + + if (input$cpCommcol == "none") { + newconf$commcol <- paste0("comm_", newconf$projectid) + } else + newconf$commcol <- input$cpCommcol + + if (!length(input$cpTagcol)) { + newconf$tagcols <- "scheme0" + } else + newconf$tagcols <- input$cpTagcol + + newconf$contextcols <- input$cpContextcol + + newconf$dataNrows <- input$cpNrows + newconf$dataSkipRows <- input$cpNskip + + if (input$cp_use_python) { + pytest <- try(system(paste(input$cp_which_python, "--version"), + intern = TRUE)) + if (inherits(pytest, "try-error")) { + values$cpConfirmMessage <- paste( + "Python path `", input$cp_which_python, "` not valid") + return(NULL) + } + } + newconf$use_python <- input$cp_use_python + newconf$python <- input$cp_which_python + newconf$use_gpu <- input$cp_use_gpu + + if (input$cp_use_spacy) { + sptest <- system(paste0( + newconf$python, " -m spacy info ", input$cp_use_spacy_model), intern = TRUE) + if (length(sptest) == 0) { + values$cpConfirmMessage <- paste( + "Error loading spacy, check that it is installed in the specified python env") + return(NULL) + } + if (length(attr(sptest, "status"))) { + values$cpConfirmMessage <- paste( + "Error loading spacy model, check that it has been downloaded") + return(NULL) + } + } + newconf$use_spacy <- input$cp_use_spacy + newconf$use_spacy_model <- input$cp_use_spacy_model + + if (input$cp_use_ft) if (!file.exists(input$cp_use_ft_model)) { + values$cpConfirmMessage <- paste( + "Error loading fasttext model, check the specified path") + return(NULL) + } + newconf$use_ft <- input$cp_use_ft + newconf$use_ft_model <- input$cp_use_ft_model + + newconf$use_sb <- input$cp_use_sb + newconf$use_sb_model <- input$cp_use_sb_model + + writeLines(RJSONIO::toJSON(newconf), + paste0("tigger_", newconf$projectid, ".json")) + + values$confnames <- c(newconf$projectid, values$confnames) + values$conf <- newconf + + ## On new data import, write a feather file with the name of the project + if (input$cpDoImport) + arrow::write_feather(values$cpNewData, paste0( + newconf$datadir, newconf$dataname, ".feather" + )) + + removeModal() + }) + + ############################################################################## + ## Scheme management + ############################################################################## + + ## Print active scheme in scheme management panel + output$printScheme <- renderUI(h4(input$selectScheme)) + + ## Create new scheme on request, with modal dialog + observeEvent(input$createScheme, { + if (verbose) + cat("DEBUG create scheme\n") + showModal(modalDialog( + title = "Create tagging scheme", + p(paste0("This will create a new scheme for this project")), + + textInput("newScheme", label = NULL, placeholder = "Required: new scheme name"), + textAreaInput("newSchemeDescr", label = NULL, + placeholder = "Optional: scheme description"), + checkboxInput("schemeDuplicate", "Duplicate existing scheme:"), + conditionalPanel( + "input.schemeDuplicate", + selectInput("schemeDuplicateFrom", NULL, rev(values$conf$tagcols))), + + footer = tagList(actionButton("createSchemeConfirm", "Create"), + modalButton("Cancel")))) + }) + + observeEvent(input$createSchemeConfirm, { + if (is.null(ok.data())) return(NULL) + if (verbose) + cat("DEBUG create scheme event\n") + + ## Make sure new scheme name is a correct file name + if (!grepl("\\S", input$newScheme)) { + showNotification(type = "error", + "Cannot create scheme: name empty.") + return(NULL) + } + + ## Check that new scheme name doesn't exist + ## Re-read tagcols from conf file, to prevent scheme disappearances + tmpname <- cleanFileName(input$newScheme) + tagpath <- paste0(values$projectdir, tmpname, ".csv") + confpath <- paste0("tigger_", values$conf$projectid, ".json") + if (verbose) + cat("DEBUG create scheme check json:", confpath,"\n") + tmp_tagcols <- RJSONIO::fromJSON(confpath)$tagcols + if (verbose) + cat("DEBUG create scheme json OK\n") + if (tmpname %in% tmp_tagcols | file.exists(tagpath)) { + showNotification(type = "error", + "Cannot create scheme: name already exists.") + return(NULL) + } + + ## Create new scheme description + if (nchar(input$newSchemeDescr)) + writeLines(input$newSchemeDescr, + paste0(values$projectdir, tmpname, ".txt")) + + ## Update conf file and reactives + values$conf$tagcols <- c(tmp_tagcols, tmpname) + # values$confs[[values$conf$projectid]]$tagcols <- c(tmp_tagcols, tmpname) + writeLines(RJSONIO::toJSON(values$conf), confpath) + + if (input$schemeDuplicate) { + tmpmat <- matrix(values$tagmat[, input$schemeDuplicateFrom]) + tmpmathist <- matrix(values$tagmathist[, input$schemeDuplicateFrom]) + } else { + tmpmat <- matrix(rep(NA, nrow(ok.data()))) + tmpmathist <- matrix(rep(NA, nrow(ok.data()))) + } + colnames(tmpmat) <- tmpname + colnames(tmpmathist) <- tmpname + + values$tagmat <- cbind(values$tagmat, tmpmat) + values$tagmathist <- cbind(values$tagmat, tmpmathist) + + ## Create new tagging file if duplicating + if (input$schemeDuplicate) { + export <- data.frame(ok.data()[, values$conf$idcol], + values$tagmat[, tmpname]) + colnames(export) <- c(values$conf$idcol, tmpname) + export <- na.omit(export) + write.table(export, tagpath, sep = ",", qmethod = "double", + fileEncoding = "UTF-8", row.names = FALSE, + append = FALSE, col.names = TRUE) + } + + cat("DEBUG create scheme exit\n") + removeModal() + }) + + ## Button action: delete scheme + observeEvent(input$schemeDelete, { + if (verbose) + cat("DEBUG delete scheme\n") + if (length(values$conf$tagcols) < 2) { + showNotification(type = "error", + "Cannot delete: at least one scheme must remain.") + return(NULL) + } + showModal(modalDialog( + title = "Delete tagging scheme", + paste0("Confirm delete '", input$selectScheme, + "': ", sum(!is.na(values$label)), " tags will be deleted", + ", along with scheme description."), + footer = tagList(actionButton("schemeDeleteConfirm", "Delete"), + modalButton("Cancel")))) + }) + + observeEvent(input$schemeDeleteConfirm, { + file.remove(paste0(values$projectdir, input$selectScheme, ".csv")) + file.remove(paste0(values$projectdir, input$selectScheme, ".txt")) + file.remove(paste0(values$projectdir, input$selectScheme, "_regex.json")) + values$conf$tagcols <- + values$conf$tagcols[values$conf$tagcols != input$selectScheme] + writeLines(RJSONIO::toJSON(values$conf), + paste0("tigger_", input$selectProject, ".json")) + removeModal() + }) + + + observeEvent(input$schemeDescrSave, { + req(input$selectScheme) + writeLines(input$schemeDescr, + paste0(values$conf$datadir, values$conf$projectid, + "/", input$selectScheme, ".txt")) + }) + + + ############################################################################## + ############################################################################## + + ## Tag selection : populate + observe({ + values$conf$tagcols + updateSelectInput( + # session, "selectScheme", choices = c(rev(values$conf$tagcols), "")) + session, "selectScheme", choices = rev(values$conf$tagcols)) + }) + + + ## Populate oracle buttons + output$oracleButtons <- renderUI({ + if (is.null(ok.data())) return(NULL) + # if (is.null(values$label)) return(NULL) + if (is.null(values$uniqueLabels)) return(NULL) + if (verbose) + cat("DEBUG populate oracle buttons\n") + + do.call(fluidRow, lapply(values$uniqueLabels, function(ilab) + actionButton(paste0("oracle_", label2hash(ilab)), + strong(ilab)))) + }) + + ## Populate maxprob variable selector + observe({ + updateSelectInput(session, "maxprobWhich", choices = values$uniqueLabels) + updateSelectInput(session, "taggedWhich", + choices = c("All" = "all", values$uniqueLabels)) + }) + + ## Data panel message + observeEvent(ok.data(), { + if (verbose) + cat("DEBUG data panel message\n") + if (is.null(ok.data())) { + output$dataMessage <- renderUI(p("Importing data...")) + } + output$dataMessage <- NULL + }) + + ## On config change, update dataView and retag options + observeEvent(list(values$conf), { + + if (is.null(values$conf)) return(NULL) + output$panelData <- renderUI(list( + DT::dataTableOutput("dataView") + )) + + output$panelRetag <- renderUI(c( + list( + br(), + h4("Rename tags"), + fluidRow( + column(4, strong("Current")), + column(2, strong("Count")), + column(6, strong("New")) + ), + br()), + lapply(values$uniqueLabels, function(iLab) { + iHash <- label2hash(iLab) + fluidRow( + column(4, p(iLab)), + column(2, p(sum(values$label == iLab, na.rm = TRUE))), + column(4, textInput( + paste0("newtag_", iHash), NULL, + placeholder = "(new name, leave empty to untag)")), + column(2, actionButton(paste0("retag_", iHash), "Rename"))) + }) + )) + }) + + output$panelExtraRegex <- renderUI(list( + p(strong("Regex predictors")), + fluidRow( + column(8, textInput("extraRegexText", NULL, + placeholder = "(new regex)", width = "100%")), + column(4, actionButton( + "extraRegexSearch", strong("+"), width = "100%")) + ), + fluidRow( + style="overflow-y:scroll; max-height: 25vh", + fluidRow( + column(2, strong("")), + column(1, strong("Use")), + column(1, strong("Case")), + column(1, p(strong("N"))), + column(7, p(strong("Regex"))) + ), + + lapply(values$extraRegex, function(iRegex) { + iHash <- label2hash(iRegex) + tmp_cased <- values[[paste0("caseregex_", iHash)]] + if (is.null(tmp_cased)) tmp_cased <- FALSE + fluidRow( + column(2, actionButton(paste0("delregex_", iHash), "🗑️", width="100%")), + column(1, checkboxInput( + paste0("useregex_", iHash), NULL, + value = isolate(!isFALSE(values[[paste0("useregex_", iHash)]])))), + column(1, checkboxInput( + paste0("caseregex_", iHash), NULL, + value = isolate(isTRUE(values[[paste0("caseregex_", iHash)]])))), + column(1, br(), p(sum( + stringi::stri_detect_regex( + ok.nicetext(), iRegex, + opts_regex = list(case_insensitive = !tmp_cased)), + na.rm = TRUE))), + column(7, br(), p(iRegex)) + ) + }) + ) + + )) + + ## Process new regex + observeEvent(input$extraRegexSearch, { + if (is.null(ok.nicetext())) return(NULL) + req(nchar(input$extraRegexText) > 0) + if (input$extraRegexText %in% values$extraRegex) return(NULL) + try_regex <- try(stringi::stri_detect_regex( + ok.nicetext(), input$extraRegexText, + opts_regex = list(case_insensitive = TRUE))) + if (inherits(try_regex, "try-error")) return(NULL) + new_regex <- input$extraRegexText + new_hash <- label2hash(new_regex) + values$extraRegex <- c(new_regex, values$extraRegex) + names(values$extraRegex)[1] <- new_hash + if (verbose) + cat("DEBUG regex search", input$extraRegexText, "\n") + }) + + ## Regex predictors: populate delete and use events on first creation + useregex_trigger <- reactiveVal(0) + delregex_trigger <- reactiveVal(0) + caseregex_trigger <- reactiveVal(0) + observeEvent(list(values$extraRegex), { + if (!length(values$extraRegex)) return(NULL) + if (any(! values$extraRegex %in% values$extraRegexHist)) { + if (verbose) + cat("DEBUG regex predictor names updates\n") + tmpregex <- values$extraRegex[! values$extraRegex %in% values$extraRegexHist] + values$extraRegexHist <- c(tmpregex, values$extraRegexHist) + + c(lapply(1:length(tmpregex), function(iregex) { + observeEvent( + input[[paste0("delregex_", names(tmpregex)[iregex])]], { + if (verbose) + cat("DEBUG regex delete", names(tmpregex)[iregex], "\n") + values$extraRegex <- values$extraRegex[values$extraRegex != tmpregex[iregex]] + delregex_trigger(delregex_trigger() + 1) + }) + }), + lapply(1:length(tmpregex), function(iregex) { + tmpname <- paste0("useregex_", names(tmpregex)[iregex]) + observeEvent( + input[[tmpname]], { + values[[tmpname]] <- input[[tmpname]] + useregex_trigger(useregex_trigger() + 1) + }) + }), + lapply(1:length(tmpregex), function(iregex) { + tmpname <- paste0("caseregex_", names(tmpregex)[iregex]) + observeEvent( + input[[tmpname]], { + values[[tmpname]] <- input[[tmpname]] + caseregex_trigger(caseregex_trigger() + 1) + }) + }) + ) + } + }) + + observeEvent(list(delregex_trigger(), useregex_trigger(), caseregex_trigger()), { + req(ok.data(), input$selectScheme) + regex_all <- values$extraRegex + values$regex_inuse <- sapply(names(regex_all), function(iname) + values[[paste0("useregex_", iname)]]) + values$regex_case <- sapply(names(regex_all), function(iname) + values[[paste0("caseregex_", iname)]]) + if (verbose) if (any(values$regex_inuse)) + cat("DEBUG regex in use", paste0(collapse = " ", regex_all[values$regex_inuse]), "\n") + writeLines( + RJSONIO::toJSON(lapply(FUN = unname, list( + regex = regex_all, use = values$regex_inuse, case = values$regex_case))), + paste0(values$projectdir, input$selectScheme, "_regex.json")) + }) + + output$dataView <- DT::renderDataTable({ + # if (is.null(ok.text()) | ok.data.running()) return(NULL) + req(ok.text()) + if (verbose) + cat("DEBUG enter dataView\n") + if (verbose) + cat("DEBUG render dataView\n") + DT::datatable( + rownames = F, + data.frame( + row = 1:nrow(ok.data()), + id = ok.data()[, values$conf$idcol], + text = ok.text()), + options = list( + pageLength = 3, + columnDefs = list(list( + targets = 2, + render = DT::JS( + "function(data, type, row, meta) {", + "return type === 'display' && data.length > 200 ?", + "'' + data.substr(0, 200) + '...' : data;", + "}") + )) + ), + callback = DT::JS('table.page(0).draw(false);') + ) + }) + + ## Triggers for mass rename, created with each new label, with confirm + observeEvent(values$uniqueLabelsAll, { + if (is.null(values$uniqueLabelsAll)) return(NULL) + notthere <- unique(na.omit( + values$uniqueLabelsAll[! values$uniqueLabelsAll %in% values$retagLabels])) + lapply( + notthere, + function(iTag) { + iHash <- label2hash(iTag) + + observeEvent(input[[paste0("retag_", iHash)]], { + tmp_oldtag <- iTag + tmp_retagged <- which(values$label == tmp_oldtag) + tmp_newtag <- gsub( + "^\\s+|\\s+$", "", input[[paste0("newtag_", iHash)]]) + showModal(modalDialog( + title = "Mass retag", + paste0("Confirm retag: ", length(tmp_retagged), " ", + tmp_oldtag, " tags will be ", + ifelse(nchar(tmp_newtag), + paste("renamed to", tmp_newtag) , "deleted")), + footer = tagList(actionButton(paste0("retag_confirm_", iTag), "Retag"), + modalButton("Cancel")))) + }) + + observeEvent(input[[paste0("retag_confirm_", iTag)]], { + tmp_oldtag <- iTag + tmp_retagged <- which(values$label == tmp_oldtag) + tmp_newtag <- gsub( + "^\\s+|\\s+$", "", input[[paste0("newtag_", iHash)]]) + if (!nchar(tmp_newtag)) tmp_newtag <- NA + values$tagTrigger <- list( + id = ok.data()[[values$conf$idcol]][tmp_retagged], + tag = rep(tmp_newtag, length(tmp_retagged))) + values$queries <- c(values$queries, tmp_retagged) + + removeModal() + }) + + }) + values$retagLabels <- c(values$retagLabels, notthere) + }) + + + + ############################################################################## + ## UI rendering of Panel Tagging / Visualization + ############################################################################## + + #################### + ## Tagging left panel + + + # ## Trigger from Oops + # observeEvent(input$oops, { + # if (is.null(ok.data())) + # return(NULL) + # if (verbose) + # cat("DEBUG enter Oops\n") + # if (!is.null(values$retagged)) if (values$queries[length(values$queries)] == values$retagged[length(values$retagged)]) + # values$retagged <- values$retagged[-length(values$retagged)] + # values$label[values$queries[length(values$queries)]] <- NA + # if (ok.commentCol() != "(None)") + # values$comment[values$queries[length(values$queries)]] <- ok.data()[values$queries[length(values$queries)], ok.commentCol()] + # values$queries <- values$queries[-length(values$queries)] + # ## tagTrigger ? + # + # ready2tag(0) + # queryNext(queryNext() + 1) + # }) + + + ## Tagging triggers for Oracle buttons, created with each new label + observeEvent(values$uniqueLabelsAll, { + if (is.null(values$label)) return(NULL) + if (is.null(values$uniqueLabelsAll)) return(NULL) + if (all(is.na(values$label))) return(NULL) + notthere <- unique(na.omit( + values$uniqueLabelsAll[! values$uniqueLabelsAll %in% values$oracleLabels] + )) + if (!length(notthere)) return(NULL) + if (verbose) + cat("DEBUG triggers for oracle buttons\n") + lapply( + notthere, + function(iLab) { + observeEvent( + input[[paste0("oracle_", label2hash(iLab))]], { + if (is.null(ok.text())) return(NULL) + if (is.null(values$newQuery)) return(NULL) + if (!ready2tag()) {cat("DEBUG not ready to tag in Button\n"); return(NULL)} + # ready2tag(0) + # if (input$allowRetag | sum(is.na(values$label)) == 0) + if (!is.na(values$label[values$lastQuery])) + values$retagged <- c(values$retagged, values$lastQuery) + # values$label[values$lastQuery] <- iLab + values$comment[values$lastQuery] <- input$currentComment + # values$queries <- c(values$queries, values$lastQuery) + values$tagTrigger <- list( + id = ok.data()[values$lastQuery, values$conf$idcol], + tag = iLab, + comment = input$currentComment) + }) + }) + values$oracleLabels <- c(values$oracleLabels, notthere) + }) + + ## Trigger from Oracle Create value button + observeEvent(input$currentAction, { + if (is.null(ok.data())) return(NULL) + if (is.null(values$newQuery)) return(NULL) + if (verbose) + cat("DEBUG enter create value\n") + # if (input$allowRetag | sum(is.na(values$label)) == 0) + if (!is.na(values$label[values$lastQuery])) + values$retagged <- c(values$retagged, values$lastQuery) + if (input$newLab == "") { + return(NULL) + } else { + # values$label[values$lastQuery] <- input$newLab + updateTextInput(session, "newLab", value= "") + } + values$comment[values$lastQuery] <- input$currentComment + # values$queries <- c(values$queries, values$lastQuery) + values$tagTrigger <- list( + id = ok.data()[values$lastQuery, values$conf$idcol], + tag = input$newLab, + comment = input$currentComment) + }) + + ## Trigger from Oracle Confirm button + observeEvent(input$oracleConfirm, { + req(ok.data(), ok.train(), values$newQuery) + # if (is.null(ok.data())) return(NULL) + # if (is.null(values$newQuery)) return(NULL) + if (!ready2tag()) {cat("DEBUG not ready to tag in OC\n"); return(NULL)} + # ready2tag(0) + if (!ok.train.possible()) return(NULL) + if (verbose) + cat("DEBUG oracle confirm\n") + # if (input$allowRetag | sum(is.na(values$label)) == 0) + if (!is.na(values$label[values$lastQuery])) + values$retagged <- c(values$retagged, values$lastQuery) + # values$label[values$lastQuery] <- ok.predlab1()[values$lastQuery] + # values$queries <- c(values$queries, values$lastQuery) + values$comment[values$lastQuery] <- input$currentComment + values$tagTrigger <- list( + id = ok.data()[values$lastQuery, values$conf$idcol], + tag = ok.predlab1()[values$lastQuery], + comment = input$currentComment) + }) + + ## Trigger for CV glm (async) + glmCV_nclicks <- reactiveVal(0) + observeEvent(input$glmCV, { + if (is.null(ok.predictor())) return(NULL) + if (glmCV_nclicks() != 0) { + showNotification("LASSO CV already running") + return(NULL) + } + glmCV_nclicks(1) + glmCV_future <- reactiveVal() + + tmpvalues <- values$label + tmppred <- ok.predictor() + for (ival in na.omit(unique(tmpvalues))) + if (sum(tmpvalues == ival, na.rm = T) <= 8) + tmpvalues[which(tmpvalues == ival)] <- NA + glmCV_async <- future(seed = TRUE, { + glmnet::cv.glmnet( + x= tmppred[!is.na(tmpvalues), ], + y= as.factor(tmpvalues[!is.na(tmpvalues)]), + family= "multinomial") + }) %...>% glmCV_future() + + glmCV_async <- catch(glmCV_async, function(e) { + showNotification(type = "error", paste0("LASSO CV error: ", e)) + }) + glmCV_async <- finally(glmCV_async, function() { + glmCV_nclicks(0) + if (is.null(glmCV_future())) return(NULL) + updateNumericInput(session, "glmLambda", + value = glmCV_future()$lambda.min) + }) + NULL + }) + + ## Trigger for CV liblinear (async) + liblinCV_nclicks <- reactiveVal(0) + observeEvent(input$liblinCV, { + if (is.null(ok.predictor())) return(NULL) + if (liblinCV_nclicks() != 0) { + showNotification("LibLineaR CV already running") + return(NULL) + } + liblinCV_nclicks(1) + liblinCV_future <- reactiveVal() + + tmpvalues <- values$label + tmppred <- ok.predictor() + for (ival in na.omit(unique(tmpvalues))) + if (sum(tmpvalues == ival, na.rm = T) <= 3) + tmpvalues[which(tmpvalues == ival)] <- NA + + liblinCV_async <- future(seed = TRUE, { + LiblineaR::LiblineaR( + data= tmppred[!is.na(tmpvalues), ], + target= as.factor(tmpvalues[!is.na(tmpvalues)]), + type= 0, findC= T) + }) %...>% liblinCV_future() + liblinCV_async <- catch(liblinCV_async, function(e) { + showNotification(type = "error", paste0("LibLineaR CV error: ", e)) + }) + liblinCV_async <- finally(liblinCV_async, function() { + liblinCV_nclicks(0) + if (is.null(liblinCV_future())) return(NULL) + updateNumericInput(session, "liblinCost", value= liblinCV_future()) + }) + NULL + }) + + + output$currentContext <- renderUI({ + if (verbose) + cat("DEBUG 1rentContext\n") + if ((! is.null(ok.text())) & (length(ok.contextCol()) > 0)) + HTML(paste(paste(ok.contextCol(), + as.character(ok.data()[values$newQuery, ok.contextCol()]), + sep = " : "), + collapse= "

    ")) + }) + + output$makeOracleConfirm <- renderUI({ + if (verbose) + cat("DEBUG update printPredicted\n") + if (is.null(ok.pred())) { + tmp <- "No prediction" + } else if (is.null(values$newQuery)) { + tmp <- "No query" + } else { + tmp <- paste0( + ifelse(is.na(values$label[values$newQuery]), "", "Pred: "), + ok.predlab1()[values$newQuery], + " (", round(max(ok.pred()[values$newQuery, ]), 3), ")") + } + res <- list(actionButton("oracleConfirm", label= paste("🤖", tmp), width = "100%")) + if (!is.null(values$label) & !is.null(values$newQuery)) { + if (!is.na(values$label[values$newQuery])) + res <- list(res[[1]], HTML(paste0( + "
    Current: ", values$label[values$newQuery], + "
    "))) + } + res + }) + + + ## Show training diagnostics on new model train + output$trainDiagno <- + renderText(paste0(ok.msgtagged(), "\n", values$modelDiagno)) + + ## Diagnostics event + observeEvent(diagTrigger(), { + if (verbose) + cat("DEBUG update modelDiagno\n") + if (is.null(ok.data())) { + values$modelDiagno <- "No model" + return(NULL) + } + if (!ok.train.possible()) { + values$modelDiagno <- "No model (not enough labelled data)." + return(NULL) + } + + if (input$predModel == "random forest") { + tmp <- paste0( + "Train Accuracy: ", + round(100 * mean(ok.predlab1()[!is.na(values$label)] == + values$label[!is.na(values$label)]), 1), + "% ; Wtd. F1: ", + sprintf("%f", round(wtdF1(values$label[!is.na(values$label)], + ok.predlab1()[!is.na(values$label)]), 3)), + "%\nOOB Accuracy : ", + round(100 * mean( + colnames(ok.train()$predictions)[apply(ok.train()$predictions, 1, which.max)] == + values$label[!is.na(values$label)]), 1), + "% ; Wtd. F1: ", + sprintf("%.3f", wtdF1( + values$label[!is.na(values$label)], + colnames(ok.train()$predictions)[apply(ok.train()$predictions, 1, which.max)])) + ) + } else if (input$predModel %in% c("lasso", "linear", "naive bayes", "knn")) { + tmp <- paste0( + "Train Accuracy: ", + round(100 * mean(ok.predlab1()[!is.na(values$label)] == + values$label[!is.na(values$label)]), 1), + "% ; Wtd. F1: ", + sprintf("%.3f", wtdF1( + values$label[!is.na(values$label)], + ok.predlab1()[!is.na(values$label)]))) + } + values$modelDiagno <- tmp + }) + + ## Update comment input field + observeEvent(values$newQuery, { + if (is.null(ok.text())) return(NULL) + if (verbose) + cat("DEBUG update comment field\n") + updateTextInput(session, "currentComment", + value = values$comment[values$newQuery]) + }) + + + ## Save event: tag + comment in dedicated csv + observeEvent(values$tagTrigger, { + if (is.null(values$tagTrigger)) return(NULL) + if (verbose) + cat("DEBUG enter event tagTrigger", length(values$tagTrigger$id), "\n") + ready2tag(0) # Prevent new tagging until next query + dapath <- paste0(values$projectdir, ok.labelcol(), ".csv") + darows <- match(values$tagTrigger$id, ok.data()[, values$conf$idcol]) + + export <- data.frame(values$tagTrigger$id, values$tagTrigger$tag) + colnames(export) <- c(values$conf$idcol, ok.labelcol()) + if (verbose) + cat("DEBUG tagTrigger tag", + paste(export[, 1], export[, 2], collapse = " ; "), "\n") + write.table(export, dapath, sep = ",", qmethod = "double", + fileEncoding = "UTF-8", row.names = F, + append = ifelse(file.exists(dapath), T, F), + col.names = ifelse(file.exists(dapath), F, T)) + + if (any(nchar(values$tagTrigger$comment))) { + tmp_filter <- which(nchar(values$tagTrigger$comment) > 0) + dapath <- paste0(values$projectdir, values$conf$commcol, ".csv") + export <- data.frame(values$tagTrigger$id[tmp_filter], + values$tagTrigger$comment[tmp_filter]) + colnames(export) <- c(values$conf$idcol, values$conf$commcol) + if (verbose) + cat("DEBUG tagTrigger comment", export[1,1], export[1, 2]) + write.table(export, dapath, sep = ",", qmethod = "double", + fileEncoding = "UTF-8", row.names = F, + append = ifelse(file.exists(dapath), T, F), + col.names = ifelse(file.exists(dapath), F, T)) + } + + ## Update label hist + values$labelhist[match(values$tagTrigger$id, ok.data()[, values$conf$idcol])] <- + max(values$labelhist, na.rm = T) + 1 + ## Update wasTagged and wasRetagged + itag <- match(values$tagTrigger$id, ok.data()[, values$conf$idcol]) + if (input$sampleChoice == "untagged") { + values$wasTagged[itag] <- TRUE + } else if (input$sampleChoice == "tagged") { + values$wasRetagged[itag] <- TRUE + } else if (input$sampleChoice == "all") { + if (is.na(values$label[itag])) { + values$wasTagged[itag] <- TRUE + } else + values$wasRetagged[itag] <- TRUE + } + + ## Call queryNext event, or launch training + if (trainCountdown() == 1) { + trainCountdown(input$trainCountdown) + trainTrigger(trainTrigger() + 1) + } else if (trainCountdown() > 1) { + trainCountdown(trainCountdown() - 1) + queryNext(queryNext() + 1) + } else { + queryNext(queryNext() + 1) + } + + ## Update values$label etc + values$label[darows] <- values$tagTrigger$tag + values$queries <- c(values$queries, darows) + + if (verbose) + cat("DEBUG exit event tagTrigger", nrow(export), "rows\n") + }) + + + #################### + ## Tagging main panel + + output$textVisuCols <- renderUI({ + if (verbose) + cat("DEBUG render text/visu panel\n") + leftsize <- 6 + if (!input$panelText) leftsize <- 1 + if (input$panelText & !input$panelVisu) leftsize <- 11 + + fluidRow( + column( + leftsize, + conditionalPanel( + "input.panelText", + wellPanel(htmlOutput("currentText")) + ) + ), + column( + 12 - leftsize, + conditionalPanel( + "input.panelVisu", + fluidRow( + column(2, HTML(paste0('
    ', + actionButton("visuCompute", "🎲"), + '
    '))), + column(2, HTML(paste0('
    ', + actionButton("visuGoZoom", "🔍±"), + '
    '))), + column(2, HTML(paste0('
    ', + checkboxInput("visuLock", "🔒", value = FALSE), + '
    '))), + column(2, HTML(paste0('
    ', + checkboxInput("visuOptions", "🔧"), + '
    '))), + column(3, HTML(paste0('
    ', + numericInput("visuHeight", NULL, 400, 20, 2e12, 10), + '
    '))) + ), + conditionalPanel( + 'input.visuOptions', + flowLayout( + selectInput( + "visuAlgo", "Visu. method", c("umap", "t-sne"), "umap", width = "100%"), + selectizeInput( + "visuSource", "Visu. source", + c("Word counts (DFM)" = "dfm", + "Word embeddings (FastText)" = "ft", + "Sentence embeddings (SBERT)" = "sb", + "BERT predictions" = "bertpred", + "Regex" = "regex"), + c("sb", "ft"), multiple = TRUE) + ) + ), + conditionalPanel( # umap options + 'input.visuAlgo == "umap" & input.visuOptions', + flowLayout( + numericInput( + "umapNeighb", "Nb. Neighbors", 15, 1, 500, 1), + numericInput( + "umapSpread", "Spread", 1, 0, 16, 1e-2), + numericInput( + "umapMindist", "Min dist.", .001, 0, 16, 1e-3), + selectInput( + "umapMetric", "Metric", selected = "cosine", + c("euclidean", "cosine", "manhattan", "hamming", + "correlation", "categorical") + ) + ) + ), + conditionalPanel( # t-SNE options + 'input.visuAlgo == "t-sne" & input.visuOptions', + flowLayout( + numericInput( + "tsnePerplex", "Perplexity", 15, 1, 500, 1), + numericInput( + "tsneTheta", "Theta", .5, 0, 1, 1e-2), + numericInput( + "tsnePcaDim", "PCA dims", 50, 1, 100, 1) + ) + ), + uiOutput("visuMsg"), + plotOutput( + "visuPlot", width = "100%", + dblclick = "visuDblclick", + brush = brushOpts("visuBrush", resetOnNew = T)) + ) + ) + ) + }) + + ################## + ## Subpanel "Text" + ################## + + output$currentText <- renderUI({ + if (verbose) + cat("DEBUG render currentText\n") + if (is.null(ok.data()) | is.null(ok.nicetext())) + return(HTML("Import data to start training.")) + + if (input$strategy %in% c("entropy", "maxprob") & is.null(ok.train())) + return(HTML("No model: train one, or try 'random' or 'sequential' strategy")) + + if (is.null(values$newQuery)) + return(HTML("No query")) + + bastext <- ok.nicetext()[values$newQuery] + if (nchar(input$regexFilter) > 0) + bastext <- gsub(paste0("(", isolate(input$regexFilter), ")"), + "\\1", bastext, ignore.case = isolate(!input$regexCaseSens)) + baslen <- nchar(bastext) + if (!length(baslen)) { + if (nchar(input$regexFilter) > 0) + return(HTML("Regex filter: no match")) + return(HTML("No text matches the filters")) + } + if (is.na(baslen)) + return(HTML("No query")) + if (baslen > 1200) { + bastext <- paste0(substring(bastext, 1, 1200), '', + substring(bastext, 1201), "") + } + bastext <- gsub("\n", "
    ", bastext) + HTML(bastext) + }) + + + + ############################################################################## + ## Panel "Visualize" + ############################################################################## + + output$visuMsg <- renderUI({ + if (!is.null(values$visuMsg)) + # strong(values$visuMsg) + h5(values$visuMsg, style = "text-align:center") + }) + + ## Compute visualization points (async) + visu_nclicks <- reactiveVal(0) + ok.visu <- reactiveVal() + observe({ + if (!input$panelVisu) + values$visuGo <- FALSE + if (input$panelVisu & !values$visuGo) + values$visuGo <- TRUE + }) + observeEvent(list(values$visuGo, input$visuCompute), { + if (is.null(input$visuSource)) return(NULL) + if (!values$visuGo) return(NULL) + if (is.null(ok.data())) { + values$visuMsg <- "No data imported." + return(NULL) + } + + if (visu_nclicks() > 0) { + values$visuMsg <- "Already computing visualization" + return(NULL) + } + visu_nclicks(1) + + values$visuMsg <- "Computing visualization..." + + ## Prepare options and objects for async computation + dasources <- input$visuSource + daalgo <- input$visuAlgo + + if (verbose) + cat("DEBUG compute visu embedding: ", daalgo, paste(dasources, collapse = " "), "\n") + + visudat <- NULL + if ("dfm" %in% dasources) dadfm <- ok.dfm() + if ("ft" %in% dasources) { + visudat <- ok.data()[, values$cols_ft] + } + if ("sb" %in% dasources) { + if (is.null(visudat)) { + visudat <- ok.data()[, values$cols_sb] + } else + visudat <- cbind(visudat, ok.data()[, values$cols_sb]) + } + if ("bertpred" %in% dasources) { + if (is.null(visudat)) { + visudat <- ok.data()[, do.call(c, values$cols_bertpred)] + } else + visudat <- cbind(visudat, ok.data()[, do.call(c, values$cols_bertpred)]) + } + if ("regex" %in% dasources) { + if (is.null(visudat)) { + visudat <- ok.extraRegex.matrix() + } else + visudat <- cbind(visudat, ok.extraRegex.matrix()) + } + if (daalgo == "umap") { + visu_args <- list( + n_neighbors = input$umapNeighb, metric = input$umapMetric, + spread = input$umapSpread, min_dist = input$umapMindist) + } else if (daalgo == "t-sne") { + visu_args <- list( + perplexity = input$tsnePerplex, theta = input$tsneTheta, + initial_dims = input$tsnePcaDim, check_duplicates = FALSE) + } + + visu_async <- future(seed = TRUE, { + if ("dfm" %in% dasources) { + ## TODO: CA treatment buggy, switch to SVD? + # da_ca <- quanteda.textmodels::textmodel_ca( + # quanteda::as.dfm(dadfm), nd = min(100, nrow(dadfm))) + # da_ca <- (da_ca$rowcoord %*% diag(da_ca$sv))[, !is.na(da_ca$sv)] + da_ca <- as.matrix(dadfm) + if (is.null(visudat)) { + visudat <- da_ca + } else + visudat <- cbind(da_ca, visudat) + } + + if (daalgo == "umap") { + do.call(uwot::umap, c(list(visudat), visu_args)) + } else if (daalgo == "t-sne") { + visudat_dupli <- duplicated(visudat) + if (any(visudat_dupli)) + visudat[visudat_dupli, ] <- jitter(visudat[visudat_dupli, ]) + do.call(Rtsne::Rtsne, c(list(visudat), visu_args))$Y + } + }) %...>% ok.visu() + + visu_async <- catch(visu_async, function(error) { + values$visuMsg <- paste0("Error: ", error) + }) + visu_async <- finally(visu_async, function() { + visu_nclicks(0) + if (input$visuGoZoom == 0) { + values$visuMsg <- "Double-click to select, drag to select zooming zone." + } else + values$visuMsg <- NULL + + nice_choices <- c("FastText" = "ft", "SBERT" = "sb", "visu" = daalgo) + names(nice_choices)[3] <- ifelse(daalgo == "t-sne", "t-SNE", "UMAP") + updateSelectInput( + session, "dlEmbedSelect", choices = nice_choices, selected = daalgo) + + if (verbose) + cat("DEBUG Visu async out\n") + }) + + NULL + }) + + ## Compute visualization plot (async) + ok.ggvisu <- reactiveVal() + observe({ + # if (is.null(ok.visu())) return(NULL) + if (is.null(ok.visu())) { + ok.ggvisu(NULL) + return(NULL) + } + # if (!isTRUE(values$visugo)) return(NULL) + if (!input$panelVisu) return(NULL) + + if (verbose) + cat("DEBUG compute visu plot\n") + ggdat <- data.frame(ok.visu(), tag = values$label) + if (!is.null(ok.predlab1())) + ggdat$tag[is.na(values$label)] <- ok.predlab1()[is.na(values$label)] + da_predlabs <- ok.predlab1() + da_tagged <- ! is.na(values$label) + da_query <- values$newQuery + + ggvisu_async <- future({ + ggres <- ggplot(ggdat, aes(X1, X2)) + theme_bw() + + xlab("") + ylab("") + + theme(legend.position = "bottom") + + scale_shape_manual(values = rep(1:8, 40)) # TODO : adapt to tags + + ## Tagged points + if (any(da_tagged)) { + ggres <- ggres + + geom_point(data = ggdat[da_tagged, ], + aes(X1, X2, color = tag, shape = tag), + size = 1.5, stroke = 1.5, alpha = .8) + } + + ## Not tagged points + if (!is.null(da_predlabs)) { + ggres <- ggres + + geom_point(data = ggdat[!da_tagged, ], + aes(color = tag, shape = tag), show.legend = FALSE) + } else { + ggres <- ggres + + geom_point(alpha = .8, data = ggdat[is.na(ggdat$tag), ]) + } + + ## Current query + if (!is.null(da_query)) { + if (is.null(da_predlabs)) { + ggres <- ggres + + geom_point(data = ggdat[da_query, ], color = 2, + stroke = 3, shape = 3, show.legend = FALSE) + } else { + ggres <- ggres + + geom_point(data = ggdat[da_query, ], + stroke = 3, shape = 3, show.legend = FALSE) + } + } + ggres + }) %...>% ok.ggvisu() + + ggvisu_async <- catch(ggvisu_async, function(error) { + values$visuMsg <- paste0("Error in ggplot: ", error) + }) + + }) + + observe({ + if (verbose) + cat("DEBUG update visuPlot\n") + if (is.null(ok.ggvisu())) { + values$visuMsg <- "No visualization yet." + output$visuPlot <- NULL + values$visuZoom$xlim <- NULL + values$visuZoom$ylim <- NULL + return(NULL) + } + output$visuPlot <- renderPlot(height = input$visuHeight, { + # if (is.null(ok.ggvisu())) return(NULL) + ok.ggvisu() + + coord_fixed(xlim = values$visuZoom$xlim, ylim = values$visuZoom$ylim) + }) + }) + + observeEvent(input$visuDblclick, { + if (!is.null(values$visuMsg)) + values$visuMsg <- NULL + tmp <- input$visuDblclick + bmu <- t(ok.visu()) - c(tmp$x, tmp$y) + bmu <- which.min(colSums(bmu^2)) + values$lastQuery <- bmu + values$newQuery <- bmu + ready2tag(1) + }) + + observeEvent(input$visuGoZoom, { + if (!is.null(values$visuMsg)) + values$visuMsg <- NULL + values$visuZoom$xlim <- c(input$visuBrush$xmin, input$visuBrush$xmax) + values$visuZoom$ylim <- c(input$visuBrush$ymin, input$visuBrush$ymax) + }) + + + ############################################################################## + ## Panel History + ############################################################################## + + ## Create history link and retag events as they appear + observeEvent(input$histDTable_rows_current, { + histrows <- order(values$labelhist) + histrows <- rev(histrows[!is.na(values$labelhist[histrows])]) + histrows <- histrows[!is.na(values$label[histrows])] + histrows <- histrows[input$histDTable_rows_current] + notthere <- histrows[! histrows %in% values$histQueries] + + if (!length(notthere)) return(NULL) + lapply(notthere, function(daquery) { + if (verbose) cat("DEBUG create history events", daquery, "\n") + observeEvent(input[[paste0("hist", daquery)]], { + if (verbose) cat("DEBUG history link", daquery, "\n") + values$lastQuery <- daquery + values$newQuery <- daquery + updateTabsetPanel(session, "mainPanelset", selected = "Tagging") + ready2tag(1) + }) + + observeEvent(input[[paste0("histTag", daquery)]], { + if (verbose) cat("DEBUG history retag", daquery, "\n") + values$histSaveStack <- rbind( + values$histSaveStack, + data.frame(id = ok.data()[daquery, values$conf$idcol], + tag = input[[paste0("histTag", daquery)]], + comment = values$comment[daquery]) + ) + }) + }) + values$histQueries <- c(values$histQueries, notthere) + }) + + ## History save action + observeEvent(input$histSave, { + req(is.data.frame(values$histSaveStack)) + values$tagTrigger <- as.list(values$histSaveStack) + values$label[match(values$histSaveStack$id, ok.data()[[values$conf$idcol]])] <- + values$histSaveStack$tag + values$histSaveStack <- NULL + }) + + ## Update save name with number of unsaved changes + observeEvent(length(values$histSaveStack$id), { + updateActionButton( + session, "histSave", + label = paste0("Save changes (", length(unique(values$histSaveStack$id)),")")) + }) + + ## History table, with clickable links and retagging + output$histDTable <- DT::renderDataTable({ + histrows <- order(values$labelhist) + histrows <- rev(histrows[!is.na(values$labelhist[histrows])]) + histrows <- histrows[!is.na(values$label[histrows])] + + # histrows <- rev(values$queries) + hist_df <- data.frame( + # history = length(histrows):1, + history = values$labelhist[histrows], + id = sapply(histrows, function(irow) { + as.character(actionLink( + paste0("hist", irow), + label = ok.data()[irow, values$conf$idcol], + onclick = paste0('Shiny.onInputChange(\"hist', irow, '\", this.id)'))) + }), + Tag = values$label[histrows], + Retag = sapply(histrows, function(irow) { + paste0( + "" + ) + }), + Comment = values$comment[histrows], + Text = ok.nicetext()[histrows]) + if (!is.null(values$conf$contextcols)) + hist_df <- cbind(hist_df, + ok.data()[histrows, values$conf$contextcols]) + + DT::datatable(hist_df, escape = FALSE, rownames = FALSE, filter = "top", + options = list( + columnDefs = list(list( + targets = 5, + render = DT::JS( + "function(data, type, row, meta) {", + "return type === 'display' && data.length > 200 ?", + "'' + data.substr(0, 200) + '...' : data;", + "}"))) + )) + }) + + ############################################################################## + ## Panel "Stats" + ############################################################################## + + output$statsTagTable <- renderTable(rownames = F, { + table(tag = values$label) + }) + + + ## Model diagnostics by CV + output$statsCVoutput <- reactive({ + if (is.null(values$modelcv)) + return("Model diagnostics will appear here") + values$modelcv + }) + + output$statsCVtable <- DT::renderDataTable({ + if (is.null(values$cvtable)) + return(NULL) + DT::datatable(values$cvtable, list(pageLength = 25), rownames = F) + }) + + ## Button action: compute CV (async) + cv_nclicks <- reactiveVal(0) + observeEvent(input$statsCVgo, { + if (is.null(ok.train())) { + values$modelcv <- "Error in model training" + return(NULL) + } + if (cv_nclicks() != 0) { + values$modelcv <- "CV already running" + return(NULL) + } + cv_nclicks(cv_nclicks() + 1) + + predmat <- ok.predictor()[!is.na(values$label), ] + predlab <- values$label[!is.na(values$label)] + cvsamp <- sample(10, nrow(predmat), replace = TRUE) + cvres <- rep(NA, nrow(predmat)) + cvres_future <- reactiveVal() + + tmp_model <- input$predModel + if (tmp_model == "random forest") { + cv_args <- list( + num.trees = input$rfNumTrees, mtry = if (input$rfMtry > 0) input$rfMtry, + sample.fraction = input$rfSampleFrac, probability = FALSE) + } else if (tmp_model == "linear") { + cv_args <- list(type= 0, cost = input$liblinCost) + } else if (tmp_model == "lasso") { + cv_args <- input$glmLambda + } else if (tmp_model == "naive bayes") { + cv_args <- list(smooth = input$naiveSmooth, prior = input$naivePrior, + distribution = input$naiveDistri) + } else if (tmp_model == "knn") { + cv_args <- list(k = input$knnK) + } + + cv_status_file <- paste0(session$token, "_cv") + + cv_async <- future(seed = TRUE, { + for (icv in 1:10) { + set_status(cv_status_file, paste0("Computing CV fold ", icv, " / 10")) + if (tmp_model == "random forest") { + cvmodel <- do.call(ranger::ranger, c(cv_args, list( + x= predmat[cvsamp != icv, ], + y= as.factor(predlab[cvsamp != icv])))) + cvres[cvsamp == icv] <- as.character( + predict(cvmodel, data= predmat[cvsamp == icv, ])$predictions) + } else { + tmpvalues <- predlab[cvsamp != icv] + # for (ival in na.omit(unique(tmpvalues))) + # if (sum(tmpvalues == ival, na.rm = T) == 1) + # tmpvalues[which(tmpvalues == ival)] <- NA + + if (tmp_model == "lasso") { + cvmodel <- suppressWarnings( + glmnet::glmnet(x= predmat[cvsamp != icv, ][!is.na(tmpvalues), ], + y= as.factor(tmpvalues[!is.na(tmpvalues)]), + family= "multinomial")) + cvres[cvsamp == icv] <- predict( + cvmodel, newx = predmat[cvsamp == icv, ], + type = "class", s= cv_args) + } else if (tmp_model == "linear") { + cvmodel <- do.call(LiblineaR::LiblineaR, c( + list(data= predmat[cvsamp != icv, ][!is.na(tmpvalues), ], + target= as.factor(tmpvalues[!is.na(tmpvalues)])), + cv_args)) + cvres[cvsamp == icv] <- as.character( + predict(cvmodel, newx= predmat[cvsamp == icv, ])$predictions) + } else if (tmp_model == "naive bayes") { + cvmodel <- do.call(quanteda.textmodels::textmodel_nb, c( + list(x= predmat[cvsamp != icv, ][!is.na(tmpvalues), ], + y= as.factor(tmpvalues[!is.na(tmpvalues)])), + cv_args)) + cvres[cvsamp == icv] <- as.character( + predict( + cvmodel, newdata= predmat[cvsamp == icv, ], type= "class")) + } else if (isolate(tmp_model) == "knn") { + cvres[cvsamp == icv] <- + as.character(do.call(class::knn, c(cv_args, list( + train = predmat[cvsamp != icv, ], + test = predmat[cvsamp == icv, ], + cl = predlab[cvsamp != icv])))) + } + } + } + cvres + }) %...>% cvres_future() + + cv_async <- catch(cv_async, function(e) { + cvres_future(NULL) + values$modelcv <- e$message + }) + + cv_async <- finally(cv_async, function() { + cv_nclicks(0) + if (file.exists(cv_status_file)) unlink(cv_status_file) + if (is.null(cvres_future())) return(NULL) + cvres <- cvres_future() + predlab <- values$label[!is.na(values$label)] + dametrics <- metricsTable(predlab, cvres) + values$cvtable <- dametrics + res <- "10-CV OOB evaluation:\n" + # if (any(grepl("^bertpred", input$use_regressors))) + if (any(grepl("^bertpred", values$new_use_regressors))) + res <- paste0( + res, + "⚠️ Scores are biased upwards if BERT predictions were trained on current data ⚠️ \n") + res <- paste0( + res, + "* Accuracy : ", round(100 * mean(cvres == predlab), 2), "%\n", + "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n", + "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1), + dametrics$N.cases)), "\n" + ) + values$modelcv <- res + }) + + NULL + }) + + ## Automatic message update during CV computation + observe({ + if (cv_nclicks() == 0) return(NULL) + invalidateLater(200) + if (!file.exists(paste0(session$token, "_cv"))) return(NULL) + values$modelcv <- get_status(paste0(session$token, "_cv")) + values$cvtable <- NULL + }) + + + ############################################################################## + ## Panel BERT + ############################################################################## + + ########## + ## Bert UI left panel + + ## Update list of saved models and dlSelect on init and on saveTrigger + + observeEvent(list(ok.data(), values$bertSaveTrigger), { + damodels <- dir(pattern = "^tiggerbert_saved_") + if (!length(damodels)) return(NULL) + damodels <- gsub("^tiggerbert_saved_", "", damodels) + values$bertSaved <- damodels + updateSelectInput(session, "dlBPSelect", choices = damodels) + }) + + ## Populate saved models list, with buttons + + output$bertSaved <- renderUI({ + if (verbose) + cat("DEBUG bert update saved list\n") + HTML(paste0( + "", + paste0(sapply(values$bertSaved, function(dabert) { + bertshort <- dabert + if (nchar(bertshort) > 23) + bertshort <- paste0(substr(bertshort, 1, 20), "...") + paste0( + "") + }), collapse = "\n"), + "
    ", + actionLink(paste0("saved_", dabert), bertshort, title = dabert, + style="margin-right: 5px;"), "", + actionButton(paste0("bert_copypars_", dabert), "📋", + title = "Copy parameters"), "", + actionButton(paste0("bert_infer_", dabert), "🤖", + title = "Infer predictions on current data"), "", + actionButton(paste0("bert_delete_", dabert), "🗑", + title = "Delete model"), + "
    " + )) + }) + + ########## + ## Bert UI main panel + + ## UI outputs : message and stats table and plot + output$bertMsg <- reactive( + ifelse(is.null(values$bertMsg), + "Train new BERT, or select saved model.", + values$bertMsg)) + + output$bertValstats <- DT::renderDataTable( + DT::datatable(values$bertValstats, list(pageLength = 25))) + + output$bertValPlot <- renderPlot(values$bertValPlot) + + output$bertMsgHyperpar <- reactive(values$bertMsgHyperpar) + + ################################## + ## Async handling of BERT training + + ## Status File + bert_status_file <- paste0("tiggerbert_", session$token, "_trainstatus") + + fire_interrupt <- function(dafile) set_status(dafile, "interrupt") + fire_ready <- function(dafile) set_status(dafile, "Ready") + fire_running <- function(dafile, perc_complete = NULL){ + if(is.null(perc_complete)) + msg <- "Training..." + else + msg <- paste0("Training... ", perc_complete, "% complete") + set_status(dafile, msg) + } + interrupted <- function(dafile) get_status(dafile) == "interrupt" + + ## Create Status File + fire_ready(bert_status_file) + + + ## Button action preparation + bert_nclicks <- reactiveVal(0) + bert_result_val <- reactiveVal() + bert_hyperpars <- reactiveVal() + + ## Button appearance: write "Stop" if training is ongoing + observeEvent(bert_nclicks(), { + updateActionButton(session, "bertTrain", + label = ifelse(bert_nclicks() > 0, "Stop", "Train BERT")) + }) + + ## Button action: Train new BERT + observeEvent(input$bertTrain,{ + + ## Don't do anything if analysis is already being run + # if(bert_nclicks() != 0){ + # values$bertMsg <- "BERT is already training" + # return(NULL) + # } + tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token) + if(bert_nclicks() != 0){ + write("stop", paste0(tmpsession, "_stop")) + bert_nclicks(0) + return(NULL) + } + if (file.exists(paste0(tmpsession, "_stop"))) + file.remove(paste0(tmpsession, "_stop")) + + ## Increment clicks and prevent concurrent analyses + bert_nclicks(bert_nclicks() + 1) + + bert_result_val(data.frame(Status="Running...")) + fire_running(bert_status_file) + + ## Prepare async training + values$bertLoadSaved <- NULL + # tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token) + tmpmodel <- input$bertModel + system(paste0("rm ", tmpsession, "_done")) + + tmpdat <- data.frame( + id = ok.data()[, values$conf$idcol], + text = ok.nicetext(), + label = values$label + ) + tmpdat <- tmpdat[!is.na(tmpdat$label), ] + tmpdat <- tmpdat[grepl("\\S", tmpdat$text), ] + + ## Filter data: min occurences, balance classes, validation split + tmplabtable <- table(tmpdat$label) + if (is.numeric(input$bertMinOccur)) if (input$bertMinOccur > 1) { + tmpdat <- tmpdat[tmpdat$label %in% + names(which(tmplabtable >= input$bertMinOccur)), ] + tmplabtable <- table(tmpdat$label) + } + set.seed(input$bertValidSeed) + if (input$bertBalance) { + tmpminocc <- min(tmplabtable) + tmptrain <- sample(do.call(c, lapply(unique(tmpdat$label), function(dalabel) + sample(which(tmpdat$label == dalabel), round((1-input$bertValidFrac) * tmpminocc))))) + tmpvalid <- sample(do.call(c, lapply(unique(tmpdat$label), function(dalabel) { + dasamp <- which(tmpdat$label == dalabel) + dasamp <- dasamp[! dasamp %in% tmptrain] + sample(dasamp, round(input$bertValidFrac * tmpminocc)) + }))) + } else { + tmptrain <- sample(1:nrow(tmpdat), round((1 - input$bertValidFrac) * nrow(tmpdat))) + tmpvalid <- (1:nrow(tmpdat))[-tmptrain] + } + + write.csv(tmpdat[tmptrain, ], paste0(tmpsession, "_dat_train.csv"), row.names = FALSE) + write.csv(tmpdat[tmpvalid, ], paste0(tmpsession, "_dat_valid.csv"), row.names = FALSE) + + iwait <- 0 + while (iwait < 10) { + if (!file.exists(paste0(tmpsession, "_dat_valid.csv"))) Sys.sleep(500) + iwait <- iwait + 1 + } + + tmp_cmd <- paste0( + values$conf$python, " gobert.py -m ", tmpmodel, + " -t ", tmpsession, "_dat_train.csv", + " -v ", tmpsession, "_dat_valid.csv", + " -b ", input$bertBatchsize, + " -g ", input$bertGradacc, + " -e ", input$bertEpochs, + " -l ", input$bertLrate, + " -w ", input$bertWdecay, + " -B ", ifelse(input$bertBest, 1, 0), + " -E ", input$bertNeval, + " -s ", tmpsession, + " -G ", as.numeric(values$conf$use_gpu), + " -A ", as.numeric(input$bertAdapt), + " 2>&1 ; touch ", tmpsession, "_done" + ) + + ## Save hyperpars + bert_hyperpars(list(bertModel = input$bertModel, + bertEpochs = input$bertEpochs, + bertLrate = input$bertLrate, + bertWdecay = input$bertWdecay, + bertBatchsize = input$bertBatchsize, + bertGradacc = input$bertGradacc, + bertNeval = input$bertNeval, + bertBest = input$bertBest, + bertValidFrac = input$bertValidFrac, + bertValidSeed = input$bertValidSeed, + bertNtrain = length(tmptrain), + bertTrainTable = as.list(table(tmpdat$label[tmptrain])), + bertScheme = ok.labelcol())) + + ## Launch async training + bert_async <- future({ + if (file.exists(tmpsession)) + system(paste0("rm -R ", tmpsession, "*")) + + # system(tmp_cmd, wait = FALSE, intern = FALSE) + tmp_exec <- system(tmp_cmd, wait = FALSE, intern = TRUE) + writeLines(tmp_exec, paste0(tmpsession, "_log")) + + ## After training, get trainer state from last checkpoint for diagnostics + chkpt <- dir(paste0(tmpsession, "_train"), pattern = "^checkpoint") + chkpt <- + chkpt[order(as.numeric(gsub("\\D", "", chkpt)), decreasing = T)[1]] + system(paste0( + "cp ", tmpsession, "_train/", chkpt, "/trainer_state.json ", + tmpsession, "/")) + }) %...>% bert_result_val() + + ## Catch interrupt (or any other error) and notify user + bert_async <- catch(bert_async, function(e){ + bert_result_val(NULL) + showNotification(e$message) + values$bertMsg <- e$message + }) + + ## After the promise has been evaluated set nclicks to 0 to allow another Run + bert_async <- finally(bert_async, function() { + system(paste0("rm -R ", tmpsession, "_train")) + system(paste0("rm ", tmpsession, "_done")) + + fire_ready(bert_status_file) + bert_nclicks(0) + + if (is.null(bert_result_val())) return(NULL) + + bertpreds <- try(read.csv(paste0(tmpsession, "_predval.csv"))) + if (inherits(bertpreds, "try-error")) { + # values$bertMsg <- "BERT error - try again" + values$bertMsg <- try(paste(readLines(paste0(tmpsession, "_log")), collapse = "\n")) + values$bertValstats <- NULL + values$bertValPlot <- NULL + values$bertLastUnsaved <- NULL + values$bertMsgHyperpar <- NULL + values$bertLastHyperpar <- NULL + return(NULL) + } + + datruth <- tmpdat[tmpvalid, "label"] + dapred <- bertpreds$bertpred + dametrics <- metricsTable(datruth, dapred) + damsg <- paste0( + "Unsaved model (", tmpmodel, ")\n\n", + "BERT validation stats:\n", + "* Accuracy : ", round(100 * mean(datruth == dapred), 2), "%\n", + "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n", + "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1), + dametrics$N.cases)), "\n") + hypermsg <- bertHyperToMsg(bert_hyperpars()) + writeLines(RJSONIO::toJSON(bert_hyperpars()), + paste0(tmpsession, "/tiggertrain.json")) + + values$bertValstats <- dametrics + values$bertMsg <- damsg + values$bertValPlot <- bertPlotFromPath(tmpsession) + values$bertLastUnsaved <- list( + metrics = dametrics, msg = damsg, session = tmpsession) + values$bertMsgHyperpar <- hypermsg + values$bertLastHyperpar <- bert_hyperpars() + }) + + values$bertMsg <- "BERT is training" + values$bertValstats <- NULL + values$bertValPlot <- NULL + values$bertLastUnsaved <- NULL + values$bertMsgHyperpar <- NULL + values$bertLastHyperpar <- NULL + + ## Return something other than the promise so shiny remains responsive + NULL + }) + + ## Automatic check of BERT training status + observe({ + if (bert_nclicks() == 0) return(NULL) + isolate({ + tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token) + ## Check for checkpoint dirs + checkdirs <- + length(dir(paste0(tmpsession, "_train"), pattern = "^checkpoint")) + # fire_running(bert_status_file, + # min(100, round(100 * checkdirs / input$bertNeval, 1))) + # dastate <- try(RJSONIO::fromJSON(paste0(tmpsession, "_train/trainer_state.json"))) + # if (!inherits(dastate, "try-error")) + # fire_running(bert_status_file, + # round(100 * as.numeric(dastate$epoch) / input$bertEpochs, 1)) + try({ + dapoints <- dir(paste0(tmpsession, "_train"), + pattern = "trainer_state[.]json$", recursive = T) + dapoints_num <- as.numeric(gsub("^checkpoint-(\\d+)/.*$", "\\1", dapoints)) + statefile <- paste0( + tmpsession, "_train/", dapoints[which.max(dapoints_num)]) + if (file.exists(statefile)) { + state <- RJSONIO::fromJSON(statefile) + fire_running(bert_status_file, + round(100 * as.numeric(state$epoch) / input$bertEpochs, 1)) + } + }) + }) + if (checkdirs < isolate(input$bertNeval) & file.exists(paste0(tmpsession, "_done"))) { + values$bertMsg <- "BERT Error - try again" + # values$bertMsg <- paste(readLines(paste0(tmpsession, "_log")), collapse = "\n") + } else { + invalidateLater(1e3) + loss_progress <- "" + if (checkdirs > 0) { + ## Get training diagnostics + try({ + dapoints <- dir(paste0(tmpsession, "_train"), + pattern = "trainer_state[.]json$", recursive = T) + dapoints_num <- as.numeric(gsub("^checkpoint-(\\d+)/.*$", "\\1", dapoints)) + statefile <- paste0( + tmpsession, "_train/", dapoints[which.max(dapoints_num)]) + if (file.exists(statefile)) { + state <- RJSONIO::fromJSON(statefile) + logs <- data.table::rbindlist(lapply(state$log_history, as.list), + fill = TRUE) + logs <- logs[, .(loss = max(loss, na.rm = T), + eval_loss = max(eval_loss, na.rm = T)), by = epoch] + loss_progress <- paste0( + "\n* Epoch : ", sprintf("%.02f", logs$epoch), + "\ttrain loss : ", sprintf("%.05f",logs$loss), + "\teval loss : ", sprintf("%.05f", logs$eval_loss), collapse = "") + } + }) + } + values$bertMsg <- + ifelse(checkdirs == 0, "Pre-processing...", + paste0(get_status(bert_status_file), loss_progress)) + } + values$bertValstats <- NULL + values$bertValPlot <- NULL + values$bertLoadSaved <- NULL + values$bertMsgHyperpar <- NULL + }) + + + ## Action: show last BERT model + observeEvent(input$bertLast, { + values$bertMsg <- values$bertLastUnsaved$msg + values$bertValstats <- values$bertLastUnsaved$metrics + values$bertValPlot <- bertPlotFromPath(values$bertLastUnsaved$session) + values$bertMsgHyperpar <- bertHyperToMsg(values$bertLastHyperpar) + values$bertLoadSaved <- NULL + }) + + + ## Action: save BERT model + observeEvent(input$bertSave, { + tmpsession <- paste0("tiggerbert_", ok.labelcol(), "_", session$token) + if (!file.exists(paste0(tmpsession))) + return(NULL) + + if (!grepl("\\S", input$bertSaveName)) { + tmpname <- paste0( + ok.labelcol(), "_", gsub("\\D+$", "", gsub("\\D", "-", Sys.time()))) + } else + tmpname <- cleanFileName(input$bertSaveName) + + savedir <- paste0("tiggerbert_saved_", tmpname) + system(paste("cp -R", tmpsession, savedir)) + system(paste0("cp ", tmpsession, "_dat_valid.csv ", + savedir, "/tiggerbert_dat_valid.csv")) + system(paste0("cp ", tmpsession, "_predval.csv ", + savedir, "/tiggerbert_pred_valid.csv")) + + system(paste0("rm -R ", tmpsession, "*")) + values$bertMsg <- paste0("BERT model successfully saved: ", tmpname) + values$bertValstats <- NULL + values$bertValPlot <- NULL + values$bertMsgHyperpar <- NULL + + if (is.null(values$bertSaveTrigger)) { + values$bertSaveTrigger <- 0 + } else + values$bertSaveTrigger <- values$bertSaveTrigger + 1 + values$bertLastUnsaved <- NULL + updateTextInput(session, "bertSaveName", value = "") + }) + + + ## Actions: load saved model (list updated only with new models) + observeEvent(values$bertSaved, { + if (!length(values$bertSaved)) return(NULL) + tmpsaved <- values$bertSaved[! values$bertSaved %in% values$bertSavedAll] + + lapply(tmpsaved, function(dabert) { + ############ + ## Actions to load saved model + observeEvent(input[[paste0("saved_", dabert)]], { + bertconf <- RJSONIO::fromJSON(paste0( + "tiggerbert_saved_", dabert, "/config.json")) + truth <- read.csv(paste0( + "tiggerbert_saved_", dabert, "/tiggerbert_dat_valid.csv"))$label + pred <- read.csv(paste0( + "tiggerbert_saved_", dabert, "/tiggerbert_pred_valid.csv"))$bertpred + + dametrics <- metricsTable(truth, pred) + values$bertValstats <- dametrics + values$bertMsg <- paste0( + "Saved model ", dabert, " (", bertconf[["_name_or_path"]], ")\n\n", + "BERT validation stats:\n", + "* Accuracy : ", round(100 * mean(truth == pred), 2), "%\n", + "* Macro F1 : ", sprintf("%f", mean(as.numeric(dametrics$F1))), "\n", + "* Weighted F1 : ", sprintf("%f", weighted.mean(as.numeric(dametrics$F1), + dametrics$N.cases)), "\n") + + ## Hyperparameters + tiggertrain_file <- paste0("tiggerbert_saved_", dabert, "/tiggertrain.json") + if (file.exists(tiggertrain_file)) { + try({ + values$bertMsgHyperpar <- bertHyperToMsg(RJSONIO::fromJSON(tiggertrain_file)) + }) + } else + values$bertMsgHyperpar <- "Training parameters not saved." + + values$bertValPlot <- + bertPlotFromPath(paste0("tiggerbert_saved_", dabert)) + values$bertLoadSaved <- dabert + }) + + ############ + ## Actions to copy hyperparameters + observeEvent(input[[paste0("bert_copypars_", dabert)]], { + tiggertrain_file <- paste0("tiggerbert_saved_", dabert, "/tiggertrain.json") + if (!file.exists(tiggertrain_file)) return(NULL) + try({ + tmppars <- RJSONIO::fromJSON(tiggertrain_file) + updateSelectInput(session, "bertModel", selected = tmppars$bertModel) + updateCheckboxInput(session, "bertBest", value = tmppars$bertBest) + numinputs <- c("bertEpochs", "bertLrate", "bertWdecay", + "bertBatchsize", "bertGradacc") + for (iinput in numinputs) + updateNumericInput(session, iinput, value = tmppars[[iinput]]) + }) + }) + + ############ + ## Actions to delete saved model (with confirm) + observeEvent(input[[paste0("bert_delete_", dabert)]], { + showModal(modalDialog( + title = "Delete BERT model", + paste0("Confirm delete: ", dabert, + " (model and predictions will be deleted)."), + footer = tagList(actionButton(paste0("bert_delete_confirm_", dabert), "Delete"), + modalButton("Cancel")))) + }) + + observeEvent(input[[paste0("bert_delete_confirm_", dabert)]], { + system(paste0("rm -R tiggerbert_saved_", dabert)) + system(paste0( + "rm ", values$conf$datadir, values$conf$dataname, "_bertpred_", + dabert, ".feather")) + # values$bertLoadSaved <- NULL + if (is.null(values$bertSaveTrigger)) { + values$bertSaveTrigger <- 0 + } else + values$bertSaveTrigger <- values$bertSaveTrigger + 1 + removeModal() + values$bertMsg <- NULL + values$bertValstats <- NULL + values$bertMsgHyperpar <- NULL + }) + + ############ + ## Actions to infer on current data from saved BERT (async) + observeEvent(input[[paste0("bert_infer_", dabert)]], { + if (bertpred_nclicks() > 0 | bpall_nclicks() > 0) { + values$bertMsg <- "Inference already running" + values$bertMsgHyperpar <- NULL + values$bertValPlot <- NULL + values$bertValstats <- NULL + return(NULL) + } + bertpred_nclicks(1) + + saved_model <- paste0("tiggerbert_saved_", dabert) + out_path <- paste0( + values$conf$datadir, values$conf$dataname, "_bertpred_", + dabert, ".feather") + + if (file.exists(out_path)) { + old_infer <- arrow::read_feather(out_path) + } else + old_infer <- NULL + + to_infer <- ok.data()[[values$conf$idcol]] + to_infer <- to_infer[! to_infer %in% old_infer[[values$conf$idcol]]] + to_infer_row <- match(to_infer, ok.data()[[values$conf$idcol]]) + + if (!length(to_infer)) { + values$bertMsg <- paste0( + "All current ", nrow(ok.data()), + " texts already infered by ", dabert) + values$bertMsgHyperpar <- NULL + values$bertValPlot <- NULL + values$bertValstats <- NULL + bertpred_nclicks(0) + return(NULL) + } + + values$bertValPlot <- NULL + values$bertValstats <- NULL + values$bertMsgHyperpar <- NULL + values$bertMsg <- paste0( + "Preparing inference on ", length(to_infer), " new texts") + tmp_dat <- paste0("tiggerbert_infer_", session$token, ".feather") + arrow::write_feather( + data.frame( + ok.data()[to_infer_row, values$conf$idcol, drop = FALSE], + text = ok.nicetext()[to_infer_row]), + tmp_dat) + + values$bertMsg <- paste0( + "Infering ", dabert, " predictions on ", + length(to_infer), " new texts") + infer_cmd <- paste0( + values$conf$python, " gobert_infer.py -m ", saved_model, + " -o ", out_path, + " -d ", tmp_dat, + " -i ", values$conf$idcol, + " -l ", paste0("tiggerbert_infer_", session$token, ".log"), + " -G ", as.numeric(values$conf$use_gpu), + " -b ", 128) + bertpred_res <- reactiveVal() + bertpred_async <- future({ + system(infer_cmd) + system(paste0("rm ", tmp_dat)) + + if (!is.null(old_infer)) { + new_infer <- arrow::read_feather(out_path) + arrow::write_feather(rbind(old_infer, new_infer), out_path) + } + }) %...>% bertpred_res + bertpred_async <- catch(bertpred_async, function(error) { + bertpred_nclicks(0) + values$bertMsg <- paste0("BERT inference error: ", error) + system(paste0("rm ", tmp_dat)) + }) + bertpred_async <- finally(bertpred_async, function() { + bertpred_nclicks(0) + values$bertMsg <- + "Inference complete!\nRe-import data in Project/Sample panel to use the predictions for tagging." + values$bertValPlot <- NULL + values$bertValstats <- NULL + values$bertMsgHyperpar <- NULL + }) + NULL + }) + }) + values$bertSavedAll <- c(values$bertSavedAll, tmpsaved) + }) + + ## Automatic check of BERT inference progress + observe({ + if (bertpred_nclicks() == 0) return(NULL) + invalidateLater(1e3) + logfile <- paste0("tiggerbert_infer_", isolate(session$token), ".log") + if (file.exists(logfile)) { + values$bertMsg <- readLines(logfile)[1] + values$bertValPlot <- NULL + values$bertValstats <- NULL + values$bertMsgHyperpar <- NULL + } + }) + + + + + ############################################################################## + ## Export panel + ############################################################################## + + ## Export tags and full model predictions + output$dlTagSave <- + downloadHandler( + function() + paste0(Sys.Date(), "-activeTigger-", ok.labelcol(),".", input$dlTagFormat), + function(con) { + if (is.null(ok.data())) return(NULL) + dadat <- ok.data()[, values$conf$idcol, drop = FALSE] + if ("tags" %in% input$dlTagSelect) + dadat[[ok.labelcol()]] <- values$label + if ("comments" %in% input$dlTagSelect) if (ok.commentCol() != "(None)") + dadat[, ok.commentCol()] <- values$comment + if ("predictions" %in% input$dlTagSelect) if (ok.train.possible()) { + dadat$tigger.predict <- ok.predlab1() + dadat$tigger.predprob <- apply(ok.pred(), 1, max) + tmp_pred <- ok.pred() + colnames(tmp_pred) <- paste0("tiggerpred_", colnames(tmp_pred)) + dadat <- cbind(dadat, tmp_pred) + } + othercols <- input$dlTagSelect + othercols <- othercols[! othercols %in% c("tags", "comments", "predictions")] + if (length(othercols)) + dadat <- cbind(dadat, ok.data()[, othercols, drop = FALSE]) + + if (input$dlTagFormat == "csv") { + write.csv(dadat, row.names = F, con) + } else + arrow::write_feather(dadat, con) + }) + + + ## Export embeddings (ft, sb, umap, t-sne) + output$dlEmbedSave <- downloadHandler( + function() + paste0(Sys.Date(), "-activeTigger-emb-", + paste(input$dlEmbedSelect, collapse = "-"), + ".", input$dlEmbedFormat), + function (file) { + if (is.null(input$dlEmbedSelect)) return(NULL) + export <- ok.data()[, values$conf$idcol, drop = FALSE] + if ("ft" %in% input$dlEmbedSelect) + export <- data.frame(export, ok.data()[, values$cols_ft]) + if ("sb" %in% input$dlEmbedSelect) + export <- data.frame(export, ok.data()[, values$cols_sb]) + if (any(c("t-sne", "umap") %in% input$dlEmbedSelect)) { + tmpvisu <- ok.visu() + tmpalgo <- input$dlEmbedSelect[input$dlEmbedSelect %in% c("t-sne", "umap")] + colnames(tmpvisu) <- paste0(gsub("-", "", tmpalgo), 1:2) + export <- data.frame(export, tmpvisu) + } + if (input$dlEmbedFormat == "csv") { + write.csv(export, file, row.names = FALSE) + } else if (input$dlEmbedFormat == "feather") { + arrow::write_feather(export, file) + } + }) + + ## Export BERT predictions + + output$dlBPMsg <- reactive(values$dlBPMsg) + + + + # output$dlBertPredCsv <- + # downloadHandler(paste0(Sys.Date(), "-bertpred-", ok.labelcol(),".csv"), + # content= function(con) { + # if (is.null(ok.data())) return(NULL) + # dadat <- ok.data()[, c(values$conf$idcol, ok.labelcol())] + # dadat[[ok.labelcol()]] <- values$label + # if (ok.commentCol() != "(None)") { + # dadat[, ok.commentCol()] <- values$comment + # } + # if (ok.train.possible()) { + # dadat$tigger.predict <- ok.predlab1() + # dadat$tigger.predprob <- apply(ok.pred(), 1, max) + # } + # dadat <- cbind(dadat, ok.pred()) + # write.csv(dadat, row.names = F, con) + # }) + + ############ + ## Action to infer on ALL data from saved BERT (async) + bpall_res <- reactiveVal() + observeEvent(input$dlBPInfer, { + if (bpall_nclicks() > 0 | bertpred_nclicks() > 0) { + values$dlBPMsg <- "Inference already running" + bpall_res(NULL) + return(NULL) + } + bpall_nclicks(1) + bpall_res_async <- reactiveVal() + dabert <- input$dlBPSelect + + saved_model <- paste0("tiggerbert_saved_", dabert) + out_path <- paste0( + values$conf$datadir, values$conf$dataname, "_bertpredall_", + dabert, ".feather") + tmp_dat <- paste0("tiggerbert_inferall_", session$token, ".feather") + tmp_log <- paste0("tiggerbert_inferall_", session$token, ".log") + tmp_done <- paste0("tiggerbert_inferall_", session$token, "_done") + if (file.exists(tmp_done)) + system(paste0("rm ", tmp_done)) + + if (file.exists(out_path)) { + values$dlBPMsg <- "Inference already complete, ready for download" + bpall_nclicks(0) + bpall_res(out_path) + return(NULL) + # old_infer <- arrow::read_feather(out_path) + } + + fulldat <- arrow::read_feather(paste0( + values$conf$datadir, values$conf$dataname, ".feather")) + values$dlBPMsg <- paste0( + "Preparing inference on ", nrow(fulldat), " texts") + if (length(values$conf$textcols) > 1) { + fulldat$text <- do.call(paste, c( + fulldat[, values$conf$textcols, drop = FALSE], sep = "\n\n")) + } else { + fulldat$text <- fulldat[, values$conf$textcols] + } + arrow::write_feather(fulldat[, c(values$conf$idcol, "text")], tmp_dat) + + infer_cmd <- paste0( + values$conf$python, " gobert_infer.py -m ", saved_model, + " -o ", out_path, + " -d ", tmp_dat, + " -i ", values$conf$idcol, + " -l ", tmp_log, + " -G ", as.numeric(values$conf$use_gpu), + " -b ", 128, + "; touch ", tmp_done) + + bpall_async <- future({ + system(infer_cmd) + system(paste0("rm ", tmp_dat)) + }) %...>% bpall_res_async + + bpall_async <- catch(bpall_async, function(error) { + values$dlBPMsg <- paste0("BERT inference error: ", error) + system(paste0("rm ", tmp_dat)) + system(paste0("rm ", tmp_done)) + }) + + bpall_async <- finally(bpall_async, function() { + bpall_nclicks(0) + values$dlBPMsg <- + "Inference complete, ready for download" + bpall_res(out_path) + }) + NULL + }) + + ## Automatic check of BERT all prediction progress + observe({ + if (bpall_nclicks() == 0) return(NULL) + invalidateLater(1e3) + logfile <- paste0("tiggerbert_inferall_", isolate(session$token), ".log") + if (file.exists(logfile)) + values$dlBPMsg <- readLines(logfile)[1] + }) + + ## Show BERTpred download button when data is ready + output$dlBPDlButton <- renderUI({ + if (is.null(bpall_res())) return(NULL) + downloadButton("dlBPDl", NULL) + }) + + output$dlBPDl <- downloadHandler( + function() + paste0(Sys.Date(), "-tiggerbert-", values$conf$dataname, + input$dlBPSelect, ".", input$dlBPFormat), + function(file) { + if (is.null(bpall_res())) return(NULL) + tmp_bert <- gsub("^.*_bertpredall_(.*)[.]feather$", "\\1", bpall_res()) + if (tmp_bert != input$dlBPSelect) { + values$dlBPMsg <- "BERT model changed - predict again or select current model" + return(NULL) + } + tmp <- arrow::read_feather(bpall_res()) + if (input$dlBPFormat == "csv") { + write.csv(tmp, file, row.names = FALSE) + } else if (input$dlBPFormat == "feather") { + arrow::write_feather(tmp, file) + } + } + ) + + + ############################################################################## + ## Delete files at end of session + onStop(function() { + if(file.exists(bert_status_file)) unlink(bert_status_file) + if(length(dir(pattern = paste(session$token)))) + system(paste0("rm -R *", session$token, "*")) + }) + +}) diff --git a/docker-images-datalab/activetigger/activetigger/tokenize_spacy.py b/docker-images-datalab/activetigger/activetigger/tokenize_spacy.py new file mode 100644 index 0000000..a7143ce --- /dev/null +++ b/docker-images-datalab/activetigger/activetigger/tokenize_spacy.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# coding: utf-8 + +## Spacy tokenize texts +## Requires data file with columns id and text + +import argparse +from os.path import expanduser +import pandas as pd +import pyarrow as pa +import pyarrow.feather as feather +import spacy +import re + + +def main(args): + print("Tokenizer: Importing data") + datapath = expanduser(args.data) + dat = feather.read_feather(datapath) + outfile = re.sub("[.]feather$", "_spa.feather", datapath) + + print("Tokenizer: Loading model") + spa = spacy.load(expanduser(args.model)) + print("Tokenizer: Tokenizing sentences") + tok = [" ".join([str(token) for token in spa.tokenizer(text)]) for text in dat["text"]] + + print("Tokenizer: Exporting") + tok = pd.concat([dat["id"], pd.DataFrame(tok)], axis=1) + tok.columns = ["id", "text_spa"] + feather.write_feather(tok, outfile) + print("Tokenizer: Done") + + +if __name__ == "__main__": + argParser = argparse.ArgumentParser() + argParser.add_argument("-m", "--model", help="Model name", default="fr_core_news_sm") + argParser.add_argument("-d", "--data", help="Path to data (feather)") + args = argParser.parse_args() + main(args) + diff --git a/docker-images-datalab/activetigger/activetigger/ui.R b/docker-images-datalab/activetigger/activetigger/ui.R new file mode 100644 index 0000000..5f788a2 --- /dev/null +++ b/docker-images-datalab/activetigger/activetigger/ui.R @@ -0,0 +1,631 @@ +## 21/04/2020 : shiny pour active learning étiquetage de textes + +shinyUI(fluidPage( + title = "Active Tigger", + + ############################################################################ + ## Top panel: Title, strategy, model options + ############################################################################ + fluidRow( + column( + 4, + fluidRow( + column(4, br(), + HTML('')), + column( + 8, br(), + p(strong("Project / Scheme")), + fluidRow( + column(2, HTML(paste0( + '
    ', + actionButton("createProject", "+"), + '
    '))), + column(10, selectInput("selectProject", NULL, NULL, NULL)) + ), + fluidRow( + column(2, HTML(paste0( + '
    ', + actionButton("createScheme", "+"), + '
    '))), + column(10, selectInput("selectScheme", NULL, NULL, NULL)) + ) + )) + ), + column( + 2, br(), + + HTML(paste0( + '
    ', + p(strong("Strategy")), + selectInput("strategy", NULL, + choices= c("Active" = "entropy", + "MaxProb" = "maxprob", + "Random" = "random", + "Deterministic" = "sequential"), + selected = "sequential"), + conditionalPanel("input.strategy == 'maxprob'", + selectInput("maxprobWhich", NULL, NULL)), + '
    ')) + ), + column( + 2, br(), + HTML(paste0( + '
    ', + p(strong("On")), + selectInput("sampleChoice", NULL, + choices = c("Untagged" = "untagged", + "Tagged" = "tagged", + "All" = "all")), + conditionalPanel("input.sampleChoice == 'tagged'", + selectInput("taggedWhich", NULL, NULL)), + '
    ')) + ), + + column(4, p(br()), + verbatimTextOutput("trainDiagno"), + fluidRow( + column(8, HTML(paste0( + '
    ', + actionButton("modelTrain", "🤖 Train", width = "100%"), + '
    '))), + column(4, HTML(paste0( + '
    ', + checkboxInput("showTrainOptions", "🔧"), + '
    '))) + )) + ), + + + ## General training options + conditionalPanel( + "input.showTrainOptions", + hr(), + fluidRow( + column( + 4, + conditionalPanel( + "input.showTrainOptions & input.use_regressors.includes('regex')", + uiOutput("panelExtraRegex") + ) + ), + column( + 4, + p(strong("Predictors")), + selectizeInput("use_regressors", NULL, "", + multiple = TRUE, width = "100%"), + conditionalPanel( + "input.showTrainOptions & input.use_regressors.includes('extra')", + selectizeInput("use_ootregnum", "Extra predictors: continuous", "", + multiple = TRUE, width = "100%"), + selectizeInput("use_ootregcat", "Extra predictors: categorical", "", + multiple = TRUE, width = "100%") + ), + + conditionalPanel( + "input.showTrainOptions & input.use_regressors.includes('dfm')", + hr(), + fluidRow( + column( + 6, numericInput("dfmMinTermfreq", "DFM : Min Termfreq", + min= 1, max= 1e3, value= 5, step= 1)), + column( + 6, numericInput("dfmMinDocfreq", "DFM : Min Docfreq", + min= 1, max= 1e3, value= 5, step= 1))), + fluidRow( + column( + 4, checkboxInput("dfmTfIdf", "Tf-Idf", TRUE)), + column( + 4, selectInput("dfmTfScheme", label = NULL, + choices= c("logcount", "count", "prop", "propmax", + "boolean", "augmented", "logave"), + selected= "logcount")), + column( + 4, selectInput("dfmDfScheme", label = NULL, + choices= c("inverse", "count", + "inversemax", "inverseprob", "unary"), + selected= "inverse")) + ), + numericInput("dfmNgrams", "N-grams", value= 1, + min= 1, max= 10, step = 1) + ) + ), + column( + 4, + HTML(paste0( + '
    ', + fluidRow( + column(6, strong("Auto train every")), + column(6, numericInput("trainCountdown", NULL, 0, 0, 1e6, 1))), + '
    ' + )), + HTML(paste0( + '
    ', + fluidRow( + column(6, strong("Model")), + column(6, selectInput("predModel", NULL, selected = "linear", + choices = c("Naive Bayes" = "naive bayes", + "KNN" = "knn", + "Liblinear" = "linear", + "LASSO" = "lasso", + "Random Forest" = "random forest"))) + ), + '
    ' + )), + + ## Model-specific training options + ### Random forest options + conditionalPanel( + "input.showTrainOptions & input.predModel == 'random forest'", + fluidRow( + numericInput("rfNumTrees", label = "Num. trees", + min = 1, max = 2e3, value = 500, step = 1), + numericInput("rfMtry", label = "mtry", + min = 0, max = 1e5, value = 0, step = 1), + numericInput("rfSampleFrac", label = "Sample fraction", + min = 0, max = 1, value = 1, step = .01) + ) + ), + ### Naive Bayes options + conditionalPanel( + "input.showTrainOptions & input.predModel == 'naive bayes'", + flowLayout( + numericInput("naiveSmooth", label = "Smooth", + min = 0, max = 2e3, + value = 1, step = 1e-3), + selectInput("naivePrior", "Prior", + c("uniform", "docfreq", "termfreq")), + selectInput("naiveDistri", "Distribution", + c("multinomial", "Bernoulli")) + ) + ), + ### Lasso options + conditionalPanel( + "input.showTrainOptions & input.predModel == 'lasso'", + strong("Lasso penalty"), + fluidRow( + column( + 6, numericInput("glmLambda", label = NULL, min = 0, max = 2e3, + value = 0, step = 1e-6)), + column( + 6, actionButton("glmCV", label= "Find best (CV)"))) + ), + ### Linear options + conditionalPanel( + "input.showTrainOptions & input.predModel == 'linear'", + strong("Liblinear Cost"), + fluidRow( + column( + 6, numericInput("liblinCost", label= NULL, min= 0, max= 2e10, + value= 32, step= 1)), + column( + 6, actionButton("liblinCV", label= "Find best (CV)"))) + ), + ### KNN options + conditionalPanel( + "input.showTrainOptions & input.predModel == 'knn'", + flowLayout( + strong("N. Neighbours"), + numericInput("knnK", label = NULL, min = 1, max = 1e2, + value = 3, step = 1) + ) + ) + ) + ), + hr() + ), + + + ############################################################################ + ## Main panel set + ############################################################################ + tabsetPanel( + id = "mainPanelset", + selected = "Tagging", + + ######################################################################## + ## Project panel + ######################################################################## + tabPanel( + "Project", + br(), + tabsetPanel( + id = "tabsetProject", + selected = "Sample", + + tabPanel( + "Settings", + br(), + actionButton("saveSystem", "Save changes"), + + h4("Files"), + fluidRow( + column(2, p("Data directory")), + column(4, uiOutput("sys_datadir")), + column(6, p("Place (on the server) where the data and project are stored")) + ), + fluidRow( + column(2, p("Data filename")), + column(4, uiOutput("sys_datafile")), + column(6, p("Main file, containing id and text columns")) + ), + + h4("Variables"), + fluidRow( + column(2, p("ID")), + column(4, uiOutput("sys_var_id")), + column(6, p("Name of the id variable, unique identifier of each text")) + ), + fluidRow( + column(2, p("Text")), + column(4, uiOutput("sys_var_text")), + column(6, p("Name of the text variables: if more than one, texts are concatenated in the specified order")) + ), + fluidRow( + column(2, p("Tags")), + column(4, uiOutput("sys_var_tag")), + column(6, p("Names of scheme variables")) + ), + fluidRow( + column(2, p("Comments")), + column(4, uiOutput("sys_var_comm_ui")), + column(6, p("Name of the comments variable")) + ), + fluidRow( + column(2, p("Context")), + column(4, uiOutput("sys_var_context_ui")), + column(6, p("Names of variables not used in the models, but may be displayed during tagging")) + ), + + h4("System"), + fluidRow( + column(2, checkboxInput("sys_use_python", "Python backend", FALSE)), + column(4, conditionalPanel( + "input.sys_use_python", + textInput("sys_which_python", NULL, value = "python3", + placeholder = "(custom python path)"))), + column(6, conditionalPanel( + "input.sys_use_python", + p("This must be a working python3 environment, with the required modules installed (see documentation)"))) + ), + + conditionalPanel("input.sys_use_python", list( + fluidRow( + column(2, checkboxInput("sys_use_spacy", "SpaCy tokenization", FALSE)), + column(4, conditionalPanel("input.sys_use_spacy", textInput( + "sys_use_spacy_model", NULL, NULL, placeholder = "(spacy model name)"))), + column(6, p("Name of the spacy tokenizer model, used in DTM and word embeddings")) + ), + conditionalPanel("input.sys_use_spacy", fluidRow( + column(2), + column(9, uiOutput("sys_spacyDlUI"))) + ), + fluidRow( + column(2, checkboxInput("sys_use_ft", "FastText word embeddings", FALSE)), + column(4, conditionalPanel("input.sys_use_ft", textInput( + "sys_use_ft_model", NULL, NULL, placeholder = "(fasttext model path)"))), + column(6, p("Path to the local fasttext model binary")) + ), + conditionalPanel("input.sys_use_ft", fluidRow( + column(2), + column(9, uiOutput("sys_ftDlUI"))) + ), + fluidRow( + column(2, checkboxInput("sys_use_sb", "SBERT sentence embeddings", FALSE)), + column(4, conditionalPanel("input.sys_use_sb", textInput( + "sys_use_sb_model", NULL, NULL, + placeholder = "(custom sentence_transformers model)"))), + column(6, p("(GPU recommended) Name or path of the sentence-transformers model")) + ), + + conditionalPanel("input.sys_use_python", list( + checkboxInput("sys_use_gpu", "GPU support (CUDA, for SBERT and BERT)", FALSE), + + br(), + wellPanel( + h4("Model picker"), + fluidRow( + column(2, p("Language")), + column(4, uiOutput("sys_ex_lang_ui")), + column(6, p("Used to preset tokenization and embedding models")) + ), + fluidRow( + column(2), + column(4, strong("Recommended model")), + column(6, strong("Download instructions")) + ), + fluidRow( + column(2, p("SpaCy tokenization")), + column(4, uiOutput("sys_ex_spacy")), + column(6, uiOutput("sys_ex_spacy_dl")) + ), + fluidRow( + column(2, p("FastText word embeddings")), + column(4, uiOutput("sys_ex_ft")), + column(6, uiOutput("sys_ex_ft_dl")) + ), + fluidRow( + column(2, p("SBERT sentence embeddings")), + column(4, uiOutput("sys_ex_sb")), + column(6, p("(Auto download by python module)")) + ) + ) + )) + )) + ), + + tabPanel( + "Sample", + br(), + fluidRow( + column( + 4, + wellPanel( + fluidRow( + column(8, h4("Sample")), + column(4, actionButton("dataImport", "Import", width = "100%"))), + fluidRow( + column(6, numericInput("dataNrows", "N. rows", 500, 10, 1e4, 1)), + column(6, numericInput("dataSkipRows", "Skip rows", 0, 0, step = 1)) + ) + ) + ), + column(8, uiOutput("dataMessage"), uiOutput("panelData")) + ) + ), + + tabPanel( + "Scheme", + br(), + fluidRow( + # column(4, uiOutput("panelScheme")), + column( + 4, + wellPanel( + h4("Current scheme"), + fluidRow( + column(2, HTML(paste0( + "
    ", + actionButton("schemeDelete", "🗑", width = "100%"), + "
    "))), + column(6, uiOutput("printScheme")), + column(4, HTML(paste0( + "
    ", + actionButton("schemeDescrSave", "Save", width = "100%"), + "
    "))) + ), + br(), + textAreaInput("schemeDescr", NULL, width = "100%", rows = 10, + placeholder = "Write scheme description here"), + hr() + ) + ), + column(8, uiOutput("panelRetag")) + ) + ) + ) + ), + + + ######################################################################## + ## Text / visualization panel + ######################################################################## + tabPanel( + "Tagging", + fluidRow( + column( + 3, + br(), + fluidRow( + column(8, textInput("regexFilter", label = NULL, + placeholder = "(Regex filter)")), + column(4, checkboxInput("regexCaseSens", "Case"))), + + wellPanel( + ## Tagging buttons + fluidRow( + column(8, textInput("newLab", label = NULL, + placeholder = "(New label)")), + column(4, actionButton("currentAction", "Create")) + ), + + # fluidRow(uiOutput("oracleButtons")), + uiOutput("oracleButtons"), + + br(), + textInput("currentComment", NULL, "", width = "100%", + placeholder = "(Comment)"), + br(), + uiOutput("makeOracleConfirm") + ), + + # fluidRow( + # column(6, checkboxInput("showContext", "Context")), + # column(6, actionButton("oops", strong("Oops"))) + # ), + checkboxInput("showContext", "Context"), + conditionalPanel("input.showContext", htmlOutput("currentContext")) + ), + column( + 9, + fluidRow( + column(2, checkboxInput("panelText", "Text", TRUE)), + column(2, checkboxInput("panelVisu", "Visualization", FALSE), + offset = 8) + ), + uiOutput("textVisuCols") # Handled in server.R for adaptive columns + ) + ) + ), + + ######################################################################## + ## History panel + ######################################################################## + tabPanel( + "History", + br(), + actionButton("histSave", "Save changes"), + br(), + br(), + DT::dataTableOutput("histDTable") + ), + + + ######################################################################## + ## Stats panel + ######################################################################## + tabPanel( + "Stats", + br(), + fluidRow( + column( + 3, + h3("Counts"), + tableOutput("statsTagTable") + ), + column( + 9, + h3("10-CV diagnostics"), + actionButton("statsCVgo", "Compute 10-CV"), + br(), + verbatimTextOutput("statsCVoutput"), + DT::dataTableOutput("statsCVtable") + ) + ), + hr(), + h3("Gold Standard") + ), + + ######################################################################## + ## BERT panel + ######################################################################## + tabPanel( + "BERT", + + fluidRow( + column( + 3, + br(), + h3("Train new BERT"), + fluidRow( + column(6, actionButton("bertTrain", "Train BERT", width = "100%")), + column(6, checkboxInput("bertOptions", "Options"))), + fluidRow( + column(6, textInput( + "bertSaveName", NULL, placeholder = "(save name)")), + column(6, actionButton("bertSave", "Save", width = "100%"))), + actionLink("bertLast", "Last trained model"), + h3("Saved models"), + uiOutput("bertSaved") + ), + column( + 9, + br(), + conditionalPanel( + "input.bertOptions", + fluidRow( + column(6, selectInput( + "bertModel", "Model", + c("(Fr) CamemBERT-base" = "camembert/camembert-base", + "(Fr) CamemBERT-large" = "camembert/camembert-large", + "(Fr) FlauBERT-small" = "flaubert/flaubert_small_cased", + "(Fr) FlauBERT-base" = "flaubert/flaubert_base_cased", + "(Fr) FlauBERT-large" = "flaubert/flaubert_large_cased", + "(En) DistilBERT-base" = "distilbert-base-cased", + "(En) RoBERTa-base" = "roberta-base", + "(En) DeBERTa-base" = "microsoft/deberta-base", + "(Multi) DistilmBERT-base" = "distilbert-base-multilingual-cased", + "(Multi) MiniLM" = "microsoft/Multilingual-MiniLM-L12-H384", + "(Multi) XLM-RoBERTa-base" = "xlm-roberta-base"))), + column(6) + ), + fluidRow( + column(3, numericInput("bertEpochs", "Epochs", 3, 1, 20, 1)), + column(3, numericInput("bertLrate", "Learning rate", 2e-5, 1e-6, 1, 1e-6)), + column(3, numericInput("bertWdecay", "Weight decay", 0.01, 0, 10, 1e-6)), + column(3) + ), + fluidRow( + column(3, numericInput("bertBatchsize", "Batch size", 4, 1, 32, 1)), + column(3, numericInput("bertGradacc", "Gradient accum.", 4, 1, 32, 1)), + column(3, br(), checkboxInput("bertAdapt", "Adapt token length to batch", TRUE)), + column(3) + ), + fluidRow( + column(3, numericInput("bertValidFrac", "Validation fraction", .2, 0, .9)), + column(3, numericInput("bertValidSeed", "Validation seed", 1234, 1, 9e8)), + column(3, numericInput("bertNeval", "N. validation evals", 10, 1, 100, 1)), + column(3, br(), checkboxInput("bertBest", "Keep best", TRUE)) + ), + fluidRow( + column(3, numericInput("bertMinOccur", "Min. class occurences", 1, 1, 1e4, 1)), + column(3, br(), checkboxInput("bertBalance", "Balance classes", FALSE)), + column(3), + column(3) + ) + ), + + + fluidRow( + column( + 6, + # flowLayout( + # actionButton( + # "bertGoPred", "Infer on current data", width = "100%"), + # actionButton( + # "bertDelete", "Delete saved model", width = "100%")), + verbatimTextOutput("bertMsg")), + column(6, plotOutput("bertValPlot", height = 200))), + + verbatimTextOutput("bertMsgHyperpar"), + + DT::dataTableOutput("bertValstats") + ) + ) + ), + + ######################################################################## + ## Export panel + ######################################################################## + tabPanel( + "Export", + h4("Export tagged data"), + p("Download the tags and predicted probabilities from the complete model, on the current data sample."), + # downloadButton("downloadCsv", "Save csv"), + flowLayout( + selectInput( + "dlTagSelect", NULL, c("tags", "comments", "predictions"), + c("tags", "comments", "predictions"), multiple = TRUE), + selectInput("dlTagFormat", NULL, c("csv", "feather"), "csv"), + downloadButton("dlTagSave", NULL, title = "Save tags") + ), + + hr(), + h4("Export embeddings"), + p("Download the embeddings (incl. from visualization if present), on the current data sample."), + flowLayout( + selectInput( + "dlEmbedSelect", NULL, c("FastText" = "ft", "SBERT" = "sb"), + selected = "sb", multiple = TRUE), + selectInput("dlEmbedFormat", NULL, c("csv", "feather"), "feather"), + downloadButton("dlEmbedSave", NULL, title = "Save embeddings") + ), + + hr(), + h4("Export BERT predictions"), + p("Download the predicted probabilities from the chosen BERT model, on the complete dataset."), + flowLayout( + selectInput("dlBPSelect", NULL, NULL, NULL), + selectInput("dlBPFormat", NULL, c("csv", "feather"), "feather"), + actionButton("dlBPInfer", "Predict"), + verbatimTextOutput("dlBPMsg"), + uiOutput("dlBPDlButton") + ), + + hr(), + h4("Export BERT models") + ) + + ), + br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), + br(), br(), br(), br(), br(), br() +)) diff --git a/docker-images-datalab/activetigger/activetigger/www/active_tigger.jpeg b/docker-images-datalab/activetigger/activetigger/www/active_tigger.jpeg new file mode 100644 index 0000000..c2e913e Binary files /dev/null and b/docker-images-datalab/activetigger/activetigger/www/active_tigger.jpeg differ diff --git a/docker-images-datalab/activetigger/activetigger/www/active_tigger.png b/docker-images-datalab/activetigger/activetigger/www/active_tigger.png new file mode 100644 index 0000000..587d4ee Binary files /dev/null and b/docker-images-datalab/activetigger/activetigger/www/active_tigger.png differ diff --git a/docker-images-datalab/activetigger/activetigger/www/favicon.ico b/docker-images-datalab/activetigger/activetigger/www/favicon.ico new file mode 100644 index 0000000..84e8c0f Binary files /dev/null and b/docker-images-datalab/activetigger/activetigger/www/favicon.ico differ diff --git a/docker-images-datalab/activetigger/requirements.r b/docker-images-datalab/activetigger/requirements.r new file mode 100644 index 0000000..c1dc00c --- /dev/null +++ b/docker-images-datalab/activetigger/requirements.r @@ -0,0 +1 @@ +install.packages(c("arrow", "class", "data.table", "DT", "foreign", "glmnet", "haven", "LiblineaR", "Matrix", "Metrics", "quanteda", "quanteda.textmodels", "ranger", "readODS", "readxl", "RJSONIO", "rlang", "Rtsne", "shiny", "SparseM", "stringi", "uwot")) diff --git a/docker-images-datalab/activetigger/requirementspython.txt b/docker-images-datalab/activetigger/requirementspython.txt new file mode 100644 index 0000000..6bbb052 --- /dev/null +++ b/docker-images-datalab/activetigger/requirementspython.txt @@ -0,0 +1,6 @@ +pip install argparse datasets fasttext numpy pandas pyarrow sklearn +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +pip install transformers[torch] +pip install sentence_transformers +pip install -U typing-inspect==0.8.0 typing_extensions==4.6.1 +pip install spacy \ No newline at end of file diff --git a/docker-images-datalab/index.md b/docker-images-datalab/index.md new file mode 100644 index 0000000..8f9b045 --- /dev/null +++ b/docker-images-datalab/index.md @@ -0,0 +1,3 @@ +## Helm charts datalab du Genes + +Dépôts regroupant l'ensemble des images dockers créer par le genes et qui seront utilisées pour le déploiement de nouveau services dans le catalogue datalab via helm chart \ No newline at end of file diff --git a/docker-images-datalab/requirementspython.txt b/docker-images-datalab/requirementspython.txt new file mode 100644 index 0000000..af56f7c --- /dev/null +++ b/docker-images-datalab/requirementspython.txt @@ -0,0 +1,15 @@ +argparse +datasets +fasttext +numpy +pandas +pyarrow +scikit-learn +torch==1.8.0+cu118 +torchvision==0.9.0+cu118 +torchaudio==0.8.0 +transformers +sentence_transformers +typing-inspect==0.8.0 +typing-extensions==4.6.1 +spacy \ No newline at end of file diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/.helmignore b/helm-charts-datalab/helm-charts-test/charts/activetigger/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/Chart.yaml b/helm-charts-datalab/helm-charts-test/charts/activetigger/Chart.yaml new file mode 100644 index 0000000..f71d675 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: activetigger +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/index.md b/helm-charts-datalab/helm-charts-test/charts/activetigger/index.md new file mode 100644 index 0000000..f4d05fe --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/index.md @@ -0,0 +1,63 @@ +# dépôt pour déploiement automatique d'un serveur shiny (R + python) pour mise à disposition à des étudiants pour quelques mois + +A voir la configuration de l'ingress dans le fichier values.yaml du helm chart activetigger + +Image docker où les consignes de déploiement ont été ajoutées : https://code.groupe-genes.fr/DSIT/datalab/src/branch/main/docker-images-datalab/activetigger + +# CONSIGNE DE DEPLOIEMENT + +Voici le git d'Active Tigger : https://gitlab.univ-lille.fr/julien.boelaert/activetigger +Le programme nécessite python, R et shiny server. + +## Packages R (à installer depuis l'exécutable R employé par shiny server) +``` +install.packages(c("arrow", "class", "data.table", "DT", "foreign", "glmnet", "haven", "LiblineaR", "Matrix", "Metrics", "quanteda", "quanteda.textmodels", "ranger", "readODS", "readxl", "RJSONIO", "rlang", "Rtsne", "shiny", "SparseM", "stringi", "uwot")) +``` + +## Environnement python +``` +conda create -n tigger python==3.10 +conda activate tigger +``` + +``` +pip install argparse datasets fasttext numpy pandas pyarrow sklearn +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +pip install transformers[torch] +pip install sentence_transformers +pip install -U typing-inspect==0.8.0 typing_extensions==4.6.1 +pip install spacy +``` +## Téléchargement des modèles spacy et fasttext + +### Français +``` +python -m spacy download fr_core_news_sm +cd ~ +wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz +gunzip cc.fr.300.bin.gz +``` +### Anglais +``` +python -m spacy download en_core_web_sm +cd ~ +wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz +gunzip cc.en.300.bin.gz +``` + +# A chaque création d'instance + +## Clone git pour créer la nouvelle instance (remplacer "tigger-name" par le nom que prendra l'instance, ie https://analytics.huma-num.fr/Prenom.Nom/tigger-name/) +``` +cd ~/zPublish/shiny +git clone https://gitlab.univ-lille.fr/julien.boelaert/activetigger.git tigger-name +``` +## Dans l'application + +Tout en haut à gauche, bouton "+" pour "create project". Puis dans les champs : + +- data directory: moi j'utilise toujours ~/tagging/domaine (genre ~/tagging/radio ou ~/tagging/journaux), mais c'est à toi de voir où tu veux que les données et tags soient stockées sur ton serveur +- je conseille de cocher toutes les cases : python, spacy, fasttext, sbert, gpu +- python : "~/conda/envs/tigger/bin/python" +- fasttext : "~/cc.fr.300.bin" (càd qu'il faut donner le chemin du modèle sur ton serveur, pas juste le nom) +- spacy et SBERT : garder les valeurs par défaut pour la langue choisie \ No newline at end of file diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/NOTES.txt b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/NOTES.txt new file mode 100644 index 0000000..52c6bee --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "test1.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "test1.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "test1.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "test1.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/_helpers.tpl b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/_helpers.tpl new file mode 100644 index 0000000..3625df9 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "test1.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "test1.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "test1.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "test1.labels" -}} +helm.sh/chart: {{ include "test1.chart" . }} +{{ include "test1.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "test1.selectorLabels" -}} +app.kubernetes.io/name: {{ include "test1.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "test1.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "test1.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/deployment.yaml b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/deployment.yaml new file mode 100644 index 0000000..bd09077 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "test1.fullname" . }} + labels: + {{- include "test1.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "test1.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "test1.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "test1.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/hpa.yaml b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/hpa.yaml new file mode 100644 index 0000000..f0b4cfe --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/hpa.yaml @@ -0,0 +1,32 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "test1.fullname" . }} + labels: + {{- include "test1.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "test1.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/ingress.yaml b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/ingress.yaml new file mode 100644 index 0000000..8a26ce5 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "test1.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "test1.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/service.yaml b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/service.yaml new file mode 100644 index 0000000..34760fd --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "test1.fullname" . }} + labels: + {{- include "test1.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "test1.selectorLabels" . | nindent 4 }} diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/serviceaccount.yaml b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/serviceaccount.yaml new file mode 100644 index 0000000..5e8f157 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "test1.serviceAccountName" . }} + labels: + {{- include "test1.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.serviceAccount.automount }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/tests/test-connection.yaml b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/tests/test-connection.yaml new file mode 100644 index 0000000..cf40217 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "test1.fullname" . }}-test-connection" + labels: + {{- include "test1.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "test1.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/helm-charts-datalab/helm-charts-test/charts/activetigger/values.yaml b/helm-charts-datalab/helm-charts-test/charts/activetigger/values.yaml new file mode 100644 index 0000000..ab2c50c --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/activetigger/values.yaml @@ -0,0 +1,107 @@ +# Default values for test1. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: https://code.groupe-genes.fr/dsit/datalab/src/branch/main/docker-images-datalab/activetigger + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 8000 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + +nodeSelector: {} + +tolerations: [] + +affinity: {} diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/.helmignore b/helm-charts-datalab/helm-charts-test/charts/overleaf/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/Chart.yaml b/helm-charts-datalab/helm-charts-test/charts/overleaf/Chart.yaml new file mode 100644 index 0000000..ac43f14 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: overleaf +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/index.md b/helm-charts-datalab/helm-charts-test/charts/overleaf/index.md new file mode 100644 index 0000000..f6146c0 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/index.md @@ -0,0 +1,20 @@ +## Informations utiles + +Exemple insee: +chart: + https://github.com/InseeFrLab/helm-charts-interactive-services/blob/main/charts/vscode-python/Chart.yaml + +dockerfile: + https://github.com/InseeFrLab/images-datascience/blob/main/vscode/Dockerfile + + +OVERLEAF: +image: +https://github.com/overleaf/overleaf + https://github.com/overleaf/overleaf/blob/main/server-ce/Dockerfile + https://github.com/overleaf/toolkit/ + https://github.com/overleaf/toolkit/blob/master/doc/quick-start-guide.md + +helm chart exemple: +https://artifacthub.io/packages/helm/geek-cookbook/overleaf +https://github.com/geek-cookbook/charts/tree/main/charts/stable/overleaf \ No newline at end of file diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/NOTES.txt b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/NOTES.txt new file mode 100644 index 0000000..c8680d6 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "overleaf.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "overleaf.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "overleaf.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "overleaf.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/_helpers.tpl b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/_helpers.tpl new file mode 100644 index 0000000..8fe42d4 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "overleaf.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "overleaf.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "overleaf.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "overleaf.labels" -}} +helm.sh/chart: {{ include "overleaf.chart" . }} +{{ include "overleaf.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "overleaf.selectorLabels" -}} +app.kubernetes.io/name: {{ include "overleaf.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "overleaf.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "overleaf.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/deployment.yaml b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/deployment.yaml new file mode 100644 index 0000000..e6e8a6b --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "overleaf.fullname" . }} + labels: + {{- include "overleaf.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "overleaf.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "overleaf.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "overleaf.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/hpa.yaml b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/hpa.yaml new file mode 100644 index 0000000..a9c4988 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/hpa.yaml @@ -0,0 +1,32 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "overleaf.fullname" . }} + labels: + {{- include "overleaf.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "overleaf.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/ingress.yaml b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/ingress.yaml new file mode 100644 index 0000000..8e87c01 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "overleaf.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "overleaf.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/service.yaml b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/service.yaml new file mode 100644 index 0000000..407f9b3 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "overleaf.fullname" . }} + labels: + {{- include "overleaf.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "overleaf.selectorLabels" . | nindent 4 }} diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/serviceaccount.yaml b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/serviceaccount.yaml new file mode 100644 index 0000000..78193cb --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "overleaf.serviceAccountName" . }} + labels: + {{- include "overleaf.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.serviceAccount.automount }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/tests/test-connection.yaml b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/tests/test-connection.yaml new file mode 100644 index 0000000..d00c184 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "overleaf.fullname" . }}-test-connection" + labels: + {{- include "overleaf.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "overleaf.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/helm-charts-datalab/helm-charts-test/charts/overleaf/values.yaml b/helm-charts-datalab/helm-charts-test/charts/overleaf/values.yaml new file mode 100644 index 0000000..4ce91cd --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/overleaf/values.yaml @@ -0,0 +1,107 @@ +# Default values for overleaf. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: sharelatex/sharelatex + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: 4 + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 80 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + +nodeSelector: {} + +tolerations: [] + +affinity: {} diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/.helmignore b/helm-charts-datalab/helm-charts-test/charts/test2/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/Chart.yaml b/helm-charts-datalab/helm-charts-test/charts/test2/Chart.yaml new file mode 100644 index 0000000..df7f09c --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: test2 +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/templates/NOTES.txt b/helm-charts-datalab/helm-charts-test/charts/test2/templates/NOTES.txt new file mode 100644 index 0000000..8f6d373 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "test2.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "test2.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "test2.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "test2.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/templates/_helpers.tpl b/helm-charts-datalab/helm-charts-test/charts/test2/templates/_helpers.tpl new file mode 100644 index 0000000..b34baed --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "test2.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "test2.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "test2.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "test2.labels" -}} +helm.sh/chart: {{ include "test2.chart" . }} +{{ include "test2.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "test2.selectorLabels" -}} +app.kubernetes.io/name: {{ include "test2.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "test2.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "test2.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/templates/deployment.yaml b/helm-charts-datalab/helm-charts-test/charts/test2/templates/deployment.yaml new file mode 100644 index 0000000..f8ef1db --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/templates/deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "test2.fullname" . }} + labels: + {{- include "test2.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "test2.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "test2.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "test2.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/templates/hpa.yaml b/helm-charts-datalab/helm-charts-test/charts/test2/templates/hpa.yaml new file mode 100644 index 0000000..25f8047 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/templates/hpa.yaml @@ -0,0 +1,32 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "test2.fullname" . }} + labels: + {{- include "test2.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "test2.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/templates/ingress.yaml b/helm-charts-datalab/helm-charts-test/charts/test2/templates/ingress.yaml new file mode 100644 index 0000000..3a4bbf6 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "test2.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "test2.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/templates/service.yaml b/helm-charts-datalab/helm-charts-test/charts/test2/templates/service.yaml new file mode 100644 index 0000000..77975d4 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "test2.fullname" . }} + labels: + {{- include "test2.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "test2.selectorLabels" . | nindent 4 }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/templates/serviceaccount.yaml b/helm-charts-datalab/helm-charts-test/charts/test2/templates/serviceaccount.yaml new file mode 100644 index 0000000..d7e0a53 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "test2.serviceAccountName" . }} + labels: + {{- include "test2.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.serviceAccount.automount }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/templates/tests/test-connection.yaml b/helm-charts-datalab/helm-charts-test/charts/test2/templates/tests/test-connection.yaml new file mode 100644 index 0000000..e09a13e --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "test2.fullname" . }}-test-connection" + labels: + {{- include "test2.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "test2.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/helm-charts-datalab/helm-charts-test/charts/test2/values.yaml b/helm-charts-datalab/helm-charts-test/charts/test2/values.yaml new file mode 100644 index 0000000..5a142e5 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test2/values.yaml @@ -0,0 +1,107 @@ +# Default values for test2. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: nginx + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 80 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + +nodeSelector: {} + +tolerations: [] + +affinity: {} diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/.helmignore b/helm-charts-datalab/helm-charts-test/charts/test3/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/Chart.yaml b/helm-charts-datalab/helm-charts-test/charts/test3/Chart.yaml new file mode 100644 index 0000000..7c447b0 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: test3 +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/templates/NOTES.txt b/helm-charts-datalab/helm-charts-test/charts/test3/templates/NOTES.txt new file mode 100644 index 0000000..ca312c2 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "test3.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "test3.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "test3.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "test3.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/templates/_helpers.tpl b/helm-charts-datalab/helm-charts-test/charts/test3/templates/_helpers.tpl new file mode 100644 index 0000000..a9b6c9e --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "test3.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "test3.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "test3.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "test3.labels" -}} +helm.sh/chart: {{ include "test3.chart" . }} +{{ include "test3.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "test3.selectorLabels" -}} +app.kubernetes.io/name: {{ include "test3.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "test3.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "test3.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/templates/deployment.yaml b/helm-charts-datalab/helm-charts-test/charts/test3/templates/deployment.yaml new file mode 100644 index 0000000..01a1afd --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/templates/deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "test3.fullname" . }} + labels: + {{- include "test3.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "test3.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "test3.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "test3.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/templates/hpa.yaml b/helm-charts-datalab/helm-charts-test/charts/test3/templates/hpa.yaml new file mode 100644 index 0000000..8282d6c --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/templates/hpa.yaml @@ -0,0 +1,32 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "test3.fullname" . }} + labels: + {{- include "test3.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "test3.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/templates/ingress.yaml b/helm-charts-datalab/helm-charts-test/charts/test3/templates/ingress.yaml new file mode 100644 index 0000000..e813ac4 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "test3.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "test3.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/templates/service.yaml b/helm-charts-datalab/helm-charts-test/charts/test3/templates/service.yaml new file mode 100644 index 0000000..6d7ecf8 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "test3.fullname" . }} + labels: + {{- include "test3.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "test3.selectorLabels" . | nindent 4 }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/templates/serviceaccount.yaml b/helm-charts-datalab/helm-charts-test/charts/test3/templates/serviceaccount.yaml new file mode 100644 index 0000000..b6bae8a --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "test3.serviceAccountName" . }} + labels: + {{- include "test3.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.serviceAccount.automount }} +{{- end }} diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/templates/tests/test-connection.yaml b/helm-charts-datalab/helm-charts-test/charts/test3/templates/tests/test-connection.yaml new file mode 100644 index 0000000..2b47022 --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "test3.fullname" . }}-test-connection" + labels: + {{- include "test3.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "test3.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/helm-charts-datalab/helm-charts-test/charts/test3/values.yaml b/helm-charts-datalab/helm-charts-test/charts/test3/values.yaml new file mode 100644 index 0000000..e2551bd --- /dev/null +++ b/helm-charts-datalab/helm-charts-test/charts/test3/values.yaml @@ -0,0 +1,107 @@ +# Default values for test3. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: nginx + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 80 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + +nodeSelector: {} + +tolerations: [] + +affinity: {} diff --git a/helm-charts-datalab/index.md b/helm-charts-datalab/index.md new file mode 100644 index 0000000..796a044 --- /dev/null +++ b/helm-charts-datalab/index.md @@ -0,0 +1,3 @@ +## Helm charts test + +dépôt de test pour ajouter de nouveau service au catalogue datalab \ No newline at end of file diff --git a/index.md b/index.md new file mode 100644 index 0000000..6c6d2a2 --- /dev/null +++ b/index.md @@ -0,0 +1,5 @@ +## Helm charts datalab du Genes + +Dépôts regroupant l'ensemble des helm-charts utilisé pour le déploiement des services, cette version est actuellement en test + +Voir le fichier "values.yaml" pour rajouter un nouveau dépôt au catalogue du datalab \ No newline at end of file diff --git a/kk b/kk deleted file mode 100644 index a427aba..0000000 --- a/kk +++ /dev/null @@ -1 +0,0 @@ -hh \ No newline at end of file diff --git a/values.yaml b/values.yaml new file mode 100644 index 0000000..59e17dc --- /dev/null +++ b/values.yaml @@ -0,0 +1,18 @@ +## texte à rajouter sur le fichier values.yaml de l'app onyxia pour ajouter le catalogue test (https://code.groupe-genes.fr/DSIT/kube-apps/src/branch/main/apps/onyxia/values.yaml) + + { + "id": "test", + "name": "test", + "description": "test services.", + "maintainer": "support@groupe-genes.fr", + "location": "link vers dépots à rajouter une fois créé", + "status": "PROD", + "highlightedCharts": + [ + "overleaf", + "activetigger", + "test2", + "test3", + ], + "type": "helm", + }, \ No newline at end of file