ajout de l'image myactivetigger
This commit is contained in:
parent
710fd3bd45
commit
dd830a2cbb
48
docker-images-datalab/myactivetigger/.drone.yml
Normal file
48
docker-images-datalab/myactivetigger/.drone.yml
Normal file
|
@ -0,0 +1,48 @@
|
|||
kind: pipeline
|
||||
name: Build & publish main
|
||||
|
||||
steps:
|
||||
- name: publish-image
|
||||
pull: always
|
||||
image: plugins/kaniko:1.7.1-kaniko1.9.1
|
||||
settings:
|
||||
auto_tag: true
|
||||
auto_tag_suffix: latest
|
||||
registry: code.groupe-genes.fr
|
||||
repo: code.groupe-genes.fr/datalab/docker-images-datalab/activetigger
|
||||
username:
|
||||
from_secret: docker_username
|
||||
password:
|
||||
from_secret: docker_password
|
||||
when:
|
||||
event:
|
||||
exclude:
|
||||
- pull_request
|
||||
- name: deploy
|
||||
image: alpine
|
||||
environment:
|
||||
kubernetes_server:
|
||||
from_secret: kubernetes_server
|
||||
kubernetes_cert:
|
||||
from_secret: kubernetes_cert
|
||||
kubernetes_token:
|
||||
from_secret: kubernetes_token
|
||||
commands:
|
||||
- apk add --no-cache curl
|
||||
- curl -LL -o /usr/bin/kubectl "https://dl.k8s.io/release/v1.28.2/bin/linux/amd64/kubectl"
|
||||
- curl -LL -o helm.tar.gz "https://get.helm.sh/helm-v3.14.0-linux-amd64.tar.gz"
|
||||
- tar xf "helm.tar.gz" && mv ./linux-amd64/helm /usr/bin/helm
|
||||
- chmod +x /usr/bin/kubectl
|
||||
- chmod +x /usr/bin/helm
|
||||
- kubectl config set-cluster default --server=$kubernetes_server --insecure-skip-tls-verify=true
|
||||
- kubectl config set-credentials user --token=$kubernetes_token
|
||||
- kubectl config set-context default --user=user --cluster=default --namespace=activetigger
|
||||
- kubectl config use-context default
|
||||
- kubectl get pods
|
||||
- helm ls -n activetigger --debug
|
||||
- helm dependency build ./helm-chart
|
||||
- helm upgrade activetigger ./helm-chart -f ./helm-chart/values.yaml -n activetigger
|
||||
when:
|
||||
event:
|
||||
exclude:
|
||||
- pull_request
|
130
docker-images-datalab/myactivetigger/Dockerfile
Normal file
130
docker-images-datalab/myactivetigger/Dockerfile
Normal file
|
@ -0,0 +1,130 @@
|
|||
|
||||
ARG CACHEBUST=1
|
||||
FROM ubuntu:22.04
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
COPY requirements.r /requirements.r
|
||||
COPY requirementspython.txt /requirementspython.txt
|
||||
|
||||
|
||||
# Installation python
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python3.10 \
|
||||
python3.10-distutils \
|
||||
python3.10-venv \
|
||||
python3-pip \
|
||||
r-base \
|
||||
wget \
|
||||
&& apt-get clean
|
||||
|
||||
# Installation R + shiny server
|
||||
RUN apt-get update && \
|
||||
apt-get install -y r-base
|
||||
RUN R -e "install.packages('shiny', repos='https://cran.rstudio.com/')"
|
||||
|
||||
# Install gdebi-core and shiny-server
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y gdebi-core
|
||||
RUN wget https://download3.rstudio.org/ubuntu-18.04/x86_64/shiny-server-1.5.21.1012-amd64.deb
|
||||
RUN gdebi --non-interactive shiny-server-1.5.21.1012-amd64.deb
|
||||
|
||||
|
||||
## Packages package R (à installer depuis l'exécutable R employé par shiny server)
|
||||
RUN Rscript /requirements.r
|
||||
|
||||
## Environnement python | a vérifier dans requirementspython.txt l'installation des cu118 se fais de cette manière
|
||||
|
||||
|
||||
|
||||
# Install Miniconda
|
||||
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
|
||||
bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
|
||||
rm Miniconda3-latest-Linux-x86_64.sh
|
||||
|
||||
# Add Conda binaries to PATH
|
||||
ENV PATH="/opt/conda/bin:${PATH}"
|
||||
|
||||
# Create a Conda environment and activate it
|
||||
RUN conda create -n tigger python==3.10 && \
|
||||
echo "conda activate tigger" >> ~/.bashrc
|
||||
|
||||
# Mise à jour et installation des dépendances système
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip3 install --no-cache-dir \
|
||||
torch torchvision torchaudio \
|
||||
-f https://download.pytorch.org/whl/cu118/torch_stable.html
|
||||
|
||||
RUN apt-get update && apt-get install -y curl build-essential
|
||||
RUN pip3 install --no-cache-dir six
|
||||
|
||||
# Install Rust using rustup
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
|
||||
# Add Cargo's bin directory to the PATH environment variable
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
RUN pip3 install --no-cache-dir --upgrade setuptools
|
||||
|
||||
|
||||
# Autres installations de bibliothèques Python
|
||||
RUN pip3 install argparse
|
||||
RUN pip3 install datasets
|
||||
RUN pip3 install fasttext
|
||||
RUN pip3 install numpy
|
||||
RUN pip3 install pandas
|
||||
RUN pip3 install pyarrow
|
||||
RUN pip3 install scikit-learn
|
||||
RUN pip3 install sentence-transformers
|
||||
RUN pip3 install transformers
|
||||
RUN pip3 install typing-inspect==0.8.0
|
||||
RUN pip3 install typing-extensions==4.6.1
|
||||
RUN pip3 install spacy
|
||||
|
||||
# Mettre en place des configurations supplémentaires si nécessaire
|
||||
|
||||
# Commande par défaut à exécuter lorsque le conteneur démarre
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
## Téléchargement des modèles spacy et fasttext
|
||||
|
||||
### Français
|
||||
#WORKDIR ~
|
||||
#RUN python -m spacy download fr_core_news_sm
|
||||
|
||||
#RUN python -m spacy download fr_core_news_sm \
|
||||
#WORKDIR ~
|
||||
|
||||
RUN wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz \
|
||||
&& gunzip cc.fr.300.bin.gz
|
||||
|
||||
|
||||
# A chaque création d'instance
|
||||
|
||||
## Clone git pour créer la nouvelle instance (remplacer "tigger-name" par le nom que prendra l'instance, ie https://analytics.huma-num.fr/Prenom.Nom/tigger-name/)
|
||||
COPY activetigger/ ./activetigger
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Expose the default Shiny port
|
||||
EXPOSE 3838
|
||||
|
||||
# Command to run the Shiny app
|
||||
CMD ["R", "-e", "shiny::runApp('activetigger', port=3838, host='0.0.0.0')"]
|
||||
|
||||
RUN R -e "install.packages('quanteda')"
|
||||
|
||||
|
||||
## Dans l'application
|
||||
|
||||
## Tout en haut à gauche, bouton "+" pour "create project". Puis dans les champs :
|
||||
|
||||
## - data directory: moi j'utilise toujours ~/tagging/domaine (genre ~/tagging/radio ou ~/tagging/journaux), mais c'est à toi de voir où tu veux que les données et tags soient stockées sur ton serveur
|
||||
## - je conseille de cocher toutes les cases : python, spacy, fasttext, sbert, gpu
|
||||
## - python : "~/conda/envs/tigger/bin/python"
|
||||
## - fasttext : "~/cc.fr.300.bin" (càd qu'il faut donner le chemin du modèle sur ton serveur, pas juste le nom)
|
||||
## - spacy et SBERT : garder les valeurs par défaut pour la langue choisie
|
|
@ -0,0 +1,41 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
## FastText embed sentences
|
||||
## Requires data file with columns id and text
|
||||
|
||||
import argparse
|
||||
import fasttext
|
||||
from os.path import expanduser
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.feather as feather
|
||||
import re
|
||||
|
||||
|
||||
def main(args):
|
||||
print("FastText: Importing data")
|
||||
datapath = expanduser(args.data)
|
||||
dat = feather.read_feather(datapath)
|
||||
outfile = re.sub("[.]feather$", "_ft.feather", datapath)
|
||||
|
||||
print("FastText: Loading model")
|
||||
ft = fasttext.load_model(expanduser(args.model))
|
||||
print("FastText: Embedding sentences")
|
||||
emb = [ft.get_sentence_vector(re.sub("\n", " ", x)) for x in dat["text"]]
|
||||
|
||||
print("FastText: Exporting")
|
||||
emb = pd.DataFrame(emb)
|
||||
emb.columns = ["ft%03d" % (x + 1) for x in range(len(emb.columns))]
|
||||
emb = pd.concat([dat["id"], emb], axis=1)
|
||||
feather.write_feather(emb, outfile)
|
||||
print("FastText: Done")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
argParser = argparse.ArgumentParser()
|
||||
argParser.add_argument("-m", "--model", help="Model path", default="/data/user/b/jboelaert/cc.fr.100.bin")
|
||||
argParser.add_argument("-d", "--data", help="Path to data (feather)")
|
||||
args = argParser.parse_args()
|
||||
main(args)
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
## SBERT embed sentences
|
||||
## Requires data file with columns id and text
|
||||
|
||||
import argparse
|
||||
from os.path import expanduser
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.feather as feather
|
||||
import re
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
|
||||
def main(args):
|
||||
print("SBERT: Importing data")
|
||||
datapath = expanduser(args.data)
|
||||
dat = feather.read_feather(datapath)
|
||||
outfile = re.sub("[.]feather$", "_sb.feather", datapath)
|
||||
|
||||
print("SBERT: Loading model")
|
||||
sbert = SentenceTransformer(expanduser(args.model))
|
||||
sbert.max_seq_length = 512
|
||||
print("SBERT: Embedding sentences")
|
||||
emb = sbert.encode(dat["text"])
|
||||
|
||||
print("SBERT: Exporting")
|
||||
emb = pd.DataFrame(emb)
|
||||
emb.columns = ["sb%03d" % (x + 1) for x in range(len(emb.columns))]
|
||||
emb = pd.concat([dat["id"], emb], axis=1)
|
||||
feather.write_feather(emb, outfile)
|
||||
print("SBERT: Done")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
argParser = argparse.ArgumentParser()
|
||||
argParser.add_argument("-m", "--model", help="Model name or path", default="distiluse-base-multilingual-cased-v1")
|
||||
argParser.add_argument("-d", "--data", help="Path to data (feather)")
|
||||
args = argParser.parse_args()
|
||||
main(args)
|
||||
|
174
docker-images-datalab/myactivetigger/activetigger/gobert.py
Normal file
174
docker-images-datalab/myactivetigger/activetigger/gobert.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
## BERT trainer to be called by server.R
|
||||
## Requires two data files with columns id, label and text
|
||||
|
||||
import argparse
|
||||
import datasets
|
||||
from datasets import load_metric
|
||||
import numpy as np
|
||||
from os.path import expanduser
|
||||
import os
|
||||
import pandas as pd
|
||||
import re
|
||||
from sklearn import metrics
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
from transformers import Trainer, TrainingArguments, TrainerCallback
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
def main(args):
|
||||
print("Importing data")
|
||||
dattrain = pd.read_csv(expanduser(args.traindat))
|
||||
datval = pd.read_csv(expanduser(args.valdat))
|
||||
datval_id = datval["id"]
|
||||
classcolname = "label"
|
||||
|
||||
## Make class_names
|
||||
class_names = [x for x in dattrain[classcolname].unique()]
|
||||
|
||||
## Labels to class number
|
||||
dattrain[classcolname] = [class_names.index(x) for x in dattrain[classcolname].to_list()]
|
||||
datval[classcolname] = [class_names.index(x) for x in datval[classcolname].to_list()]
|
||||
|
||||
## Transform to datasets
|
||||
dattrain = datasets.Dataset.from_pandas(dattrain[['text', 'label']])
|
||||
datval = datasets.Dataset.from_pandas(datval[['text', 'label']])
|
||||
|
||||
# Model choice
|
||||
modelname = expanduser(args.model)
|
||||
|
||||
## Tokenizer
|
||||
print("Tokenizing")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(modelname)
|
||||
|
||||
# toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length"), batched=True)
|
||||
# toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length"), batched=True)
|
||||
if args.adapt:
|
||||
toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=512), batched=True)
|
||||
toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=512), batched=True)
|
||||
else:
|
||||
toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True)
|
||||
toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True)
|
||||
|
||||
del(dattrain)
|
||||
|
||||
## Model
|
||||
print("Loading model")
|
||||
model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels = len(class_names))
|
||||
if (args.gpu):
|
||||
model.cuda()
|
||||
|
||||
## Train using Trainer interface
|
||||
print("Training...")
|
||||
BATCH_SIZE = args.batchsize
|
||||
GRAD_ACC = args.gradacc
|
||||
epochs = args.epochs
|
||||
|
||||
total_steps = (epochs * len(toktrain)) // (BATCH_SIZE * GRAD_ACC)
|
||||
warmup_steps = (total_steps) // 10
|
||||
eval_steps = total_steps // args.eval
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir=args.session + "_train",
|
||||
learning_rate=args.lrate,
|
||||
weight_decay=args.wdecay,
|
||||
num_train_epochs=epochs,
|
||||
gradient_accumulation_steps=GRAD_ACC,
|
||||
per_device_train_batch_size=BATCH_SIZE,
|
||||
# per_device_eval_batch_size=BATCH_SIZE,
|
||||
per_device_eval_batch_size=32,
|
||||
warmup_steps=warmup_steps,
|
||||
|
||||
eval_steps=eval_steps,
|
||||
evaluation_strategy="steps",
|
||||
save_strategy="steps",
|
||||
save_steps=eval_steps,
|
||||
logging_steps=eval_steps,
|
||||
do_eval=True,
|
||||
greater_is_better=False,
|
||||
load_best_model_at_end=bool(args.best),
|
||||
metric_for_best_model="eval_loss"
|
||||
)
|
||||
|
||||
trainer = Trainer(model=model, args=training_args,
|
||||
train_dataset=toktrain, eval_dataset=toktest)
|
||||
|
||||
the_session = args.session
|
||||
class HaltCallback(TrainerCallback):
|
||||
"A callback that checks for _stop file to interrupt training"
|
||||
|
||||
def on_step_begin(self, args, state, control, **kwargs):
|
||||
if os.path.exists(the_session + "_stop"):
|
||||
print("\nHalted by user.\n")
|
||||
control.should_training_stop = True
|
||||
return(control)
|
||||
else:
|
||||
print("\nNot halted by user.\n")
|
||||
|
||||
trainer.add_callback(HaltCallback)
|
||||
|
||||
trainer.train()
|
||||
|
||||
## Add class names to model
|
||||
label_to_id = {v: i for i, v in enumerate(class_names)}
|
||||
model.config.label2id = label_to_id
|
||||
model.config.id2label = {id: label for label, id in model.config.label2id.items()}
|
||||
|
||||
## Save model
|
||||
model.save_pretrained(args.session)
|
||||
|
||||
|
||||
## Prediction functions
|
||||
|
||||
|
||||
def get_predprobs(text):
|
||||
# inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
|
||||
inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
||||
if (args.gpu):
|
||||
inputs = inputs.to("cuda")
|
||||
outputs = model(**inputs)
|
||||
res = outputs[0]
|
||||
if (args.gpu):
|
||||
res = res.cpu()
|
||||
res = res.softmax(1).detach().numpy()
|
||||
return res
|
||||
|
||||
|
||||
def get_prediction(text):
|
||||
return class_names[get_predprobs(text).argmax()]
|
||||
|
||||
## Metrics on validation set
|
||||
print("Computing predictions")
|
||||
testpred = [get_prediction(txt) for txt in datval["text"]]
|
||||
testtruth = [class_names[x] for x in datval["label"]]
|
||||
|
||||
exportpred = pd.DataFrame(datval_id)
|
||||
exportpred.columns = ["id"]
|
||||
exportpred["bertpred"] = testpred
|
||||
exportpred.to_csv(args.session + "_predval.csv", index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
argParser = argparse.ArgumentParser()
|
||||
argParser.add_argument("-m", "--model", help="Model name or path", default="microsoft/Multilingual-MiniLM-L12-H384")
|
||||
argParser.add_argument("-t", "--traindat", help="Path to training data")
|
||||
argParser.add_argument("-v", "--valdat", help="Path to validation data")
|
||||
argParser.add_argument("-b", "--batchsize", help="Batch size for training", type=int, default=4)
|
||||
argParser.add_argument("-g", "--gradacc", help="Gradient accumulation for training", type=int, default=1)
|
||||
argParser.add_argument("-e", "--epochs", help="Number of training epochs", type=float, default=3)
|
||||
argParser.add_argument("-l", "--lrate", help="Learning rate", type=float, default=5e-05)
|
||||
argParser.add_argument("-w", "--wdecay", help="Weight decay", type=float, default=.01)
|
||||
argParser.add_argument("-B", "--best", help="Load best model instead of last", type=int, choices=[0,1], default=1)
|
||||
argParser.add_argument("-E", "--eval", help="Number of intermediary evaluations", type=int, default=10)
|
||||
argParser.add_argument("-s", "--session", help="Session name (used to save results)")
|
||||
argParser.add_argument("-G", "--gpu", help="Use GPU (CUDA)", type=int, choices=[0,1], default=0)
|
||||
argParser.add_argument("-A", "--adapt", help="Adapt token length to batch", type=int, choices=[0,1], default=1)
|
||||
|
||||
|
||||
args = argParser.parse_args()
|
||||
|
||||
main(args)
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
## BERT inference to be called by server.R
|
||||
|
||||
import argparse
|
||||
import datasets
|
||||
import json
|
||||
import numpy as np
|
||||
from os import path, remove
|
||||
import pandas as pd
|
||||
import pyarrow.feather as feather
|
||||
import re
|
||||
from torch import no_grad
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
|
||||
|
||||
def chunker(seq, batch_size):
|
||||
return (seq[pos:pos + batch_size] for pos in range(0, len(seq), batch_size))
|
||||
|
||||
|
||||
def main(args):
|
||||
print("Importing data")
|
||||
with open(path.expanduser(args.logfile), "w") as progfile:
|
||||
progfile.write("Importing data")
|
||||
|
||||
dat = feather.read_feather(path.expanduser(args.dat))
|
||||
|
||||
with open(path.expanduser(args.logfile), "w") as progfile:
|
||||
progfile.write("Tokenizing")
|
||||
|
||||
## Tokenizer
|
||||
print("Tokenizing")
|
||||
with open(path.join(path.expanduser(args.model), "config.json"), "r") as jsonfile:
|
||||
modeltype = json.load(jsonfile)["_name_or_path"]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(modeltype)
|
||||
|
||||
## Model
|
||||
print("Loading model")
|
||||
model = AutoModelForSequenceClassification.from_pretrained(path.expanduser(args.model))
|
||||
if (args.gpu):
|
||||
model.cuda()
|
||||
|
||||
## Prediction functions
|
||||
|
||||
|
||||
def get_predprobs(text):
|
||||
inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
||||
if (args.gpu):
|
||||
inputs = inputs.to("cuda")
|
||||
with no_grad():
|
||||
outputs = model(**inputs)
|
||||
res = outputs[0]
|
||||
if (args.gpu):
|
||||
res = res.cpu()
|
||||
res = res.softmax(1).detach().numpy()
|
||||
return res
|
||||
|
||||
print("Computing predictions")
|
||||
|
||||
chunks = chunker([str(x) for x in dat[args.txtname]], args.batch)
|
||||
pred = []
|
||||
for i, x in enumerate(chunks):
|
||||
if (i % 5 == 0):
|
||||
percent = round(100 * i * args.batch / len(dat), 1)
|
||||
logmsg = "Computing: " + str(percent) + "% (" + str(i * args.batch) + "/" + str(len(dat)) + ")"
|
||||
with open(path.expanduser(args.logfile), "w") as progfile:
|
||||
progfile.write(logmsg)
|
||||
pred.append(get_predprobs(x))
|
||||
|
||||
pred = np.concatenate(pred)
|
||||
pred = pd.DataFrame(pred)
|
||||
pred.columns = ["bertpred_" + v for i, v in model.config.id2label.items()]
|
||||
pred = pd.concat([dat[args.idname], pred], axis=1)
|
||||
feather.write_feather(pred, path.abspath(args.output))
|
||||
remove(path.expanduser(args.logfile))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
argParser = argparse.ArgumentParser()
|
||||
argParser.add_argument("-m", "--model", help="Trained model path")
|
||||
argParser.add_argument("-d", "--dat", help="Path to data (feather file)")
|
||||
argParser.add_argument("-o", "--output", help="Output path of predictions", default="tiggerbert.feather")
|
||||
argParser.add_argument("-i", "--idname", help="Name of id variable", default="id")
|
||||
argParser.add_argument("-x", "--txtname", help="Name of text variable", default="text")
|
||||
argParser.add_argument("-l", "--logfile", help="Path to log file", default="tiggerbert-progress.txt")
|
||||
argParser.add_argument("-G", "--gpu", help="Use GPU (CUDA)", type=int, choices=[0,1], default=1)
|
||||
argParser.add_argument("-b", "--batch", help="Batch size", type=int, default=128)
|
||||
|
||||
args = argParser.parse_args()
|
||||
|
||||
main(args)
|
||||
|
159
docker-images-datalab/myactivetigger/activetigger/modelnames.csv
Normal file
159
docker-images-datalab/myactivetigger/activetigger/modelnames.csv
Normal file
|
@ -0,0 +1,159 @@
|
|||
"short","spacy_name","fasttext_name","fasttext_url","language","short_lang"
|
||||
"af","xx_ent_wiki_sm","cc.af.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.bin.gz","Afrikaans","(af) Afrikaans"
|
||||
"als","xx_ent_wiki_sm","cc.als.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.bin.gz","Alemannic","(als) Alemannic"
|
||||
"am","xx_ent_wiki_sm","cc.am.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.bin.gz","Amharic","(am) Amharic"
|
||||
"an","xx_ent_wiki_sm","cc.an.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.bin.gz","Aragonese","(an) Aragonese"
|
||||
"ar","xx_ent_wiki_sm","cc.ar.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz","Arabic","(ar) Arabic"
|
||||
"arz","xx_ent_wiki_sm","cc.arz.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.bin.gz","Egyptian Arabic","(arz) Egyptian Arabic"
|
||||
"as","xx_ent_wiki_sm","cc.as.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.bin.gz","Assamese","(as) Assamese"
|
||||
"ast","xx_ent_wiki_sm","cc.ast.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.bin.gz","Asturian","(ast) Asturian"
|
||||
"az","xx_ent_wiki_sm","cc.az.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.bin.gz","Azerbaijani","(az) Azerbaijani"
|
||||
"azb","xx_ent_wiki_sm","cc.azb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.bin.gz","Southern Azerbaijani","(azb) Southern Azerbaijani"
|
||||
"ba","xx_ent_wiki_sm","cc.ba.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.bin.gz","Bashkir","(ba) Bashkir"
|
||||
"bar","xx_ent_wiki_sm","cc.bar.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.bin.gz","Bavarian","(bar) Bavarian"
|
||||
"bcl","xx_ent_wiki_sm","cc.bcl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.bin.gz","Central Bicolano","(bcl) Central Bicolano"
|
||||
"be","xx_ent_wiki_sm","cc.be.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.bin.gz","Belarusian","(be) Belarusian"
|
||||
"bg","xx_ent_wiki_sm","cc.bg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.bin.gz","Bulgarian","(bg) Bulgarian"
|
||||
"bh","xx_ent_wiki_sm","cc.bh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.bin.gz","Bihari","(bh) Bihari"
|
||||
"bn","xx_ent_wiki_sm","cc.bn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz","Bengali","(bn) Bengali"
|
||||
"bo","xx_ent_wiki_sm","cc.bo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.bin.gz","Tibetan","(bo) Tibetan"
|
||||
"bpy","xx_ent_wiki_sm","cc.bpy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.bin.gz","Bishnupriya Manipuri","(bpy) Bishnupriya Manipuri"
|
||||
"br","xx_ent_wiki_sm","cc.br.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.bin.gz","Breton","(br) Breton"
|
||||
"bs","xx_ent_wiki_sm","cc.bs.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.bin.gz","Bosnian","(bs) Bosnian"
|
||||
"ca","ca_core_news_sm","cc.ca.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.bin.gz","Catalan","(ca) Catalan"
|
||||
"ce","xx_ent_wiki_sm","cc.ce.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.bin.gz","Chechen","(ce) Chechen"
|
||||
"ceb","xx_ent_wiki_sm","cc.ceb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.bin.gz","Cebuano","(ceb) Cebuano"
|
||||
"ckb","xx_ent_wiki_sm","cc.ckb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.bin.gz","Kurdish (Sorani)","(ckb) Kurdish (Sorani)"
|
||||
"co","xx_ent_wiki_sm","cc.co.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.bin.gz","Corsican","(co) Corsican"
|
||||
"cs","xx_ent_wiki_sm","cc.cs.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.bin.gz","Czech","(cs) Czech"
|
||||
"cv","xx_ent_wiki_sm","cc.cv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.bin.gz","Chuvash","(cv) Chuvash"
|
||||
"cy","xx_ent_wiki_sm","cc.cy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz","Welsh","(cy) Welsh"
|
||||
"da","da_core_news_sm","cc.da.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.bin.gz","Danish","(da) Danish"
|
||||
"de","de_core_news_sm","cc.de.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz","German","(de) German"
|
||||
"diq","xx_ent_wiki_sm","cc.diq.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.bin.gz","Zazaki","(diq) Zazaki"
|
||||
"dv","xx_ent_wiki_sm","cc.dv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.bin.gz","Divehi","(dv) Divehi"
|
||||
"el","el_core_news_sm","cc.el.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.bin.gz","Greek","(el) Greek"
|
||||
"eml","xx_ent_wiki_sm","cc.eml.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.bin.gz","Emilian-Romagnol","(eml) Emilian-Romagnol"
|
||||
"en","en_core_web_sm","cc.en.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz","English","(en) English"
|
||||
"eo","xx_ent_wiki_sm","cc.eo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.bin.gz","Esperanto","(eo) Esperanto"
|
||||
"es","es_core_news_sm","cc.es.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz","Spanish","(es) Spanish"
|
||||
"et","xx_ent_wiki_sm","cc.et.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.bin.gz","Estonian","(et) Estonian"
|
||||
"eu","xx_ent_wiki_sm","cc.eu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.bin.gz","Basque","(eu) Basque"
|
||||
"fa","xx_ent_wiki_sm","cc.fa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz","Persian","(fa) Persian"
|
||||
"fi","fi_core_news_sm","cc.fi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.bin.gz","Finnish","(fi) Finnish"
|
||||
"fr","fr_core_news_sm","cc.fr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz","French","(fr) French"
|
||||
"frr","xx_ent_wiki_sm","cc.frr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.bin.gz","North Frisian","(frr) North Frisian"
|
||||
"fy","xx_ent_wiki_sm","cc.fy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.bin.gz","West Frisian","(fy) West Frisian"
|
||||
"ga","xx_ent_wiki_sm","cc.ga.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.bin.gz","Irish","(ga) Irish"
|
||||
"gd","xx_ent_wiki_sm","cc.gd.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.bin.gz","Scottish Gaelic","(gd) Scottish Gaelic"
|
||||
"gl","xx_ent_wiki_sm","cc.gl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.bin.gz","Galician","(gl) Galician"
|
||||
"gom","xx_ent_wiki_sm","cc.gom.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.bin.gz","Goan Konkani","(gom) Goan Konkani"
|
||||
"gu","xx_ent_wiki_sm","cc.gu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.bin.gz","Gujarati","(gu) Gujarati"
|
||||
"gv","xx_ent_wiki_sm","cc.gv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.bin.gz","Manx","(gv) Manx"
|
||||
"he","xx_ent_wiki_sm","cc.he.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz","Hebrew","(he) Hebrew"
|
||||
"hi","xx_ent_wiki_sm","cc.hi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz","Hindi","(hi) Hindi"
|
||||
"hif","xx_ent_wiki_sm","cc.hif.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.bin.gz","Fiji Hindi","(hif) Fiji Hindi"
|
||||
"hr","hr_core_news_sm","cc.hr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz","Croatian","(hr) Croatian"
|
||||
"hsb","xx_ent_wiki_sm","cc.hsb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.bin.gz","Upper Sorbian","(hsb) Upper Sorbian"
|
||||
"ht","xx_ent_wiki_sm","cc.ht.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.bin.gz","Haitian","(ht) Haitian"
|
||||
"hu","xx_ent_wiki_sm","cc.hu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.bin.gz","Hungarian","(hu) Hungarian"
|
||||
"hy","xx_ent_wiki_sm","cc.hy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.bin.gz","Armenian","(hy) Armenian"
|
||||
"ia","xx_ent_wiki_sm","cc.ia.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.bin.gz","Interlingua","(ia) Interlingua"
|
||||
"id","xx_ent_wiki_sm","cc.id.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz","Indonesian","(id) Indonesian"
|
||||
"ilo","xx_ent_wiki_sm","cc.ilo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.bin.gz","Ilokano","(ilo) Ilokano"
|
||||
"io","xx_ent_wiki_sm","cc.io.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.bin.gz","Ido","(io) Ido"
|
||||
"is","xx_ent_wiki_sm","cc.is.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.bin.gz","Icelandic","(is) Icelandic"
|
||||
"it","it_core_news_sm","cc.it.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz","Italian","(it) Italian"
|
||||
"ja","ja_core_news_sm","cc.ja.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz","Japanese","(ja) Japanese"
|
||||
"jv","xx_ent_wiki_sm","cc.jv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.bin.gz","Javanese","(jv) Javanese"
|
||||
"ka","xx_ent_wiki_sm","cc.ka.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.bin.gz","Georgian","(ka) Georgian"
|
||||
"kk","xx_ent_wiki_sm","cc.kk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.bin.gz","Kazakh","(kk) Kazakh"
|
||||
"km","xx_ent_wiki_sm","cc.km.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz","Khmer","(km) Khmer"
|
||||
"kn","xx_ent_wiki_sm","cc.kn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.bin.gz","Kannada","(kn) Kannada"
|
||||
"ko","ko_core_news_sm","cc.ko.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz","Korean","(ko) Korean"
|
||||
"ku","xx_ent_wiki_sm","cc.ku.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.bin.gz","Kurdish (Kurmanji)","(ku) Kurdish (Kurmanji)"
|
||||
"ky","xx_ent_wiki_sm","cc.ky.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.bin.gz","Kirghiz","(ky) Kirghiz"
|
||||
"la","xx_ent_wiki_sm","cc.la.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz","Latin","(la) Latin"
|
||||
"lb","xx_ent_wiki_sm","cc.lb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.bin.gz","Luxembourgish","(lb) Luxembourgish"
|
||||
"li","xx_ent_wiki_sm","cc.li.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.bin.gz","Limburgish","(li) Limburgish"
|
||||
"lmo","xx_ent_wiki_sm","cc.lmo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.bin.gz","Lombard","(lmo) Lombard"
|
||||
"lt","lt_core_news_sm","cc.lt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.bin.gz","Lithuanian","(lt) Lithuanian"
|
||||
"lv","xx_ent_wiki_sm","cc.lv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.bin.gz","Latvian","(lv) Latvian"
|
||||
"mai","xx_ent_wiki_sm","cc.mai.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.bin.gz","Maithili","(mai) Maithili"
|
||||
"mg","xx_ent_wiki_sm","cc.mg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.bin.gz","Malagasy","(mg) Malagasy"
|
||||
"mhr","xx_ent_wiki_sm","cc.mhr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.bin.gz","Meadow Mari","(mhr) Meadow Mari"
|
||||
"min","xx_ent_wiki_sm","cc.min.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.bin.gz","Minangkabau","(min) Minangkabau"
|
||||
"mk","mk_core_news_sm","cc.mk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.bin.gz","Macedonian","(mk) Macedonian"
|
||||
"ml","xx_ent_wiki_sm","cc.ml.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.bin.gz","Malayalam","(ml) Malayalam"
|
||||
"mn","xx_ent_wiki_sm","cc.mn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.bin.gz","Mongolian","(mn) Mongolian"
|
||||
"mr","xx_ent_wiki_sm","cc.mr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.bin.gz","Marathi","(mr) Marathi"
|
||||
"mrj","xx_ent_wiki_sm","cc.mrj.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.bin.gz","Hill Mari","(mrj) Hill Mari"
|
||||
"ms","xx_ent_wiki_sm","cc.ms.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.bin.gz","Malay","(ms) Malay"
|
||||
"mt","xx_ent_wiki_sm","cc.mt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.bin.gz","Maltese","(mt) Maltese"
|
||||
"mwl","xx_ent_wiki_sm","cc.mwl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.bin.gz","Mirandese","(mwl) Mirandese"
|
||||
"my","xx_ent_wiki_sm","cc.my.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.bin.gz","Burmese","(my) Burmese"
|
||||
"myv","xx_ent_wiki_sm","cc.myv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.bin.gz","Erzya","(myv) Erzya"
|
||||
"mzn","xx_ent_wiki_sm","cc.mzn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.bin.gz","Mazandarani","(mzn) Mazandarani"
|
||||
"nah","xx_ent_wiki_sm","cc.nah.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.bin.gz","Nahuatl","(nah) Nahuatl"
|
||||
"nap","xx_ent_wiki_sm","cc.nap.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.bin.gz","Neapolitan","(nap) Neapolitan"
|
||||
"nds","xx_ent_wiki_sm","cc.nds.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.bin.gz","Low Saxon","(nds) Low Saxon"
|
||||
"ne","xx_ent_wiki_sm","cc.ne.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.bin.gz","Nepali","(ne) Nepali"
|
||||
"new","xx_ent_wiki_sm","cc.new.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.bin.gz","Newar","(new) Newar"
|
||||
"nl","nl_core_news_sm","cc.nl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.bin.gz","Dutch","(nl) Dutch"
|
||||
"nn","xx_ent_wiki_sm","cc.nn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.bin.gz","Norwegian (Nynorsk)","(nn) Norwegian (Nynorsk)"
|
||||
"no","xx_ent_wiki_sm","cc.no.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.bin.gz","Norwegian (Bokmål)","(no) Norwegian (Bokmål)"
|
||||
"nso","xx_ent_wiki_sm","cc.nso.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.bin.gz","Northern Sotho","(nso) Northern Sotho"
|
||||
"oc","xx_ent_wiki_sm","cc.oc.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.bin.gz","Occitan","(oc) Occitan"
|
||||
"or","xx_ent_wiki_sm","cc.or.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.bin.gz","Oriya","(or) Oriya"
|
||||
"os","xx_ent_wiki_sm","cc.os.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.bin.gz","Ossetian","(os) Ossetian"
|
||||
"pa","xx_ent_wiki_sm","cc.pa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.bin.gz","Eastern Punjabi","(pa) Eastern Punjabi"
|
||||
"pam","xx_ent_wiki_sm","cc.pam.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.bin.gz","Kapampangan","(pam) Kapampangan"
|
||||
"pfl","xx_ent_wiki_sm","cc.pfl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.bin.gz","Palatinate German","(pfl) Palatinate German"
|
||||
"pl","pl_core_news_sm","cc.pl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz","Polish","(pl) Polish"
|
||||
"pms","xx_ent_wiki_sm","cc.pms.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.bin.gz","Piedmontese","(pms) Piedmontese"
|
||||
"pnb","xx_ent_wiki_sm","cc.pnb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.bin.gz","Western Punjabi","(pnb) Western Punjabi"
|
||||
"ps","xx_ent_wiki_sm","cc.ps.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.bin.gz","Pashto","(ps) Pashto"
|
||||
"pt","pt_core_news_sm","cc.pt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.bin.gz","Portuguese","(pt) Portuguese"
|
||||
"qu","xx_ent_wiki_sm","cc.qu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.bin.gz","Quechua","(qu) Quechua"
|
||||
"rm","xx_ent_wiki_sm","cc.rm.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.bin.gz","Romansh","(rm) Romansh"
|
||||
"ro","ro_core_news_sm","cc.ro.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz","Romanian","(ro) Romanian"
|
||||
"ru","ru_core_news_sm","cc.ru.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz","Russian","(ru) Russian"
|
||||
"sa","xx_ent_wiki_sm","cc.sa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.bin.gz","Sanskrit","(sa) Sanskrit"
|
||||
"sah","xx_ent_wiki_sm","cc.sah.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.bin.gz","Sakha","(sah) Sakha"
|
||||
"sc","xx_ent_wiki_sm","cc.sc.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.bin.gz","Sardinian","(sc) Sardinian"
|
||||
"scn","xx_ent_wiki_sm","cc.scn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.bin.gz","Sicilian","(scn) Sicilian"
|
||||
"sco","xx_ent_wiki_sm","cc.sco.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.bin.gz","Scots","(sco) Scots"
|
||||
"sd","xx_ent_wiki_sm","cc.sd.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.bin.gz","Sindhi","(sd) Sindhi"
|
||||
"sh","xx_ent_wiki_sm","cc.sh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.bin.gz","Serbo-Croatian","(sh) Serbo-Croatian"
|
||||
"si","xx_ent_wiki_sm","cc.si.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz","Sinhalese","(si) Sinhalese"
|
||||
"sk","xx_ent_wiki_sm","cc.sk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.bin.gz","Slovak","(sk) Slovak"
|
||||
"sl","sl_core_news_sm","cc.sl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.bin.gz","Slovenian","(sl) Slovenian"
|
||||
"so","xx_ent_wiki_sm","cc.so.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.bin.gz","Somali","(so) Somali"
|
||||
"sq","xx_ent_wiki_sm","cc.sq.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.bin.gz","Albanian","(sq) Albanian"
|
||||
"sr","xx_ent_wiki_sm","cc.sr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.bin.gz","Serbian","(sr) Serbian"
|
||||
"su","xx_ent_wiki_sm","cc.su.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.bin.gz","Sundanese","(su) Sundanese"
|
||||
"sv","sv_core_news_sm","cc.sv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.bin.gz","Swedish","(sv) Swedish"
|
||||
"sw","xx_ent_wiki_sm","cc.sw.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.bin.gz","Swahili","(sw) Swahili"
|
||||
"ta","xx_ent_wiki_sm","cc.ta.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.bin.gz","Tamil","(ta) Tamil"
|
||||
"te","xx_ent_wiki_sm","cc.te.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.bin.gz","Telugu","(te) Telugu"
|
||||
"tg","xx_ent_wiki_sm","cc.tg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.bin.gz","Tajik","(tg) Tajik"
|
||||
"th","xx_ent_wiki_sm","cc.th.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.bin.gz","Thai","(th) Thai"
|
||||
"tk","xx_ent_wiki_sm","cc.tk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.bin.gz","Turkmen","(tk) Turkmen"
|
||||
"tl","xx_ent_wiki_sm","cc.tl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.bin.gz","Tagalog","(tl) Tagalog"
|
||||
"tr","xx_ent_wiki_sm","cc.tr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz","Turkish","(tr) Turkish"
|
||||
"tt","xx_ent_wiki_sm","cc.tt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.bin.gz","Tatar","(tt) Tatar"
|
||||
"ug","xx_ent_wiki_sm","cc.ug.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz","Uyghur","(ug) Uyghur"
|
||||
"uk","uk_core_news_sm","cc.uk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.bin.gz","Ukrainian","(uk) Ukrainian"
|
||||
"ur","xx_ent_wiki_sm","cc.ur.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.bin.gz","Urdu","(ur) Urdu"
|
||||
"uz","xx_ent_wiki_sm","cc.uz.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.bin.gz","Uzbek","(uz) Uzbek"
|
||||
"vec","xx_ent_wiki_sm","cc.vec.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.bin.gz","Venetian","(vec) Venetian"
|
||||
"vi","xx_ent_wiki_sm","cc.vi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.bin.gz","Vietnamese","(vi) Vietnamese"
|
||||
"vls","xx_ent_wiki_sm","cc.vls.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.bin.gz","West Flemish","(vls) West Flemish"
|
||||
"vo","xx_ent_wiki_sm","cc.vo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.bin.gz","Volapük","(vo) Volapük"
|
||||
"wa","xx_ent_wiki_sm","cc.wa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.bin.gz","Walloon","(wa) Walloon"
|
||||
"war","xx_ent_wiki_sm","cc.war.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.bin.gz","Waray","(war) Waray"
|
||||
"xmf","xx_ent_wiki_sm","cc.xmf.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.bin.gz","Mingrelian","(xmf) Mingrelian"
|
||||
"yi","xx_ent_wiki_sm","cc.yi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.bin.gz","Yiddish","(yi) Yiddish"
|
||||
"yo","xx_ent_wiki_sm","cc.yo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.bin.gz","Yoruba","(yo) Yoruba"
|
||||
"zea","xx_ent_wiki_sm","cc.zea.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.bin.gz","Zeelandic","(zea) Zeelandic"
|
||||
"zh","zh_core_web_sm","cc.zh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz","Chinese","(zh) Chinese"
|
|
4143
docker-images-datalab/myactivetigger/activetigger/server.R
Normal file
4143
docker-images-datalab/myactivetigger/activetigger/server.R
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
## Spacy tokenize texts
|
||||
## Requires data file with columns id and text
|
||||
|
||||
import argparse
|
||||
from os.path import expanduser
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.feather as feather
|
||||
import spacy
|
||||
import re
|
||||
|
||||
|
||||
def main(args):
|
||||
print("Tokenizer: Importing data")
|
||||
datapath = expanduser(args.data)
|
||||
dat = feather.read_feather(datapath)
|
||||
outfile = re.sub("[.]feather$", "_spa.feather", datapath)
|
||||
|
||||
print("Tokenizer: Loading model")
|
||||
spa = spacy.load(expanduser(args.model))
|
||||
print("Tokenizer: Tokenizing sentences")
|
||||
tok = [" ".join([str(token) for token in spa.tokenizer(text)]) for text in dat["text"]]
|
||||
|
||||
print("Tokenizer: Exporting")
|
||||
tok = pd.concat([dat["id"], pd.DataFrame(tok)], axis=1)
|
||||
tok.columns = ["id", "text_spa"]
|
||||
feather.write_feather(tok, outfile)
|
||||
print("Tokenizer: Done")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
argParser = argparse.ArgumentParser()
|
||||
argParser.add_argument("-m", "--model", help="Model name", default="fr_core_news_sm")
|
||||
argParser.add_argument("-d", "--data", help="Path to data (feather)")
|
||||
args = argParser.parse_args()
|
||||
main(args)
|
||||
|
631
docker-images-datalab/myactivetigger/activetigger/ui.R
Normal file
631
docker-images-datalab/myactivetigger/activetigger/ui.R
Normal file
|
@ -0,0 +1,631 @@
|
|||
## 21/04/2020 : shiny pour active learning étiquetage de textes
|
||||
|
||||
shinyUI(fluidPage(
|
||||
title = "Active Tigger",
|
||||
|
||||
############################################################################
|
||||
## Top panel: Title, strategy, model options
|
||||
############################################################################
|
||||
fluidRow(
|
||||
column(
|
||||
4,
|
||||
fluidRow(
|
||||
column(4, br(),
|
||||
HTML('<img src="active_tigger.png" width="100%">')),
|
||||
column(
|
||||
8, br(),
|
||||
p(strong("Project / Scheme")),
|
||||
fluidRow(
|
||||
column(2, HTML(paste0(
|
||||
'<div title="New project">',
|
||||
actionButton("createProject", "+"),
|
||||
'</div>'))),
|
||||
column(10, selectInput("selectProject", NULL, NULL, NULL))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, HTML(paste0(
|
||||
'<div title="New scheme">',
|
||||
actionButton("createScheme", "+"),
|
||||
'</div>'))),
|
||||
column(10, selectInput("selectScheme", NULL, NULL, NULL))
|
||||
)
|
||||
))
|
||||
),
|
||||
column(
|
||||
2, br(),
|
||||
|
||||
HTML(paste0(
|
||||
'<div title="Querying strategy">',
|
||||
p(strong("Strategy")),
|
||||
selectInput("strategy", NULL,
|
||||
choices= c("Active" = "entropy",
|
||||
"MaxProb" = "maxprob",
|
||||
"Random" = "random",
|
||||
"Deterministic" = "sequential"),
|
||||
selected = "sequential"),
|
||||
conditionalPanel("input.strategy == 'maxprob'",
|
||||
selectInput("maxprobWhich", NULL, NULL)),
|
||||
'</div>'))
|
||||
),
|
||||
column(
|
||||
2, br(),
|
||||
HTML(paste0(
|
||||
'<div title="Query from which?">',
|
||||
p(strong("On")),
|
||||
selectInput("sampleChoice", NULL,
|
||||
choices = c("Untagged" = "untagged",
|
||||
"Tagged" = "tagged",
|
||||
"All" = "all")),
|
||||
conditionalPanel("input.sampleChoice == 'tagged'",
|
||||
selectInput("taggedWhich", NULL, NULL)),
|
||||
'</div>'))
|
||||
),
|
||||
|
||||
column(4, p(br()),
|
||||
verbatimTextOutput("trainDiagno"),
|
||||
fluidRow(
|
||||
column(8, HTML(paste0(
|
||||
'<div title="Train prediction model 🤖">',
|
||||
actionButton("modelTrain", "🤖 Train", width = "100%"),
|
||||
'</div>'))),
|
||||
column(4, HTML(paste0(
|
||||
'<div title="🤖 Prediction model options">',
|
||||
checkboxInput("showTrainOptions", "🔧"),
|
||||
'</div>')))
|
||||
))
|
||||
),
|
||||
|
||||
|
||||
## General training options
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions",
|
||||
hr(),
|
||||
fluidRow(
|
||||
column(
|
||||
4,
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions & input.use_regressors.includes('regex')",
|
||||
uiOutput("panelExtraRegex")
|
||||
)
|
||||
),
|
||||
column(
|
||||
4,
|
||||
p(strong("Predictors")),
|
||||
selectizeInput("use_regressors", NULL, "",
|
||||
multiple = TRUE, width = "100%"),
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions & input.use_regressors.includes('extra')",
|
||||
selectizeInput("use_ootregnum", "Extra predictors: continuous", "",
|
||||
multiple = TRUE, width = "100%"),
|
||||
selectizeInput("use_ootregcat", "Extra predictors: categorical", "",
|
||||
multiple = TRUE, width = "100%")
|
||||
),
|
||||
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions & input.use_regressors.includes('dfm')",
|
||||
hr(),
|
||||
fluidRow(
|
||||
column(
|
||||
6, numericInput("dfmMinTermfreq", "DFM : Min Termfreq",
|
||||
min= 1, max= 1e3, value= 5, step= 1)),
|
||||
column(
|
||||
6, numericInput("dfmMinDocfreq", "DFM : Min Docfreq",
|
||||
min= 1, max= 1e3, value= 5, step= 1))),
|
||||
fluidRow(
|
||||
column(
|
||||
4, checkboxInput("dfmTfIdf", "Tf-Idf", TRUE)),
|
||||
column(
|
||||
4, selectInput("dfmTfScheme", label = NULL,
|
||||
choices= c("logcount", "count", "prop", "propmax",
|
||||
"boolean", "augmented", "logave"),
|
||||
selected= "logcount")),
|
||||
column(
|
||||
4, selectInput("dfmDfScheme", label = NULL,
|
||||
choices= c("inverse", "count",
|
||||
"inversemax", "inverseprob", "unary"),
|
||||
selected= "inverse"))
|
||||
),
|
||||
numericInput("dfmNgrams", "N-grams", value= 1,
|
||||
min= 1, max= 10, step = 1)
|
||||
)
|
||||
),
|
||||
column(
|
||||
4,
|
||||
HTML(paste0(
|
||||
'<div title="Auto train after # tags (0=never)">',
|
||||
fluidRow(
|
||||
column(6, strong("Auto train every")),
|
||||
column(6, numericInput("trainCountdown", NULL, 0, 0, 1e6, 1))),
|
||||
'</div>'
|
||||
)),
|
||||
HTML(paste0(
|
||||
'<div title="🤖 model (recommended: Liblinear)">',
|
||||
fluidRow(
|
||||
column(6, strong("Model")),
|
||||
column(6, selectInput("predModel", NULL, selected = "linear",
|
||||
choices = c("Naive Bayes" = "naive bayes",
|
||||
"KNN" = "knn",
|
||||
"Liblinear" = "linear",
|
||||
"LASSO" = "lasso",
|
||||
"Random Forest" = "random forest")))
|
||||
),
|
||||
'</div>'
|
||||
)),
|
||||
|
||||
## Model-specific training options
|
||||
### Random forest options
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions & input.predModel == 'random forest'",
|
||||
fluidRow(
|
||||
numericInput("rfNumTrees", label = "Num. trees",
|
||||
min = 1, max = 2e3, value = 500, step = 1),
|
||||
numericInput("rfMtry", label = "mtry",
|
||||
min = 0, max = 1e5, value = 0, step = 1),
|
||||
numericInput("rfSampleFrac", label = "Sample fraction",
|
||||
min = 0, max = 1, value = 1, step = .01)
|
||||
)
|
||||
),
|
||||
### Naive Bayes options
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions & input.predModel == 'naive bayes'",
|
||||
flowLayout(
|
||||
numericInput("naiveSmooth", label = "Smooth",
|
||||
min = 0, max = 2e3,
|
||||
value = 1, step = 1e-3),
|
||||
selectInput("naivePrior", "Prior",
|
||||
c("uniform", "docfreq", "termfreq")),
|
||||
selectInput("naiveDistri", "Distribution",
|
||||
c("multinomial", "Bernoulli"))
|
||||
)
|
||||
),
|
||||
### Lasso options
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions & input.predModel == 'lasso'",
|
||||
strong("Lasso penalty"),
|
||||
fluidRow(
|
||||
column(
|
||||
6, numericInput("glmLambda", label = NULL, min = 0, max = 2e3,
|
||||
value = 0, step = 1e-6)),
|
||||
column(
|
||||
6, actionButton("glmCV", label= "Find best (CV)")))
|
||||
),
|
||||
### Linear options
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions & input.predModel == 'linear'",
|
||||
strong("Liblinear Cost"),
|
||||
fluidRow(
|
||||
column(
|
||||
6, numericInput("liblinCost", label= NULL, min= 0, max= 2e10,
|
||||
value= 32, step= 1)),
|
||||
column(
|
||||
6, actionButton("liblinCV", label= "Find best (CV)")))
|
||||
),
|
||||
### KNN options
|
||||
conditionalPanel(
|
||||
"input.showTrainOptions & input.predModel == 'knn'",
|
||||
flowLayout(
|
||||
strong("N. Neighbours"),
|
||||
numericInput("knnK", label = NULL, min = 1, max = 1e2,
|
||||
value = 3, step = 1)
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
hr()
|
||||
),
|
||||
|
||||
|
||||
############################################################################
|
||||
## Main panel set
|
||||
############################################################################
|
||||
tabsetPanel(
|
||||
id = "mainPanelset",
|
||||
selected = "Tagging",
|
||||
|
||||
########################################################################
|
||||
## Project panel
|
||||
########################################################################
|
||||
tabPanel(
|
||||
"Project",
|
||||
br(),
|
||||
tabsetPanel(
|
||||
id = "tabsetProject",
|
||||
selected = "Sample",
|
||||
|
||||
tabPanel(
|
||||
"Settings",
|
||||
br(),
|
||||
actionButton("saveSystem", "Save changes"),
|
||||
|
||||
h4("Files"),
|
||||
fluidRow(
|
||||
column(2, p("Data directory")),
|
||||
column(4, uiOutput("sys_datadir")),
|
||||
column(6, p("Place (on the server) where the data and project are stored"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, p("Data filename")),
|
||||
column(4, uiOutput("sys_datafile")),
|
||||
column(6, p("Main file, containing id and text columns"))
|
||||
),
|
||||
|
||||
h4("Variables"),
|
||||
fluidRow(
|
||||
column(2, p("ID")),
|
||||
column(4, uiOutput("sys_var_id")),
|
||||
column(6, p("Name of the id variable, unique identifier of each text"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, p("Text")),
|
||||
column(4, uiOutput("sys_var_text")),
|
||||
column(6, p("Name of the text variables: if more than one, texts are concatenated in the specified order"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, p("Tags")),
|
||||
column(4, uiOutput("sys_var_tag")),
|
||||
column(6, p("Names of scheme variables"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, p("Comments")),
|
||||
column(4, uiOutput("sys_var_comm_ui")),
|
||||
column(6, p("Name of the comments variable"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, p("Context")),
|
||||
column(4, uiOutput("sys_var_context_ui")),
|
||||
column(6, p("Names of variables not used in the models, but may be displayed during tagging"))
|
||||
),
|
||||
|
||||
h4("System"),
|
||||
fluidRow(
|
||||
column(2, checkboxInput("sys_use_python", "Python backend", FALSE)),
|
||||
column(4, conditionalPanel(
|
||||
"input.sys_use_python",
|
||||
textInput("sys_which_python", NULL, value = "python3",
|
||||
placeholder = "(custom python path)"))),
|
||||
column(6, conditionalPanel(
|
||||
"input.sys_use_python",
|
||||
p("This must be a working python3 environment, with the required modules installed (see documentation)")))
|
||||
),
|
||||
|
||||
conditionalPanel("input.sys_use_python", list(
|
||||
fluidRow(
|
||||
column(2, checkboxInput("sys_use_spacy", "SpaCy tokenization", FALSE)),
|
||||
column(4, conditionalPanel("input.sys_use_spacy", textInput(
|
||||
"sys_use_spacy_model", NULL, NULL, placeholder = "(spacy model name)"))),
|
||||
column(6, p("Name of the spacy tokenizer model, used in DTM and word embeddings"))
|
||||
),
|
||||
conditionalPanel("input.sys_use_spacy", fluidRow(
|
||||
column(2),
|
||||
column(9, uiOutput("sys_spacyDlUI")))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, checkboxInput("sys_use_ft", "FastText word embeddings", FALSE)),
|
||||
column(4, conditionalPanel("input.sys_use_ft", textInput(
|
||||
"sys_use_ft_model", NULL, NULL, placeholder = "(fasttext model path)"))),
|
||||
column(6, p("Path to the local fasttext model binary"))
|
||||
),
|
||||
conditionalPanel("input.sys_use_ft", fluidRow(
|
||||
column(2),
|
||||
column(9, uiOutput("sys_ftDlUI")))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, checkboxInput("sys_use_sb", "SBERT sentence embeddings", FALSE)),
|
||||
column(4, conditionalPanel("input.sys_use_sb", textInput(
|
||||
"sys_use_sb_model", NULL, NULL,
|
||||
placeholder = "(custom sentence_transformers model)"))),
|
||||
column(6, p("(GPU recommended) Name or path of the sentence-transformers model"))
|
||||
),
|
||||
|
||||
conditionalPanel("input.sys_use_python", list(
|
||||
checkboxInput("sys_use_gpu", "GPU support (CUDA, for SBERT and BERT)", FALSE),
|
||||
|
||||
br(),
|
||||
wellPanel(
|
||||
h4("Model picker"),
|
||||
fluidRow(
|
||||
column(2, p("Language")),
|
||||
column(4, uiOutput("sys_ex_lang_ui")),
|
||||
column(6, p("Used to preset tokenization and embedding models"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2),
|
||||
column(4, strong("Recommended model")),
|
||||
column(6, strong("Download instructions"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, p("SpaCy tokenization")),
|
||||
column(4, uiOutput("sys_ex_spacy")),
|
||||
column(6, uiOutput("sys_ex_spacy_dl"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, p("FastText word embeddings")),
|
||||
column(4, uiOutput("sys_ex_ft")),
|
||||
column(6, uiOutput("sys_ex_ft_dl"))
|
||||
),
|
||||
fluidRow(
|
||||
column(2, p("SBERT sentence embeddings")),
|
||||
column(4, uiOutput("sys_ex_sb")),
|
||||
column(6, p("(Auto download by python module)"))
|
||||
)
|
||||
)
|
||||
))
|
||||
))
|
||||
),
|
||||
|
||||
tabPanel(
|
||||
"Sample",
|
||||
br(),
|
||||
fluidRow(
|
||||
column(
|
||||
4,
|
||||
wellPanel(
|
||||
fluidRow(
|
||||
column(8, h4("Sample")),
|
||||
column(4, actionButton("dataImport", "Import", width = "100%"))),
|
||||
fluidRow(
|
||||
column(6, numericInput("dataNrows", "N. rows", 500, 10, 1e4, 1)),
|
||||
column(6, numericInput("dataSkipRows", "Skip rows", 0, 0, step = 1))
|
||||
)
|
||||
)
|
||||
),
|
||||
column(8, uiOutput("dataMessage"), uiOutput("panelData"))
|
||||
)
|
||||
),
|
||||
|
||||
tabPanel(
|
||||
"Scheme",
|
||||
br(),
|
||||
fluidRow(
|
||||
# column(4, uiOutput("panelScheme")),
|
||||
column(
|
||||
4,
|
||||
wellPanel(
|
||||
h4("Current scheme"),
|
||||
fluidRow(
|
||||
column(2, HTML(paste0(
|
||||
"<div title='Delete scheme'>",
|
||||
actionButton("schemeDelete", "🗑", width = "100%"),
|
||||
"</div>"))),
|
||||
column(6, uiOutput("printScheme")),
|
||||
column(4, HTML(paste0(
|
||||
"<div title='Save scheme description'>",
|
||||
actionButton("schemeDescrSave", "Save", width = "100%"),
|
||||
"</div>")))
|
||||
),
|
||||
br(),
|
||||
textAreaInput("schemeDescr", NULL, width = "100%", rows = 10,
|
||||
placeholder = "Write scheme description here"),
|
||||
hr()
|
||||
)
|
||||
),
|
||||
column(8, uiOutput("panelRetag"))
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
|
||||
|
||||
########################################################################
|
||||
## Text / visualization panel
|
||||
########################################################################
|
||||
tabPanel(
|
||||
"Tagging",
|
||||
fluidRow(
|
||||
column(
|
||||
3,
|
||||
br(),
|
||||
fluidRow(
|
||||
column(8, textInput("regexFilter", label = NULL,
|
||||
placeholder = "(Regex filter)")),
|
||||
column(4, checkboxInput("regexCaseSens", "Case"))),
|
||||
|
||||
wellPanel(
|
||||
## Tagging buttons
|
||||
fluidRow(
|
||||
column(8, textInput("newLab", label = NULL,
|
||||
placeholder = "(New label)")),
|
||||
column(4, actionButton("currentAction", "Create"))
|
||||
),
|
||||
|
||||
# fluidRow(uiOutput("oracleButtons")),
|
||||
uiOutput("oracleButtons"),
|
||||
|
||||
br(),
|
||||
textInput("currentComment", NULL, "", width = "100%",
|
||||
placeholder = "(Comment)"),
|
||||
br(),
|
||||
uiOutput("makeOracleConfirm")
|
||||
),
|
||||
|
||||
# fluidRow(
|
||||
# column(6, checkboxInput("showContext", "Context")),
|
||||
# column(6, actionButton("oops", strong("Oops")))
|
||||
# ),
|
||||
checkboxInput("showContext", "Context"),
|
||||
conditionalPanel("input.showContext", htmlOutput("currentContext"))
|
||||
),
|
||||
column(
|
||||
9,
|
||||
fluidRow(
|
||||
column(2, checkboxInput("panelText", "Text", TRUE)),
|
||||
column(2, checkboxInput("panelVisu", "Visualization", FALSE),
|
||||
offset = 8)
|
||||
),
|
||||
uiOutput("textVisuCols") # Handled in server.R for adaptive columns
|
||||
)
|
||||
)
|
||||
),
|
||||
|
||||
########################################################################
|
||||
## History panel
|
||||
########################################################################
|
||||
tabPanel(
|
||||
"History",
|
||||
br(),
|
||||
actionButton("histSave", "Save changes"),
|
||||
br(),
|
||||
br(),
|
||||
DT::dataTableOutput("histDTable")
|
||||
),
|
||||
|
||||
|
||||
########################################################################
|
||||
## Stats panel
|
||||
########################################################################
|
||||
tabPanel(
|
||||
"Stats",
|
||||
br(),
|
||||
fluidRow(
|
||||
column(
|
||||
3,
|
||||
h3("Counts"),
|
||||
tableOutput("statsTagTable")
|
||||
),
|
||||
column(
|
||||
9,
|
||||
h3("10-CV diagnostics"),
|
||||
actionButton("statsCVgo", "Compute 10-CV"),
|
||||
br(),
|
||||
verbatimTextOutput("statsCVoutput"),
|
||||
DT::dataTableOutput("statsCVtable")
|
||||
)
|
||||
),
|
||||
hr(),
|
||||
h3("Gold Standard")
|
||||
),
|
||||
|
||||
########################################################################
|
||||
## BERT panel
|
||||
########################################################################
|
||||
tabPanel(
|
||||
"BERT",
|
||||
|
||||
fluidRow(
|
||||
column(
|
||||
3,
|
||||
br(),
|
||||
h3("Train new BERT"),
|
||||
fluidRow(
|
||||
column(6, actionButton("bertTrain", "Train BERT", width = "100%")),
|
||||
column(6, checkboxInput("bertOptions", "Options"))),
|
||||
fluidRow(
|
||||
column(6, textInput(
|
||||
"bertSaveName", NULL, placeholder = "(save name)")),
|
||||
column(6, actionButton("bertSave", "Save", width = "100%"))),
|
||||
actionLink("bertLast", "Last trained model"),
|
||||
h3("Saved models"),
|
||||
uiOutput("bertSaved")
|
||||
),
|
||||
column(
|
||||
9,
|
||||
br(),
|
||||
conditionalPanel(
|
||||
"input.bertOptions",
|
||||
fluidRow(
|
||||
column(6, selectInput(
|
||||
"bertModel", "Model",
|
||||
c("(Fr) CamemBERT-base" = "camembert/camembert-base",
|
||||
"(Fr) CamemBERT-large" = "camembert/camembert-large",
|
||||
"(Fr) FlauBERT-small" = "flaubert/flaubert_small_cased",
|
||||
"(Fr) FlauBERT-base" = "flaubert/flaubert_base_cased",
|
||||
"(Fr) FlauBERT-large" = "flaubert/flaubert_large_cased",
|
||||
"(En) DistilBERT-base" = "distilbert-base-cased",
|
||||
"(En) RoBERTa-base" = "roberta-base",
|
||||
"(En) DeBERTa-base" = "microsoft/deberta-base",
|
||||
"(Multi) DistilmBERT-base" = "distilbert-base-multilingual-cased",
|
||||
"(Multi) MiniLM" = "microsoft/Multilingual-MiniLM-L12-H384",
|
||||
"(Multi) XLM-RoBERTa-base" = "xlm-roberta-base"))),
|
||||
column(6)
|
||||
),
|
||||
fluidRow(
|
||||
column(3, numericInput("bertEpochs", "Epochs", 3, 1, 20, 1)),
|
||||
column(3, numericInput("bertLrate", "Learning rate", 2e-5, 1e-6, 1, 1e-6)),
|
||||
column(3, numericInput("bertWdecay", "Weight decay", 0.01, 0, 10, 1e-6)),
|
||||
column(3)
|
||||
),
|
||||
fluidRow(
|
||||
column(3, numericInput("bertBatchsize", "Batch size", 4, 1, 32, 1)),
|
||||
column(3, numericInput("bertGradacc", "Gradient accum.", 4, 1, 32, 1)),
|
||||
column(3, br(), checkboxInput("bertAdapt", "Adapt token length to batch", TRUE)),
|
||||
column(3)
|
||||
),
|
||||
fluidRow(
|
||||
column(3, numericInput("bertValidFrac", "Validation fraction", .2, 0, .9)),
|
||||
column(3, numericInput("bertValidSeed", "Validation seed", 1234, 1, 9e8)),
|
||||
column(3, numericInput("bertNeval", "N. validation evals", 10, 1, 100, 1)),
|
||||
column(3, br(), checkboxInput("bertBest", "Keep best", TRUE))
|
||||
),
|
||||
fluidRow(
|
||||
column(3, numericInput("bertMinOccur", "Min. class occurences", 1, 1, 1e4, 1)),
|
||||
column(3, br(), checkboxInput("bertBalance", "Balance classes", FALSE)),
|
||||
column(3),
|
||||
column(3)
|
||||
)
|
||||
),
|
||||
|
||||
|
||||
fluidRow(
|
||||
column(
|
||||
6,
|
||||
# flowLayout(
|
||||
# actionButton(
|
||||
# "bertGoPred", "Infer on current data", width = "100%"),
|
||||
# actionButton(
|
||||
# "bertDelete", "Delete saved model", width = "100%")),
|
||||
verbatimTextOutput("bertMsg")),
|
||||
column(6, plotOutput("bertValPlot", height = 200))),
|
||||
|
||||
verbatimTextOutput("bertMsgHyperpar"),
|
||||
|
||||
DT::dataTableOutput("bertValstats")
|
||||
)
|
||||
)
|
||||
),
|
||||
|
||||
########################################################################
|
||||
## Export panel
|
||||
########################################################################
|
||||
tabPanel(
|
||||
"Export",
|
||||
h4("Export tagged data"),
|
||||
p("Download the tags and predicted probabilities from the complete model, on the current data sample."),
|
||||
# downloadButton("downloadCsv", "Save csv"),
|
||||
flowLayout(
|
||||
selectInput(
|
||||
"dlTagSelect", NULL, c("tags", "comments", "predictions"),
|
||||
c("tags", "comments", "predictions"), multiple = TRUE),
|
||||
selectInput("dlTagFormat", NULL, c("csv", "feather"), "csv"),
|
||||
downloadButton("dlTagSave", NULL, title = "Save tags")
|
||||
),
|
||||
|
||||
hr(),
|
||||
h4("Export embeddings"),
|
||||
p("Download the embeddings (incl. from visualization if present), on the current data sample."),
|
||||
flowLayout(
|
||||
selectInput(
|
||||
"dlEmbedSelect", NULL, c("FastText" = "ft", "SBERT" = "sb"),
|
||||
selected = "sb", multiple = TRUE),
|
||||
selectInput("dlEmbedFormat", NULL, c("csv", "feather"), "feather"),
|
||||
downloadButton("dlEmbedSave", NULL, title = "Save embeddings")
|
||||
),
|
||||
|
||||
hr(),
|
||||
h4("Export BERT predictions"),
|
||||
p("Download the predicted probabilities from the chosen BERT model, on the complete dataset."),
|
||||
flowLayout(
|
||||
selectInput("dlBPSelect", NULL, NULL, NULL),
|
||||
selectInput("dlBPFormat", NULL, c("csv", "feather"), "feather"),
|
||||
actionButton("dlBPInfer", "Predict"),
|
||||
verbatimTextOutput("dlBPMsg"),
|
||||
uiOutput("dlBPDlButton")
|
||||
),
|
||||
|
||||
hr(),
|
||||
h4("Export BERT models")
|
||||
)
|
||||
|
||||
),
|
||||
br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(),
|
||||
br(), br(), br(), br(), br(), br()
|
||||
))
|
Binary file not shown.
After Width: | Height: | Size: 104 KiB |
Binary file not shown.
After Width: | Height: | Size: 245 KiB |
Binary file not shown.
After Width: | Height: | Size: 3.7 KiB |
13
docker-images-datalab/myactivetigger/requirements.py
Normal file
13
docker-images-datalab/myactivetigger/requirements.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
argparse
|
||||
datasets
|
||||
fasttext
|
||||
numpy
|
||||
pandas
|
||||
pyarrow
|
||||
scikit-learn
|
||||
torch
|
||||
transformers[torch]
|
||||
sentence_transformers
|
||||
typing-inspect==0.8.0
|
||||
typing_extensions==4.6.1
|
||||
spacy
|
3
docker-images-datalab/myactivetigger/requirements.r
Normal file
3
docker-images-datalab/myactivetigger/requirements.r
Normal file
|
@ -0,0 +1,3 @@
|
|||
packages=c("arrow", "class", "data.table", "DT", "foreign", "glmnet", "haven", "LiblineaR", "Matrix", "Metrics", "quanteda", "quanteda.textmodels", "ranger", "readODS", "readxl", "RJSONIO", "rlang", "Rtsne", "shiny", "SparseM", "stringi", "uwot", "future","htmlTable","ggplot2")
|
||||
|
||||
install.packages(setdiff(packages, rownames(installed.packages())))
|
|
@ -0,0 +1,6 @@
|
|||
pip install argparse datasets fasttext numpy pandas pyarrow sklearn
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
pip install transformers[torch]
|
||||
pip install sentence_transformers
|
||||
pip install -U typing-inspect==0.8.0 typing_extensions==4.6.1
|
||||
pip install spacy
|
Loading…
Reference in New Issue
Block a user