ajout de l'image myactivetigger

This commit is contained in:
Alexis GUYOT 2024-03-06 15:54:50 +01:00
parent 710fd3bd45
commit dd830a2cbb
16 changed files with 5524 additions and 0 deletions

View File

@ -0,0 +1,48 @@
kind: pipeline
name: Build & publish main
steps:
- name: publish-image
pull: always
image: plugins/kaniko:1.7.1-kaniko1.9.1
settings:
auto_tag: true
auto_tag_suffix: latest
registry: code.groupe-genes.fr
repo: code.groupe-genes.fr/datalab/docker-images-datalab/activetigger
username:
from_secret: docker_username
password:
from_secret: docker_password
when:
event:
exclude:
- pull_request
- name: deploy
image: alpine
environment:
kubernetes_server:
from_secret: kubernetes_server
kubernetes_cert:
from_secret: kubernetes_cert
kubernetes_token:
from_secret: kubernetes_token
commands:
- apk add --no-cache curl
- curl -LL -o /usr/bin/kubectl "https://dl.k8s.io/release/v1.28.2/bin/linux/amd64/kubectl"
- curl -LL -o helm.tar.gz "https://get.helm.sh/helm-v3.14.0-linux-amd64.tar.gz"
- tar xf "helm.tar.gz" && mv ./linux-amd64/helm /usr/bin/helm
- chmod +x /usr/bin/kubectl
- chmod +x /usr/bin/helm
- kubectl config set-cluster default --server=$kubernetes_server --insecure-skip-tls-verify=true
- kubectl config set-credentials user --token=$kubernetes_token
- kubectl config set-context default --user=user --cluster=default --namespace=activetigger
- kubectl config use-context default
- kubectl get pods
- helm ls -n activetigger --debug
- helm dependency build ./helm-chart
- helm upgrade activetigger ./helm-chart -f ./helm-chart/values.yaml -n activetigger
when:
event:
exclude:
- pull_request

View File

@ -0,0 +1,130 @@
ARG CACHEBUST=1
FROM ubuntu:22.04
ARG DEBIAN_FRONTEND=noninteractive
COPY requirements.r /requirements.r
COPY requirementspython.txt /requirementspython.txt
# Installation python
RUN apt-get update && apt-get install -y \
python3.10 \
python3.10-distutils \
python3.10-venv \
python3-pip \
r-base \
wget \
&& apt-get clean
# Installation R + shiny server
RUN apt-get update && \
apt-get install -y r-base
RUN R -e "install.packages('shiny', repos='https://cran.rstudio.com/')"
# Install gdebi-core and shiny-server
RUN apt-get update
RUN apt-get install -y gdebi-core
RUN wget https://download3.rstudio.org/ubuntu-18.04/x86_64/shiny-server-1.5.21.1012-amd64.deb
RUN gdebi --non-interactive shiny-server-1.5.21.1012-amd64.deb
## Packages package R (à installer depuis l'exécutable R employé par shiny server)
RUN Rscript /requirements.r
## Environnement python | a vérifier dans requirementspython.txt l'installation des cu118 se fais de cette manière
# Install Miniconda
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
rm Miniconda3-latest-Linux-x86_64.sh
# Add Conda binaries to PATH
ENV PATH="/opt/conda/bin:${PATH}"
# Create a Conda environment and activate it
RUN conda create -n tigger python==3.10 && \
echo "conda activate tigger" >> ~/.bashrc
# Mise à jour et installation des dépendances système
RUN rm -rf /var/lib/apt/lists/*
RUN pip3 install --no-cache-dir \
torch torchvision torchaudio \
-f https://download.pytorch.org/whl/cu118/torch_stable.html
RUN apt-get update && apt-get install -y curl build-essential
RUN pip3 install --no-cache-dir six
# Install Rust using rustup
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
# Add Cargo's bin directory to the PATH environment variable
ENV PATH="/root/.cargo/bin:${PATH}"
RUN pip3 install --no-cache-dir --upgrade setuptools
# Autres installations de bibliothèques Python
RUN pip3 install argparse
RUN pip3 install datasets
RUN pip3 install fasttext
RUN pip3 install numpy
RUN pip3 install pandas
RUN pip3 install pyarrow
RUN pip3 install scikit-learn
RUN pip3 install sentence-transformers
RUN pip3 install transformers
RUN pip3 install typing-inspect==0.8.0
RUN pip3 install typing-extensions==4.6.1
RUN pip3 install spacy
# Mettre en place des configurations supplémentaires si nécessaire
# Commande par défaut à exécuter lorsque le conteneur démarre
CMD ["/bin/bash"]
## Téléchargement des modèles spacy et fasttext
### Français
#WORKDIR ~
#RUN python -m spacy download fr_core_news_sm
#RUN python -m spacy download fr_core_news_sm \
#WORKDIR ~
RUN wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz \
&& gunzip cc.fr.300.bin.gz
# A chaque création d'instance
## Clone git pour créer la nouvelle instance (remplacer "tigger-name" par le nom que prendra l'instance, ie https://analytics.huma-num.fr/Prenom.Nom/tigger-name/)
COPY activetigger/ ./activetigger
# Expose the default Shiny port
EXPOSE 3838
# Command to run the Shiny app
CMD ["R", "-e", "shiny::runApp('activetigger', port=3838, host='0.0.0.0')"]
RUN R -e "install.packages('quanteda')"
## Dans l'application
## Tout en haut à gauche, bouton "+" pour "create project". Puis dans les champs :
## - data directory: moi j'utilise toujours ~/tagging/domaine (genre ~/tagging/radio ou ~/tagging/journaux), mais c'est à toi de voir où tu veux que les données et tags soient stockées sur ton serveur
## - je conseille de cocher toutes les cases : python, spacy, fasttext, sbert, gpu
## - python : "~/conda/envs/tigger/bin/python"
## - fasttext : "~/cc.fr.300.bin" (càd qu'il faut donner le chemin du modèle sur ton serveur, pas juste le nom)
## - spacy et SBERT : garder les valeurs par défaut pour la langue choisie

View File

@ -0,0 +1,41 @@
#!/usr/bin/env python
# coding: utf-8
## FastText embed sentences
## Requires data file with columns id and text
import argparse
import fasttext
from os.path import expanduser
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
import re
def main(args):
print("FastText: Importing data")
datapath = expanduser(args.data)
dat = feather.read_feather(datapath)
outfile = re.sub("[.]feather$", "_ft.feather", datapath)
print("FastText: Loading model")
ft = fasttext.load_model(expanduser(args.model))
print("FastText: Embedding sentences")
emb = [ft.get_sentence_vector(re.sub("\n", " ", x)) for x in dat["text"]]
print("FastText: Exporting")
emb = pd.DataFrame(emb)
emb.columns = ["ft%03d" % (x + 1) for x in range(len(emb.columns))]
emb = pd.concat([dat["id"], emb], axis=1)
feather.write_feather(emb, outfile)
print("FastText: Done")
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
argParser.add_argument("-m", "--model", help="Model path", default="/data/user/b/jboelaert/cc.fr.100.bin")
argParser.add_argument("-d", "--data", help="Path to data (feather)")
args = argParser.parse_args()
main(args)

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python
# coding: utf-8
## SBERT embed sentences
## Requires data file with columns id and text
import argparse
from os.path import expanduser
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
import re
from sentence_transformers import SentenceTransformer
def main(args):
print("SBERT: Importing data")
datapath = expanduser(args.data)
dat = feather.read_feather(datapath)
outfile = re.sub("[.]feather$", "_sb.feather", datapath)
print("SBERT: Loading model")
sbert = SentenceTransformer(expanduser(args.model))
sbert.max_seq_length = 512
print("SBERT: Embedding sentences")
emb = sbert.encode(dat["text"])
print("SBERT: Exporting")
emb = pd.DataFrame(emb)
emb.columns = ["sb%03d" % (x + 1) for x in range(len(emb.columns))]
emb = pd.concat([dat["id"], emb], axis=1)
feather.write_feather(emb, outfile)
print("SBERT: Done")
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
argParser.add_argument("-m", "--model", help="Model name or path", default="distiluse-base-multilingual-cased-v1")
argParser.add_argument("-d", "--data", help="Path to data (feather)")
args = argParser.parse_args()
main(args)

View File

@ -0,0 +1,174 @@
#!/usr/bin/env python
# coding: utf-8
## BERT trainer to be called by server.R
## Requires two data files with columns id, label and text
import argparse
import datasets
from datasets import load_metric
import numpy as np
from os.path import expanduser
import os
import pandas as pd
import re
from sklearn import metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments, TrainerCallback
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def main(args):
print("Importing data")
dattrain = pd.read_csv(expanduser(args.traindat))
datval = pd.read_csv(expanduser(args.valdat))
datval_id = datval["id"]
classcolname = "label"
## Make class_names
class_names = [x for x in dattrain[classcolname].unique()]
## Labels to class number
dattrain[classcolname] = [class_names.index(x) for x in dattrain[classcolname].to_list()]
datval[classcolname] = [class_names.index(x) for x in datval[classcolname].to_list()]
## Transform to datasets
dattrain = datasets.Dataset.from_pandas(dattrain[['text', 'label']])
datval = datasets.Dataset.from_pandas(datval[['text', 'label']])
# Model choice
modelname = expanduser(args.model)
## Tokenizer
print("Tokenizing")
tokenizer = AutoTokenizer.from_pretrained(modelname)
# toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length"), batched=True)
# toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length"), batched=True)
if args.adapt:
toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=512), batched=True)
toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=512), batched=True)
else:
toktrain = dattrain.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True)
toktest = datval.map(lambda e: tokenizer(e['text'], truncation=True, padding="max_length", max_length=512), batched=True)
del(dattrain)
## Model
print("Loading model")
model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels = len(class_names))
if (args.gpu):
model.cuda()
## Train using Trainer interface
print("Training...")
BATCH_SIZE = args.batchsize
GRAD_ACC = args.gradacc
epochs = args.epochs
total_steps = (epochs * len(toktrain)) // (BATCH_SIZE * GRAD_ACC)
warmup_steps = (total_steps) // 10
eval_steps = total_steps // args.eval
training_args = TrainingArguments(
output_dir=args.session + "_train",
learning_rate=args.lrate,
weight_decay=args.wdecay,
num_train_epochs=epochs,
gradient_accumulation_steps=GRAD_ACC,
per_device_train_batch_size=BATCH_SIZE,
# per_device_eval_batch_size=BATCH_SIZE,
per_device_eval_batch_size=32,
warmup_steps=warmup_steps,
eval_steps=eval_steps,
evaluation_strategy="steps",
save_strategy="steps",
save_steps=eval_steps,
logging_steps=eval_steps,
do_eval=True,
greater_is_better=False,
load_best_model_at_end=bool(args.best),
metric_for_best_model="eval_loss"
)
trainer = Trainer(model=model, args=training_args,
train_dataset=toktrain, eval_dataset=toktest)
the_session = args.session
class HaltCallback(TrainerCallback):
"A callback that checks for _stop file to interrupt training"
def on_step_begin(self, args, state, control, **kwargs):
if os.path.exists(the_session + "_stop"):
print("\nHalted by user.\n")
control.should_training_stop = True
return(control)
else:
print("\nNot halted by user.\n")
trainer.add_callback(HaltCallback)
trainer.train()
## Add class names to model
label_to_id = {v: i for i, v in enumerate(class_names)}
model.config.label2id = label_to_id
model.config.id2label = {id: label for label, id in model.config.label2id.items()}
## Save model
model.save_pretrained(args.session)
## Prediction functions
def get_predprobs(text):
# inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
if (args.gpu):
inputs = inputs.to("cuda")
outputs = model(**inputs)
res = outputs[0]
if (args.gpu):
res = res.cpu()
res = res.softmax(1).detach().numpy()
return res
def get_prediction(text):
return class_names[get_predprobs(text).argmax()]
## Metrics on validation set
print("Computing predictions")
testpred = [get_prediction(txt) for txt in datval["text"]]
testtruth = [class_names[x] for x in datval["label"]]
exportpred = pd.DataFrame(datval_id)
exportpred.columns = ["id"]
exportpred["bertpred"] = testpred
exportpred.to_csv(args.session + "_predval.csv", index=False)
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
argParser.add_argument("-m", "--model", help="Model name or path", default="microsoft/Multilingual-MiniLM-L12-H384")
argParser.add_argument("-t", "--traindat", help="Path to training data")
argParser.add_argument("-v", "--valdat", help="Path to validation data")
argParser.add_argument("-b", "--batchsize", help="Batch size for training", type=int, default=4)
argParser.add_argument("-g", "--gradacc", help="Gradient accumulation for training", type=int, default=1)
argParser.add_argument("-e", "--epochs", help="Number of training epochs", type=float, default=3)
argParser.add_argument("-l", "--lrate", help="Learning rate", type=float, default=5e-05)
argParser.add_argument("-w", "--wdecay", help="Weight decay", type=float, default=.01)
argParser.add_argument("-B", "--best", help="Load best model instead of last", type=int, choices=[0,1], default=1)
argParser.add_argument("-E", "--eval", help="Number of intermediary evaluations", type=int, default=10)
argParser.add_argument("-s", "--session", help="Session name (used to save results)")
argParser.add_argument("-G", "--gpu", help="Use GPU (CUDA)", type=int, choices=[0,1], default=0)
argParser.add_argument("-A", "--adapt", help="Adapt token length to batch", type=int, choices=[0,1], default=1)
args = argParser.parse_args()
main(args)

View File

@ -0,0 +1,94 @@
#!/usr/bin/env python
# coding: utf-8
## BERT inference to be called by server.R
import argparse
import datasets
import json
import numpy as np
from os import path, remove
import pandas as pd
import pyarrow.feather as feather
import re
from torch import no_grad
from transformers import AutoModelForSequenceClassification, AutoTokenizer
def chunker(seq, batch_size):
return (seq[pos:pos + batch_size] for pos in range(0, len(seq), batch_size))
def main(args):
print("Importing data")
with open(path.expanduser(args.logfile), "w") as progfile:
progfile.write("Importing data")
dat = feather.read_feather(path.expanduser(args.dat))
with open(path.expanduser(args.logfile), "w") as progfile:
progfile.write("Tokenizing")
## Tokenizer
print("Tokenizing")
with open(path.join(path.expanduser(args.model), "config.json"), "r") as jsonfile:
modeltype = json.load(jsonfile)["_name_or_path"]
tokenizer = AutoTokenizer.from_pretrained(modeltype)
## Model
print("Loading model")
model = AutoModelForSequenceClassification.from_pretrained(path.expanduser(args.model))
if (args.gpu):
model.cuda()
## Prediction functions
def get_predprobs(text):
inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
if (args.gpu):
inputs = inputs.to("cuda")
with no_grad():
outputs = model(**inputs)
res = outputs[0]
if (args.gpu):
res = res.cpu()
res = res.softmax(1).detach().numpy()
return res
print("Computing predictions")
chunks = chunker([str(x) for x in dat[args.txtname]], args.batch)
pred = []
for i, x in enumerate(chunks):
if (i % 5 == 0):
percent = round(100 * i * args.batch / len(dat), 1)
logmsg = "Computing: " + str(percent) + "% (" + str(i * args.batch) + "/" + str(len(dat)) + ")"
with open(path.expanduser(args.logfile), "w") as progfile:
progfile.write(logmsg)
pred.append(get_predprobs(x))
pred = np.concatenate(pred)
pred = pd.DataFrame(pred)
pred.columns = ["bertpred_" + v for i, v in model.config.id2label.items()]
pred = pd.concat([dat[args.idname], pred], axis=1)
feather.write_feather(pred, path.abspath(args.output))
remove(path.expanduser(args.logfile))
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
argParser.add_argument("-m", "--model", help="Trained model path")
argParser.add_argument("-d", "--dat", help="Path to data (feather file)")
argParser.add_argument("-o", "--output", help="Output path of predictions", default="tiggerbert.feather")
argParser.add_argument("-i", "--idname", help="Name of id variable", default="id")
argParser.add_argument("-x", "--txtname", help="Name of text variable", default="text")
argParser.add_argument("-l", "--logfile", help="Path to log file", default="tiggerbert-progress.txt")
argParser.add_argument("-G", "--gpu", help="Use GPU (CUDA)", type=int, choices=[0,1], default=1)
argParser.add_argument("-b", "--batch", help="Batch size", type=int, default=128)
args = argParser.parse_args()
main(args)

View File

@ -0,0 +1,159 @@
"short","spacy_name","fasttext_name","fasttext_url","language","short_lang"
"af","xx_ent_wiki_sm","cc.af.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.bin.gz","Afrikaans","(af) Afrikaans"
"als","xx_ent_wiki_sm","cc.als.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.bin.gz","Alemannic","(als) Alemannic"
"am","xx_ent_wiki_sm","cc.am.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.bin.gz","Amharic","(am) Amharic"
"an","xx_ent_wiki_sm","cc.an.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.bin.gz","Aragonese","(an) Aragonese"
"ar","xx_ent_wiki_sm","cc.ar.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz","Arabic","(ar) Arabic"
"arz","xx_ent_wiki_sm","cc.arz.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.bin.gz","Egyptian Arabic","(arz) Egyptian Arabic"
"as","xx_ent_wiki_sm","cc.as.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.bin.gz","Assamese","(as) Assamese"
"ast","xx_ent_wiki_sm","cc.ast.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.bin.gz","Asturian","(ast) Asturian"
"az","xx_ent_wiki_sm","cc.az.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.bin.gz","Azerbaijani","(az) Azerbaijani"
"azb","xx_ent_wiki_sm","cc.azb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.bin.gz","Southern Azerbaijani","(azb) Southern Azerbaijani"
"ba","xx_ent_wiki_sm","cc.ba.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.bin.gz","Bashkir","(ba) Bashkir"
"bar","xx_ent_wiki_sm","cc.bar.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.bin.gz","Bavarian","(bar) Bavarian"
"bcl","xx_ent_wiki_sm","cc.bcl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.bin.gz","Central Bicolano","(bcl) Central Bicolano"
"be","xx_ent_wiki_sm","cc.be.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.bin.gz","Belarusian","(be) Belarusian"
"bg","xx_ent_wiki_sm","cc.bg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.bin.gz","Bulgarian","(bg) Bulgarian"
"bh","xx_ent_wiki_sm","cc.bh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.bin.gz","Bihari","(bh) Bihari"
"bn","xx_ent_wiki_sm","cc.bn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz","Bengali","(bn) Bengali"
"bo","xx_ent_wiki_sm","cc.bo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.bin.gz","Tibetan","(bo) Tibetan"
"bpy","xx_ent_wiki_sm","cc.bpy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.bin.gz","Bishnupriya Manipuri","(bpy) Bishnupriya Manipuri"
"br","xx_ent_wiki_sm","cc.br.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.bin.gz","Breton","(br) Breton"
"bs","xx_ent_wiki_sm","cc.bs.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.bin.gz","Bosnian","(bs) Bosnian"
"ca","ca_core_news_sm","cc.ca.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.bin.gz","Catalan","(ca) Catalan"
"ce","xx_ent_wiki_sm","cc.ce.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.bin.gz","Chechen","(ce) Chechen"
"ceb","xx_ent_wiki_sm","cc.ceb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.bin.gz","Cebuano","(ceb) Cebuano"
"ckb","xx_ent_wiki_sm","cc.ckb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.bin.gz","Kurdish (Sorani)","(ckb) Kurdish (Sorani)"
"co","xx_ent_wiki_sm","cc.co.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.bin.gz","Corsican","(co) Corsican"
"cs","xx_ent_wiki_sm","cc.cs.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.bin.gz","Czech","(cs) Czech"
"cv","xx_ent_wiki_sm","cc.cv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.bin.gz","Chuvash","(cv) Chuvash"
"cy","xx_ent_wiki_sm","cc.cy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz","Welsh","(cy) Welsh"
"da","da_core_news_sm","cc.da.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.bin.gz","Danish","(da) Danish"
"de","de_core_news_sm","cc.de.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz","German","(de) German"
"diq","xx_ent_wiki_sm","cc.diq.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.bin.gz","Zazaki","(diq) Zazaki"
"dv","xx_ent_wiki_sm","cc.dv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.bin.gz","Divehi","(dv) Divehi"
"el","el_core_news_sm","cc.el.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.bin.gz","Greek","(el) Greek"
"eml","xx_ent_wiki_sm","cc.eml.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.bin.gz","Emilian-Romagnol","(eml) Emilian-Romagnol"
"en","en_core_web_sm","cc.en.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz","English","(en) English"
"eo","xx_ent_wiki_sm","cc.eo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.bin.gz","Esperanto","(eo) Esperanto"
"es","es_core_news_sm","cc.es.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz","Spanish","(es) Spanish"
"et","xx_ent_wiki_sm","cc.et.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.bin.gz","Estonian","(et) Estonian"
"eu","xx_ent_wiki_sm","cc.eu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.bin.gz","Basque","(eu) Basque"
"fa","xx_ent_wiki_sm","cc.fa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz","Persian","(fa) Persian"
"fi","fi_core_news_sm","cc.fi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.bin.gz","Finnish","(fi) Finnish"
"fr","fr_core_news_sm","cc.fr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz","French","(fr) French"
"frr","xx_ent_wiki_sm","cc.frr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.bin.gz","North Frisian","(frr) North Frisian"
"fy","xx_ent_wiki_sm","cc.fy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.bin.gz","West Frisian","(fy) West Frisian"
"ga","xx_ent_wiki_sm","cc.ga.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.bin.gz","Irish","(ga) Irish"
"gd","xx_ent_wiki_sm","cc.gd.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.bin.gz","Scottish Gaelic","(gd) Scottish Gaelic"
"gl","xx_ent_wiki_sm","cc.gl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.bin.gz","Galician","(gl) Galician"
"gom","xx_ent_wiki_sm","cc.gom.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.bin.gz","Goan Konkani","(gom) Goan Konkani"
"gu","xx_ent_wiki_sm","cc.gu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.bin.gz","Gujarati","(gu) Gujarati"
"gv","xx_ent_wiki_sm","cc.gv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.bin.gz","Manx","(gv) Manx"
"he","xx_ent_wiki_sm","cc.he.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz","Hebrew","(he) Hebrew"
"hi","xx_ent_wiki_sm","cc.hi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz","Hindi","(hi) Hindi"
"hif","xx_ent_wiki_sm","cc.hif.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.bin.gz","Fiji Hindi","(hif) Fiji Hindi"
"hr","hr_core_news_sm","cc.hr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz","Croatian","(hr) Croatian"
"hsb","xx_ent_wiki_sm","cc.hsb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.bin.gz","Upper Sorbian","(hsb) Upper Sorbian"
"ht","xx_ent_wiki_sm","cc.ht.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.bin.gz","Haitian","(ht) Haitian"
"hu","xx_ent_wiki_sm","cc.hu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.bin.gz","Hungarian","(hu) Hungarian"
"hy","xx_ent_wiki_sm","cc.hy.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.bin.gz","Armenian","(hy) Armenian"
"ia","xx_ent_wiki_sm","cc.ia.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.bin.gz","Interlingua","(ia) Interlingua"
"id","xx_ent_wiki_sm","cc.id.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz","Indonesian","(id) Indonesian"
"ilo","xx_ent_wiki_sm","cc.ilo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.bin.gz","Ilokano","(ilo) Ilokano"
"io","xx_ent_wiki_sm","cc.io.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.bin.gz","Ido","(io) Ido"
"is","xx_ent_wiki_sm","cc.is.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.bin.gz","Icelandic","(is) Icelandic"
"it","it_core_news_sm","cc.it.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz","Italian","(it) Italian"
"ja","ja_core_news_sm","cc.ja.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz","Japanese","(ja) Japanese"
"jv","xx_ent_wiki_sm","cc.jv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.bin.gz","Javanese","(jv) Javanese"
"ka","xx_ent_wiki_sm","cc.ka.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.bin.gz","Georgian","(ka) Georgian"
"kk","xx_ent_wiki_sm","cc.kk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.bin.gz","Kazakh","(kk) Kazakh"
"km","xx_ent_wiki_sm","cc.km.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz","Khmer","(km) Khmer"
"kn","xx_ent_wiki_sm","cc.kn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.bin.gz","Kannada","(kn) Kannada"
"ko","ko_core_news_sm","cc.ko.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz","Korean","(ko) Korean"
"ku","xx_ent_wiki_sm","cc.ku.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.bin.gz","Kurdish (Kurmanji)","(ku) Kurdish (Kurmanji)"
"ky","xx_ent_wiki_sm","cc.ky.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.bin.gz","Kirghiz","(ky) Kirghiz"
"la","xx_ent_wiki_sm","cc.la.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz","Latin","(la) Latin"
"lb","xx_ent_wiki_sm","cc.lb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.bin.gz","Luxembourgish","(lb) Luxembourgish"
"li","xx_ent_wiki_sm","cc.li.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.bin.gz","Limburgish","(li) Limburgish"
"lmo","xx_ent_wiki_sm","cc.lmo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.bin.gz","Lombard","(lmo) Lombard"
"lt","lt_core_news_sm","cc.lt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.bin.gz","Lithuanian","(lt) Lithuanian"
"lv","xx_ent_wiki_sm","cc.lv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.bin.gz","Latvian","(lv) Latvian"
"mai","xx_ent_wiki_sm","cc.mai.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.bin.gz","Maithili","(mai) Maithili"
"mg","xx_ent_wiki_sm","cc.mg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.bin.gz","Malagasy","(mg) Malagasy"
"mhr","xx_ent_wiki_sm","cc.mhr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.bin.gz","Meadow Mari","(mhr) Meadow Mari"
"min","xx_ent_wiki_sm","cc.min.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.bin.gz","Minangkabau","(min) Minangkabau"
"mk","mk_core_news_sm","cc.mk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.bin.gz","Macedonian","(mk) Macedonian"
"ml","xx_ent_wiki_sm","cc.ml.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.bin.gz","Malayalam","(ml) Malayalam"
"mn","xx_ent_wiki_sm","cc.mn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.bin.gz","Mongolian","(mn) Mongolian"
"mr","xx_ent_wiki_sm","cc.mr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.bin.gz","Marathi","(mr) Marathi"
"mrj","xx_ent_wiki_sm","cc.mrj.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.bin.gz","Hill Mari","(mrj) Hill Mari"
"ms","xx_ent_wiki_sm","cc.ms.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.bin.gz","Malay","(ms) Malay"
"mt","xx_ent_wiki_sm","cc.mt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.bin.gz","Maltese","(mt) Maltese"
"mwl","xx_ent_wiki_sm","cc.mwl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.bin.gz","Mirandese","(mwl) Mirandese"
"my","xx_ent_wiki_sm","cc.my.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.bin.gz","Burmese","(my) Burmese"
"myv","xx_ent_wiki_sm","cc.myv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.bin.gz","Erzya","(myv) Erzya"
"mzn","xx_ent_wiki_sm","cc.mzn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.bin.gz","Mazandarani","(mzn) Mazandarani"
"nah","xx_ent_wiki_sm","cc.nah.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.bin.gz","Nahuatl","(nah) Nahuatl"
"nap","xx_ent_wiki_sm","cc.nap.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.bin.gz","Neapolitan","(nap) Neapolitan"
"nds","xx_ent_wiki_sm","cc.nds.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.bin.gz","Low Saxon","(nds) Low Saxon"
"ne","xx_ent_wiki_sm","cc.ne.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.bin.gz","Nepali","(ne) Nepali"
"new","xx_ent_wiki_sm","cc.new.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.bin.gz","Newar","(new) Newar"
"nl","nl_core_news_sm","cc.nl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.bin.gz","Dutch","(nl) Dutch"
"nn","xx_ent_wiki_sm","cc.nn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.bin.gz","Norwegian (Nynorsk)","(nn) Norwegian (Nynorsk)"
"no","xx_ent_wiki_sm","cc.no.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.bin.gz","Norwegian (Bokmål)","(no) Norwegian (Bokmål)"
"nso","xx_ent_wiki_sm","cc.nso.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.bin.gz","Northern Sotho","(nso) Northern Sotho"
"oc","xx_ent_wiki_sm","cc.oc.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.bin.gz","Occitan","(oc) Occitan"
"or","xx_ent_wiki_sm","cc.or.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.bin.gz","Oriya","(or) Oriya"
"os","xx_ent_wiki_sm","cc.os.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.bin.gz","Ossetian","(os) Ossetian"
"pa","xx_ent_wiki_sm","cc.pa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.bin.gz","Eastern Punjabi","(pa) Eastern Punjabi"
"pam","xx_ent_wiki_sm","cc.pam.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.bin.gz","Kapampangan","(pam) Kapampangan"
"pfl","xx_ent_wiki_sm","cc.pfl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.bin.gz","Palatinate German","(pfl) Palatinate German"
"pl","pl_core_news_sm","cc.pl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz","Polish","(pl) Polish"
"pms","xx_ent_wiki_sm","cc.pms.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.bin.gz","Piedmontese","(pms) Piedmontese"
"pnb","xx_ent_wiki_sm","cc.pnb.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.bin.gz","Western Punjabi","(pnb) Western Punjabi"
"ps","xx_ent_wiki_sm","cc.ps.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.bin.gz","Pashto","(ps) Pashto"
"pt","pt_core_news_sm","cc.pt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.bin.gz","Portuguese","(pt) Portuguese"
"qu","xx_ent_wiki_sm","cc.qu.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.bin.gz","Quechua","(qu) Quechua"
"rm","xx_ent_wiki_sm","cc.rm.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.bin.gz","Romansh","(rm) Romansh"
"ro","ro_core_news_sm","cc.ro.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz","Romanian","(ro) Romanian"
"ru","ru_core_news_sm","cc.ru.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz","Russian","(ru) Russian"
"sa","xx_ent_wiki_sm","cc.sa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.bin.gz","Sanskrit","(sa) Sanskrit"
"sah","xx_ent_wiki_sm","cc.sah.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.bin.gz","Sakha","(sah) Sakha"
"sc","xx_ent_wiki_sm","cc.sc.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.bin.gz","Sardinian","(sc) Sardinian"
"scn","xx_ent_wiki_sm","cc.scn.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.bin.gz","Sicilian","(scn) Sicilian"
"sco","xx_ent_wiki_sm","cc.sco.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.bin.gz","Scots","(sco) Scots"
"sd","xx_ent_wiki_sm","cc.sd.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.bin.gz","Sindhi","(sd) Sindhi"
"sh","xx_ent_wiki_sm","cc.sh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.bin.gz","Serbo-Croatian","(sh) Serbo-Croatian"
"si","xx_ent_wiki_sm","cc.si.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz","Sinhalese","(si) Sinhalese"
"sk","xx_ent_wiki_sm","cc.sk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.bin.gz","Slovak","(sk) Slovak"
"sl","sl_core_news_sm","cc.sl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.bin.gz","Slovenian","(sl) Slovenian"
"so","xx_ent_wiki_sm","cc.so.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.bin.gz","Somali","(so) Somali"
"sq","xx_ent_wiki_sm","cc.sq.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.bin.gz","Albanian","(sq) Albanian"
"sr","xx_ent_wiki_sm","cc.sr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.bin.gz","Serbian","(sr) Serbian"
"su","xx_ent_wiki_sm","cc.su.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.bin.gz","Sundanese","(su) Sundanese"
"sv","sv_core_news_sm","cc.sv.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.bin.gz","Swedish","(sv) Swedish"
"sw","xx_ent_wiki_sm","cc.sw.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.bin.gz","Swahili","(sw) Swahili"
"ta","xx_ent_wiki_sm","cc.ta.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.bin.gz","Tamil","(ta) Tamil"
"te","xx_ent_wiki_sm","cc.te.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.bin.gz","Telugu","(te) Telugu"
"tg","xx_ent_wiki_sm","cc.tg.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.bin.gz","Tajik","(tg) Tajik"
"th","xx_ent_wiki_sm","cc.th.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.bin.gz","Thai","(th) Thai"
"tk","xx_ent_wiki_sm","cc.tk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.bin.gz","Turkmen","(tk) Turkmen"
"tl","xx_ent_wiki_sm","cc.tl.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.bin.gz","Tagalog","(tl) Tagalog"
"tr","xx_ent_wiki_sm","cc.tr.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz","Turkish","(tr) Turkish"
"tt","xx_ent_wiki_sm","cc.tt.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.bin.gz","Tatar","(tt) Tatar"
"ug","xx_ent_wiki_sm","cc.ug.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz","Uyghur","(ug) Uyghur"
"uk","uk_core_news_sm","cc.uk.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.bin.gz","Ukrainian","(uk) Ukrainian"
"ur","xx_ent_wiki_sm","cc.ur.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.bin.gz","Urdu","(ur) Urdu"
"uz","xx_ent_wiki_sm","cc.uz.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.bin.gz","Uzbek","(uz) Uzbek"
"vec","xx_ent_wiki_sm","cc.vec.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.bin.gz","Venetian","(vec) Venetian"
"vi","xx_ent_wiki_sm","cc.vi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.bin.gz","Vietnamese","(vi) Vietnamese"
"vls","xx_ent_wiki_sm","cc.vls.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.bin.gz","West Flemish","(vls) West Flemish"
"vo","xx_ent_wiki_sm","cc.vo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.bin.gz","Volapük","(vo) Volapük"
"wa","xx_ent_wiki_sm","cc.wa.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.bin.gz","Walloon","(wa) Walloon"
"war","xx_ent_wiki_sm","cc.war.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.bin.gz","Waray","(war) Waray"
"xmf","xx_ent_wiki_sm","cc.xmf.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.bin.gz","Mingrelian","(xmf) Mingrelian"
"yi","xx_ent_wiki_sm","cc.yi.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.bin.gz","Yiddish","(yi) Yiddish"
"yo","xx_ent_wiki_sm","cc.yo.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.bin.gz","Yoruba","(yo) Yoruba"
"zea","xx_ent_wiki_sm","cc.zea.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.bin.gz","Zeelandic","(zea) Zeelandic"
"zh","zh_core_web_sm","cc.zh.300.bin","https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz","Chinese","(zh) Chinese"
1 short spacy_name fasttext_name fasttext_url language short_lang
2 af xx_ent_wiki_sm cc.af.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.bin.gz Afrikaans (af) Afrikaans
3 als xx_ent_wiki_sm cc.als.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.bin.gz Alemannic (als) Alemannic
4 am xx_ent_wiki_sm cc.am.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.bin.gz Amharic (am) Amharic
5 an xx_ent_wiki_sm cc.an.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.bin.gz Aragonese (an) Aragonese
6 ar xx_ent_wiki_sm cc.ar.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz Arabic (ar) Arabic
7 arz xx_ent_wiki_sm cc.arz.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.bin.gz Egyptian Arabic (arz) Egyptian Arabic
8 as xx_ent_wiki_sm cc.as.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.bin.gz Assamese (as) Assamese
9 ast xx_ent_wiki_sm cc.ast.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.bin.gz Asturian (ast) Asturian
10 az xx_ent_wiki_sm cc.az.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.bin.gz Azerbaijani (az) Azerbaijani
11 azb xx_ent_wiki_sm cc.azb.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.bin.gz Southern Azerbaijani (azb) Southern Azerbaijani
12 ba xx_ent_wiki_sm cc.ba.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.bin.gz Bashkir (ba) Bashkir
13 bar xx_ent_wiki_sm cc.bar.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.bin.gz Bavarian (bar) Bavarian
14 bcl xx_ent_wiki_sm cc.bcl.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.bin.gz Central Bicolano (bcl) Central Bicolano
15 be xx_ent_wiki_sm cc.be.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.bin.gz Belarusian (be) Belarusian
16 bg xx_ent_wiki_sm cc.bg.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.bin.gz Bulgarian (bg) Bulgarian
17 bh xx_ent_wiki_sm cc.bh.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.bin.gz Bihari (bh) Bihari
18 bn xx_ent_wiki_sm cc.bn.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz Bengali (bn) Bengali
19 bo xx_ent_wiki_sm cc.bo.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.bin.gz Tibetan (bo) Tibetan
20 bpy xx_ent_wiki_sm cc.bpy.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.bin.gz Bishnupriya Manipuri (bpy) Bishnupriya Manipuri
21 br xx_ent_wiki_sm cc.br.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.bin.gz Breton (br) Breton
22 bs xx_ent_wiki_sm cc.bs.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.bin.gz Bosnian (bs) Bosnian
23 ca ca_core_news_sm cc.ca.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.bin.gz Catalan (ca) Catalan
24 ce xx_ent_wiki_sm cc.ce.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.bin.gz Chechen (ce) Chechen
25 ceb xx_ent_wiki_sm cc.ceb.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.bin.gz Cebuano (ceb) Cebuano
26 ckb xx_ent_wiki_sm cc.ckb.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.bin.gz Kurdish (Sorani) (ckb) Kurdish (Sorani)
27 co xx_ent_wiki_sm cc.co.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.bin.gz Corsican (co) Corsican
28 cs xx_ent_wiki_sm cc.cs.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.bin.gz Czech (cs) Czech
29 cv xx_ent_wiki_sm cc.cv.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.bin.gz Chuvash (cv) Chuvash
30 cy xx_ent_wiki_sm cc.cy.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz Welsh (cy) Welsh
31 da da_core_news_sm cc.da.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.bin.gz Danish (da) Danish
32 de de_core_news_sm cc.de.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz German (de) German
33 diq xx_ent_wiki_sm cc.diq.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.bin.gz Zazaki (diq) Zazaki
34 dv xx_ent_wiki_sm cc.dv.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.bin.gz Divehi (dv) Divehi
35 el el_core_news_sm cc.el.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.bin.gz Greek (el) Greek
36 eml xx_ent_wiki_sm cc.eml.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.bin.gz Emilian-Romagnol (eml) Emilian-Romagnol
37 en en_core_web_sm cc.en.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz English (en) English
38 eo xx_ent_wiki_sm cc.eo.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.bin.gz Esperanto (eo) Esperanto
39 es es_core_news_sm cc.es.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz Spanish (es) Spanish
40 et xx_ent_wiki_sm cc.et.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.bin.gz Estonian (et) Estonian
41 eu xx_ent_wiki_sm cc.eu.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.bin.gz Basque (eu) Basque
42 fa xx_ent_wiki_sm cc.fa.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz Persian (fa) Persian
43 fi fi_core_news_sm cc.fi.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.bin.gz Finnish (fi) Finnish
44 fr fr_core_news_sm cc.fr.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz French (fr) French
45 frr xx_ent_wiki_sm cc.frr.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.bin.gz North Frisian (frr) North Frisian
46 fy xx_ent_wiki_sm cc.fy.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.bin.gz West Frisian (fy) West Frisian
47 ga xx_ent_wiki_sm cc.ga.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.bin.gz Irish (ga) Irish
48 gd xx_ent_wiki_sm cc.gd.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.bin.gz Scottish Gaelic (gd) Scottish Gaelic
49 gl xx_ent_wiki_sm cc.gl.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.bin.gz Galician (gl) Galician
50 gom xx_ent_wiki_sm cc.gom.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.bin.gz Goan Konkani (gom) Goan Konkani
51 gu xx_ent_wiki_sm cc.gu.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.bin.gz Gujarati (gu) Gujarati
52 gv xx_ent_wiki_sm cc.gv.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.bin.gz Manx (gv) Manx
53 he xx_ent_wiki_sm cc.he.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz Hebrew (he) Hebrew
54 hi xx_ent_wiki_sm cc.hi.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz Hindi (hi) Hindi
55 hif xx_ent_wiki_sm cc.hif.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.bin.gz Fiji Hindi (hif) Fiji Hindi
56 hr hr_core_news_sm cc.hr.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz Croatian (hr) Croatian
57 hsb xx_ent_wiki_sm cc.hsb.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.bin.gz Upper Sorbian (hsb) Upper Sorbian
58 ht xx_ent_wiki_sm cc.ht.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.bin.gz Haitian (ht) Haitian
59 hu xx_ent_wiki_sm cc.hu.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.bin.gz Hungarian (hu) Hungarian
60 hy xx_ent_wiki_sm cc.hy.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.bin.gz Armenian (hy) Armenian
61 ia xx_ent_wiki_sm cc.ia.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.bin.gz Interlingua (ia) Interlingua
62 id xx_ent_wiki_sm cc.id.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz Indonesian (id) Indonesian
63 ilo xx_ent_wiki_sm cc.ilo.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.bin.gz Ilokano (ilo) Ilokano
64 io xx_ent_wiki_sm cc.io.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.bin.gz Ido (io) Ido
65 is xx_ent_wiki_sm cc.is.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.bin.gz Icelandic (is) Icelandic
66 it it_core_news_sm cc.it.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz Italian (it) Italian
67 ja ja_core_news_sm cc.ja.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz Japanese (ja) Japanese
68 jv xx_ent_wiki_sm cc.jv.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.bin.gz Javanese (jv) Javanese
69 ka xx_ent_wiki_sm cc.ka.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.bin.gz Georgian (ka) Georgian
70 kk xx_ent_wiki_sm cc.kk.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.bin.gz Kazakh (kk) Kazakh
71 km xx_ent_wiki_sm cc.km.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz Khmer (km) Khmer
72 kn xx_ent_wiki_sm cc.kn.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.bin.gz Kannada (kn) Kannada
73 ko ko_core_news_sm cc.ko.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz Korean (ko) Korean
74 ku xx_ent_wiki_sm cc.ku.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.bin.gz Kurdish (Kurmanji) (ku) Kurdish (Kurmanji)
75 ky xx_ent_wiki_sm cc.ky.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.bin.gz Kirghiz (ky) Kirghiz
76 la xx_ent_wiki_sm cc.la.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz Latin (la) Latin
77 lb xx_ent_wiki_sm cc.lb.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.bin.gz Luxembourgish (lb) Luxembourgish
78 li xx_ent_wiki_sm cc.li.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.bin.gz Limburgish (li) Limburgish
79 lmo xx_ent_wiki_sm cc.lmo.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.bin.gz Lombard (lmo) Lombard
80 lt lt_core_news_sm cc.lt.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.bin.gz Lithuanian (lt) Lithuanian
81 lv xx_ent_wiki_sm cc.lv.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.bin.gz Latvian (lv) Latvian
82 mai xx_ent_wiki_sm cc.mai.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.bin.gz Maithili (mai) Maithili
83 mg xx_ent_wiki_sm cc.mg.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.bin.gz Malagasy (mg) Malagasy
84 mhr xx_ent_wiki_sm cc.mhr.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.bin.gz Meadow Mari (mhr) Meadow Mari
85 min xx_ent_wiki_sm cc.min.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.bin.gz Minangkabau (min) Minangkabau
86 mk mk_core_news_sm cc.mk.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.bin.gz Macedonian (mk) Macedonian
87 ml xx_ent_wiki_sm cc.ml.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.bin.gz Malayalam (ml) Malayalam
88 mn xx_ent_wiki_sm cc.mn.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.bin.gz Mongolian (mn) Mongolian
89 mr xx_ent_wiki_sm cc.mr.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.bin.gz Marathi (mr) Marathi
90 mrj xx_ent_wiki_sm cc.mrj.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.bin.gz Hill Mari (mrj) Hill Mari
91 ms xx_ent_wiki_sm cc.ms.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.bin.gz Malay (ms) Malay
92 mt xx_ent_wiki_sm cc.mt.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.bin.gz Maltese (mt) Maltese
93 mwl xx_ent_wiki_sm cc.mwl.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.bin.gz Mirandese (mwl) Mirandese
94 my xx_ent_wiki_sm cc.my.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.bin.gz Burmese (my) Burmese
95 myv xx_ent_wiki_sm cc.myv.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.bin.gz Erzya (myv) Erzya
96 mzn xx_ent_wiki_sm cc.mzn.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.bin.gz Mazandarani (mzn) Mazandarani
97 nah xx_ent_wiki_sm cc.nah.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.bin.gz Nahuatl (nah) Nahuatl
98 nap xx_ent_wiki_sm cc.nap.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.bin.gz Neapolitan (nap) Neapolitan
99 nds xx_ent_wiki_sm cc.nds.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.bin.gz Low Saxon (nds) Low Saxon
100 ne xx_ent_wiki_sm cc.ne.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.bin.gz Nepali (ne) Nepali
101 new xx_ent_wiki_sm cc.new.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.bin.gz Newar (new) Newar
102 nl nl_core_news_sm cc.nl.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.bin.gz Dutch (nl) Dutch
103 nn xx_ent_wiki_sm cc.nn.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.bin.gz Norwegian (Nynorsk) (nn) Norwegian (Nynorsk)
104 no xx_ent_wiki_sm cc.no.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.bin.gz Norwegian (Bokmål) (no) Norwegian (Bokmål)
105 nso xx_ent_wiki_sm cc.nso.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.bin.gz Northern Sotho (nso) Northern Sotho
106 oc xx_ent_wiki_sm cc.oc.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.bin.gz Occitan (oc) Occitan
107 or xx_ent_wiki_sm cc.or.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.bin.gz Oriya (or) Oriya
108 os xx_ent_wiki_sm cc.os.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.bin.gz Ossetian (os) Ossetian
109 pa xx_ent_wiki_sm cc.pa.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.bin.gz Eastern Punjabi (pa) Eastern Punjabi
110 pam xx_ent_wiki_sm cc.pam.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.bin.gz Kapampangan (pam) Kapampangan
111 pfl xx_ent_wiki_sm cc.pfl.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.bin.gz Palatinate German (pfl) Palatinate German
112 pl pl_core_news_sm cc.pl.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz Polish (pl) Polish
113 pms xx_ent_wiki_sm cc.pms.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.bin.gz Piedmontese (pms) Piedmontese
114 pnb xx_ent_wiki_sm cc.pnb.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.bin.gz Western Punjabi (pnb) Western Punjabi
115 ps xx_ent_wiki_sm cc.ps.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.bin.gz Pashto (ps) Pashto
116 pt pt_core_news_sm cc.pt.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.bin.gz Portuguese (pt) Portuguese
117 qu xx_ent_wiki_sm cc.qu.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.bin.gz Quechua (qu) Quechua
118 rm xx_ent_wiki_sm cc.rm.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.bin.gz Romansh (rm) Romansh
119 ro ro_core_news_sm cc.ro.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz Romanian (ro) Romanian
120 ru ru_core_news_sm cc.ru.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz Russian (ru) Russian
121 sa xx_ent_wiki_sm cc.sa.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.bin.gz Sanskrit (sa) Sanskrit
122 sah xx_ent_wiki_sm cc.sah.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.bin.gz Sakha (sah) Sakha
123 sc xx_ent_wiki_sm cc.sc.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.bin.gz Sardinian (sc) Sardinian
124 scn xx_ent_wiki_sm cc.scn.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.bin.gz Sicilian (scn) Sicilian
125 sco xx_ent_wiki_sm cc.sco.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.bin.gz Scots (sco) Scots
126 sd xx_ent_wiki_sm cc.sd.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.bin.gz Sindhi (sd) Sindhi
127 sh xx_ent_wiki_sm cc.sh.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.bin.gz Serbo-Croatian (sh) Serbo-Croatian
128 si xx_ent_wiki_sm cc.si.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz Sinhalese (si) Sinhalese
129 sk xx_ent_wiki_sm cc.sk.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.bin.gz Slovak (sk) Slovak
130 sl sl_core_news_sm cc.sl.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.bin.gz Slovenian (sl) Slovenian
131 so xx_ent_wiki_sm cc.so.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.bin.gz Somali (so) Somali
132 sq xx_ent_wiki_sm cc.sq.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.bin.gz Albanian (sq) Albanian
133 sr xx_ent_wiki_sm cc.sr.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.bin.gz Serbian (sr) Serbian
134 su xx_ent_wiki_sm cc.su.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.bin.gz Sundanese (su) Sundanese
135 sv sv_core_news_sm cc.sv.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.bin.gz Swedish (sv) Swedish
136 sw xx_ent_wiki_sm cc.sw.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.bin.gz Swahili (sw) Swahili
137 ta xx_ent_wiki_sm cc.ta.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.bin.gz Tamil (ta) Tamil
138 te xx_ent_wiki_sm cc.te.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.bin.gz Telugu (te) Telugu
139 tg xx_ent_wiki_sm cc.tg.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.bin.gz Tajik (tg) Tajik
140 th xx_ent_wiki_sm cc.th.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.bin.gz Thai (th) Thai
141 tk xx_ent_wiki_sm cc.tk.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.bin.gz Turkmen (tk) Turkmen
142 tl xx_ent_wiki_sm cc.tl.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.bin.gz Tagalog (tl) Tagalog
143 tr xx_ent_wiki_sm cc.tr.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz Turkish (tr) Turkish
144 tt xx_ent_wiki_sm cc.tt.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.bin.gz Tatar (tt) Tatar
145 ug xx_ent_wiki_sm cc.ug.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz Uyghur (ug) Uyghur
146 uk uk_core_news_sm cc.uk.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.bin.gz Ukrainian (uk) Ukrainian
147 ur xx_ent_wiki_sm cc.ur.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.bin.gz Urdu (ur) Urdu
148 uz xx_ent_wiki_sm cc.uz.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.bin.gz Uzbek (uz) Uzbek
149 vec xx_ent_wiki_sm cc.vec.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.bin.gz Venetian (vec) Venetian
150 vi xx_ent_wiki_sm cc.vi.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.bin.gz Vietnamese (vi) Vietnamese
151 vls xx_ent_wiki_sm cc.vls.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.bin.gz West Flemish (vls) West Flemish
152 vo xx_ent_wiki_sm cc.vo.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.bin.gz Volapük (vo) Volapük
153 wa xx_ent_wiki_sm cc.wa.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.bin.gz Walloon (wa) Walloon
154 war xx_ent_wiki_sm cc.war.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.bin.gz Waray (war) Waray
155 xmf xx_ent_wiki_sm cc.xmf.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.bin.gz Mingrelian (xmf) Mingrelian
156 yi xx_ent_wiki_sm cc.yi.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.bin.gz Yiddish (yi) Yiddish
157 yo xx_ent_wiki_sm cc.yo.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.bin.gz Yoruba (yo) Yoruba
158 zea xx_ent_wiki_sm cc.zea.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.bin.gz Zeelandic (zea) Zeelandic
159 zh zh_core_web_sm cc.zh.300.bin https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz Chinese (zh) Chinese

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,40 @@
#!/usr/bin/env python
# coding: utf-8
## Spacy tokenize texts
## Requires data file with columns id and text
import argparse
from os.path import expanduser
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
import spacy
import re
def main(args):
print("Tokenizer: Importing data")
datapath = expanduser(args.data)
dat = feather.read_feather(datapath)
outfile = re.sub("[.]feather$", "_spa.feather", datapath)
print("Tokenizer: Loading model")
spa = spacy.load(expanduser(args.model))
print("Tokenizer: Tokenizing sentences")
tok = [" ".join([str(token) for token in spa.tokenizer(text)]) for text in dat["text"]]
print("Tokenizer: Exporting")
tok = pd.concat([dat["id"], pd.DataFrame(tok)], axis=1)
tok.columns = ["id", "text_spa"]
feather.write_feather(tok, outfile)
print("Tokenizer: Done")
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
argParser.add_argument("-m", "--model", help="Model name", default="fr_core_news_sm")
argParser.add_argument("-d", "--data", help="Path to data (feather)")
args = argParser.parse_args()
main(args)

View File

@ -0,0 +1,631 @@
## 21/04/2020 : shiny pour active learning étiquetage de textes
shinyUI(fluidPage(
title = "Active Tigger",
############################################################################
## Top panel: Title, strategy, model options
############################################################################
fluidRow(
column(
4,
fluidRow(
column(4, br(),
HTML('<img src="active_tigger.png" width="100%">')),
column(
8, br(),
p(strong("Project / Scheme")),
fluidRow(
column(2, HTML(paste0(
'<div title="New project">',
actionButton("createProject", "+"),
'</div>'))),
column(10, selectInput("selectProject", NULL, NULL, NULL))
),
fluidRow(
column(2, HTML(paste0(
'<div title="New scheme">',
actionButton("createScheme", "+"),
'</div>'))),
column(10, selectInput("selectScheme", NULL, NULL, NULL))
)
))
),
column(
2, br(),
HTML(paste0(
'<div title="Querying strategy">',
p(strong("Strategy")),
selectInput("strategy", NULL,
choices= c("Active" = "entropy",
"MaxProb" = "maxprob",
"Random" = "random",
"Deterministic" = "sequential"),
selected = "sequential"),
conditionalPanel("input.strategy == 'maxprob'",
selectInput("maxprobWhich", NULL, NULL)),
'</div>'))
),
column(
2, br(),
HTML(paste0(
'<div title="Query from which?">',
p(strong("On")),
selectInput("sampleChoice", NULL,
choices = c("Untagged" = "untagged",
"Tagged" = "tagged",
"All" = "all")),
conditionalPanel("input.sampleChoice == 'tagged'",
selectInput("taggedWhich", NULL, NULL)),
'</div>'))
),
column(4, p(br()),
verbatimTextOutput("trainDiagno"),
fluidRow(
column(8, HTML(paste0(
'<div title="Train prediction model 🤖">',
actionButton("modelTrain", "🤖 Train", width = "100%"),
'</div>'))),
column(4, HTML(paste0(
'<div title="🤖 Prediction model options">',
checkboxInput("showTrainOptions", "🔧"),
'</div>')))
))
),
## General training options
conditionalPanel(
"input.showTrainOptions",
hr(),
fluidRow(
column(
4,
conditionalPanel(
"input.showTrainOptions & input.use_regressors.includes('regex')",
uiOutput("panelExtraRegex")
)
),
column(
4,
p(strong("Predictors")),
selectizeInput("use_regressors", NULL, "",
multiple = TRUE, width = "100%"),
conditionalPanel(
"input.showTrainOptions & input.use_regressors.includes('extra')",
selectizeInput("use_ootregnum", "Extra predictors: continuous", "",
multiple = TRUE, width = "100%"),
selectizeInput("use_ootregcat", "Extra predictors: categorical", "",
multiple = TRUE, width = "100%")
),
conditionalPanel(
"input.showTrainOptions & input.use_regressors.includes('dfm')",
hr(),
fluidRow(
column(
6, numericInput("dfmMinTermfreq", "DFM : Min Termfreq",
min= 1, max= 1e3, value= 5, step= 1)),
column(
6, numericInput("dfmMinDocfreq", "DFM : Min Docfreq",
min= 1, max= 1e3, value= 5, step= 1))),
fluidRow(
column(
4, checkboxInput("dfmTfIdf", "Tf-Idf", TRUE)),
column(
4, selectInput("dfmTfScheme", label = NULL,
choices= c("logcount", "count", "prop", "propmax",
"boolean", "augmented", "logave"),
selected= "logcount")),
column(
4, selectInput("dfmDfScheme", label = NULL,
choices= c("inverse", "count",
"inversemax", "inverseprob", "unary"),
selected= "inverse"))
),
numericInput("dfmNgrams", "N-grams", value= 1,
min= 1, max= 10, step = 1)
)
),
column(
4,
HTML(paste0(
'<div title="Auto train after # tags (0=never)">',
fluidRow(
column(6, strong("Auto train every")),
column(6, numericInput("trainCountdown", NULL, 0, 0, 1e6, 1))),
'</div>'
)),
HTML(paste0(
'<div title="🤖 model (recommended: Liblinear)">',
fluidRow(
column(6, strong("Model")),
column(6, selectInput("predModel", NULL, selected = "linear",
choices = c("Naive Bayes" = "naive bayes",
"KNN" = "knn",
"Liblinear" = "linear",
"LASSO" = "lasso",
"Random Forest" = "random forest")))
),
'</div>'
)),
## Model-specific training options
### Random forest options
conditionalPanel(
"input.showTrainOptions & input.predModel == 'random forest'",
fluidRow(
numericInput("rfNumTrees", label = "Num. trees",
min = 1, max = 2e3, value = 500, step = 1),
numericInput("rfMtry", label = "mtry",
min = 0, max = 1e5, value = 0, step = 1),
numericInput("rfSampleFrac", label = "Sample fraction",
min = 0, max = 1, value = 1, step = .01)
)
),
### Naive Bayes options
conditionalPanel(
"input.showTrainOptions & input.predModel == 'naive bayes'",
flowLayout(
numericInput("naiveSmooth", label = "Smooth",
min = 0, max = 2e3,
value = 1, step = 1e-3),
selectInput("naivePrior", "Prior",
c("uniform", "docfreq", "termfreq")),
selectInput("naiveDistri", "Distribution",
c("multinomial", "Bernoulli"))
)
),
### Lasso options
conditionalPanel(
"input.showTrainOptions & input.predModel == 'lasso'",
strong("Lasso penalty"),
fluidRow(
column(
6, numericInput("glmLambda", label = NULL, min = 0, max = 2e3,
value = 0, step = 1e-6)),
column(
6, actionButton("glmCV", label= "Find best (CV)")))
),
### Linear options
conditionalPanel(
"input.showTrainOptions & input.predModel == 'linear'",
strong("Liblinear Cost"),
fluidRow(
column(
6, numericInput("liblinCost", label= NULL, min= 0, max= 2e10,
value= 32, step= 1)),
column(
6, actionButton("liblinCV", label= "Find best (CV)")))
),
### KNN options
conditionalPanel(
"input.showTrainOptions & input.predModel == 'knn'",
flowLayout(
strong("N. Neighbours"),
numericInput("knnK", label = NULL, min = 1, max = 1e2,
value = 3, step = 1)
)
)
)
),
hr()
),
############################################################################
## Main panel set
############################################################################
tabsetPanel(
id = "mainPanelset",
selected = "Tagging",
########################################################################
## Project panel
########################################################################
tabPanel(
"Project",
br(),
tabsetPanel(
id = "tabsetProject",
selected = "Sample",
tabPanel(
"Settings",
br(),
actionButton("saveSystem", "Save changes"),
h4("Files"),
fluidRow(
column(2, p("Data directory")),
column(4, uiOutput("sys_datadir")),
column(6, p("Place (on the server) where the data and project are stored"))
),
fluidRow(
column(2, p("Data filename")),
column(4, uiOutput("sys_datafile")),
column(6, p("Main file, containing id and text columns"))
),
h4("Variables"),
fluidRow(
column(2, p("ID")),
column(4, uiOutput("sys_var_id")),
column(6, p("Name of the id variable, unique identifier of each text"))
),
fluidRow(
column(2, p("Text")),
column(4, uiOutput("sys_var_text")),
column(6, p("Name of the text variables: if more than one, texts are concatenated in the specified order"))
),
fluidRow(
column(2, p("Tags")),
column(4, uiOutput("sys_var_tag")),
column(6, p("Names of scheme variables"))
),
fluidRow(
column(2, p("Comments")),
column(4, uiOutput("sys_var_comm_ui")),
column(6, p("Name of the comments variable"))
),
fluidRow(
column(2, p("Context")),
column(4, uiOutput("sys_var_context_ui")),
column(6, p("Names of variables not used in the models, but may be displayed during tagging"))
),
h4("System"),
fluidRow(
column(2, checkboxInput("sys_use_python", "Python backend", FALSE)),
column(4, conditionalPanel(
"input.sys_use_python",
textInput("sys_which_python", NULL, value = "python3",
placeholder = "(custom python path)"))),
column(6, conditionalPanel(
"input.sys_use_python",
p("This must be a working python3 environment, with the required modules installed (see documentation)")))
),
conditionalPanel("input.sys_use_python", list(
fluidRow(
column(2, checkboxInput("sys_use_spacy", "SpaCy tokenization", FALSE)),
column(4, conditionalPanel("input.sys_use_spacy", textInput(
"sys_use_spacy_model", NULL, NULL, placeholder = "(spacy model name)"))),
column(6, p("Name of the spacy tokenizer model, used in DTM and word embeddings"))
),
conditionalPanel("input.sys_use_spacy", fluidRow(
column(2),
column(9, uiOutput("sys_spacyDlUI")))
),
fluidRow(
column(2, checkboxInput("sys_use_ft", "FastText word embeddings", FALSE)),
column(4, conditionalPanel("input.sys_use_ft", textInput(
"sys_use_ft_model", NULL, NULL, placeholder = "(fasttext model path)"))),
column(6, p("Path to the local fasttext model binary"))
),
conditionalPanel("input.sys_use_ft", fluidRow(
column(2),
column(9, uiOutput("sys_ftDlUI")))
),
fluidRow(
column(2, checkboxInput("sys_use_sb", "SBERT sentence embeddings", FALSE)),
column(4, conditionalPanel("input.sys_use_sb", textInput(
"sys_use_sb_model", NULL, NULL,
placeholder = "(custom sentence_transformers model)"))),
column(6, p("(GPU recommended) Name or path of the sentence-transformers model"))
),
conditionalPanel("input.sys_use_python", list(
checkboxInput("sys_use_gpu", "GPU support (CUDA, for SBERT and BERT)", FALSE),
br(),
wellPanel(
h4("Model picker"),
fluidRow(
column(2, p("Language")),
column(4, uiOutput("sys_ex_lang_ui")),
column(6, p("Used to preset tokenization and embedding models"))
),
fluidRow(
column(2),
column(4, strong("Recommended model")),
column(6, strong("Download instructions"))
),
fluidRow(
column(2, p("SpaCy tokenization")),
column(4, uiOutput("sys_ex_spacy")),
column(6, uiOutput("sys_ex_spacy_dl"))
),
fluidRow(
column(2, p("FastText word embeddings")),
column(4, uiOutput("sys_ex_ft")),
column(6, uiOutput("sys_ex_ft_dl"))
),
fluidRow(
column(2, p("SBERT sentence embeddings")),
column(4, uiOutput("sys_ex_sb")),
column(6, p("(Auto download by python module)"))
)
)
))
))
),
tabPanel(
"Sample",
br(),
fluidRow(
column(
4,
wellPanel(
fluidRow(
column(8, h4("Sample")),
column(4, actionButton("dataImport", "Import", width = "100%"))),
fluidRow(
column(6, numericInput("dataNrows", "N. rows", 500, 10, 1e4, 1)),
column(6, numericInput("dataSkipRows", "Skip rows", 0, 0, step = 1))
)
)
),
column(8, uiOutput("dataMessage"), uiOutput("panelData"))
)
),
tabPanel(
"Scheme",
br(),
fluidRow(
# column(4, uiOutput("panelScheme")),
column(
4,
wellPanel(
h4("Current scheme"),
fluidRow(
column(2, HTML(paste0(
"<div title='Delete scheme'>",
actionButton("schemeDelete", "🗑", width = "100%"),
"</div>"))),
column(6, uiOutput("printScheme")),
column(4, HTML(paste0(
"<div title='Save scheme description'>",
actionButton("schemeDescrSave", "Save", width = "100%"),
"</div>")))
),
br(),
textAreaInput("schemeDescr", NULL, width = "100%", rows = 10,
placeholder = "Write scheme description here"),
hr()
)
),
column(8, uiOutput("panelRetag"))
)
)
)
),
########################################################################
## Text / visualization panel
########################################################################
tabPanel(
"Tagging",
fluidRow(
column(
3,
br(),
fluidRow(
column(8, textInput("regexFilter", label = NULL,
placeholder = "(Regex filter)")),
column(4, checkboxInput("regexCaseSens", "Case"))),
wellPanel(
## Tagging buttons
fluidRow(
column(8, textInput("newLab", label = NULL,
placeholder = "(New label)")),
column(4, actionButton("currentAction", "Create"))
),
# fluidRow(uiOutput("oracleButtons")),
uiOutput("oracleButtons"),
br(),
textInput("currentComment", NULL, "", width = "100%",
placeholder = "(Comment)"),
br(),
uiOutput("makeOracleConfirm")
),
# fluidRow(
# column(6, checkboxInput("showContext", "Context")),
# column(6, actionButton("oops", strong("Oops")))
# ),
checkboxInput("showContext", "Context"),
conditionalPanel("input.showContext", htmlOutput("currentContext"))
),
column(
9,
fluidRow(
column(2, checkboxInput("panelText", "Text", TRUE)),
column(2, checkboxInput("panelVisu", "Visualization", FALSE),
offset = 8)
),
uiOutput("textVisuCols") # Handled in server.R for adaptive columns
)
)
),
########################################################################
## History panel
########################################################################
tabPanel(
"History",
br(),
actionButton("histSave", "Save changes"),
br(),
br(),
DT::dataTableOutput("histDTable")
),
########################################################################
## Stats panel
########################################################################
tabPanel(
"Stats",
br(),
fluidRow(
column(
3,
h3("Counts"),
tableOutput("statsTagTable")
),
column(
9,
h3("10-CV diagnostics"),
actionButton("statsCVgo", "Compute 10-CV"),
br(),
verbatimTextOutput("statsCVoutput"),
DT::dataTableOutput("statsCVtable")
)
),
hr(),
h3("Gold Standard")
),
########################################################################
## BERT panel
########################################################################
tabPanel(
"BERT",
fluidRow(
column(
3,
br(),
h3("Train new BERT"),
fluidRow(
column(6, actionButton("bertTrain", "Train BERT", width = "100%")),
column(6, checkboxInput("bertOptions", "Options"))),
fluidRow(
column(6, textInput(
"bertSaveName", NULL, placeholder = "(save name)")),
column(6, actionButton("bertSave", "Save", width = "100%"))),
actionLink("bertLast", "Last trained model"),
h3("Saved models"),
uiOutput("bertSaved")
),
column(
9,
br(),
conditionalPanel(
"input.bertOptions",
fluidRow(
column(6, selectInput(
"bertModel", "Model",
c("(Fr) CamemBERT-base" = "camembert/camembert-base",
"(Fr) CamemBERT-large" = "camembert/camembert-large",
"(Fr) FlauBERT-small" = "flaubert/flaubert_small_cased",
"(Fr) FlauBERT-base" = "flaubert/flaubert_base_cased",
"(Fr) FlauBERT-large" = "flaubert/flaubert_large_cased",
"(En) DistilBERT-base" = "distilbert-base-cased",
"(En) RoBERTa-base" = "roberta-base",
"(En) DeBERTa-base" = "microsoft/deberta-base",
"(Multi) DistilmBERT-base" = "distilbert-base-multilingual-cased",
"(Multi) MiniLM" = "microsoft/Multilingual-MiniLM-L12-H384",
"(Multi) XLM-RoBERTa-base" = "xlm-roberta-base"))),
column(6)
),
fluidRow(
column(3, numericInput("bertEpochs", "Epochs", 3, 1, 20, 1)),
column(3, numericInput("bertLrate", "Learning rate", 2e-5, 1e-6, 1, 1e-6)),
column(3, numericInput("bertWdecay", "Weight decay", 0.01, 0, 10, 1e-6)),
column(3)
),
fluidRow(
column(3, numericInput("bertBatchsize", "Batch size", 4, 1, 32, 1)),
column(3, numericInput("bertGradacc", "Gradient accum.", 4, 1, 32, 1)),
column(3, br(), checkboxInput("bertAdapt", "Adapt token length to batch", TRUE)),
column(3)
),
fluidRow(
column(3, numericInput("bertValidFrac", "Validation fraction", .2, 0, .9)),
column(3, numericInput("bertValidSeed", "Validation seed", 1234, 1, 9e8)),
column(3, numericInput("bertNeval", "N. validation evals", 10, 1, 100, 1)),
column(3, br(), checkboxInput("bertBest", "Keep best", TRUE))
),
fluidRow(
column(3, numericInput("bertMinOccur", "Min. class occurences", 1, 1, 1e4, 1)),
column(3, br(), checkboxInput("bertBalance", "Balance classes", FALSE)),
column(3),
column(3)
)
),
fluidRow(
column(
6,
# flowLayout(
# actionButton(
# "bertGoPred", "Infer on current data", width = "100%"),
# actionButton(
# "bertDelete", "Delete saved model", width = "100%")),
verbatimTextOutput("bertMsg")),
column(6, plotOutput("bertValPlot", height = 200))),
verbatimTextOutput("bertMsgHyperpar"),
DT::dataTableOutput("bertValstats")
)
)
),
########################################################################
## Export panel
########################################################################
tabPanel(
"Export",
h4("Export tagged data"),
p("Download the tags and predicted probabilities from the complete model, on the current data sample."),
# downloadButton("downloadCsv", "Save csv"),
flowLayout(
selectInput(
"dlTagSelect", NULL, c("tags", "comments", "predictions"),
c("tags", "comments", "predictions"), multiple = TRUE),
selectInput("dlTagFormat", NULL, c("csv", "feather"), "csv"),
downloadButton("dlTagSave", NULL, title = "Save tags")
),
hr(),
h4("Export embeddings"),
p("Download the embeddings (incl. from visualization if present), on the current data sample."),
flowLayout(
selectInput(
"dlEmbedSelect", NULL, c("FastText" = "ft", "SBERT" = "sb"),
selected = "sb", multiple = TRUE),
selectInput("dlEmbedFormat", NULL, c("csv", "feather"), "feather"),
downloadButton("dlEmbedSave", NULL, title = "Save embeddings")
),
hr(),
h4("Export BERT predictions"),
p("Download the predicted probabilities from the chosen BERT model, on the complete dataset."),
flowLayout(
selectInput("dlBPSelect", NULL, NULL, NULL),
selectInput("dlBPFormat", NULL, c("csv", "feather"), "feather"),
actionButton("dlBPInfer", "Predict"),
verbatimTextOutput("dlBPMsg"),
uiOutput("dlBPDlButton")
),
hr(),
h4("Export BERT models")
)
),
br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(), br(),
br(), br(), br(), br(), br(), br()
))

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 245 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

View File

@ -0,0 +1,13 @@
argparse
datasets
fasttext
numpy
pandas
pyarrow
scikit-learn
torch
transformers[torch]
sentence_transformers
typing-inspect==0.8.0
typing_extensions==4.6.1
spacy

View File

@ -0,0 +1,3 @@
packages=c("arrow", "class", "data.table", "DT", "foreign", "glmnet", "haven", "LiblineaR", "Matrix", "Metrics", "quanteda", "quanteda.textmodels", "ranger", "readODS", "readxl", "RJSONIO", "rlang", "Rtsne", "shiny", "SparseM", "stringi", "uwot", "future","htmlTable","ggplot2")
install.packages(setdiff(packages, rownames(installed.packages())))

View File

@ -0,0 +1,6 @@
pip install argparse datasets fasttext numpy pandas pyarrow sklearn
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install transformers[torch]
pip install sentence_transformers
pip install -U typing-inspect==0.8.0 typing_extensions==4.6.1
pip install spacy