datalab/docker-images-datalab/myactivetigger/activetigger/embed_fasttext.py

42 lines
1.2 KiB
Python
Raw Normal View History

2024-03-06 15:54:50 +01:00
#!/usr/bin/env python
# coding: utf-8
## FastText embed sentences
## Requires data file with columns id and text
import argparse
import fasttext
from os.path import expanduser
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
import re
def main(args):
print("FastText: Importing data")
datapath = expanduser(args.data)
dat = feather.read_feather(datapath)
outfile = re.sub("[.]feather$", "_ft.feather", datapath)
print("FastText: Loading model")
ft = fasttext.load_model(expanduser(args.model))
print("FastText: Embedding sentences")
emb = [ft.get_sentence_vector(re.sub("\n", " ", x)) for x in dat["text"]]
print("FastText: Exporting")
emb = pd.DataFrame(emb)
emb.columns = ["ft%03d" % (x + 1) for x in range(len(emb.columns))]
emb = pd.concat([dat["id"], emb], axis=1)
feather.write_feather(emb, outfile)
print("FastText: Done")
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
argParser.add_argument("-m", "--model", help="Model path", default="/data/user/b/jboelaert/cc.fr.100.bin")
argParser.add_argument("-d", "--data", help="Path to data (feather)")
args = argParser.parse_args()
main(args)