datalab/docker-images-datalab/myactivetigger/activetigger/embed_fasttext.py

42 lines
1.2 KiB
Python

#!/usr/bin/env python
# coding: utf-8
## FastText embed sentences
## Requires data file with columns id and text
import argparse
import fasttext
from os.path import expanduser
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
import re
def main(args):
print("FastText: Importing data")
datapath = expanduser(args.data)
dat = feather.read_feather(datapath)
outfile = re.sub("[.]feather$", "_ft.feather", datapath)
print("FastText: Loading model")
ft = fasttext.load_model(expanduser(args.model))
print("FastText: Embedding sentences")
emb = [ft.get_sentence_vector(re.sub("\n", " ", x)) for x in dat["text"]]
print("FastText: Exporting")
emb = pd.DataFrame(emb)
emb.columns = ["ft%03d" % (x + 1) for x in range(len(emb.columns))]
emb = pd.concat([dat["id"], emb], axis=1)
feather.write_feather(emb, outfile)
print("FastText: Done")
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
argParser.add_argument("-m", "--model", help="Model path", default="/data/user/b/jboelaert/cc.fr.100.bin")
argParser.add_argument("-d", "--data", help="Path to data (feather)")
args = argParser.parse_args()
main(args)