42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
|
#!/usr/bin/env python
|
||
|
# coding: utf-8
|
||
|
|
||
|
## FastText embed sentences
|
||
|
## Requires data file with columns id and text
|
||
|
|
||
|
import argparse
|
||
|
import fasttext
|
||
|
from os.path import expanduser
|
||
|
import pandas as pd
|
||
|
import pyarrow as pa
|
||
|
import pyarrow.feather as feather
|
||
|
import re
|
||
|
|
||
|
|
||
|
def main(args):
|
||
|
print("FastText: Importing data")
|
||
|
datapath = expanduser(args.data)
|
||
|
dat = feather.read_feather(datapath)
|
||
|
outfile = re.sub("[.]feather$", "_ft.feather", datapath)
|
||
|
|
||
|
print("FastText: Loading model")
|
||
|
ft = fasttext.load_model(expanduser(args.model))
|
||
|
print("FastText: Embedding sentences")
|
||
|
emb = [ft.get_sentence_vector(re.sub("\n", " ", x)) for x in dat["text"]]
|
||
|
|
||
|
print("FastText: Exporting")
|
||
|
emb = pd.DataFrame(emb)
|
||
|
emb.columns = ["ft%03d" % (x + 1) for x in range(len(emb.columns))]
|
||
|
emb = pd.concat([dat["id"], emb], axis=1)
|
||
|
feather.write_feather(emb, outfile)
|
||
|
print("FastText: Done")
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
argParser = argparse.ArgumentParser()
|
||
|
argParser.add_argument("-m", "--model", help="Model path", default="/data/user/b/jboelaert/cc.fr.100.bin")
|
||
|
argParser.add_argument("-d", "--data", help="Path to data (feather)")
|
||
|
args = argParser.parse_args()
|
||
|
main(args)
|
||
|
|