#!/usr/bin/env python # coding: utf-8 ## FastText embed sentences ## Requires data file with columns id and text import argparse import fasttext from os.path import expanduser import pandas as pd import pyarrow as pa import pyarrow.feather as feather import re def main(args): print("FastText: Importing data") datapath = expanduser(args.data) dat = feather.read_feather(datapath) outfile = re.sub("[.]feather$", "_ft.feather", datapath) print("FastText: Loading model") ft = fasttext.load_model(expanduser(args.model)) print("FastText: Embedding sentences") emb = [ft.get_sentence_vector(re.sub("\n", " ", x)) for x in dat["text"]] print("FastText: Exporting") emb = pd.DataFrame(emb) emb.columns = ["ft%03d" % (x + 1) for x in range(len(emb.columns))] emb = pd.concat([dat["id"], emb], axis=1) feather.write_feather(emb, outfile) print("FastText: Done") if __name__ == "__main__": argParser = argparse.ArgumentParser() argParser.add_argument("-m", "--model", help="Model path", default="/data/user/b/jboelaert/cc.fr.100.bin") argParser.add_argument("-d", "--data", help="Path to data (feather)") args = argParser.parse_args() main(args)