#!/usr/bin/env python # coding: utf-8 ## Spacy tokenize texts ## Requires data file with columns id and text import argparse from os.path import expanduser import pandas as pd import pyarrow as pa import pyarrow.feather as feather import spacy import re def main(args): print("Tokenizer: Importing data") datapath = expanduser(args.data) dat = feather.read_feather(datapath) outfile = re.sub("[.]feather$", "_spa.feather", datapath) print("Tokenizer: Loading model") spa = spacy.load(expanduser(args.model)) print("Tokenizer: Tokenizing sentences") tok = [" ".join([str(token) for token in spa.tokenizer(text)]) for text in dat["text"]] print("Tokenizer: Exporting") tok = pd.concat([dat["id"], pd.DataFrame(tok)], axis=1) tok.columns = ["id", "text_spa"] feather.write_feather(tok, outfile) print("Tokenizer: Done") if __name__ == "__main__": argParser = argparse.ArgumentParser() argParser.add_argument("-m", "--model", help="Model name", default="fr_core_news_sm") argParser.add_argument("-d", "--data", help="Path to data (feather)") args = argParser.parse_args() main(args)