41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf-8
|
|
|
|
## Spacy tokenize texts
|
|
## Requires data file with columns id and text
|
|
|
|
import argparse
|
|
from os.path import expanduser
|
|
import pandas as pd
|
|
import pyarrow as pa
|
|
import pyarrow.feather as feather
|
|
import spacy
|
|
import re
|
|
|
|
|
|
def main(args):
|
|
print("Tokenizer: Importing data")
|
|
datapath = expanduser(args.data)
|
|
dat = feather.read_feather(datapath)
|
|
outfile = re.sub("[.]feather$", "_spa.feather", datapath)
|
|
|
|
print("Tokenizer: Loading model")
|
|
spa = spacy.load(expanduser(args.model))
|
|
print("Tokenizer: Tokenizing sentences")
|
|
tok = [" ".join([str(token) for token in spa.tokenizer(text)]) for text in dat["text"]]
|
|
|
|
print("Tokenizer: Exporting")
|
|
tok = pd.concat([dat["id"], pd.DataFrame(tok)], axis=1)
|
|
tok.columns = ["id", "text_spa"]
|
|
feather.write_feather(tok, outfile)
|
|
print("Tokenizer: Done")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
argParser = argparse.ArgumentParser()
|
|
argParser.add_argument("-m", "--model", help="Model name", default="fr_core_news_sm")
|
|
argParser.add_argument("-d", "--data", help="Path to data (feather)")
|
|
args = argParser.parse_args()
|
|
main(args)
|
|
|