datalab/docker-images-datalab/myactivetigger/activetigger/tokenize_spacy.py

41 lines
1.1 KiB
Python

#!/usr/bin/env python
# coding: utf-8
## Spacy tokenize texts
## Requires data file with columns id and text
import argparse
from os.path import expanduser
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
import spacy
import re
def main(args):
print("Tokenizer: Importing data")
datapath = expanduser(args.data)
dat = feather.read_feather(datapath)
outfile = re.sub("[.]feather$", "_spa.feather", datapath)
print("Tokenizer: Loading model")
spa = spacy.load(expanduser(args.model))
print("Tokenizer: Tokenizing sentences")
tok = [" ".join([str(token) for token in spa.tokenizer(text)]) for text in dat["text"]]
print("Tokenizer: Exporting")
tok = pd.concat([dat["id"], pd.DataFrame(tok)], axis=1)
tok.columns = ["id", "text_spa"]
feather.write_feather(tok, outfile)
print("Tokenizer: Done")
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
argParser.add_argument("-m", "--model", help="Model name", default="fr_core_news_sm")
argParser.add_argument("-d", "--data", help="Path to data (feather)")
args = argParser.parse_args()
main(args)