41 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			41 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| # coding: utf-8
 | |
| 
 | |
| ## Spacy tokenize texts
 | |
| ## Requires data file with columns id and text
 | |
| 
 | |
| import argparse
 | |
| from os.path import expanduser
 | |
| import pandas as pd
 | |
| import pyarrow as pa
 | |
| import pyarrow.feather as feather
 | |
| import spacy
 | |
| import re
 | |
| 
 | |
| 
 | |
| def main(args):
 | |
|     print("Tokenizer: Importing data")
 | |
|     datapath = expanduser(args.data)
 | |
|     dat = feather.read_feather(datapath)
 | |
|     outfile = re.sub("[.]feather$", "_spa.feather", datapath)
 | |
|     
 | |
|     print("Tokenizer: Loading model")
 | |
|     spa = spacy.load(expanduser(args.model))
 | |
|     print("Tokenizer: Tokenizing sentences")
 | |
|     tok = [" ".join([str(token) for token in spa.tokenizer(text)]) for text in dat["text"]]
 | |
|     
 | |
|     print("Tokenizer: Exporting")
 | |
|     tok = pd.concat([dat["id"], pd.DataFrame(tok)], axis=1)
 | |
|     tok.columns = ["id", "text_spa"]
 | |
|     feather.write_feather(tok, outfile)
 | |
|     print("Tokenizer: Done")
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     argParser = argparse.ArgumentParser()
 | |
|     argParser.add_argument("-m", "--model", help="Model name", default="fr_core_news_sm")
 | |
|     argParser.add_argument("-d", "--data", help="Path to data (feather)")
 | |
|     args = argParser.parse_args()
 | |
|     main(args)
 | |
| 
 |