From ada5bc2674f6044ca9dfa0a5e1dc18eb6390e5c5 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Sun, 2 Oct 2022 00:14:40 -0700 Subject: oops --- stanza/models/tokenization/tokenize_files.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/stanza/models/tokenization/tokenize_files.py b/stanza/models/tokenization/tokenize_files.py index 7bf421b8..0f5eac4a 100644 --- a/stanza/models/tokenization/tokenize_files.py +++ b/stanza/models/tokenization/tokenize_files.py @@ -7,6 +7,7 @@ For example, this kind of input is suitable for Glove import argparse import os +import re import torch @@ -18,11 +19,12 @@ from stanza.pipeline.tokenize_processor import TokenizeProcessor tqdm = get_tqdm() +NEWLINE_SPLIT_RE = re.compile(r"\n\s*\n") + def tokenize_to_file(tokenizer, fin, fout): # TODO: split text? this could be kinda long raw_text = fin.read() - documents = raw_text.split(r"\n\n") - print("Number of docs: %d" % len(documents)) + documents = NEWLINE_SPLIT_RE.split(raw_text) in_docs = [stanza.Document([], text=d) for d in documents] out_docs = tokenizer.bulk_process(in_docs) for document in out_docs: -- cgit v1.2.3