diff options
author | John Bauer <horatio@gmail.com> | 2022-10-02 10:14:40 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-10-02 10:14:40 +0300 |
commit | ada5bc2674f6044ca9dfa0a5e1dc18eb6390e5c5 (patch) | |
tree | fd855df986e5d7aeb1036d19aba7dd0b013311cf | |
parent | 31a413a0fd884c0a133863f9b454edac1997af32 (diff) |
oopstoken
-rw-r--r-- | stanza/models/tokenization/tokenize_files.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/stanza/models/tokenization/tokenize_files.py b/stanza/models/tokenization/tokenize_files.py index 7bf421b8..0f5eac4a 100644 --- a/stanza/models/tokenization/tokenize_files.py +++ b/stanza/models/tokenization/tokenize_files.py @@ -7,6 +7,7 @@ For example, this kind of input is suitable for Glove import argparse import os +import re import torch @@ -18,11 +19,12 @@ from stanza.pipeline.tokenize_processor import TokenizeProcessor tqdm = get_tqdm() +NEWLINE_SPLIT_RE = re.compile(r"\n\s*\n") + def tokenize_to_file(tokenizer, fin, fout): # TODO: split text? this could be kinda long raw_text = fin.read() - documents = raw_text.split(r"\n\n") - print("Number of docs: %d" % len(documents)) + documents = NEWLINE_SPLIT_RE.split(raw_text) in_docs = [stanza.Document([], text=d) for d in documents] out_docs = tokenizer.bulk_process(in_docs) for document in out_docs: |