Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-10-02 10:14:40 +0300
committerJohn Bauer <horatio@gmail.com>2022-10-02 10:14:40 +0300
commitada5bc2674f6044ca9dfa0a5e1dc18eb6390e5c5 (patch)
treefd855df986e5d7aeb1036d19aba7dd0b013311cf
parent31a413a0fd884c0a133863f9b454edac1997af32 (diff)
oopstoken
-rw-r--r--stanza/models/tokenization/tokenize_files.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/stanza/models/tokenization/tokenize_files.py b/stanza/models/tokenization/tokenize_files.py
index 7bf421b8..0f5eac4a 100644
--- a/stanza/models/tokenization/tokenize_files.py
+++ b/stanza/models/tokenization/tokenize_files.py
@@ -7,6 +7,7 @@ For example, this kind of input is suitable for Glove
import argparse
import os
+import re
import torch
@@ -18,11 +19,12 @@ from stanza.pipeline.tokenize_processor import TokenizeProcessor
tqdm = get_tqdm()
+NEWLINE_SPLIT_RE = re.compile(r"\n\s*\n")
+
def tokenize_to_file(tokenizer, fin, fout):
# TODO: split text? this could be kinda long
raw_text = fin.read()
- documents = raw_text.split(r"\n\n")
- print("Number of docs: %d" % len(documents))
+ documents = NEWLINE_SPLIT_RE.split(raw_text)
in_docs = [stanza.Document([], text=d) for d in documents]
out_docs = tokenizer.bulk_process(in_docs)
for document in out_docs: