From ada5bc2674f6044ca9dfa0a5e1dc18eb6390e5c5 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Sun, 2 Oct 2022 00:14:40 -0700
Subject: oops

---
 stanza/models/tokenization/tokenize_files.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/stanza/models/tokenization/tokenize_files.py b/stanza/models/tokenization/tokenize_files.py
index 7bf421b8..0f5eac4a 100644
--- a/stanza/models/tokenization/tokenize_files.py
+++ b/stanza/models/tokenization/tokenize_files.py
@@ -7,6 +7,7 @@ For example, this kind of input is suitable for Glove
 
 import argparse
 import os
+import re
 
 import torch
 
@@ -18,11 +19,12 @@ from stanza.pipeline.tokenize_processor import TokenizeProcessor
 
 tqdm = get_tqdm()
 
+NEWLINE_SPLIT_RE = re.compile(r"\n\s*\n")
+
 def tokenize_to_file(tokenizer, fin, fout):
     # TODO: split text?  this could be kinda long
     raw_text = fin.read()
-    documents = raw_text.split(r"\n\n")
-    print("Number of docs: %d" % len(documents))
+    documents = NEWLINE_SPLIT_RE.split(raw_text)
     in_docs = [stanza.Document([], text=d) for d in documents]
     out_docs = tokenizer.bulk_process(in_docs)
     for document in out_docs:
-- 
cgit v1.2.3