Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-10-28 10:54:25 +0300
committerJohn Bauer <horatio@gmail.com>2022-10-28 10:54:25 +0300
commitba5c8c7fa3f001a034f7095468a10c0f920c2dac (patch)
tree3a613ecafca7c685b1a6ae2b97116b5dd1adcb19
parentbf4204f948dd8db716acfa000e20c8576abd6734 (diff)
fix bug in the lt/gt finding (it can start a line). use FoundationCache to save on memory
-rw-r--r--stanza/utils/datasets/constituency/selftrain_wiki.py8
1 files changed, 5 insertions, 3 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain_wiki.py b/stanza/utils/datasets/constituency/selftrain_wiki.py
index f091780d..6c2604fe 100644
--- a/stanza/utils/datasets/constituency/selftrain_wiki.py
+++ b/stanza/utils/datasets/constituency/selftrain_wiki.py
@@ -17,6 +17,7 @@ import os
import random
from stanza.models.common import utils
+from stanza.models.common.foundation_cache import FoundationCache
from stanza.utils.datasets.constituency import selftrain
tqdm = utils.get_tqdm()
@@ -91,7 +92,7 @@ def read_wiki_file(filename):
line = line.replace("()", " ")
line = line.replace("( )", " ")
line = line.strip()
- if line.find("&lt;") > 0 or line.find("&gt;") > 0:
+ if line.find("&lt;") >= 0 or line.find("&gt;") >= 0:
line = ""
if line:
current_doc.append(line)
@@ -110,8 +111,9 @@ def main():
if args.shuffle:
random.shuffle(wiki_files)
- tag_pipe = selftrain.build_tag_pipe(ssplit=True, lang=args.lang)
- parser_pipes = selftrain.build_parser_pipes(args.lang, args.models)
+ foundation_cache = FoundationCache()
+ tag_pipe = selftrain.build_tag_pipe(ssplit=True, lang=args.lang, foundation_cache=foundation_cache)
+ parser_pipes = selftrain.build_parser_pipes(args.lang, args.models, foundation_cache=foundation_cache)
# create a blank file. we will append to this file so that partial results can be used
with open(args.output_file, "w") as fout: