diff options
author | John Bauer <horatio@gmail.com> | 2022-10-28 10:54:25 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-10-28 10:54:25 +0300 |
commit | ba5c8c7fa3f001a034f7095468a10c0f920c2dac (patch) | |
tree | 3a613ecafca7c685b1a6ae2b97116b5dd1adcb19 | |
parent | bf4204f948dd8db716acfa000e20c8576abd6734 (diff) |
fix bug in the lt/gt finding (it can start a line). use FoundationCache to save on memory
-rw-r--r-- | stanza/utils/datasets/constituency/selftrain_wiki.py | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain_wiki.py b/stanza/utils/datasets/constituency/selftrain_wiki.py index f091780d..6c2604fe 100644 --- a/stanza/utils/datasets/constituency/selftrain_wiki.py +++ b/stanza/utils/datasets/constituency/selftrain_wiki.py @@ -17,6 +17,7 @@ import os import random from stanza.models.common import utils +from stanza.models.common.foundation_cache import FoundationCache from stanza.utils.datasets.constituency import selftrain tqdm = utils.get_tqdm() @@ -91,7 +92,7 @@ def read_wiki_file(filename): line = line.replace("()", " ") line = line.replace("( )", " ") line = line.strip() - if line.find("<") > 0 or line.find(">") > 0: + if line.find("<") >= 0 or line.find(">") >= 0: line = "" if line: current_doc.append(line) @@ -110,8 +111,9 @@ def main(): if args.shuffle: random.shuffle(wiki_files) - tag_pipe = selftrain.build_tag_pipe(ssplit=True, lang=args.lang) - parser_pipes = selftrain.build_parser_pipes(args.lang, args.models) + foundation_cache = FoundationCache() + tag_pipe = selftrain.build_tag_pipe(ssplit=True, lang=args.lang, foundation_cache=foundation_cache) + parser_pipes = selftrain.build_parser_pipes(args.lang, args.models, foundation_cache=foundation_cache) # create a blank file. we will append to this file so that partial results can be used with open(args.output_file, "w") as fout: |