diff options
author | John Bauer <horatio@gmail.com> | 2022-11-03 10:40:11 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-11-03 10:40:11 +0300 |
commit | ad8358e37c6ba467760258c165c7d61be5de28a9 (patch) | |
tree | 9787a10810f89a3f0cbd094e73271ae10321fa13 | |
parent | 8bda5dfb52fdbc84a86f6937e06c22c6f5206bed (diff) |
Add min_len and max_len args to tokenize_wiki.py. Skip one line wiki docs, since those are likely to be useless
-rw-r--r-- | stanza/utils/datasets/constituency/selftrain_wiki.py | 5 | ||||
-rw-r--r-- | stanza/utils/datasets/constituency/tokenize_wiki.py | 30 |
2 files changed, 34 insertions, 1 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain_wiki.py b/stanza/utils/datasets/constituency/selftrain_wiki.py index 01692da4..330c2b49 100644 --- a/stanza/utils/datasets/constituency/selftrain_wiki.py +++ b/stanza/utils/datasets/constituency/selftrain_wiki.py @@ -87,7 +87,10 @@ def read_wiki_file(filename): line = next(line_iterator, None) elif line.startswith("</doc"): if current_doc: - docs.append("\n\n".join(current_doc)) + if len(current_doc) > 1: + # a lot of single line documents are links to related documents + # a single wikipedia can have tens of thousands of useless almost-duplicates + docs.append("\n\n".join(current_doc)) current_doc = [] else: # not the start or end of a doc diff --git a/stanza/utils/datasets/constituency/tokenize_wiki.py b/stanza/utils/datasets/constituency/tokenize_wiki.py index 3a3ef24e..feeda4b2 100644 --- a/stanza/utils/datasets/constituency/tokenize_wiki.py +++ b/stanza/utils/datasets/constituency/tokenize_wiki.py @@ -39,6 +39,32 @@ def parse_args(): default='extern_data/vietnamese/wikipedia/text/AA', help='Path to the wikipedia dump after processing by wikiextractor' ) + parser.add_argument( + '--min_len', + default=5, + type=int, + help='Minimum length sentence to keep. None = unlimited' + ) + parser.add_argument( + '--no_min_len', + dest='min_len', + action='store_const', + const=None, + help='No minimum length' + ) + parser.add_argument( + '--max_len', + default=100, + type=int, + help='Maximum length sentence to keep. None = unlimited' + ) + parser.add_argument( + '--no_max_len', + dest='max_len', + action='store_const', + const=None, + help='No maximum length' + ) args = parser.parse_args() return args @@ -57,6 +83,10 @@ def main(): for doc in docs: for sentence in doc.sentences: + if args.min_len and len(sentence.words) < args.min_len: + continue + if args.max_len and len(sentence.words) > args.max_len: + continue text = sentence.text if (text.find("|") >= 0 or text.find("_") >= 0 or text.find("<") >= 0 or text.find(">") >= 0 or |