Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-11-03 10:40:11 +0300
committerJohn Bauer <horatio@gmail.com>2022-11-03 10:40:11 +0300
commitad8358e37c6ba467760258c165c7d61be5de28a9 (patch)
tree9787a10810f89a3f0cbd094e73271ae10321fa13
parent8bda5dfb52fdbc84a86f6937e06c22c6f5206bed (diff)
Add min_len and max_len args to tokenize_wiki.py. Skip one line wiki docs, since those are likely to be useless
-rw-r--r--stanza/utils/datasets/constituency/selftrain_wiki.py5
-rw-r--r--stanza/utils/datasets/constituency/tokenize_wiki.py30
2 files changed, 34 insertions, 1 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain_wiki.py b/stanza/utils/datasets/constituency/selftrain_wiki.py
index 01692da4..330c2b49 100644
--- a/stanza/utils/datasets/constituency/selftrain_wiki.py
+++ b/stanza/utils/datasets/constituency/selftrain_wiki.py
@@ -87,7 +87,10 @@ def read_wiki_file(filename):
line = next(line_iterator, None)
elif line.startswith("</doc"):
if current_doc:
- docs.append("\n\n".join(current_doc))
+ if len(current_doc) > 1:
+ # a lot of single line documents are links to related documents
+ # a single wikipedia can have tens of thousands of useless almost-duplicates
+ docs.append("\n\n".join(current_doc))
current_doc = []
else:
# not the start or end of a doc
diff --git a/stanza/utils/datasets/constituency/tokenize_wiki.py b/stanza/utils/datasets/constituency/tokenize_wiki.py
index 3a3ef24e..feeda4b2 100644
--- a/stanza/utils/datasets/constituency/tokenize_wiki.py
+++ b/stanza/utils/datasets/constituency/tokenize_wiki.py
@@ -39,6 +39,32 @@ def parse_args():
default='extern_data/vietnamese/wikipedia/text/AA',
help='Path to the wikipedia dump after processing by wikiextractor'
)
+ parser.add_argument(
+ '--min_len',
+ default=5,
+ type=int,
+ help='Minimum length sentence to keep. None = unlimited'
+ )
+ parser.add_argument(
+ '--no_min_len',
+ dest='min_len',
+ action='store_const',
+ const=None,
+ help='No minimum length'
+ )
+ parser.add_argument(
+ '--max_len',
+ default=100,
+ type=int,
+ help='Maximum length sentence to keep. None = unlimited'
+ )
+ parser.add_argument(
+ '--no_max_len',
+ dest='max_len',
+ action='store_const',
+ const=None,
+ help='No maximum length'
+ )
args = parser.parse_args()
return args
@@ -57,6 +83,10 @@ def main():
for doc in docs:
for sentence in doc.sentences:
+ if args.min_len and len(sentence.words) < args.min_len:
+ continue
+ if args.max_len and len(sentence.words) > args.max_len:
+ continue
text = sentence.text
if (text.find("|") >= 0 or text.find("_") >= 0 or
text.find("<") >= 0 or text.find(">") >= 0 or