Add min_len and max_len args to tokenize_wiki.py. Skip one line wiki docs, since those are likely to be useless

author: John Bauer <horatio@gmail.com> 2022-11-03 10:40:11 +0300
committer: John Bauer <horatio@gmail.com> 2022-11-03 10:40:11 +0300
commit: ad8358e37c6ba467760258c165c7d61be5de28a9 (patch)
tree: 9787a10810f89a3f0cbd094e73271ae10321fa13
parent: 8bda5dfb52fdbc84a86f6937e06c22c6f5206bed (diff)
2 files changed, 34 insertions, 1 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain_wiki.py b/stanza/utils/datasets/constituency/selftrain_wiki.py
index 01692da4..330c2b49 100644
--- a/stanza/utils/datasets/constituency/selftrain_wiki.py
+++ b/stanza/utils/datasets/constituency/selftrain_wiki.py
@@ -87,7 +87,10 @@ def read_wiki_file(filename):
             line = next(line_iterator, None)
         elif line.startswith("</doc"):
             if current_doc:
-                docs.append("\n\n".join(current_doc))
+                if len(current_doc) > 1:
+                    # a lot of single line documents are links to related documents
+                    # a single wikipedia can have tens of thousands of useless almost-duplicates
+                    docs.append("\n\n".join(current_doc))
                 current_doc = []
         else:
             # not the start or end of a doc
diff --git a/stanza/utils/datasets/constituency/tokenize_wiki.py b/stanza/utils/datasets/constituency/tokenize_wiki.py
index 3a3ef24e..feeda4b2 100644
--- a/stanza/utils/datasets/constituency/tokenize_wiki.py
+++ b/stanza/utils/datasets/constituency/tokenize_wiki.py
@@ -39,6 +39,32 @@ def parse_args():
         default='extern_data/vietnamese/wikipedia/text/AA',
         help='Path to the wikipedia dump after processing by wikiextractor'
     )
+    parser.add_argument(
+        '--min_len',
+        default=5,
+        type=int,
+        help='Minimum length sentence to keep.  None = unlimited'
+    )
+    parser.add_argument(
+        '--no_min_len',
+        dest='min_len',
+        action='store_const',
+        const=None,
+        help='No minimum length'
+    )
+    parser.add_argument(
+        '--max_len',
+        default=100,
+        type=int,
+        help='Maximum length sentence to keep.  None = unlimited'
+    )
+    parser.add_argument(
+        '--no_max_len',
+        dest='max_len',
+        action='store_const',
+        const=None,
+        help='No maximum length'
+    )
     args = parser.parse_args()
     return args
 
@@ -57,6 +83,10 @@ def main():
 
             for doc in docs:
                 for sentence in doc.sentences:
+                    if args.min_len and len(sentence.words) < args.min_len:
+                        continue
+                    if args.max_len and len(sentence.words) > args.max_len:
+                        continue
                     text = sentence.text
                     if (text.find("|") >= 0 or text.find("_") >= 0 or
                         text.find("<") >= 0 or text.find(">") >= 0 or
author	John Bauer <horatio@gmail.com>	2022-11-03 10:40:11 +0300
committer	John Bauer <horatio@gmail.com>	2022-11-03 10:40:11 +0300
commit	ad8358e37c6ba467760258c165c7d61be5de28a9 (patch)
tree	9787a10810f89a3f0cbd094e73271ae10321fa13
parent	8bda5dfb52fdbc84a86f6937e06c22c6f5206bed (diff)