Throw out wiki docs of length 2 as well when building a silver dataset

author: John Bauer <horatio@gmail.com> 2022-11-03 18:59:57 +0300
committer: John Bauer <horatio@gmail.com> 2022-11-03 18:59:57 +0300
commit: 3ab75287b10331d5c7228b8a91cb6bf91580a0b5 (patch)
tree: 2377db905d9b778e7ad22a849c9b49441ee7e4ad
parent: ad8358e37c6ba467760258c165c7d61be5de28a9 (diff)
1 files changed, 2 insertions, 2 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain_wiki.py b/stanza/utils/datasets/constituency/selftrain_wiki.py
index 330c2b49..8d1170fb 100644
--- a/stanza/utils/datasets/constituency/selftrain_wiki.py
+++ b/stanza/utils/datasets/constituency/selftrain_wiki.py
@@ -87,8 +87,8 @@ def read_wiki_file(filename):
             line = next(line_iterator, None)
         elif line.startswith("</doc"):
             if current_doc:
-                if len(current_doc) > 1:
-                    # a lot of single line documents are links to related documents
+                if len(current_doc) > 2:
+                    # a lot of very short documents are links to related documents
                     # a single wikipedia can have tens of thousands of useless almost-duplicates
                     docs.append("\n\n".join(current_doc))
                 current_doc = []
author	John Bauer <horatio@gmail.com>	2022-11-03 18:59:57 +0300
committer	John Bauer <horatio@gmail.com>	2022-11-03 18:59:57 +0300
commit	3ab75287b10331d5c7228b8a91cb6bf91580a0b5 (patch)
tree	2377db905d9b778e7ad22a849c9b49441ee7e4ad
parent	ad8358e37c6ba467760258c165c7d61be5de28a9 (diff)