diff options
author | John Bauer <horatio@gmail.com> | 2022-11-03 18:59:57 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-11-03 18:59:57 +0300 |
commit | 3ab75287b10331d5c7228b8a91cb6bf91580a0b5 (patch) | |
tree | 2377db905d9b778e7ad22a849c9b49441ee7e4ad | |
parent | ad8358e37c6ba467760258c165c7d61be5de28a9 (diff) |
Throw out wiki docs of length 2 as well when building a silver dataset
-rw-r--r-- | stanza/utils/datasets/constituency/selftrain_wiki.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain_wiki.py b/stanza/utils/datasets/constituency/selftrain_wiki.py index 330c2b49..8d1170fb 100644 --- a/stanza/utils/datasets/constituency/selftrain_wiki.py +++ b/stanza/utils/datasets/constituency/selftrain_wiki.py @@ -87,8 +87,8 @@ def read_wiki_file(filename): line = next(line_iterator, None) elif line.startswith("</doc"): if current_doc: - if len(current_doc) > 1: - # a lot of single line documents are links to related documents + if len(current_doc) > 2: + # a lot of very short documents are links to related documents # a single wikipedia can have tens of thousands of useless almost-duplicates docs.append("\n\n".join(current_doc)) current_doc = [] |