Also chuck some sentences with long words

author: John Bauer <horatio@gmail.com> 2022-11-05 04:33:44 +0300
committer: John Bauer <horatio@gmail.com> 2022-11-05 04:33:44 +0300
commit: 758bc862929d93a25303f7121fcddd746b0e3297 (patch)
tree: 70153d1c5fb5ed7db3473fee099e5d9a15e748e2
parent: 03c5ede401dc3d012fbc8cc0503b4920fc2c0d30 (diff)
1 files changed, 4 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain.py b/stanza/utils/datasets/constituency/selftrain.py
index bfa0dbf1..71431b50 100644
--- a/stanza/utils/datasets/constituency/selftrain.py
+++ b/stanza/utils/datasets/constituency/selftrain.py
@@ -171,6 +171,10 @@ def tokenize_docs(docs, pipe, min_len, max_len):
                 continue
             text = [w.text.replace(" ", "_") for w in sentence.words]
             text = " ".join(text)
+            if any(len(w.text) >= 50 for w in sentence.words):
+                # skip sentences where some of the words are unreasonably long
+                # could make this an argument
+                continue
             if not is_zh and len(ZH_RE.findall(text)) > 250:
                 # some Chinese sentences show up in VI Wikipedia
                 # we want to eliminate ones which will choke the bert models
author	John Bauer <horatio@gmail.com>	2022-11-05 04:33:44 +0300
committer	John Bauer <horatio@gmail.com>	2022-11-05 04:33:44 +0300
commit	758bc862929d93a25303f7121fcddd746b0e3297 (patch)
tree	70153d1c5fb5ed7db3473fee099e5d9a15e748e2
parent	03c5ede401dc3d012fbc8cc0503b4920fc2c0d30 (diff)