diff options
author | John Bauer <horatio@gmail.com> | 2022-11-05 04:33:44 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-11-05 04:33:44 +0300 |
commit | 758bc862929d93a25303f7121fcddd746b0e3297 (patch) | |
tree | 70153d1c5fb5ed7db3473fee099e5d9a15e748e2 | |
parent | 03c5ede401dc3d012fbc8cc0503b4920fc2c0d30 (diff) |
Also chuck some sentences with long words
-rw-r--r-- | stanza/utils/datasets/constituency/selftrain.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain.py b/stanza/utils/datasets/constituency/selftrain.py index bfa0dbf1..71431b50 100644 --- a/stanza/utils/datasets/constituency/selftrain.py +++ b/stanza/utils/datasets/constituency/selftrain.py @@ -171,6 +171,10 @@ def tokenize_docs(docs, pipe, min_len, max_len): continue text = [w.text.replace(" ", "_") for w in sentence.words] text = " ".join(text) + if any(len(w.text) >= 50 for w in sentence.words): + # skip sentences where some of the words are unreasonably long + # could make this an argument + continue if not is_zh and len(ZH_RE.findall(text)) > 250: # some Chinese sentences show up in VI Wikipedia # we want to eliminate ones which will choke the bert models |