Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-11-05 04:33:44 +0300
committerJohn Bauer <horatio@gmail.com>2022-11-05 04:33:44 +0300
commit758bc862929d93a25303f7121fcddd746b0e3297 (patch)
tree70153d1c5fb5ed7db3473fee099e5d9a15e748e2
parent03c5ede401dc3d012fbc8cc0503b4920fc2c0d30 (diff)
Also chuck some sentences with long words
-rw-r--r--stanza/utils/datasets/constituency/selftrain.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain.py b/stanza/utils/datasets/constituency/selftrain.py
index bfa0dbf1..71431b50 100644
--- a/stanza/utils/datasets/constituency/selftrain.py
+++ b/stanza/utils/datasets/constituency/selftrain.py
@@ -171,6 +171,10 @@ def tokenize_docs(docs, pipe, min_len, max_len):
continue
text = [w.text.replace(" ", "_") for w in sentence.words]
text = " ".join(text)
+ if any(len(w.text) >= 50 for w in sentence.words):
+ # skip sentences where some of the words are unreasonably long
+ # could make this an argument
+ continue
if not is_zh and len(ZH_RE.findall(text)) > 250:
# some Chinese sentences show up in VI Wikipedia
# we want to eliminate ones which will choke the bert models