diff options
author | John Bauer <horatio@gmail.com> | 2022-10-29 02:53:21 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-10-29 02:53:21 +0300 |
commit | 29fb29f7425ea22025fb4986c33b3f2c9fa1f155 (patch) | |
tree | c7a96440eaa1ada6ef317d2317261fe0874f9e65 | |
parent | 046cef7dac600670a46a3adb6024729711bdab1d (diff) |
ignore em dashes in Wikipedia, as that seems to be lists
-rw-r--r-- | stanza/utils/datasets/constituency/tokenize_wiki.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/stanza/utils/datasets/constituency/tokenize_wiki.py b/stanza/utils/datasets/constituency/tokenize_wiki.py index cdc9a9a3..3a3ef24e 100644 --- a/stanza/utils/datasets/constituency/tokenize_wiki.py +++ b/stanza/utils/datasets/constituency/tokenize_wiki.py @@ -60,7 +60,8 @@ def main(): text = sentence.text if (text.find("|") >= 0 or text.find("_") >= 0 or text.find("<") >= 0 or text.find(">") >= 0 or - text.find("[") >= 0 or text.find("]") >= 0): + text.find("[") >= 0 or text.find("]") >= 0 or + text.find('—') >= 0): # an em dash, seems to be part of lists continue # the VI tokenizer in particular doesn't split these well if any(any(w.text.find(c) >= 0 and len(w.text) > 1 for w in sentence.words) |