ignore em dashes in Wikipedia, as that seems to be lists

author: John Bauer <horatio@gmail.com> 2022-10-29 02:53:21 +0300
committer: John Bauer <horatio@gmail.com> 2022-10-29 02:53:21 +0300
commit: 29fb29f7425ea22025fb4986c33b3f2c9fa1f155 (patch)
tree: c7a96440eaa1ada6ef317d2317261fe0874f9e65
parent: 046cef7dac600670a46a3adb6024729711bdab1d (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/stanza/utils/datasets/constituency/tokenize_wiki.py b/stanza/utils/datasets/constituency/tokenize_wiki.py
index cdc9a9a3..3a3ef24e 100644
--- a/stanza/utils/datasets/constituency/tokenize_wiki.py
+++ b/stanza/utils/datasets/constituency/tokenize_wiki.py
@@ -60,7 +60,8 @@ def main():
                     text = sentence.text
                     if (text.find("|") >= 0 or text.find("_") >= 0 or
                         text.find("<") >= 0 or text.find(">") >= 0 or
-                        text.find("[") >= 0 or text.find("]") >= 0):
+                        text.find("[") >= 0 or text.find("]") >= 0 or
+                        text.find('—') >= 0):   # an em dash, seems to be part of lists
                         continue
                     # the VI tokenizer in particular doesn't split these well
                     if any(any(w.text.find(c) >= 0 and len(w.text) > 1 for w in sentence.words)
author	John Bauer <horatio@gmail.com>	2022-10-29 02:53:21 +0300
committer	John Bauer <horatio@gmail.com>	2022-10-29 02:53:21 +0300
commit	29fb29f7425ea22025fb4986c33b3f2c9fa1f155 (patch)
tree	c7a96440eaa1ada6ef317d2317261fe0874f9e65
parent	046cef7dac600670a46a3adb6024729711bdab1d (diff)