Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-10-29 02:53:21 +0300
committerJohn Bauer <horatio@gmail.com>2022-10-29 02:53:21 +0300
commit29fb29f7425ea22025fb4986c33b3f2c9fa1f155 (patch)
treec7a96440eaa1ada6ef317d2317261fe0874f9e65
parent046cef7dac600670a46a3adb6024729711bdab1d (diff)
ignore em dashes in Wikipedia, as that seems to be lists
-rw-r--r--stanza/utils/datasets/constituency/tokenize_wiki.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/stanza/utils/datasets/constituency/tokenize_wiki.py b/stanza/utils/datasets/constituency/tokenize_wiki.py
index cdc9a9a3..3a3ef24e 100644
--- a/stanza/utils/datasets/constituency/tokenize_wiki.py
+++ b/stanza/utils/datasets/constituency/tokenize_wiki.py
@@ -60,7 +60,8 @@ def main():
text = sentence.text
if (text.find("|") >= 0 or text.find("_") >= 0 or
text.find("<") >= 0 or text.find(">") >= 0 or
- text.find("[") >= 0 or text.find("]") >= 0):
+ text.find("[") >= 0 or text.find("]") >= 0 or
+ text.find('—') >= 0): # an em dash, seems to be part of lists
continue
# the VI tokenizer in particular doesn't split these well
if any(any(w.text.find(c) >= 0 and len(w.text) > 1 for w in sentence.words)