If a ValueError happens while tokenizing, try to make it a bit more descriptive. Apparently does not impact tokenization time

author: John Bauer <horatio@gmail.com> 2022-09-07 08:22:41 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-07 08:22:41 +0300
commit: 54f2e542f5e6b1f7335aee18d44e28b25ca3554a (patch)
tree: 97c70bb82163d0a62be3da0e39a07f9d1ebfef39
parent: 20e03450e239b570bc102cd31d3b6e2d769841e9 (diff)
1 files changed, 7 insertions, 1 deletions
diff --git a/stanza/models/tokenization/utils.py b/stanza/models/tokenization/utils.py
index c53a306e..54d3cb88 100644
--- a/stanza/models/tokenization/utils.py
+++ b/stanza/models/tokenization/utils.py
@@ -374,7 +374,13 @@ def decode_predictions(vocab, mwt_dict, orig_text, all_raw, all_preds, no_ssplit
                             partlen = match.end(0) - match.start(0)
                             lstripped = match.group(0).lstrip()
                         else:
-                            st0 = text.index(part, char_offset) - char_offset
+                            try:
+                                st0 = text.index(part, char_offset) - char_offset
+                            except ValueError as e:
+                                sub_start = max(0, char_offset - 20)
+                                sub_end = min(len(text), char_offset + 20)
+                                sub = text[sub_start:sub_end]
+                                raise ValueError("Could not find |%s| starting from char_offset %d.  Surrounding text: |%s|" % (part, char_offset, sub)) from e
                             partlen = len(part)
                             lstripped = part.lstrip()
                         if st < 0:
author	John Bauer <horatio@gmail.com>	2022-09-07 08:22:41 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-07 08:22:41 +0300
commit	54f2e542f5e6b1f7335aee18d44e28b25ca3554a (patch)
tree	97c70bb82163d0a62be3da0e39a07f9d1ebfef39
parent	20e03450e239b570bc102cd31d3b6e2d769841e9 (diff)