Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--stanza/models/tokenization/utils.py8
1 files changed, 7 insertions, 1 deletions
diff --git a/stanza/models/tokenization/utils.py b/stanza/models/tokenization/utils.py
index c53a306e..54d3cb88 100644
--- a/stanza/models/tokenization/utils.py
+++ b/stanza/models/tokenization/utils.py
@@ -374,7 +374,13 @@ def decode_predictions(vocab, mwt_dict, orig_text, all_raw, all_preds, no_ssplit
partlen = match.end(0) - match.start(0)
lstripped = match.group(0).lstrip()
else:
- st0 = text.index(part, char_offset) - char_offset
+ try:
+ st0 = text.index(part, char_offset) - char_offset
+ except ValueError as e:
+ sub_start = max(0, char_offset - 20)
+ sub_end = min(len(text), char_offset + 20)
+ sub = text[sub_start:sub_end]
+ raise ValueError("Could not find |%s| starting from char_offset %d. Surrounding text: |%s|" % (part, char_offset, sub)) from e
partlen = len(part)
lstripped = part.lstrip()
if st < 0: