diff options
-rw-r--r-- | stanza/utils/datasets/constituency/convert_it_vit.py | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py index 7c4ca0bc..20ba9814 100644 --- a/stanza/utils/datasets/constituency/convert_it_vit.py +++ b/stanza/utils/datasets/constituency/convert_it_vit.py @@ -434,7 +434,13 @@ def get_mwt(*dep_datasets): mwt_map[token.text] = expansion return mwt_map -def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon_processor): +def update_mwts_and_special_cases(original_tree, dep_sentence, mwt_map, tsurgeon_processor): + """ + Replace MWT structures with their UD equivalents, along with some other minor tsurgeon based edits + + original_tree: the tree as read from VIT + dep_sentence: the UD dependency dataset version of this sentence + """ updated_tree = original_tree operations = [] @@ -468,7 +474,7 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon # now assemble a bunch of regex to split and otherwise manipulate # the MWT in the trees - for token in dev_sentence.tokens: + for token in dep_sentence.tokens: if len(token.words) == 1: continue if token.text in mwt_map: |