From 4dca146a8c34393c4b3887c76b3d4261e39f2029 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Fri, 9 Sep 2022 23:31:52 -0700 Subject: Add some doc and update dev_sentence -> dep_sentence to reflect where the variable comes from --- stanza/utils/datasets/constituency/convert_it_vit.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py index 7c4ca0bc..20ba9814 100644 --- a/stanza/utils/datasets/constituency/convert_it_vit.py +++ b/stanza/utils/datasets/constituency/convert_it_vit.py @@ -434,7 +434,13 @@ def get_mwt(*dep_datasets): mwt_map[token.text] = expansion return mwt_map -def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon_processor): +def update_mwts_and_special_cases(original_tree, dep_sentence, mwt_map, tsurgeon_processor): + """ + Replace MWT structures with their UD equivalents, along with some other minor tsurgeon based edits + + original_tree: the tree as read from VIT + dep_sentence: the UD dependency dataset version of this sentence + """ updated_tree = original_tree operations = [] @@ -468,7 +474,7 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon # now assemble a bunch of regex to split and otherwise manipulate # the MWT in the trees - for token in dev_sentence.tokens: + for token in dep_sentence.tokens: if len(token.words) == 1: continue if token.text in mwt_map: -- cgit v1.2.3