diff options
author | John Bauer <horatio@gmail.com> | 2022-09-10 09:31:52 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-09-10 09:31:52 +0300 |
commit | 4dca146a8c34393c4b3887c76b3d4261e39f2029 (patch) | |
tree | 113a7a5f8565ec283a19cc1aeaaafde66d2443b0 | |
parent | e904b88d552ecf14a5ff21e8109b4d80656697fb (diff) |
Add some doc and update dev_sentence -> dep_sentence to reflect where the variable comes from
-rw-r--r-- | stanza/utils/datasets/constituency/convert_it_vit.py | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py index 7c4ca0bc..20ba9814 100644 --- a/stanza/utils/datasets/constituency/convert_it_vit.py +++ b/stanza/utils/datasets/constituency/convert_it_vit.py @@ -434,7 +434,13 @@ def get_mwt(*dep_datasets): mwt_map[token.text] = expansion return mwt_map -def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon_processor): +def update_mwts_and_special_cases(original_tree, dep_sentence, mwt_map, tsurgeon_processor): + """ + Replace MWT structures with their UD equivalents, along with some other minor tsurgeon based edits + + original_tree: the tree as read from VIT + dep_sentence: the UD dependency dataset version of this sentence + """ updated_tree = original_tree operations = [] @@ -468,7 +474,7 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon # now assemble a bunch of regex to split and otherwise manipulate # the MWT in the trees - for token in dev_sentence.tokens: + for token in dep_sentence.tokens: if len(token.words) == 1: continue if token.text in mwt_map: |