Add some doc and update dev_sentence -> dep_sentence to reflect where the variable comes from

author: John Bauer <horatio@gmail.com> 2022-09-10 09:31:52 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-10 09:31:52 +0300
commit: 4dca146a8c34393c4b3887c76b3d4261e39f2029 (patch)
tree: 113a7a5f8565ec283a19cc1aeaaafde66d2443b0
parent: e904b88d552ecf14a5ff21e8109b4d80656697fb (diff)
1 files changed, 8 insertions, 2 deletions
diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py
index 7c4ca0bc..20ba9814 100644
--- a/stanza/utils/datasets/constituency/convert_it_vit.py
+++ b/stanza/utils/datasets/constituency/convert_it_vit.py
@@ -434,7 +434,13 @@ def get_mwt(*dep_datasets):
                 mwt_map[token.text] = expansion
     return mwt_map
 
-def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon_processor):
+def update_mwts_and_special_cases(original_tree, dep_sentence, mwt_map, tsurgeon_processor):
+    """
+    Replace MWT structures with their UD equivalents, along with some other minor tsurgeon based edits
+
+    original_tree: the tree as read from VIT
+    dep_sentence: the UD dependency dataset version of this sentence
+    """
     updated_tree = original_tree
 
     operations = []
@@ -468,7 +474,7 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon
 
     # now assemble a bunch of regex to split and otherwise manipulate
     # the MWT in the trees
-    for token in dev_sentence.tokens:
+    for token in dep_sentence.tokens:
         if len(token.words) == 1:
             continue
         if token.text in mwt_map:
author	John Bauer <horatio@gmail.com>	2022-09-10 09:31:52 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-10 09:31:52 +0300
commit	4dca146a8c34393c4b3887c76b3d4261e39f2029 (patch)
tree	113a7a5f8565ec283a19cc1aeaaafde66d2443b0
parent	e904b88d552ecf14a5ff21e8109b4d80656697fb (diff)