Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-10 09:31:52 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-10 09:31:52 +0300
commit4dca146a8c34393c4b3887c76b3d4261e39f2029 (patch)
tree113a7a5f8565ec283a19cc1aeaaafde66d2443b0
parente904b88d552ecf14a5ff21e8109b4d80656697fb (diff)
Add some doc and update dev_sentence -> dep_sentence to reflect where the variable comes from
-rw-r--r--stanza/utils/datasets/constituency/convert_it_vit.py10
1 files changed, 8 insertions, 2 deletions
diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py
index 7c4ca0bc..20ba9814 100644
--- a/stanza/utils/datasets/constituency/convert_it_vit.py
+++ b/stanza/utils/datasets/constituency/convert_it_vit.py
@@ -434,7 +434,13 @@ def get_mwt(*dep_datasets):
mwt_map[token.text] = expansion
return mwt_map
-def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon_processor):
+def update_mwts_and_special_cases(original_tree, dep_sentence, mwt_map, tsurgeon_processor):
+ """
+ Replace MWT structures with their UD equivalents, along with some other minor tsurgeon based edits
+
+ original_tree: the tree as read from VIT
+ dep_sentence: the UD dependency dataset version of this sentence
+ """
updated_tree = original_tree
operations = []
@@ -468,7 +474,7 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon
# now assemble a bunch of regex to split and otherwise manipulate
# the MWT in the trees
- for token in dev_sentence.tokens:
+ for token in dep_sentence.tokens:
if len(token.words) == 1:
continue
if token.text in mwt_map: