Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-10 09:05:11 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-10 09:07:00 +0300
commite904b88d552ecf14a5ff21e8109b4d80656697fb (patch)
tree8a52d40ec3f142e994baed7454119bde37b57148
parent0e6de808eacf14cd64622415eeaeeac2d60faab2 (diff)
Fix a few tag errors when reading VIT constituents
-rw-r--r--stanza/utils/datasets/constituency/convert_it_vit.py11
1 files changed, 11 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py
index 18bb0b76..7c4ca0bc 100644
--- a/stanza/utils/datasets/constituency/convert_it_vit.py
+++ b/stanza/utils/datasets/constituency/convert_it_vit.py
@@ -455,6 +455,17 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon
# these lines used to say "riempire" which was rather odd
operations.append(["/^[.][.][.]$/ . /^[.]$/=prune", "prune prune"])
+ # a few constituent tags are simply errors which need to be fixed
+ if original_tree.children[0].label == 'p':
+ # 'p' shouldn't be at root
+ operations.append(["_ROOT_ < p=p", "relabel p cp"])
+ # fix one specific tree if it has an s_top in it
+ operations.append(["s_top=stop < (in=in < più=piu)", "replace piu (q più)", "relabel in sq", "relabel stop sa"])
+ # sect doesn't exist as a constituent. replace it with sa
+ operations.append(["sect=sect < num", "relabel sect sa"])
+ # ppas as an internal node gets removed
+ operations.append(["ppas=ppas < (__ < __)", "excise ppas ppas"])
+
# now assemble a bunch of regex to split and otherwise manipulate
# the MWT in the trees
for token in dev_sentence.tokens: