diff options
author | John Bauer <horatio@gmail.com> | 2022-09-10 09:05:11 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-09-10 09:07:00 +0300 |
commit | e904b88d552ecf14a5ff21e8109b4d80656697fb (patch) | |
tree | 8a52d40ec3f142e994baed7454119bde37b57148 | |
parent | 0e6de808eacf14cd64622415eeaeeac2d60faab2 (diff) |
Fix a few tag errors when reading VIT constituents
-rw-r--r-- | stanza/utils/datasets/constituency/convert_it_vit.py | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py index 18bb0b76..7c4ca0bc 100644 --- a/stanza/utils/datasets/constituency/convert_it_vit.py +++ b/stanza/utils/datasets/constituency/convert_it_vit.py @@ -455,6 +455,17 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon # these lines used to say "riempire" which was rather odd operations.append(["/^[.][.][.]$/ . /^[.]$/=prune", "prune prune"]) + # a few constituent tags are simply errors which need to be fixed + if original_tree.children[0].label == 'p': + # 'p' shouldn't be at root + operations.append(["_ROOT_ < p=p", "relabel p cp"]) + # fix one specific tree if it has an s_top in it + operations.append(["s_top=stop < (in=in < più=piu)", "replace piu (q più)", "relabel in sq", "relabel stop sa"]) + # sect doesn't exist as a constituent. replace it with sa + operations.append(["sect=sect < num", "relabel sect sa"]) + # ppas as an internal node gets removed + operations.append(["ppas=ppas < (__ < __)", "excise ppas ppas"]) + # now assemble a bunch of regex to split and otherwise manipulate # the MWT in the trees for token in dev_sentence.tokens: |