diff options
author | John Bauer <horatio@gmail.com> | 2022-09-07 09:39:26 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-09-07 09:45:57 +0300 |
commit | c2941d7de6a4f1c22a3dabf0632852f9cd92bf8b (patch) | |
tree | a8654a9d74acbede829fd82bf7d0d99eb09dbf87 | |
parent | 54f2e542f5e6b1f7335aee18d44e28b25ca3554a (diff) |
Update for the latest version of the constituency treebank
some sentences fixed in UD, some updates to the constituency treebank
-rw-r--r-- | stanza/utils/datasets/constituency/convert_it_vit.py | 55 | ||||
-rw-r--r-- | stanza/utils/datasets/constituency/prepare_con_dataset.py | 5 |
2 files changed, 50 insertions, 10 deletions
diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py index 8db7766d..305ce029 100644 --- a/stanza/utils/datasets/constituency/convert_it_vit.py +++ b/stanza/utils/datasets/constituency/convert_it_vit.py @@ -58,8 +58,13 @@ In August 2022, Prof. Delmonte made a slight update in a zip file here. Contact Chris or John for a copy if not updated yet, or go back in git history to get the older version of the code which works with the 2022 ELRA update. -In later 2022, there is a new update, Archive.zip Put that file in -$CONSTITUENCY_BASE/italian/it_vit and unzip it there + +Later, in September 2022, there is yet another update, +New version of VIT.zip +Unzip the contents into a folder +$CONSTITUENCY_BASE/italian/it_vit +so there should be a file +$CONSTITUENCY_BASE/italian/it_vit/VITwritten/VITconstsyntNumb """ from collections import defaultdict, deque @@ -207,6 +212,32 @@ def raw_tree(text): "num-0/69%minus": "(num 0,69%) (num minus)", "num-0_39%minus": "(num 0,39%) (num minus)", "num-9_11/16": "(num 9-11,16)", + "num-2/184_90": "(num 2=184/90)", + "num-3/429_20": "(num 3eq429/20)", + # TODO: remove the following num conversions if possible + # this would require editing either constituency or UD + "num-1:28_124": "(num 1=8/1242)", + "num-1:28_397": "(num 1=8/3972)", + "num-1:28_947": "(num 1=8/9472)", + "num-1:29_657": "(num 1=9/6572)", + "num-1:29_867": "(num 1=9/8672)", + "num-1:29_874": "(num 1=9/8742)", + "num-1:30_083": "(num 1=0/0833)", + "num-1:30_140": "(num 1=0/1403)", + "num-1:30_354": "(num 1=0/3543)", + "num-1:30_453": "(num 1=0/4533)", + "num-1:30_946": "(num 1=0/9463)", + "num-1:31_602": "(num 1=1/6023)", + "num-1:31_842": "(num 1=1/8423)", + "num-1:32_087": "(num 1=2/0873)", + "num-1:32_259": "(num 1=2/2593)", + "num-1:33_166": "(num 1=3/1663)", + "num-1:34_154": "(num 1=4/1543)", + "num-1:34_556": "(num 1=4/5563)", + "num-1:35_323": "(num 1=5/3233)", + "num-1:36_023": "(num 1=6/0233)", + "num-1:36_076": "(num 1=6/0763)", + "num-1:36_651": "(num 1=6/6513)", "n-giga_flop/s": "(n giga_flop/s)", "sect-'g-1'": "(sect g-1)", "sect-'h-1'": "(sect h-1)", @@ -214,10 +245,13 @@ def raw_tree(text): "sect-'h-3'": "(sect h-3)", "abbr-'a-b-c'": "(abbr a-b-c)", "abbr-d_o_a_": "(abbr DOA)", + "abbr-d_l_": "(abbr DL)", + "abbr-i_s_e_f_": "(abbr ISEF)", "abbr-d_p_r_": "(abbr DPR)", "abbr-D_P_R_": "(abbr DPR)", "abbr-d_m_": "(abbr dm)", "abbr-T_U_": "(abbr TU)", + "abbr-F_A_M_E_": "(abbr Fame)", "dots-'...'": "(dots ...)", } new_pieces = ["(ROOT "] @@ -415,6 +449,12 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon operations.append(["/^testo$/ !, __ . /^:$/=prune", "prune prune"]) operations.append(["/^testo$/=prune !, __", "prune prune"]) + if len(con_words) >= 2 and con_words[-2] == '...' and con_words[-1] == '.': + # the most recent VIT constituency has some sentence final . after a ... + # the UD dataset has a more typical ... ending instead + # these lines used to say "riempire" which was rather odd + operations.append(["/^[.][.][.]$/ . /^[.]$/=prune", "prune prune"]) + # now assemble a bunch of regex to split and otherwise manipulate # the MWT in the trees for token in dev_sentence.tokens: @@ -505,13 +545,12 @@ def update_tree(original_tree, dep_sentence, con_id, dep_id, mwt_map, tsurgeon_p # 2375: the problem is inconsistent treatment of s_p_a_ # 05052: the heuristic to fill in a missing "si" doesn't work because there's # already another "si" immediately after -# 07069: "i" edited out in UD incorrectly? -# 07117: FAME -> F A ME wtf? -# 08371: da riempire inconsistency +# 07683: weird token # # test set: -# 09765: da riempire inconsistency -IGNORE_IDS = ["sent_00867", "sent_01169", "sent_02375", "sent_05052", "sent_07069", "sent_07117", "sent_08371", "sent_09765"] +# 09764: weird punct at end +# 10058: weird punct at end +IGNORE_IDS = ["sent_00867", "sent_01169", "sent_02375", "sent_05052", "sent_09764", "sent_10058"] def extract_updated_dataset(con_tree_map, dep_sentence_map, split_ids, mwt_map, tsurgeon_processor): """ @@ -537,7 +576,7 @@ def convert_it_vit(con_directory, ud_directory, output_directory, dataset_name, # the most recent update from ELRA may look like this? # it's what we got, at least # con_filename = os.path.join(con_directory, "italian", "VITwritten", "VITconstsyntNumb") - con_filename = os.path.join(con_directory, "italian", "john", "VITconstsyntNumb") + con_filename = os.path.join(con_directory, "italian", "it_vit", "VITwritten", "VITconstsyntNumb") ud_vit_train = os.path.join(ud_directory, "it_vit-ud-train.conllu") ud_vit_dev = os.path.join(ud_directory, "it_vit-ud-dev.conllu") ud_vit_test = os.path.join(ud_directory, "it_vit-ud-test.conllu") diff --git a/stanza/utils/datasets/constituency/prepare_con_dataset.py b/stanza/utils/datasets/constituency/prepare_con_dataset.py index 39f492a8..ad0c10a0 100644 --- a/stanza/utils/datasets/constituency/prepare_con_dataset.py +++ b/stanza/utils/datasets/constituency/prepare_con_dataset.py @@ -122,8 +122,9 @@ def process_it_turin(paths): convert_it_turin(input_dir, output_dir) def process_it_vit(paths): - # needs at least UD 2.10 or this will not work - ud_dir = os.path.join(paths["UDBASE"], "UD_Italian-VIT") + # needs at least UD 2.11 or this will not work + # in the meantime, the git version of VIT will suffice + ud_dir = os.path.join(paths["UDBASE_GIT"], "UD_Italian-VIT") output_dir = paths["CONSTITUENCY_DATA_DIR"] convert_it_vit(paths["CONSTITUENCY_BASE"], ud_dir, output_dir, "it_vit") |