Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-07 09:39:26 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-07 09:45:57 +0300
commitc2941d7de6a4f1c22a3dabf0632852f9cd92bf8b (patch)
treea8654a9d74acbede829fd82bf7d0d99eb09dbf87
parent54f2e542f5e6b1f7335aee18d44e28b25ca3554a (diff)
Update for the latest version of the constituency treebank
some sentences fixed in UD, some updates to the constituency treebank
-rw-r--r--stanza/utils/datasets/constituency/convert_it_vit.py55
-rw-r--r--stanza/utils/datasets/constituency/prepare_con_dataset.py5
2 files changed, 50 insertions, 10 deletions
diff --git a/stanza/utils/datasets/constituency/convert_it_vit.py b/stanza/utils/datasets/constituency/convert_it_vit.py
index 8db7766d..305ce029 100644
--- a/stanza/utils/datasets/constituency/convert_it_vit.py
+++ b/stanza/utils/datasets/constituency/convert_it_vit.py
@@ -58,8 +58,13 @@ In August 2022, Prof. Delmonte made a slight update in a zip file
here. Contact Chris or John for a copy if not updated yet, or go
back in git history to get the older version of the code which
works with the 2022 ELRA update.
-In later 2022, there is a new update, Archive.zip Put that file in
-$CONSTITUENCY_BASE/italian/it_vit and unzip it there
+
+Later, in September 2022, there is yet another update,
+New version of VIT.zip
+Unzip the contents into a folder
+$CONSTITUENCY_BASE/italian/it_vit
+so there should be a file
+$CONSTITUENCY_BASE/italian/it_vit/VITwritten/VITconstsyntNumb
"""
from collections import defaultdict, deque
@@ -207,6 +212,32 @@ def raw_tree(text):
"num-0/69%minus": "(num 0,69%) (num minus)",
"num-0_39%minus": "(num 0,39%) (num minus)",
"num-9_11/16": "(num 9-11,16)",
+ "num-2/184_90": "(num 2=184/90)",
+ "num-3/429_20": "(num 3eq429/20)",
+ # TODO: remove the following num conversions if possible
+ # this would require editing either constituency or UD
+ "num-1:28_124": "(num 1=8/1242)",
+ "num-1:28_397": "(num 1=8/3972)",
+ "num-1:28_947": "(num 1=8/9472)",
+ "num-1:29_657": "(num 1=9/6572)",
+ "num-1:29_867": "(num 1=9/8672)",
+ "num-1:29_874": "(num 1=9/8742)",
+ "num-1:30_083": "(num 1=0/0833)",
+ "num-1:30_140": "(num 1=0/1403)",
+ "num-1:30_354": "(num 1=0/3543)",
+ "num-1:30_453": "(num 1=0/4533)",
+ "num-1:30_946": "(num 1=0/9463)",
+ "num-1:31_602": "(num 1=1/6023)",
+ "num-1:31_842": "(num 1=1/8423)",
+ "num-1:32_087": "(num 1=2/0873)",
+ "num-1:32_259": "(num 1=2/2593)",
+ "num-1:33_166": "(num 1=3/1663)",
+ "num-1:34_154": "(num 1=4/1543)",
+ "num-1:34_556": "(num 1=4/5563)",
+ "num-1:35_323": "(num 1=5/3233)",
+ "num-1:36_023": "(num 1=6/0233)",
+ "num-1:36_076": "(num 1=6/0763)",
+ "num-1:36_651": "(num 1=6/6513)",
"n-giga_flop/s": "(n giga_flop/s)",
"sect-'g-1'": "(sect g-1)",
"sect-'h-1'": "(sect h-1)",
@@ -214,10 +245,13 @@ def raw_tree(text):
"sect-'h-3'": "(sect h-3)",
"abbr-'a-b-c'": "(abbr a-b-c)",
"abbr-d_o_a_": "(abbr DOA)",
+ "abbr-d_l_": "(abbr DL)",
+ "abbr-i_s_e_f_": "(abbr ISEF)",
"abbr-d_p_r_": "(abbr DPR)",
"abbr-D_P_R_": "(abbr DPR)",
"abbr-d_m_": "(abbr dm)",
"abbr-T_U_": "(abbr TU)",
+ "abbr-F_A_M_E_": "(abbr Fame)",
"dots-'...'": "(dots ...)",
}
new_pieces = ["(ROOT "]
@@ -415,6 +449,12 @@ def update_mwts_and_special_cases(original_tree, dev_sentence, mwt_map, tsurgeon
operations.append(["/^testo$/ !, __ . /^:$/=prune", "prune prune"])
operations.append(["/^testo$/=prune !, __", "prune prune"])
+ if len(con_words) >= 2 and con_words[-2] == '...' and con_words[-1] == '.':
+ # the most recent VIT constituency has some sentence final . after a ...
+ # the UD dataset has a more typical ... ending instead
+ # these lines used to say "riempire" which was rather odd
+ operations.append(["/^[.][.][.]$/ . /^[.]$/=prune", "prune prune"])
+
# now assemble a bunch of regex to split and otherwise manipulate
# the MWT in the trees
for token in dev_sentence.tokens:
@@ -505,13 +545,12 @@ def update_tree(original_tree, dep_sentence, con_id, dep_id, mwt_map, tsurgeon_p
# 2375: the problem is inconsistent treatment of s_p_a_
# 05052: the heuristic to fill in a missing "si" doesn't work because there's
# already another "si" immediately after
-# 07069: "i" edited out in UD incorrectly?
-# 07117: FAME -> F A ME wtf?
-# 08371: da riempire inconsistency
+# 07683: weird token
#
# test set:
-# 09765: da riempire inconsistency
-IGNORE_IDS = ["sent_00867", "sent_01169", "sent_02375", "sent_05052", "sent_07069", "sent_07117", "sent_08371", "sent_09765"]
+# 09764: weird punct at end
+# 10058: weird punct at end
+IGNORE_IDS = ["sent_00867", "sent_01169", "sent_02375", "sent_05052", "sent_09764", "sent_10058"]
def extract_updated_dataset(con_tree_map, dep_sentence_map, split_ids, mwt_map, tsurgeon_processor):
"""
@@ -537,7 +576,7 @@ def convert_it_vit(con_directory, ud_directory, output_directory, dataset_name,
# the most recent update from ELRA may look like this?
# it's what we got, at least
# con_filename = os.path.join(con_directory, "italian", "VITwritten", "VITconstsyntNumb")
- con_filename = os.path.join(con_directory, "italian", "john", "VITconstsyntNumb")
+ con_filename = os.path.join(con_directory, "italian", "it_vit", "VITwritten", "VITconstsyntNumb")
ud_vit_train = os.path.join(ud_directory, "it_vit-ud-train.conllu")
ud_vit_dev = os.path.join(ud_directory, "it_vit-ud-dev.conllu")
ud_vit_test = os.path.join(ud_directory, "it_vit-ud-test.conllu")
diff --git a/stanza/utils/datasets/constituency/prepare_con_dataset.py b/stanza/utils/datasets/constituency/prepare_con_dataset.py
index 39f492a8..ad0c10a0 100644
--- a/stanza/utils/datasets/constituency/prepare_con_dataset.py
+++ b/stanza/utils/datasets/constituency/prepare_con_dataset.py
@@ -122,8 +122,9 @@ def process_it_turin(paths):
convert_it_turin(input_dir, output_dir)
def process_it_vit(paths):
- # needs at least UD 2.10 or this will not work
- ud_dir = os.path.join(paths["UDBASE"], "UD_Italian-VIT")
+ # needs at least UD 2.11 or this will not work
+ # in the meantime, the git version of VIT will suffice
+ ud_dir = os.path.join(paths["UDBASE_GIT"], "UD_Italian-VIT")
output_dir = paths["CONSTITUENCY_DATA_DIR"]
convert_it_vit(paths["CONSTITUENCY_BASE"], ud_dir, output_dir, "it_vit")