Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-01 01:38:18 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-01 05:32:41 +0300
commite4fcf003feb984f535371fb91c9e380dd187fd12 (patch)
treef220a726d4ffdc6fee15867feaac72e504069282
parent5e3ab62f0bbbc005f08d611657b42b61457ad01f (diff)
Add the capacity to build he_combined models from UD_Hebrew-IAHLTwiki and a fork of HTB. Addresses #1109hebrew_combined
-rw-r--r--stanza/models/pos/xpos_vocab_factory.py1
-rwxr-xr-xstanza/utils/datasets/prepare_tokenizer_treebank.py69
-rw-r--r--stanza/utils/default_paths.py1
3 files changed, 59 insertions, 12 deletions
diff --git a/stanza/models/pos/xpos_vocab_factory.py b/stanza/models/pos/xpos_vocab_factory.py
index 7308d2f6..2738524d 100644
--- a/stanza/models/pos/xpos_vocab_factory.py
+++ b/stanza/models/pos/xpos_vocab_factory.py
@@ -83,6 +83,7 @@ XPOS_DESCRIPTIONS = {
'got_proiel' : XPOSDescription(XPOSType.WORD, None),
'grc_proiel' : XPOSDescription(XPOSType.WORD, None),
'hbo_ptnk' : XPOSDescription(XPOSType.WORD, None),
+ 'he_combined' : XPOSDescription(XPOSType.WORD, None),
'he_htb' : XPOSDescription(XPOSType.WORD, None),
'he_iahltwiki' : XPOSDescription(XPOSType.WORD, None),
'hi_hdtb' : XPOSDescription(XPOSType.WORD, None),
diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
index 3e136e47..30f4a93b 100755
--- a/stanza/utils/datasets/prepare_tokenizer_treebank.py
+++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -802,7 +802,8 @@ def build_combined_korean(udbase_dir, tokenizer_dir, short_name):
output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu)
-def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_combined_italian_dataset(paths, dataset):
+ udbase_dir = paths["UDBASE"]
if dataset == 'train':
# could maybe add ParTUT, but that dataset has a slightly different xpos set
# (no DE or I)
@@ -826,10 +827,11 @@ def check_gum_ready(udbase_dir):
if common.mostly_underscores(gum_conllu):
raise ValueError("Cannot process UD_English-GUMReddit in its current form. There should be a download script available in the directory which will help integrate the missing proprietary values. Please run that script to update the data, then try again.")
-def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_combined_english_dataset(paths, dataset):
"""
en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed
"""
+ udbase_dir = paths["UDBASE"]
check_gum_ready(udbase_dir)
if dataset == 'train':
@@ -842,10 +844,14 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
sents = []
for treebank in train_treebanks:
conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
- sents.extend(read_sentences_from_conllu(conllu_file))
+ new_sents = read_sentences_from_conllu(conllu_file)
+ print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+ sents.extend(new_sents)
for treebank in test_treebanks:
conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True)
- sents.extend(read_sentences_from_conllu(conllu_file))
+ new_sents = read_sentences_from_conllu(conllu_file)
+ print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+ sents.extend(new_sents)
else:
ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu")
sents = read_sentences_from_conllu(ewt_conllu)
@@ -853,19 +859,21 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
sents = strip_mwt_from_sentences(sents)
return sents
-def build_extra_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_extra_combined_english_dataset(paths, dataset):
"""
Extra sentences we don't want augmented
"""
+ handparsed_dir = paths["HANDPARSED_DIR"]
sents = []
if dataset == 'train':
sents.extend(read_sentences_from_conllu(os.path.join(handparsed_dir, "english-handparsed", "english.conll")))
return sents
-def build_extra_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_extra_combined_italian_dataset(paths, dataset):
"""
Extra data - the MWT data for Italian
"""
+ handparsed_dir = paths["HANDPARSED_DIR"]
if dataset != 'train':
return []
@@ -907,13 +915,16 @@ def replace_semicolons(sentences):
print("Updated %d sentences to replace sentence-final ; with ." % count)
return new_sents
-def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_combined_spanish_dataset(paths, dataset):
"""
es_combined is AnCora and GSD put together
TODO: remove features which aren't shared between datasets
TODO: consider mixing in PUD?
"""
+ udbase_dir = paths["UDBASE"]
+ tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
+ handparsed_dir = paths["HANDPARSED_DIR"]
if dataset == 'train':
treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"]
sents = []
@@ -936,9 +947,42 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
return sents
+def build_combined_hebrew_dataset(paths, dataset):
+ """
+ Combines the IAHLT treebank with an updated form of HTB where the annotation style more closes matches IAHLT
+
+ Currently the updated HTB is not in UD, so you will need to clone
+ git@github.com:IAHLT/UD_Hebrew.git to $UDBASE_GIT
+
+ dev and test sets will be those from IAHLT
+ """
+ udbase_dir = paths["UDBASE"]
+ udbase_git_dir = paths["UDBASE_GIT"]
+
+ treebanks = ["UD_Hebrew-IAHLTwiki"]
+ if dataset == 'train':
+ sents = []
+ for treebank in treebanks:
+ conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+ new_sents = read_sentences_from_conllu(conllu_file)
+ print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+ sents.extend(new_sents)
+
+ # if/when this gets ported back to UD, switch to getting both datasets from UD
+ conllu_file = os.path.join(udbase_git_dir, "UD_Hebrew", "he_htb-ud-train.conllu")
+ new_sents = read_sentences_from_conllu(conllu_file)
+ print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+ sents.extend(new_sents)
+ else:
+ conllu_file = common.find_treebank_dataset_file(treebanks[0], udbase_dir, dataset, "conllu", fail=True)
+ sents = read_sentences_from_conllu(conllu_file)
+
+ return sents
+
COMBINED_FNS = {
"en_combined": build_combined_english_dataset,
"es_combined": build_combined_spanish_dataset,
+ "he_combined": build_combined_hebrew_dataset,
"it_combined": build_combined_italian_dataset,
}
@@ -948,17 +992,18 @@ COMBINED_EXTRA_FNS = {
"it_combined": build_extra_combined_italian_dataset,
}
-def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, augment):
+def build_combined_dataset(paths, short_name, augment):
random.seed(1234)
+ tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
build_fn = COMBINED_FNS[short_name]
extra_fn = COMBINED_EXTRA_FNS.get(short_name, None)
for dataset in ("train", "dev", "test"):
output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
- sents = build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+ sents = build_fn(paths, dataset)
if dataset == 'train' and augment:
sents = augment_punct(sents)
if extra_fn is not None:
- sents.extend(extra_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset))
+ sents.extend(extra_fn(paths, dataset))
write_sentences_to_conllu(output_conllu, sents)
BIO_DATASETS = ("en_craft", "en_genia", "en_mimic")
@@ -975,7 +1020,7 @@ def build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_na
for dataset in ("train", "dev", "test"):
output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
if dataset == 'train':
- sents = build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+ sents = build_combined_english_dataset(paths, dataset)
if dataset == 'train' and augment:
sents = augment_punct(sents)
else:
@@ -1122,7 +1167,7 @@ def process_treebank(treebank, paths, args):
elif short_name.startswith("ko_combined"):
build_combined_korean(udbase_dir, tokenizer_dir, short_name)
elif short_name in COMBINED_FNS: # eg "it_combined", "en_combined", etc
- build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
+ build_combined_dataset(paths, short_name, args.augment)
elif short_name in BIO_DATASETS:
build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
elif short_name.startswith("en_gum"):
diff --git a/stanza/utils/default_paths.py b/stanza/utils/default_paths.py
index 143ce893..26def1ed 100644
--- a/stanza/utils/default_paths.py
+++ b/stanza/utils/default_paths.py
@@ -27,6 +27,7 @@ def get_default_paths():
# TODO: not sure what other people actually have
# TODO: also, could make this automatically update to the latest
"UDBASE": "extern_data/ud2/ud-treebanks-v2.10",
+ "UDBASE_GIT": "extern_data/ud2/git",
"NERBASE": "extern_data/ner",
"CONSTITUENCY_BASE": "extern_data/constituency",