diff options
author | John Bauer <horatio@gmail.com> | 2022-09-01 01:38:18 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-09-01 05:32:41 +0300 |
commit | e4fcf003feb984f535371fb91c9e380dd187fd12 (patch) | |
tree | f220a726d4ffdc6fee15867feaac72e504069282 | |
parent | 5e3ab62f0bbbc005f08d611657b42b61457ad01f (diff) |
Add the capacity to build he_combined models from UD_Hebrew-IAHLTwiki and a fork of HTB. Addresses #1109hebrew_combined
-rw-r--r-- | stanza/models/pos/xpos_vocab_factory.py | 1 | ||||
-rwxr-xr-x | stanza/utils/datasets/prepare_tokenizer_treebank.py | 69 | ||||
-rw-r--r-- | stanza/utils/default_paths.py | 1 |
3 files changed, 59 insertions, 12 deletions
diff --git a/stanza/models/pos/xpos_vocab_factory.py b/stanza/models/pos/xpos_vocab_factory.py index 7308d2f6..2738524d 100644 --- a/stanza/models/pos/xpos_vocab_factory.py +++ b/stanza/models/pos/xpos_vocab_factory.py @@ -83,6 +83,7 @@ XPOS_DESCRIPTIONS = { 'got_proiel' : XPOSDescription(XPOSType.WORD, None), 'grc_proiel' : XPOSDescription(XPOSType.WORD, None), 'hbo_ptnk' : XPOSDescription(XPOSType.WORD, None), + 'he_combined' : XPOSDescription(XPOSType.WORD, None), 'he_htb' : XPOSDescription(XPOSType.WORD, None), 'he_iahltwiki' : XPOSDescription(XPOSType.WORD, None), 'hi_hdtb' : XPOSDescription(XPOSType.WORD, None), diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py index 3e136e47..30f4a93b 100755 --- a/stanza/utils/datasets/prepare_tokenizer_treebank.py +++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py @@ -802,7 +802,8 @@ def build_combined_korean(udbase_dir, tokenizer_dir, short_name): output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu) -def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): +def build_combined_italian_dataset(paths, dataset): + udbase_dir = paths["UDBASE"] if dataset == 'train': # could maybe add ParTUT, but that dataset has a slightly different xpos set # (no DE or I) @@ -826,10 +827,11 @@ def check_gum_ready(udbase_dir): if common.mostly_underscores(gum_conllu): raise ValueError("Cannot process UD_English-GUMReddit in its current form. There should be a download script available in the directory which will help integrate the missing proprietary values. Please run that script to update the data, then try again.") -def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): +def build_combined_english_dataset(paths, dataset): """ en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed """ + udbase_dir = paths["UDBASE"] check_gum_ready(udbase_dir) if dataset == 'train': @@ -842,10 +844,14 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh sents = [] for treebank in train_treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True) - sents.extend(read_sentences_from_conllu(conllu_file)) + new_sents = read_sentences_from_conllu(conllu_file) + print("Read %d sentences from %s" % (len(new_sents), conllu_file)) + sents.extend(new_sents) for treebank in test_treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True) - sents.extend(read_sentences_from_conllu(conllu_file)) + new_sents = read_sentences_from_conllu(conllu_file) + print("Read %d sentences from %s" % (len(new_sents), conllu_file)) + sents.extend(new_sents) else: ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu") sents = read_sentences_from_conllu(ewt_conllu) @@ -853,19 +859,21 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh sents = strip_mwt_from_sentences(sents) return sents -def build_extra_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): +def build_extra_combined_english_dataset(paths, dataset): """ Extra sentences we don't want augmented """ + handparsed_dir = paths["HANDPARSED_DIR"] sents = [] if dataset == 'train': sents.extend(read_sentences_from_conllu(os.path.join(handparsed_dir, "english-handparsed", "english.conll"))) return sents -def build_extra_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): +def build_extra_combined_italian_dataset(paths, dataset): """ Extra data - the MWT data for Italian """ + handparsed_dir = paths["HANDPARSED_DIR"] if dataset != 'train': return [] @@ -907,13 +915,16 @@ def replace_semicolons(sentences): print("Updated %d sentences to replace sentence-final ; with ." % count) return new_sents -def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): +def build_combined_spanish_dataset(paths, dataset): """ es_combined is AnCora and GSD put together TODO: remove features which aren't shared between datasets TODO: consider mixing in PUD? """ + udbase_dir = paths["UDBASE"] + tokenizer_dir = paths["TOKENIZE_DATA_DIR"] + handparsed_dir = paths["HANDPARSED_DIR"] if dataset == 'train': treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"] sents = [] @@ -936,9 +947,42 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh return sents +def build_combined_hebrew_dataset(paths, dataset): + """ + Combines the IAHLT treebank with an updated form of HTB where the annotation style more closes matches IAHLT + + Currently the updated HTB is not in UD, so you will need to clone + git@github.com:IAHLT/UD_Hebrew.git to $UDBASE_GIT + + dev and test sets will be those from IAHLT + """ + udbase_dir = paths["UDBASE"] + udbase_git_dir = paths["UDBASE_GIT"] + + treebanks = ["UD_Hebrew-IAHLTwiki"] + if dataset == 'train': + sents = [] + for treebank in treebanks: + conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True) + new_sents = read_sentences_from_conllu(conllu_file) + print("Read %d sentences from %s" % (len(new_sents), conllu_file)) + sents.extend(new_sents) + + # if/when this gets ported back to UD, switch to getting both datasets from UD + conllu_file = os.path.join(udbase_git_dir, "UD_Hebrew", "he_htb-ud-train.conllu") + new_sents = read_sentences_from_conllu(conllu_file) + print("Read %d sentences from %s" % (len(new_sents), conllu_file)) + sents.extend(new_sents) + else: + conllu_file = common.find_treebank_dataset_file(treebanks[0], udbase_dir, dataset, "conllu", fail=True) + sents = read_sentences_from_conllu(conllu_file) + + return sents + COMBINED_FNS = { "en_combined": build_combined_english_dataset, "es_combined": build_combined_spanish_dataset, + "he_combined": build_combined_hebrew_dataset, "it_combined": build_combined_italian_dataset, } @@ -948,17 +992,18 @@ COMBINED_EXTRA_FNS = { "it_combined": build_extra_combined_italian_dataset, } -def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, augment): +def build_combined_dataset(paths, short_name, augment): random.seed(1234) + tokenizer_dir = paths["TOKENIZE_DATA_DIR"] build_fn = COMBINED_FNS[short_name] extra_fn = COMBINED_EXTRA_FNS.get(short_name, None) for dataset in ("train", "dev", "test"): output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" - sents = build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset) + sents = build_fn(paths, dataset) if dataset == 'train' and augment: sents = augment_punct(sents) if extra_fn is not None: - sents.extend(extra_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)) + sents.extend(extra_fn(paths, dataset)) write_sentences_to_conllu(output_conllu, sents) BIO_DATASETS = ("en_craft", "en_genia", "en_mimic") @@ -975,7 +1020,7 @@ def build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_na for dataset in ("train", "dev", "test"): output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" if dataset == 'train': - sents = build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset) + sents = build_combined_english_dataset(paths, dataset) if dataset == 'train' and augment: sents = augment_punct(sents) else: @@ -1122,7 +1167,7 @@ def process_treebank(treebank, paths, args): elif short_name.startswith("ko_combined"): build_combined_korean(udbase_dir, tokenizer_dir, short_name) elif short_name in COMBINED_FNS: # eg "it_combined", "en_combined", etc - build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment) + build_combined_dataset(paths, short_name, args.augment) elif short_name in BIO_DATASETS: build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment) elif short_name.startswith("en_gum"): diff --git a/stanza/utils/default_paths.py b/stanza/utils/default_paths.py index 143ce893..26def1ed 100644 --- a/stanza/utils/default_paths.py +++ b/stanza/utils/default_paths.py @@ -27,6 +27,7 @@ def get_default_paths(): # TODO: not sure what other people actually have # TODO: also, could make this automatically update to the latest "UDBASE": "extern_data/ud2/ud-treebanks-v2.10", + "UDBASE_GIT": "extern_data/ud2/git", "NERBASE": "extern_data/ner", "CONSTITUENCY_BASE": "extern_data/constituency", |