Add the capacity to build he_combined models from UD_Hebrew-IAHLTwiki and a fork of HTB. Addresses #1109hebrew_combined

author: John Bauer <horatio@gmail.com> 2022-09-01 01:38:18 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-01 05:32:41 +0300
commit: e4fcf003feb984f535371fb91c9e380dd187fd12 (patch)
tree: f220a726d4ffdc6fee15867feaac72e504069282
parent: 5e3ab62f0bbbc005f08d611657b42b61457ad01f (diff)
3 files changed, 59 insertions, 12 deletions
diff --git a/stanza/models/pos/xpos_vocab_factory.py b/stanza/models/pos/xpos_vocab_factory.py
index 7308d2f6..2738524d 100644
--- a/stanza/models/pos/xpos_vocab_factory.py
+++ b/stanza/models/pos/xpos_vocab_factory.py
@@ -83,6 +83,7 @@ XPOS_DESCRIPTIONS = {
     'got_proiel'     : XPOSDescription(XPOSType.WORD, None),
     'grc_proiel'     : XPOSDescription(XPOSType.WORD, None),
     'hbo_ptnk'       : XPOSDescription(XPOSType.WORD, None),
+    'he_combined'    : XPOSDescription(XPOSType.WORD, None),
     'he_htb'         : XPOSDescription(XPOSType.WORD, None),
     'he_iahltwiki'   : XPOSDescription(XPOSType.WORD, None),
     'hi_hdtb'        : XPOSDescription(XPOSType.WORD, None),
diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
index 3e136e47..30f4a93b 100755
--- a/stanza/utils/datasets/prepare_tokenizer_treebank.py
+++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -802,7 +802,8 @@ def build_combined_korean(udbase_dir, tokenizer_dir, short_name):
         output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
         build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu)
 
-def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_combined_italian_dataset(paths, dataset):
+    udbase_dir = paths["UDBASE"]
     if dataset == 'train':
         # could maybe add ParTUT, but that dataset has a slightly different xpos set
         # (no DE or I)
@@ -826,10 +827,11 @@ def check_gum_ready(udbase_dir):
     if common.mostly_underscores(gum_conllu):
         raise ValueError("Cannot process UD_English-GUMReddit in its current form.  There should be a download script available in the directory which will help integrate the missing proprietary values.  Please run that script to update the data, then try again.")
 
-def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_combined_english_dataset(paths, dataset):
     """
     en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed
     """
+    udbase_dir = paths["UDBASE"]
     check_gum_ready(udbase_dir)
 
     if dataset == 'train':
@@ -842,10 +844,14 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
         sents = []
         for treebank in train_treebanks:
             conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
-            sents.extend(read_sentences_from_conllu(conllu_file))
+            new_sents = read_sentences_from_conllu(conllu_file)
+            print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+            sents.extend(new_sents)
         for treebank in test_treebanks:
             conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True)
-            sents.extend(read_sentences_from_conllu(conllu_file))
+            new_sents = read_sentences_from_conllu(conllu_file)
+            print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+            sents.extend(new_sents)
     else:
         ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu")
         sents = read_sentences_from_conllu(ewt_conllu)
@@ -853,19 +859,21 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
     sents = strip_mwt_from_sentences(sents)
     return sents
 
-def build_extra_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_extra_combined_english_dataset(paths, dataset):
     """
     Extra sentences we don't want augmented
     """
+    handparsed_dir = paths["HANDPARSED_DIR"]
     sents = []
     if dataset == 'train':
         sents.extend(read_sentences_from_conllu(os.path.join(handparsed_dir, "english-handparsed", "english.conll")))
     return sents
 
-def build_extra_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_extra_combined_italian_dataset(paths, dataset):
     """
     Extra data - the MWT data for Italian
     """
+    handparsed_dir = paths["HANDPARSED_DIR"]
     if dataset != 'train':
         return []
 
@@ -907,13 +915,16 @@ def replace_semicolons(sentences):
     print("Updated %d sentences to replace sentence-final ; with ." % count)
     return new_sents
 
-def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+def build_combined_spanish_dataset(paths, dataset):
     """
     es_combined is AnCora and GSD put together
 
     TODO: remove features which aren't shared between datasets
     TODO: consider mixing in PUD?
     """
+    udbase_dir = paths["UDBASE"]
+    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
+    handparsed_dir = paths["HANDPARSED_DIR"]
     if dataset == 'train':
         treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"]
         sents = []
@@ -936,9 +947,42 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
     return sents
 
 
+def build_combined_hebrew_dataset(paths, dataset):
+    """
+    Combines the IAHLT treebank with an updated form of HTB where the annotation style more closes matches IAHLT
+
+    Currently the updated HTB is not in UD, so you will need to clone
+    git@github.com:IAHLT/UD_Hebrew.git to $UDBASE_GIT
+
+    dev and test sets will be those from IAHLT
+    """
+    udbase_dir = paths["UDBASE"]
+    udbase_git_dir = paths["UDBASE_GIT"]
+
+    treebanks = ["UD_Hebrew-IAHLTwiki"]
+    if dataset == 'train':
+        sents = []
+        for treebank in treebanks:
+            conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+            new_sents = read_sentences_from_conllu(conllu_file)
+            print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+            sents.extend(new_sents)
+
+        # if/when this gets ported back to UD, switch to getting both datasets from UD
+        conllu_file = os.path.join(udbase_git_dir, "UD_Hebrew", "he_htb-ud-train.conllu")
+        new_sents = read_sentences_from_conllu(conllu_file)
+        print("Read %d sentences from %s" % (len(new_sents), conllu_file))
+        sents.extend(new_sents)
+    else:
+        conllu_file = common.find_treebank_dataset_file(treebanks[0], udbase_dir, dataset, "conllu", fail=True)
+        sents = read_sentences_from_conllu(conllu_file)
+
+    return sents
+
 COMBINED_FNS = {
     "en_combined": build_combined_english_dataset,
     "es_combined": build_combined_spanish_dataset,
+    "he_combined": build_combined_hebrew_dataset,
     "it_combined": build_combined_italian_dataset,
 }
 
@@ -948,17 +992,18 @@ COMBINED_EXTRA_FNS = {
     "it_combined": build_extra_combined_italian_dataset,
 }
 
-def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, augment):
+def build_combined_dataset(paths, short_name, augment):
     random.seed(1234)
+    tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
     build_fn = COMBINED_FNS[short_name]
     extra_fn = COMBINED_EXTRA_FNS.get(short_name, None)
     for dataset in ("train", "dev", "test"):
         output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
-        sents = build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+        sents = build_fn(paths, dataset)
         if dataset == 'train' and augment:
             sents = augment_punct(sents)
         if extra_fn is not None:
-            sents.extend(extra_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset))
+            sents.extend(extra_fn(paths, dataset))
         write_sentences_to_conllu(output_conllu, sents)
 
 BIO_DATASETS = ("en_craft", "en_genia", "en_mimic")
@@ -975,7 +1020,7 @@ def build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_na
     for dataset in ("train", "dev", "test"):
         output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
         if dataset == 'train':
-            sents = build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+            sents = build_combined_english_dataset(paths, dataset)
             if dataset == 'train' and augment:
                 sents = augment_punct(sents)
         else:
@@ -1122,7 +1167,7 @@ def process_treebank(treebank, paths, args):
     elif short_name.startswith("ko_combined"):
         build_combined_korean(udbase_dir, tokenizer_dir, short_name)
     elif short_name in COMBINED_FNS: # eg "it_combined", "en_combined", etc
-        build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
+        build_combined_dataset(paths, short_name, args.augment)
     elif short_name in BIO_DATASETS:
         build_bio_dataset(paths, udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
     elif short_name.startswith("en_gum"):
diff --git a/stanza/utils/default_paths.py b/stanza/utils/default_paths.py
index 143ce893..26def1ed 100644
--- a/stanza/utils/default_paths.py
+++ b/stanza/utils/default_paths.py
@@ -27,6 +27,7 @@ def get_default_paths():
         # TODO: not sure what other people actually have
         # TODO: also, could make this automatically update to the latest
         "UDBASE": "extern_data/ud2/ud-treebanks-v2.10",
+        "UDBASE_GIT": "extern_data/ud2/git",
 
         "NERBASE": "extern_data/ner",
         "CONSTITUENCY_BASE": "extern_data/constituency",
author	John Bauer <horatio@gmail.com>	2022-09-01 01:38:18 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-01 05:32:41 +0300
commit	e4fcf003feb984f535371fb91c9e380dd187fd12 (patch)
tree	f220a726d4ffdc6fee15867feaac72e504069282
parent	5e3ab62f0bbbc005f08d611657b42b61457ad01f (diff)