Merge pull request #718 from stanfordnlp/ud28

Ud28
author: John Bauer <horatio@gmail.com> 2021-06-08 22:46:50 +0300
committer: GitHub <noreply@github.com> 2021-06-08 22:46:50 +0300
commit: ca0ff9c40609891da2ad5ef4765369cbd00ef5d5 (patch)
tree: e283f72143df199be15c36c2868acdbbd64d00ce
parent: 405e516dee460c57efaa9b20c0728f7e0b6d0d61 (diff)
parent: 0ac918983bb2210230ab2a3c229711c1ab22903e (diff)
14 files changed, 148 insertions, 151 deletions
diff --git a/scripts/download_vectors.sh b/scripts/download_vectors.sh
index 960f57ef..1705fdfe 100755
--- a/scripts/download_vectors.sh
+++ b/scripts/download_vectors.sh
@@ -20,9 +20,17 @@ FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki"
 # Welsh, Icelandic, Thai, Sanskrit
 # https://fasttext.cc/docs/en/crawl-vectors.html
 
-declare -a FASTTEXT_LANG=("Afrikaans" "Armenian" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
-declare -a FASTTEXT_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
-declare -a LOCAL_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")
+# We get the Armenian word vectors from here:
+# https://github.com/ispras-texterra/word-embeddings-eval-hy
+# https://arxiv.org/ftp/arxiv/papers/1906/1906.03134.pdf
+# In particular, the glove model (dogfooding):
+# https://at.ispras.ru/owncloud/index.php/s/pUUiS1l1jGKNax3/download
+# These vectors improved F1 by about 1 on various tasks for Armenian
+# and had much better coverage of Western Armenian
+
+declare -a FASTTEXT_LANG=("Afrikaans" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
+declare -a FASTTEXT_CODE=("af" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
+declare -a LOCAL_CODE=("af" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")
 
 color_green='\033[32;1m'
 color_clear='\033[0m' # No Color
diff --git a/scripts/treebank_to_shorthand.sh b/scripts/treebank_to_shorthand.sh
index 1d15b877..bb6f1793 100755
--- a/scripts/treebank_to_shorthand.sh
+++ b/scripts/treebank_to_shorthand.sh
@@ -7,9 +7,9 @@
 # Please keep synced with
 #   stanza/models/common/constant.py
 
-declare -A lang2lcode=( ["Afrikaans"]="af" ["Akkadian"]="akk" ["Akuntsu"]="aqz" ["Albanian"]="sq" ["Amharic"]="am" ["Apurina"]="apu" ["Armenian"]="hy" ["Arabic"]="ar" ["Assyrian"]="aii" ["Bambara"]="bm" ["Bengali"]="bn" ["Bhojpuri"]="bho" ["Breton"]="br" ["Bulgarian"]="bg" ["Buryat"]="bxr" ["Cantonese"]="yue" ["Catalan"]="ca" ["Chukchi"]="ckt" ["Czech"]="cs" ["Old_Church_Slavonic"]="cu" ["Danish"]="da" ["German"]="de" ["Greek"]="el" ["English"]="en" ["Spanish"]="es" ["Estonian"]="et" ["Basque"]="eu" ["Persian"]="fa" ["Faroese"]="fo" ["Finnish"]="fi" ["French"]="fr" ["Irish"]="ga" ["Galician"]="gl" ["Gothic"]="got" ["Ancient_Greek"]="grc" ["Mbya_Guarani"]="gun" ["Hebrew"]="he" ["Hindi"]="hi" ["Hindi_English"]="qhe" ["Croatian"]="hr"
-["Hungarian"]="hu" ["Icelandic"]="is" ["Indonesian"]="id" ["Italian"]="it" ["Japanese"]="ja" ["Karelian"]="krl" ["Kazakh"]="kk" ["Khunsari"]="kfm" ["Komi_Permyak"]="koi"  ["Komi_Zyrian"]="kpv"  ["Korean"]="ko" ["Kurmanji"]="kmr" ["Latin"]="la" ["Latvian"]="lv" ["Malayalam"]="mal" ["Manx"]="gv" ["Moksha"]="mdf" ["Munduruku"]="myu" ["Erzya"]="myv" ["Nayini"]="nyq" ["Dutch"]="nl" ["Norwegian_Bokmaal"]="nb" ["Norwegian_Nynorsk"]="nn" ["Polish"]="pl" ["Portuguese"]="pt" ["Romanian"]="ro" ["Russian"]="ru" ["Sanskrit"]="sa" ["Slovak"]="sk" ["Slovenian"]="sl" ["Soi"]="soj" ["South_Levantine_Arabic"]="ajp"  ["Swedish"]="sv" ["Swiss_German"]="gsw" ["Tagalog"]="tl" ["Turkish"]="tr" ["Turkish_German"]="qtd" ["Uyghur"]="ug" ["Ukrainian"]="uk" ["Urdu"]="ur" ["Vietnamese"]="vi" ["Traditional_Chinese"]="zh-hant" ["Welsh"]="cy" ["Altaic"]="bxr" ["Indo_Iranian"]="kmr" ["Uralic"]="sme" ["Slavic"]="hsb"
-["Naija"]="pcm" ["North_Sami"]="sme" ["Old_French"]="fro" ["Old_Turkish"]="otk" ["Serbian"]="sr" ["Skolt_Sami"]="sms" ["Thai"]="th" ["Tupinamba"]="tpn" ["Upper_Sorbian"]="hsb" ["Belarusian"]="be" ["Classical_Chinese"]="lzh" ["Coptic"]="cop" ["Lithuanian"]="lt" ["Livvi"]="olo" ["Maltese"]="mt" ["Marathi"]="mr" ["Old_Russian"]="orv" ["Scottish_Gaelic"]="gd" ["Simplified_Chinese"]="zh-hans" ["Swedish_Sign_Language"]="swl" ["Tamil"]="ta" ["Telugu"]="te" ["Warlpipi"]="wbp" ["Wolof"]="wo" ["Yoruba"]="yo" )
+declare -A lang2lcode=( ["Afrikaans"]="af" ["Akkadian"]="akk" ["Akuntsu"]="aqz" ["Albanian"]="sq" ["Amharic"]="am" ["Apurina"]="apu" ["Armenian"]="hy" ["Arabic"]="ar" ["Assyrian"]="aii" ["Bambara"]="bm" ["Beja"]="bej" ["Bengali"]="bn" ["Bhojpuri"]="bho" ["Breton"]="br" ["Bulgarian"]="bg" ["Buryat"]="bxr" ["Cantonese"]="yue" ["Catalan"]="ca" ["Chukchi"]="ckt" ["Czech"]="cs" ["Old_Church_Slavonic"]="cu" ["Danish"]="da" ["German"]="de" ["Greek"]="el" ["English"]="en" ["Spanish"]="es" ["Estonian"]="et" ["Basque"]="eu" ["Persian"]="fa" ["Faroese"]="fo" ["Finnish"]="fi" ["French"]="fr" ["Frisian_Dutch"]="qfn" ["Guajajara"]="gub" ["Irish"]="ga" ["Galician"]="gl" ["Gothic"]="got" ["Ancient_Greek"]="grc" ["Mbya_Guarani"]="gun" ["Hebrew"]="he" ["Hindi"]="hi" ["Hindi_English"]="qhe" ["Croatian"]="hr"
+["Hungarian"]="hu" ["Icelandic"]="is" ["Indonesian"]="id" ["Italian"]="it" ["Japanese"]="ja" ["Kaapor"]="urb" ["Kangri"]="xnr" ["Karelian"]="krl" ["Kazakh"]="kk" ["Khunsari"]="kfm" ["Kiche"]="quc" ["Komi_Permyak"]="koi"  ["Komi_Zyrian"]="kpv"  ["Korean"]="ko" ["Kurmanji"]="kmr" ["Latin"]="la" ["Latvian"]="lv" ["Low_Saxon"]="nds" ["Makurap"]="mpu" ["Malayalam"]="mal" ["Manx"]="gv" ["Moksha"]="mdf" ["Munduruku"]="myu" ["Erzya"]="myv" ["Nayini"]="nyq" ["Dutch"]="nl" ["Norwegian_Bokmaal"]="nb" ["Norwegian_Nynorsk"]="nn" ["Polish"]="pl" ["Portuguese"]="pt" ["Romanian"]="ro" ["Russian"]="ru" ["Sanskrit"]="sa" ["Slovak"]="sk" ["Slovenian"]="sl" ["Soi"]="soj" ["South_Levantine_Arabic"]="ajp"  ["Swedish"]="sv" ["Swiss_German"]="gsw" ["Tagalog"]="tl" ["Turkish"]="tr" ["Turkish_German"]="qtd" ["Uyghur"]="ug" ["Ukrainian"]="uk" ["Urdu"]="ur" ["Vietnamese"]="vi" ["Traditional_Chinese"]="zh-hant" ["Welsh"]="cy" ["Altaic"]="bxr" ["Indo_Iranian"]="kmr" ["Uralic"]="sme" ["Slavic"]="hsb"
+["Naija"]="pcm" ["North_Sami"]="sme" ["Old_French"]="fro" ["Old_Turkish"]="otk" ["Serbian"]="sr" ["Skolt_Sami"]="sms" ["Thai"]="th" ["Tupinamba"]="tpn" ["Upper_Sorbian"]="hsb" ["Belarusian"]="be" ["Classical_Chinese"]="lzh" ["Coptic"]="cop" ["Lithuanian"]="lt" ["Livvi"]="olo" ["Maltese"]="mt" ["Marathi"]="mr" ["Old_Russian"]="orv" ["Scottish_Gaelic"]="gd" ["Simplified_Chinese"]="zh-hans" ["Swedish_Sign_Language"]="swl" ["Tamil"]="ta" ["Telugu"]="te" ["Warlpipi"]="wbp" ["Western_Armenian"]="hyw" ["Wolof"]="wo" ["Yoruba"]="yo" ["Yupik"]="ess" )
 
 format=$1
 shift
diff --git a/stanza/_version.py b/stanza/_version.py
index 5d611f5b..4bce72e3 100644
--- a/stanza/_version.py
+++ b/stanza/_version.py
@@ -1,4 +1,4 @@
 """ Single source of truth for version number """
 
-__version__ = "1.2"
-__resources_version__ = '1.2.0'
+__version__ = "1.2.1"
+__resources_version__ = '1.2.1'
diff --git a/stanza/models/charlm.py b/stanza/models/charlm.py
index 031f2139..36345121 100644
--- a/stanza/models/charlm.py
+++ b/stanza/models/charlm.py
@@ -56,27 +56,26 @@ def readlines(path):
 
 def build_vocab(path, cutoff=0):
     # Requires a large amount of memory, but only need to build once
+
+    # here we need some trick to deal with excessively large files
+    # for each file we accumulate the counter of characters, and
+    # at the end we simply pass a list of chars to the vocab builder
+    counter = Counter()
     if os.path.isdir(path):
-        # here we need some trick to deal with excessively large files
-        # for each file we accumulate the counter of characters, and 
-        # at the end we simply pass a list of chars to the vocab builder
-        counter = Counter()
         filenames = sorted(os.listdir(path))
-        for filename in filenames:
-            lines = readlines(path + '/' + filename)
-            for line in lines:
-                counter.update(list(line))
-        # remove infrequent characters from vocab
-        for k in list(counter.keys()):
-            if counter[k] < cutoff:
-                del counter[k]
-        # a singleton list of all characters
-        data = [sorted([x[0] for x in counter.most_common()])]
-        vocab = CharVocab(data) # skip cutoff argument because this has been dealt with
     else:
-        lines = readlines(path)
-        data = [list(line) for line in lines]
-        vocab = CharVocab(data, cutoff=cutoff)
+        filenames = [path]
+    for filename in filenames:
+        lines = readlines(path + '/' + filename)
+        for line in lines:
+            counter.update(list(line))
+    # remove infrequent characters from vocab
+    for k in list(counter.keys()):
+        if counter[k] < cutoff:
+            del counter[k]
+    # a singleton list of all characters
+    data = [sorted([x[0] for x in counter.most_common()])]
+    vocab = CharVocab(data) # skip cutoff argument because this has been dealt with
     return vocab
 
 def load_file(path, vocab, direction):
diff --git a/stanza/models/common/constant.py b/stanza/models/common/constant.py
index cde8b597..3ba570ab 100644
--- a/stanza/models/common/constant.py
+++ b/stanza/models/common/constant.py
@@ -20,6 +20,7 @@ lcode2lang = {
     "aii": "Assyrian",
     "bm": "Bambara",
     "eu": "Basque",
+    "bej": "Beja",
     "be": "Belarusian",
     "bn": "Bengali",
     "bho": "Bhojpuri",
@@ -42,10 +43,12 @@ lcode2lang = {
     "fo": "Faroese",
     "fi": "Finnish",
     "fr": "French",
+    "qfn": "Frisian_Dutch",
     "gl": "Galician",
     "de": "German",
     "got": "Gothic",
     "el": "Greek",
+    "gub": "Guajajara",
     "he": "Hebrew",
     "hi": "Hindi",
     "qhe": "Hindi_English",
@@ -55,9 +58,12 @@ lcode2lang = {
     "ga": "Irish",
     "it": "Italian",
     "ja": "Japanese",
+    "urb": "Kaapor",
+    "xnr": "Kangri",
     "krl": "Karelian",
     "kk": "Kazakh",
     "kfm": "Khunsari",
+    "quc": "Kiche",
     "koi": "Komi_Permyak",
     "kpv": "Komi_Zyrian",
     "ko": "Korean",
@@ -66,6 +72,8 @@ lcode2lang = {
     "olo": "Livvi",
     "la": "Latin",
     "lv": "Latvian",
+    "nds": "Low_Saxon",
+    "mpu": "Makurap",
     "mal": "Malayalam",
     "mt": "Maltese",
     "gv": "Manx",
@@ -80,7 +88,7 @@ lcode2lang = {
     "nn": "Norwegian_Nynorsk",
     "cu": "Old_Church_Slavonic",
     "fro": "Old_French",
-    "orv": "Old_Russian",
+    "orv": "Old_East_Slavic",
     "otk": "Old_Turkish",
     "fa": "Persian",
     "pl": "Polish",
@@ -114,7 +122,9 @@ lcode2lang = {
     "vi": "Vietnamese",
     "wbp": "Warlpiri",
     "cy": "Welsh",
+    "hyw": "Western_Armenian",
     "wo": "Wolof",
+    "ess": "Yupik",
     "yo": "Yoruba",
 }
 
@@ -128,6 +138,9 @@ lcode2lang['zh'] = 'Simplified_Chinese'
 
 lang2lcode['Chinese'] = 'zh'
 
+# treebank names changed from Old Russian to Old East Slavic in 2.8
+lang2lcode['Old_Russian'] = 'orv'
+
 treebank_special_cases = {
     "UD_Chinese-GSDSimp": "zh_gsdsimp",
     "UD_Chinese-GSD": "zh-hant_gsd",
diff --git a/stanza/models/common/short_name_to_treebank.py b/stanza/models/common/short_name_to_treebank.py
index 9e681787..871f58da 100644
--- a/stanza/models/common/short_name_to_treebank.py
+++ b/stanza/models/common/short_name_to_treebank.py
@@ -18,6 +18,7 @@ SHORT_NAMES = {
     'aii_as': 'UD_Assyrian-AS',
     'bm_crb': 'UD_Bambara-CRB',
     'eu_bdt': 'UD_Basque-BDT',
+    'bej_nsc': 'UD_Beja-NSC',
     'be_hse': 'UD_Belarusian-HSE',
     'bho_bhtb': 'UD_Bhojpuri-BHTB',
     'br_keb': 'UD_Breton-KEB',
@@ -67,6 +68,7 @@ SHORT_NAMES = {
     'fr_partut': 'UD_French-ParTUT',
     'fr_sequoia': 'UD_French-Sequoia',
     'fr_spoken': 'UD_French-Spoken',
+    'qfn_fame': 'UD_Frisian_Dutch-Fame',
     'gl_ctg': 'UD_Galician-CTG',
     'gl_treegal': 'UD_Galician-TreeGal',
     'de_gsd': 'UD_German-GSD',
@@ -75,30 +77,37 @@ SHORT_NAMES = {
     'de_pud': 'UD_German-PUD',
     'got_proiel': 'UD_Gothic-PROIEL',
     'el_gdt': 'UD_Greek-GDT',
+    'gub_tudet': 'UD_Guajajara-TuDeT',
     'he_htb': 'UD_Hebrew-HTB',
     'hi_hdtb': 'UD_Hindi-HDTB',
     'hi_pud': 'UD_Hindi-PUD',
     'qhe_hiencs': 'UD_Hindi_English-HIENCS',
     'hu_szeged': 'UD_Hungarian-Szeged',
     'is_icepahc': 'UD_Icelandic-IcePaHC',
+    'is_modern': 'UD_Icelandic-Modern',
     'is_pud': 'UD_Icelandic-PUD',
     'id_csui': 'UD_Indonesian-CSUI',
     'id_gsd': 'UD_Indonesian-GSD',
     'id_pud': 'UD_Indonesian-PUD',
     'ga_idt': 'UD_Irish-IDT',
+    'ga_twittirish': 'UD_Irish-TwittIrish',
     'it_isdt': 'UD_Italian-ISDT',
     'it_pud': 'UD_Italian-PUD',
     'it_partut': 'UD_Italian-ParTUT',
     'it_postwita': 'UD_Italian-PoSTWITA',
     'it_twittiro': 'UD_Italian-TWITTIRO',
     'it_vit': 'UD_Italian-VIT',
+    'it_valico': 'UD_Italian-Valico',
     'ja_bccwj': 'UD_Japanese-BCCWJ',
     'ja_gsd': 'UD_Japanese-GSD',
     'ja_modern': 'UD_Japanese-Modern',
     'ja_pud': 'UD_Japanese-PUD',
+    'urb_tudet': 'UD_Kaapor-TuDeT',
+    'xnr_kdtb': 'UD_Kangri-KDTB',
     'krl_kkpp': 'UD_Karelian-KKPP',
     'kk_ktb': 'UD_Kazakh-KTB',
     'kfm_aha': 'UD_Khunsari-AHA',
+    'quc_iu': 'UD_Kiche-IU',
     'koi_uh': 'UD_Komi_Permyak-UH',
     'kpv_ikdp': 'UD_Komi_Zyrian-IKDP',
     'kpv_lattice': 'UD_Komi_Zyrian-Lattice',
@@ -110,10 +119,13 @@ SHORT_NAMES = {
     'la_llct': 'UD_Latin-LLCT',
     'la_proiel': 'UD_Latin-PROIEL',
     'la_perseus': 'UD_Latin-Perseus',
+    'la_udante': 'UD_Latin-UDante',
     'lv_lvtb': 'UD_Latvian-LVTB',
     'lt_alksnis': 'UD_Lithuanian-ALKSNIS',
     'lt_hse': 'UD_Lithuanian-HSE',
     'olo_kkpp': 'UD_Livvi-KKPP',
+    'nds_lsdc': 'UD_Low_Saxon-LSDC',
+    'mpu_tudet': 'UD_Makurap-TuDeT',
     'mt_mudt': 'UD_Maltese-MUDT',
     'gv_cadhan': 'UD_Manx-Cadhan',
     'mr_ufal': 'UD_Marathi-UFAL',
@@ -128,9 +140,9 @@ SHORT_NAMES = {
     'nn_nynorsk': 'UD_Norwegian-Nynorsk',
     'nn_nynorsklia': 'UD_Norwegian-NynorskLIA',
     'cu_proiel': 'UD_Old_Church_Slavonic-PROIEL',
+    'orv_rnc': 'UD_Old_East_Slavic-RNC',
+    'orv_torot': 'UD_Old_East_Slavic-TOROT',
     'fro_srcmf': 'UD_Old_French-SRCMF',
-    'orv_rnc': 'UD_Old_Russian-RNC',
-    'orv_torot': 'UD_Old_Russian-TOROT',
     'otk_tonqq': 'UD_Old_Turkish-Tonqq',
     'fa_perdt': 'UD_Persian-PerDT',
     'fa_seraji': 'UD_Persian-Seraji',
@@ -140,6 +152,7 @@ SHORT_NAMES = {
     'pt_bosque': 'UD_Portuguese-Bosque',
     'pt_gsd': 'UD_Portuguese-GSD',
     'pt_pud': 'UD_Portuguese-PUD',
+    'ro_art': 'UD_Romanian-ArT',
     'ro_nonstandard': 'UD_Romanian-Nonstandard',
     'ro_rrt': 'UD_Romanian-RRT',
     'ro_simonero': 'UD_Romanian-SiMoNERo',
@@ -173,9 +186,13 @@ SHORT_NAMES = {
     'th_pud': 'UD_Thai-PUD',
     'tpn_tudet': 'UD_Tupinamba-TuDeT',
     'tr_boun': 'UD_Turkish-BOUN',
+    'tr_framenet': 'UD_Turkish-FrameNet',
     'tr_gb': 'UD_Turkish-GB',
     'tr_imst': 'UD_Turkish-IMST',
+    'tr_kenet': 'UD_Turkish-Kenet',
     'tr_pud': 'UD_Turkish-PUD',
+    'tr_penn': 'UD_Turkish-Penn',
+    'tr_tourism': 'UD_Turkish-Tourism',
     'qtd_sagt': 'UD_Turkish_German-SAGT',
     'uk_iu': 'UD_Ukrainian-IU',
     'hsb_ufal': 'UD_Upper_Sorbian-UFAL',
@@ -184,8 +201,10 @@ SHORT_NAMES = {
     'vi_vtb': 'UD_Vietnamese-VTB',
     'wbp_ufal': 'UD_Warlpiri-UFAL',
     'cy_ccg': 'UD_Welsh-CCG',
+    'hyw_armtdp': 'UD_Western_Armenian-ArmTDP',
     'wo_wtb': 'UD_Wolof-WTB',
     'yo_ytb': 'UD_Yoruba-YTB',
+    'ess_sli': 'UD_Yupik-SLI',
 }
 
 
diff --git a/stanza/models/pos/xpos_vocab_factory.py b/stanza/models/pos/xpos_vocab_factory.py
index 5397ca34..39da44fd 100644
--- a/stanza/models/pos/xpos_vocab_factory.py
+++ b/stanza/models/pos/xpos_vocab_factory.py
@@ -4,9 +4,9 @@
 from stanza.models.pos.vocab import WordVocab, XPOSVocab
 
 def xpos_vocab_factory(data, shorthand):
-    if shorthand in ["af_afribooms", "ar_padt", "bg_btb", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gd_arcosg", "gl_ctg", "gl_treegal", "grc_perseus", "hr_set", "is_icepahc", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "it_combined", "la_perseus", "lt_alksnis", "lv_lvtb", "ro_nonstandard", "ro_rrt", "ro_simonero", "sk_snk", "sl_ssj", "sl_sst", "sr_set", "ta_ttb", "uk_iu"]:
+    if shorthand in ["af_afribooms", "ar_padt", "bg_btb", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gd_arcosg", "gl_ctg", "gl_treegal", "grc_perseus", "hr_set", "is_icepahc", "is_modern", "it_combined", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "la_perseus", "la_udante", "lt_alksnis", "lv_lvtb", "ro_nonstandard", "ro_rrt", "ro_simonero", "sk_snk", "sl_ssj", "sl_sst", "sr_set", "ta_ttb", "uk_iu"]:
         return XPOSVocab(data, shorthand, idx=2, sep="")
-    elif shorthand in ["be_hse", "ca_ancora", "cop_scriptorium", "cu_proiel", "cy_ccg", "da_ddt", "de_gsd", "de_hdt", "el_gdt", "en_ewt", "en_gum", "en_combined", "es_ancora", "es_gsd", "et_edt", "et_ewt", "eu_bdt", "fa_perdt", "fa_seraji", "fi_tdt", "fr_ftb", "fr_gsd", "fro_srcmf", "fr_sequoia", "fr_spoken", "ga_idt", "got_proiel", "grc_proiel", "he_htb", "hi_hdtb", "hu_szeged", "hy_armtdp", "id_csui", "ja_gsd", "la_proiel", "lt_hse", "lzh_kyoto", "mr_ufal", "mt_mudt", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "orv_rnc", "orv_torot", "pcm_nsc", "pt_bosque", "pt_gsd", "qtd_sagt", "ru_gsd", "ru_syntagrus", "ru_taiga", "sa_vedic", "sme_giella", "swl_sslc", "te_mtg", "tr_boun", "tr_imst", "ug_udt", "vi_vtb", "wo_wtb", "zh_gsdsimp", "zh-hans_gsdsimp", "zh-hant_gsd", "bxr_bdt", "hsb_ufal", "ja_bccwj", "kk_ktb", "kmr_mg", "olo_kkpp"]:
+    elif shorthand in ["be_hse", "ca_ancora", "cop_scriptorium", "cu_proiel", "cy_ccg", "da_ddt", "de_gsd", "de_hdt", "el_gdt", "en_combined", "en_ewt", "en_gum", "es_ancora", "es_gsd", "et_edt", "et_ewt", "eu_bdt", "fa_perdt", "fa_seraji", "fi_tdt", "fr_gsd", "fro_srcmf", "fr_sequoia", "fr_spoken", "ga_idt", "got_proiel", "grc_proiel", "he_htb", "hi_hdtb", "hu_szeged", "hy_armtdp", "hyw_armtdp", "id_csui", "ja_gsd", "la_proiel", "lt_hse", "lzh_kyoto", "mr_ufal", "mt_mudt", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "orv_rnc", "orv_torot", "pcm_nsc", "pt_bosque", "pt_gsd", "qtd_sagt", "ru_gsd", "ru_syntagrus", "ru_taiga", "sa_vedic", "sme_giella", "swl_sslc", "te_mtg", "tr_boun", "tr_framenet", "tr_imst", "tr_kenet", "tr_penn", "tr_tourism", "ug_udt", "vi_vtb", "wo_wtb", "zh_gsdsimp", "zh-hant_gsd", "bxr_bdt", "hsb_ufal", "ja_bccwj", "kk_ktb", "kmr_mg", "olo_kkpp"]:
         return WordVocab(data, shorthand, idx=2, ignore=["_"])
     elif shorthand in ["en_lines", "fo_farpahc", "sv_lines", "ur_udtb"]:
         return XPOSVocab(data, shorthand, idx=2, sep="-")
diff --git a/stanza/resources/common.py b/stanza/resources/common.py
index ad9d722c..8e70e861 100644
--- a/stanza/resources/common.py
+++ b/stanza/resources/common.py
@@ -103,7 +103,7 @@ def file_exists(path, md5):
     """
     return os.path.exists(path) and get_md5(path) == md5
 
-def download_file(url, path, proxies):
+def download_file(url, path, proxies, raise_for_status=False):
     """
     Download a URL into a file as specified by `path`.
     """
@@ -120,8 +120,11 @@ def download_file(url, path, proxies):
                     f.write(chunk)
                     f.flush()
                     pbar.update(len(chunk))
+    if raise_for_status:
+        r.raise_for_status()
+    return r.status_code
 
-def request_file(url, path, proxies=None, md5=None):
+def request_file(url, path, proxies=None, md5=None, raise_for_status=False):
     """
     A complete wrapper over download_file() that also make sure the directory of
     `path` exists, and that a file matching the md5 value does not exist.
@@ -130,7 +133,7 @@ def request_file(url, path, proxies=None, md5=None):
     if file_exists(path, md5):
         logger.info(f'File exists: {path}.')
         return
-    download_file(url, path, proxies)
+    download_file(url, path, proxies, raise_for_status)
     assert(not md5 or file_exists(path, md5))
 
 def sort_processors(processor_list):
@@ -332,7 +335,8 @@ def download_resources_json(model_dir, resources_url, resources_branch,
     request_file(
         f'{resources_url}/resources_{resources_version}.json',
         os.path.join(model_dir, 'resources.json'),
-        proxies
+        proxies,
+        raise_for_status=True
     )
 
 
diff --git a/stanza/resources/prepare_resources.py b/stanza/resources/prepare_resources.py
index f3fef48e..31791e21 100644
--- a/stanza/resources/prepare_resources.py
+++ b/stanza/resources/prepare_resources.py
@@ -68,6 +68,7 @@ default_treebanks = {
   "ug": "udt",
   "vi": "vtb",
   "lt": "alksnis",
+  "hyw": "armtdp",
   "wo": "wtb",
   "nb": "bokmaal",
   "mt": "mudt",
@@ -89,11 +90,13 @@ default_treebanks = {
 # default ner for languages
 default_ners = {
   "ar": "aqmar",
+  "bg": "bsnlp19",
   "de": "conll03",
   "en": "ontonotes",
   "es": "conll02",
   "fi": "turku",
   "fr": "wikiner",
+  "hu": "combined",
   "nl": "conll02",
   "ru": "wikiner",
   "uk": "languk",
@@ -104,6 +107,7 @@ default_ners = {
 # default charlms for languages
 default_charlms = {
   "ar": "ccwiki",
+  "bg": "conll17",
   "de": "newswiki",
   "en": "1billion",
   "es": "newswiki",
@@ -131,7 +135,10 @@ ner_charlms = {
   },
   "uk": {
     "languk": None,
-  }
+  },
+  "hu": {
+    "combined": None,
+  },
 }
 
 # a few languages have sentiment classifier models
@@ -213,7 +220,7 @@ lcode2lang = {
     "nn": "Norwegian_Nynorsk",
     "cu": "Old_Church_Slavonic",
     "fro": "Old_French",
-    "orv": "Old_Russian",
+    "orv": "Old_East_Slavic",
     "fa": "Persian",
     "pl": "Polish",
     "pt": "Portuguese",
@@ -239,6 +246,7 @@ lcode2lang = {
     "ug": "Uyghur",
     "vi": "Vietnamese",
     "cy": "Welsh",
+    "hyw": "Western_Armenian",
     "wo": "Wolof"
 }
 
diff --git a/stanza/tests/test_installation.py b/stanza/tests/test_installation.py
index 05e5b650..13b8c4f9 100644
--- a/stanza/tests/test_installation.py
+++ b/stanza/tests/test_installation.py
@@ -30,7 +30,7 @@ def test_install_corenlp():
     
 def test_download_corenlp_models():
     model_name = "arabic"
-    version = "4.2.0"
+    version = "4.2.2"
 
     with tempfile.TemporaryDirectory(dir=".") as test_dir:
         stanza.download_corenlp_models(model=model_name, version=version, dir=test_dir)
diff --git a/stanza/utils/datasets/prepare_lemma_treebank.py b/stanza/utils/datasets/prepare_lemma_treebank.py
index 3f90fcf5..a754c4fe 100644
--- a/stanza/utils/datasets/prepare_lemma_treebank.py
+++ b/stanza/utils/datasets/prepare_lemma_treebank.py
@@ -12,8 +12,43 @@ and it will prepare each of train, dev, test
 import stanza.utils.datasets.common as common
 import stanza.utils.datasets.prepare_tokenizer_treebank as prepare_tokenizer_treebank
 
+def check_lemmas(train_file):
+    """
+    Check if a treebank has any lemmas in it
+
+    For example, in Vietnamese-VTB, all the words and lemmas are exactly the same
+    in Telugu-MTG, all the lemmas are blank
+    """
+    # could eliminate a few languages immediately based on UD 2.7
+    # but what if a later dataset includes lemmas?
+    #if short_language in ('vi', 'fro', 'th'):
+    #    return False
+    with open(train_file) as fin:
+        for line in fin:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            pieces = line.split("\t")
+            word = pieces[1].lower().strip()
+            lemma = pieces[2].lower().strip()
+            if not lemma or lemma == '_' or lemma == '-':
+                continue
+            if word == lemma:
+                continue
+            return True
+    return False
+
 def process_treebank(treebank, paths, args):
-    prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"])
+    if treebank.startswith("UD_"):
+        udbase_dir = paths["UDBASE"]
+        train_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
+        augment = check_lemmas(train_conllu)
+        if not augment:
+            print("No lemma information found in %s.  Not augmenting the dataset" % train_conllu)
+    else:
+        # TODO: check the data to see if there are lemmas or not
+        augment = True
+    prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"], augment=augment)
 
 def main():
     common.main(process_treebank)
diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
index 7237e722..459a6a74 100755
--- a/stanza/utils/datasets/prepare_tokenizer_treebank.py
+++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -34,7 +34,6 @@ from collections import Counter
 
 import stanza.utils.datasets.common as common
 import stanza.utils.datasets.prepare_tokenizer_data as prepare_tokenizer_data
-import stanza.utils.datasets.preprocess_ssj_data as preprocess_ssj_data
 
 
 def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name):
@@ -43,7 +42,7 @@ def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_n
 
     shutil.copyfile(original, copied)
 
-def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None):
+def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None, augment=True):
     """
     This utility method copies only the conllu files to the given destination directory.
 
@@ -60,7 +59,7 @@ def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None):
 
         # first we process the tokenization data
         args = argparse.Namespace()
-        args.augment = False
+        args.augment = augment
         args.prepare_labels = False
         process_treebank(treebank, paths, args)
 
@@ -783,8 +782,6 @@ def build_combined_korean(udbase_dir, tokenizer_dir, short_name):
         build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu)
 
 def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
-    output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
-
     if dataset == 'train':
         # could maybe add ParTUT, but that dataset has a slightly different xpos set
         # (no DE or I)
@@ -806,13 +803,11 @@ def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
                 raise AssertionError("Unexpected format of the italian.mwt file.  Has it already be modified to have SpaceAfter=No everywhere?")
             sentence[2] = sentence[2][:-1] + "SpaceAfter=No"
         sents = sents + extra_sents
-
-        sents = augment_punct(sents)
     else:
         istd_conllu = common.find_treebank_dataset_file("UD_Italian-ISDT", udbase_dir, dataset, "conllu")
         sents = read_sentences_from_conllu(istd_conllu)
 
-    write_sentences_to_conllu(output_conllu, sents)
+    return sents
 
 def check_gum_ready(udbase_dir):
     gum_conllu = common.find_treebank_dataset_file("UD_English-GUMReddit", udbase_dir, "train", "conllu")
@@ -827,8 +822,6 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
     """
     check_gum_ready(udbase_dir)
 
-    output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
-
     if dataset == 'train':
         # TODO: include more UD treebanks, possibly with xpos removed
         #  UD_English-ParTUT - xpos are different
@@ -843,15 +836,13 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
         for treebank in test_treebanks:
             conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True)
             sents.extend(read_sentences_from_conllu(conllu_file))
-
-        # TODO: refactor things like the augment_punct call
-        sents = augment_punct(sents)
     else:
         ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu")
         sents = read_sentences_from_conllu(ewt_conllu)
 
     sents = strip_mwt_from_sentences(sents)
-    write_sentences_to_conllu(output_conllu, sents)
+    return sents
+
 
 def replace_semicolons(sentences):
     """
@@ -888,8 +879,6 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
     TODO: remove features which aren't shared between datasets
     TODO: consider mixing in PUD?
     """
-    output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
-
     if dataset == 'train':
         treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"]
         sents = []
@@ -905,15 +894,11 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
             raise FileNotFoundError("Cannot find the extra dataset 'spanish.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian))
         extra_sents = read_sentences_from_conllu(extra_spanish)
         sents.extend(extra_sents)
-
-        # TODO: refactor things like the augment_punct call
-        sents = augment_punct(sents)
     else:
         conllu_file = common.find_treebank_dataset_file("UD_Spanish-AnCora", udbase_dir, dataset, "conllu", fail=True)
         sents = read_sentences_from_conllu(conllu_file)
 
-    write_sentences_to_conllu(output_conllu, sents)
-
+    return sents
 
 
 COMBINED_FNS = {
@@ -922,19 +907,24 @@ COMBINED_FNS = {
     "it_combined": build_combined_italian_dataset,
 }
 
-def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name):
+def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, augment):
     random.seed(1234)
     build_fn = COMBINED_FNS[short_name]
     for dataset in ("train", "dev", "test"):
-        build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+        output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+        sents = build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+        if dataset == 'train' and augment:
+            sents = augment_punct(sents)
+        write_sentences_to_conllu(output_conllu, sents)
 
-def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset):
+def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment):
     """
     Build the GUM dataset by combining GUMReddit
 
     It checks to make sure GUMReddit is filled out using the included script
     """
     check_gum_ready(udbase_dir)
+    random.seed(1234)
 
     output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
 
@@ -944,36 +934,30 @@ def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, da
         conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
         sents.extend(read_sentences_from_conllu(conllu_file))
 
-    if dataset == 'train':
+    if dataset == 'train' and augment:
         sents = augment_punct(sents)
 
     write_sentences_to_conllu(output_conllu, sents)
 
-def build_combined_english_gum(udbase_dir, tokenizer_dir, short_name):
+def build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, augment):
     for dataset in ("train", "dev", "test"):
-        build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset)
+        build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment)
 
 def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True):
     input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu")
     output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
 
-    if short_name == "sl_ssj":
-        preprocess_ssj_data.process(input_conllu, output_conllu)
-    elif short_name == "te_mtg" and dataset == 'train' and augment:
+    if short_name == "te_mtg" and dataset == 'train' and augment:
         write_augmented_dataset(input_conllu, output_conllu, augment_telugu)
     elif short_name == "ar_padt" and dataset == 'train' and augment:
         write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt)
     elif short_name.startswith("ko_") and short_name.endswith("_seg"):
         remove_spaces(input_conllu, output_conllu)
-    elif dataset == 'train':
-        # we treat the additional punct as something that always needs to be there
-        # this will teach the tagger & depparse about unicode apos, for example
+    elif dataset == 'train' and augment:
         write_augmented_dataset(input_conllu, output_conllu, augment_punct)
     else:
         shutil.copyfile(input_conllu, output_conllu)
 
-    # TODO: refactor this call everywhere
-
 def process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, augment=True):
     """
     Process a normal UD treebank with train/dev/test splits
@@ -1049,11 +1033,11 @@ def process_treebank(treebank, paths, args):
     if short_name.startswith("ko_combined"):
         build_combined_korean(udbase_dir, tokenizer_dir, short_name)
     elif short_name in ("it_combined", "en_combined", "es_combined"):
-        build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name)
+        build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
     elif short_name.startswith("en_gum"):
         # we special case GUM because it should include a filled-out GUMReddit
         print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))
-        build_combined_english_gum(udbase_dir, tokenizer_dir, short_name)
+        build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment)
     else:
         # check that we can find the train file where we expect it
         train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
diff --git a/stanza/utils/datasets/preprocess_ssj_data.py b/stanza/utils/datasets/preprocess_ssj_data.py
deleted file mode 100644
index 4ce7a8b9..00000000
--- a/stanza/utils/datasets/preprocess_ssj_data.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
-The SSJ dataset has an unusual bug: all of the sentences end with SpaceAfter=no
-
-This script fixes them and writes the fixed files to the given location.
-"""
-
-def process(input_conllu, input_conllu_copy):
-    conllu_lines = open(input_conllu).readlines()
-
-    new_conllu_lines = list(conllu_lines)
-
-    line_idx = 0
-    text_idx = 0
-    # invariant: conllu_lines[line_idx] is
-    #  # sent_id
-    # at the start of a loop
-    while line_idx < len(conllu_lines):
-        # extract the text from the comments before each sentence
-        line_idx = line_idx + 1
-        text_line = conllu_lines[line_idx]
-        assert text_line.startswith("# text = "), "Unexpected format: %s,%d is not # text" % (input_conllu, line_idx)
-        text_line = text_line[9:-1]
-        # use that text to keep track of an index in the text where we might need to put new spaces
-        text_idx = text_idx + len(text_line)
-
-        # advance to the end of the sentence
-        line_idx = line_idx + 1
-        assert conllu_lines[line_idx].startswith("1"), "Unexpected format: %s,%d is not a word" % (input_conllu, line_idx)
-        while conllu_lines[line_idx].strip():
-            line_idx = line_idx + 1
-        last_word_idx = line_idx - 1
-        
-        # check if the end of the sentence has SpaceAfter or not
-        new_line = conllu_lines[last_word_idx].replace("SpaceAfter=No|", "")
-        assert new_line.find("SpaceAfter=") < 0, "Unexpected format: %s,%d has unusual SpaceAfter" % (input_conllu, line_idx)
-
-        # if not, need to add a new space
-        if new_line != conllu_lines[last_word_idx]:
-            conllu_lines[last_word_idx] = new_line
-        text_idx = text_idx + 1
-        
-        # done with a sentence.  skip to the start of the next sentence
-        # or the end of the document
-        while line_idx < len(conllu_lines) and not conllu_lines[line_idx].strip():
-            line_idx = line_idx + 1
-
-    with open(input_conllu_copy, "w") as fout:
-        for line in conllu_lines:
-            fout.write(line)
diff --git a/stanza/utils/training/run_lemma.py b/stanza/utils/training/run_lemma.py
index 96b0f1cf..f86648e5 100644
--- a/stanza/utils/training/run_lemma.py
+++ b/stanza/utils/training/run_lemma.py
@@ -24,33 +24,9 @@ from stanza.models import lemmatizer
 from stanza.utils.training import common
 from stanza.utils.training.common import Mode
 
-logger = logging.getLogger('stanza')
+from stanza.utils.datasets.prepare_lemma_treebank import check_lemmas
 
-def check_lemmas(train_file):
-    """
-    Check if a treebank has any lemmas in it
-
-    For example, in Vietnamese-VTB, all the words and lemmas are exactly the same
-    in Telugu-MTG, all the lemmas are blank
-    """
-    # could eliminate a few languages immediately based on UD 2.7
-    # but what if a later dataset includes lemmas?
-    #if short_language in ('vi', 'fro', 'th'):
-    #    return False
-    with open(train_file) as fin:
-        for line in fin:
-            line = line.strip()
-            if not line or line.startswith("#"):
-                continue
-            pieces = line.split("\t")
-            word = pieces[1].lower().strip()
-            lemma = pieces[2].lower().strip()
-            if not lemma or lemma == '_' or lemma == '-':
-                continue
-            if word == lemma:
-                continue
-            return True
-    return False
+logger = logging.getLogger('stanza')
 
 def run_treebank(mode, paths, treebank, short_name,
                  temp_output_file, command_args, extra_args):
author	John Bauer <horatio@gmail.com>	2021-06-08 22:46:50 +0300
committer	GitHub <noreply@github.com>	2021-06-08 22:46:50 +0300
commit	ca0ff9c40609891da2ad5ef4765369cbd00ef5d5 (patch)
tree	e283f72143df199be15c36c2868acdbbd64d00ce
parent	405e516dee460c57efaa9b20c0728f7e0b6d0d61 (diff)
parent	0ac918983bb2210230ab2a3c229711c1ab22903e (diff)