diff options
author | John Bauer <horatio@gmail.com> | 2021-06-08 22:46:50 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-06-08 22:46:50 +0300 |
commit | ca0ff9c40609891da2ad5ef4765369cbd00ef5d5 (patch) | |
tree | e283f72143df199be15c36c2868acdbbd64d00ce | |
parent | 405e516dee460c57efaa9b20c0728f7e0b6d0d61 (diff) | |
parent | 0ac918983bb2210230ab2a3c229711c1ab22903e (diff) |
Merge pull request #718 from stanfordnlp/ud28
Ud28
-rwxr-xr-x | scripts/download_vectors.sh | 14 | ||||
-rwxr-xr-x | scripts/treebank_to_shorthand.sh | 6 | ||||
-rw-r--r-- | stanza/_version.py | 4 | ||||
-rw-r--r-- | stanza/models/charlm.py | 35 | ||||
-rw-r--r-- | stanza/models/common/constant.py | 15 | ||||
-rw-r--r-- | stanza/models/common/short_name_to_treebank.py | 23 | ||||
-rw-r--r-- | stanza/models/pos/xpos_vocab_factory.py | 4 | ||||
-rw-r--r-- | stanza/resources/common.py | 12 | ||||
-rw-r--r-- | stanza/resources/prepare_resources.py | 12 | ||||
-rw-r--r-- | stanza/tests/test_installation.py | 2 | ||||
-rw-r--r-- | stanza/utils/datasets/prepare_lemma_treebank.py | 37 | ||||
-rwxr-xr-x | stanza/utils/datasets/prepare_tokenizer_treebank.py | 58 | ||||
-rw-r--r-- | stanza/utils/datasets/preprocess_ssj_data.py | 49 | ||||
-rw-r--r-- | stanza/utils/training/run_lemma.py | 28 |
14 files changed, 148 insertions, 151 deletions
diff --git a/scripts/download_vectors.sh b/scripts/download_vectors.sh index 960f57ef..1705fdfe 100755 --- a/scripts/download_vectors.sh +++ b/scripts/download_vectors.sh @@ -20,9 +20,17 @@ FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki" # Welsh, Icelandic, Thai, Sanskrit # https://fasttext.cc/docs/en/crawl-vectors.html -declare -a FASTTEXT_LANG=("Afrikaans" "Armenian" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian") -declare -a FASTTEXT_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb") -declare -a LOCAL_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb") +# We get the Armenian word vectors from here: +# https://github.com/ispras-texterra/word-embeddings-eval-hy +# https://arxiv.org/ftp/arxiv/papers/1906/1906.03134.pdf +# In particular, the glove model (dogfooding): +# https://at.ispras.ru/owncloud/index.php/s/pUUiS1l1jGKNax3/download +# These vectors improved F1 by about 1 on various tasks for Armenian +# and had much better coverage of Western Armenian + +declare -a FASTTEXT_LANG=("Afrikaans" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian") +declare -a FASTTEXT_CODE=("af" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb") +declare -a LOCAL_CODE=("af" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb") color_green='\033[32;1m' color_clear='\033[0m' # No Color diff --git a/scripts/treebank_to_shorthand.sh b/scripts/treebank_to_shorthand.sh index 1d15b877..bb6f1793 100755 --- a/scripts/treebank_to_shorthand.sh +++ b/scripts/treebank_to_shorthand.sh @@ -7,9 +7,9 @@ # Please keep synced with # stanza/models/common/constant.py -declare -A lang2lcode=( ["Afrikaans"]="af" ["Akkadian"]="akk" ["Akuntsu"]="aqz" ["Albanian"]="sq" ["Amharic"]="am" ["Apurina"]="apu" ["Armenian"]="hy" ["Arabic"]="ar" ["Assyrian"]="aii" ["Bambara"]="bm" ["Bengali"]="bn" ["Bhojpuri"]="bho" ["Breton"]="br" ["Bulgarian"]="bg" ["Buryat"]="bxr" ["Cantonese"]="yue" ["Catalan"]="ca" ["Chukchi"]="ckt" ["Czech"]="cs" ["Old_Church_Slavonic"]="cu" ["Danish"]="da" ["German"]="de" ["Greek"]="el" ["English"]="en" ["Spanish"]="es" ["Estonian"]="et" ["Basque"]="eu" ["Persian"]="fa" ["Faroese"]="fo" ["Finnish"]="fi" ["French"]="fr" ["Irish"]="ga" ["Galician"]="gl" ["Gothic"]="got" ["Ancient_Greek"]="grc" ["Mbya_Guarani"]="gun" ["Hebrew"]="he" ["Hindi"]="hi" ["Hindi_English"]="qhe" ["Croatian"]="hr" -["Hungarian"]="hu" ["Icelandic"]="is" ["Indonesian"]="id" ["Italian"]="it" ["Japanese"]="ja" ["Karelian"]="krl" ["Kazakh"]="kk" ["Khunsari"]="kfm" ["Komi_Permyak"]="koi" ["Komi_Zyrian"]="kpv" ["Korean"]="ko" ["Kurmanji"]="kmr" ["Latin"]="la" ["Latvian"]="lv" ["Malayalam"]="mal" ["Manx"]="gv" ["Moksha"]="mdf" ["Munduruku"]="myu" ["Erzya"]="myv" ["Nayini"]="nyq" ["Dutch"]="nl" ["Norwegian_Bokmaal"]="nb" ["Norwegian_Nynorsk"]="nn" ["Polish"]="pl" ["Portuguese"]="pt" ["Romanian"]="ro" ["Russian"]="ru" ["Sanskrit"]="sa" ["Slovak"]="sk" ["Slovenian"]="sl" ["Soi"]="soj" ["South_Levantine_Arabic"]="ajp" ["Swedish"]="sv" ["Swiss_German"]="gsw" ["Tagalog"]="tl" ["Turkish"]="tr" ["Turkish_German"]="qtd" ["Uyghur"]="ug" ["Ukrainian"]="uk" ["Urdu"]="ur" ["Vietnamese"]="vi" ["Traditional_Chinese"]="zh-hant" ["Welsh"]="cy" ["Altaic"]="bxr" ["Indo_Iranian"]="kmr" ["Uralic"]="sme" ["Slavic"]="hsb" -["Naija"]="pcm" ["North_Sami"]="sme" ["Old_French"]="fro" ["Old_Turkish"]="otk" ["Serbian"]="sr" ["Skolt_Sami"]="sms" ["Thai"]="th" ["Tupinamba"]="tpn" ["Upper_Sorbian"]="hsb" ["Belarusian"]="be" ["Classical_Chinese"]="lzh" ["Coptic"]="cop" ["Lithuanian"]="lt" ["Livvi"]="olo" ["Maltese"]="mt" ["Marathi"]="mr" ["Old_Russian"]="orv" ["Scottish_Gaelic"]="gd" ["Simplified_Chinese"]="zh-hans" ["Swedish_Sign_Language"]="swl" ["Tamil"]="ta" ["Telugu"]="te" ["Warlpipi"]="wbp" ["Wolof"]="wo" ["Yoruba"]="yo" ) +declare -A lang2lcode=( ["Afrikaans"]="af" ["Akkadian"]="akk" ["Akuntsu"]="aqz" ["Albanian"]="sq" ["Amharic"]="am" ["Apurina"]="apu" ["Armenian"]="hy" ["Arabic"]="ar" ["Assyrian"]="aii" ["Bambara"]="bm" ["Beja"]="bej" ["Bengali"]="bn" ["Bhojpuri"]="bho" ["Breton"]="br" ["Bulgarian"]="bg" ["Buryat"]="bxr" ["Cantonese"]="yue" ["Catalan"]="ca" ["Chukchi"]="ckt" ["Czech"]="cs" ["Old_Church_Slavonic"]="cu" ["Danish"]="da" ["German"]="de" ["Greek"]="el" ["English"]="en" ["Spanish"]="es" ["Estonian"]="et" ["Basque"]="eu" ["Persian"]="fa" ["Faroese"]="fo" ["Finnish"]="fi" ["French"]="fr" ["Frisian_Dutch"]="qfn" ["Guajajara"]="gub" ["Irish"]="ga" ["Galician"]="gl" ["Gothic"]="got" ["Ancient_Greek"]="grc" ["Mbya_Guarani"]="gun" ["Hebrew"]="he" ["Hindi"]="hi" ["Hindi_English"]="qhe" ["Croatian"]="hr" +["Hungarian"]="hu" ["Icelandic"]="is" ["Indonesian"]="id" ["Italian"]="it" ["Japanese"]="ja" ["Kaapor"]="urb" ["Kangri"]="xnr" ["Karelian"]="krl" ["Kazakh"]="kk" ["Khunsari"]="kfm" ["Kiche"]="quc" ["Komi_Permyak"]="koi" ["Komi_Zyrian"]="kpv" ["Korean"]="ko" ["Kurmanji"]="kmr" ["Latin"]="la" ["Latvian"]="lv" ["Low_Saxon"]="nds" ["Makurap"]="mpu" ["Malayalam"]="mal" ["Manx"]="gv" ["Moksha"]="mdf" ["Munduruku"]="myu" ["Erzya"]="myv" ["Nayini"]="nyq" ["Dutch"]="nl" ["Norwegian_Bokmaal"]="nb" ["Norwegian_Nynorsk"]="nn" ["Polish"]="pl" ["Portuguese"]="pt" ["Romanian"]="ro" ["Russian"]="ru" ["Sanskrit"]="sa" ["Slovak"]="sk" ["Slovenian"]="sl" ["Soi"]="soj" ["South_Levantine_Arabic"]="ajp" ["Swedish"]="sv" ["Swiss_German"]="gsw" ["Tagalog"]="tl" ["Turkish"]="tr" ["Turkish_German"]="qtd" ["Uyghur"]="ug" ["Ukrainian"]="uk" ["Urdu"]="ur" ["Vietnamese"]="vi" ["Traditional_Chinese"]="zh-hant" ["Welsh"]="cy" ["Altaic"]="bxr" ["Indo_Iranian"]="kmr" ["Uralic"]="sme" ["Slavic"]="hsb" +["Naija"]="pcm" ["North_Sami"]="sme" ["Old_French"]="fro" ["Old_Turkish"]="otk" ["Serbian"]="sr" ["Skolt_Sami"]="sms" ["Thai"]="th" ["Tupinamba"]="tpn" ["Upper_Sorbian"]="hsb" ["Belarusian"]="be" ["Classical_Chinese"]="lzh" ["Coptic"]="cop" ["Lithuanian"]="lt" ["Livvi"]="olo" ["Maltese"]="mt" ["Marathi"]="mr" ["Old_Russian"]="orv" ["Scottish_Gaelic"]="gd" ["Simplified_Chinese"]="zh-hans" ["Swedish_Sign_Language"]="swl" ["Tamil"]="ta" ["Telugu"]="te" ["Warlpipi"]="wbp" ["Western_Armenian"]="hyw" ["Wolof"]="wo" ["Yoruba"]="yo" ["Yupik"]="ess" ) format=$1 shift diff --git a/stanza/_version.py b/stanza/_version.py index 5d611f5b..4bce72e3 100644 --- a/stanza/_version.py +++ b/stanza/_version.py @@ -1,4 +1,4 @@ """ Single source of truth for version number """ -__version__ = "1.2" -__resources_version__ = '1.2.0' +__version__ = "1.2.1" +__resources_version__ = '1.2.1' diff --git a/stanza/models/charlm.py b/stanza/models/charlm.py index 031f2139..36345121 100644 --- a/stanza/models/charlm.py +++ b/stanza/models/charlm.py @@ -56,27 +56,26 @@ def readlines(path): def build_vocab(path, cutoff=0): # Requires a large amount of memory, but only need to build once + + # here we need some trick to deal with excessively large files + # for each file we accumulate the counter of characters, and + # at the end we simply pass a list of chars to the vocab builder + counter = Counter() if os.path.isdir(path): - # here we need some trick to deal with excessively large files - # for each file we accumulate the counter of characters, and - # at the end we simply pass a list of chars to the vocab builder - counter = Counter() filenames = sorted(os.listdir(path)) - for filename in filenames: - lines = readlines(path + '/' + filename) - for line in lines: - counter.update(list(line)) - # remove infrequent characters from vocab - for k in list(counter.keys()): - if counter[k] < cutoff: - del counter[k] - # a singleton list of all characters - data = [sorted([x[0] for x in counter.most_common()])] - vocab = CharVocab(data) # skip cutoff argument because this has been dealt with else: - lines = readlines(path) - data = [list(line) for line in lines] - vocab = CharVocab(data, cutoff=cutoff) + filenames = [path] + for filename in filenames: + lines = readlines(path + '/' + filename) + for line in lines: + counter.update(list(line)) + # remove infrequent characters from vocab + for k in list(counter.keys()): + if counter[k] < cutoff: + del counter[k] + # a singleton list of all characters + data = [sorted([x[0] for x in counter.most_common()])] + vocab = CharVocab(data) # skip cutoff argument because this has been dealt with return vocab def load_file(path, vocab, direction): diff --git a/stanza/models/common/constant.py b/stanza/models/common/constant.py index cde8b597..3ba570ab 100644 --- a/stanza/models/common/constant.py +++ b/stanza/models/common/constant.py @@ -20,6 +20,7 @@ lcode2lang = { "aii": "Assyrian", "bm": "Bambara", "eu": "Basque", + "bej": "Beja", "be": "Belarusian", "bn": "Bengali", "bho": "Bhojpuri", @@ -42,10 +43,12 @@ lcode2lang = { "fo": "Faroese", "fi": "Finnish", "fr": "French", + "qfn": "Frisian_Dutch", "gl": "Galician", "de": "German", "got": "Gothic", "el": "Greek", + "gub": "Guajajara", "he": "Hebrew", "hi": "Hindi", "qhe": "Hindi_English", @@ -55,9 +58,12 @@ lcode2lang = { "ga": "Irish", "it": "Italian", "ja": "Japanese", + "urb": "Kaapor", + "xnr": "Kangri", "krl": "Karelian", "kk": "Kazakh", "kfm": "Khunsari", + "quc": "Kiche", "koi": "Komi_Permyak", "kpv": "Komi_Zyrian", "ko": "Korean", @@ -66,6 +72,8 @@ lcode2lang = { "olo": "Livvi", "la": "Latin", "lv": "Latvian", + "nds": "Low_Saxon", + "mpu": "Makurap", "mal": "Malayalam", "mt": "Maltese", "gv": "Manx", @@ -80,7 +88,7 @@ lcode2lang = { "nn": "Norwegian_Nynorsk", "cu": "Old_Church_Slavonic", "fro": "Old_French", - "orv": "Old_Russian", + "orv": "Old_East_Slavic", "otk": "Old_Turkish", "fa": "Persian", "pl": "Polish", @@ -114,7 +122,9 @@ lcode2lang = { "vi": "Vietnamese", "wbp": "Warlpiri", "cy": "Welsh", + "hyw": "Western_Armenian", "wo": "Wolof", + "ess": "Yupik", "yo": "Yoruba", } @@ -128,6 +138,9 @@ lcode2lang['zh'] = 'Simplified_Chinese' lang2lcode['Chinese'] = 'zh' +# treebank names changed from Old Russian to Old East Slavic in 2.8 +lang2lcode['Old_Russian'] = 'orv' + treebank_special_cases = { "UD_Chinese-GSDSimp": "zh_gsdsimp", "UD_Chinese-GSD": "zh-hant_gsd", diff --git a/stanza/models/common/short_name_to_treebank.py b/stanza/models/common/short_name_to_treebank.py index 9e681787..871f58da 100644 --- a/stanza/models/common/short_name_to_treebank.py +++ b/stanza/models/common/short_name_to_treebank.py @@ -18,6 +18,7 @@ SHORT_NAMES = { 'aii_as': 'UD_Assyrian-AS', 'bm_crb': 'UD_Bambara-CRB', 'eu_bdt': 'UD_Basque-BDT', + 'bej_nsc': 'UD_Beja-NSC', 'be_hse': 'UD_Belarusian-HSE', 'bho_bhtb': 'UD_Bhojpuri-BHTB', 'br_keb': 'UD_Breton-KEB', @@ -67,6 +68,7 @@ SHORT_NAMES = { 'fr_partut': 'UD_French-ParTUT', 'fr_sequoia': 'UD_French-Sequoia', 'fr_spoken': 'UD_French-Spoken', + 'qfn_fame': 'UD_Frisian_Dutch-Fame', 'gl_ctg': 'UD_Galician-CTG', 'gl_treegal': 'UD_Galician-TreeGal', 'de_gsd': 'UD_German-GSD', @@ -75,30 +77,37 @@ SHORT_NAMES = { 'de_pud': 'UD_German-PUD', 'got_proiel': 'UD_Gothic-PROIEL', 'el_gdt': 'UD_Greek-GDT', + 'gub_tudet': 'UD_Guajajara-TuDeT', 'he_htb': 'UD_Hebrew-HTB', 'hi_hdtb': 'UD_Hindi-HDTB', 'hi_pud': 'UD_Hindi-PUD', 'qhe_hiencs': 'UD_Hindi_English-HIENCS', 'hu_szeged': 'UD_Hungarian-Szeged', 'is_icepahc': 'UD_Icelandic-IcePaHC', + 'is_modern': 'UD_Icelandic-Modern', 'is_pud': 'UD_Icelandic-PUD', 'id_csui': 'UD_Indonesian-CSUI', 'id_gsd': 'UD_Indonesian-GSD', 'id_pud': 'UD_Indonesian-PUD', 'ga_idt': 'UD_Irish-IDT', + 'ga_twittirish': 'UD_Irish-TwittIrish', 'it_isdt': 'UD_Italian-ISDT', 'it_pud': 'UD_Italian-PUD', 'it_partut': 'UD_Italian-ParTUT', 'it_postwita': 'UD_Italian-PoSTWITA', 'it_twittiro': 'UD_Italian-TWITTIRO', 'it_vit': 'UD_Italian-VIT', + 'it_valico': 'UD_Italian-Valico', 'ja_bccwj': 'UD_Japanese-BCCWJ', 'ja_gsd': 'UD_Japanese-GSD', 'ja_modern': 'UD_Japanese-Modern', 'ja_pud': 'UD_Japanese-PUD', + 'urb_tudet': 'UD_Kaapor-TuDeT', + 'xnr_kdtb': 'UD_Kangri-KDTB', 'krl_kkpp': 'UD_Karelian-KKPP', 'kk_ktb': 'UD_Kazakh-KTB', 'kfm_aha': 'UD_Khunsari-AHA', + 'quc_iu': 'UD_Kiche-IU', 'koi_uh': 'UD_Komi_Permyak-UH', 'kpv_ikdp': 'UD_Komi_Zyrian-IKDP', 'kpv_lattice': 'UD_Komi_Zyrian-Lattice', @@ -110,10 +119,13 @@ SHORT_NAMES = { 'la_llct': 'UD_Latin-LLCT', 'la_proiel': 'UD_Latin-PROIEL', 'la_perseus': 'UD_Latin-Perseus', + 'la_udante': 'UD_Latin-UDante', 'lv_lvtb': 'UD_Latvian-LVTB', 'lt_alksnis': 'UD_Lithuanian-ALKSNIS', 'lt_hse': 'UD_Lithuanian-HSE', 'olo_kkpp': 'UD_Livvi-KKPP', + 'nds_lsdc': 'UD_Low_Saxon-LSDC', + 'mpu_tudet': 'UD_Makurap-TuDeT', 'mt_mudt': 'UD_Maltese-MUDT', 'gv_cadhan': 'UD_Manx-Cadhan', 'mr_ufal': 'UD_Marathi-UFAL', @@ -128,9 +140,9 @@ SHORT_NAMES = { 'nn_nynorsk': 'UD_Norwegian-Nynorsk', 'nn_nynorsklia': 'UD_Norwegian-NynorskLIA', 'cu_proiel': 'UD_Old_Church_Slavonic-PROIEL', + 'orv_rnc': 'UD_Old_East_Slavic-RNC', + 'orv_torot': 'UD_Old_East_Slavic-TOROT', 'fro_srcmf': 'UD_Old_French-SRCMF', - 'orv_rnc': 'UD_Old_Russian-RNC', - 'orv_torot': 'UD_Old_Russian-TOROT', 'otk_tonqq': 'UD_Old_Turkish-Tonqq', 'fa_perdt': 'UD_Persian-PerDT', 'fa_seraji': 'UD_Persian-Seraji', @@ -140,6 +152,7 @@ SHORT_NAMES = { 'pt_bosque': 'UD_Portuguese-Bosque', 'pt_gsd': 'UD_Portuguese-GSD', 'pt_pud': 'UD_Portuguese-PUD', + 'ro_art': 'UD_Romanian-ArT', 'ro_nonstandard': 'UD_Romanian-Nonstandard', 'ro_rrt': 'UD_Romanian-RRT', 'ro_simonero': 'UD_Romanian-SiMoNERo', @@ -173,9 +186,13 @@ SHORT_NAMES = { 'th_pud': 'UD_Thai-PUD', 'tpn_tudet': 'UD_Tupinamba-TuDeT', 'tr_boun': 'UD_Turkish-BOUN', + 'tr_framenet': 'UD_Turkish-FrameNet', 'tr_gb': 'UD_Turkish-GB', 'tr_imst': 'UD_Turkish-IMST', + 'tr_kenet': 'UD_Turkish-Kenet', 'tr_pud': 'UD_Turkish-PUD', + 'tr_penn': 'UD_Turkish-Penn', + 'tr_tourism': 'UD_Turkish-Tourism', 'qtd_sagt': 'UD_Turkish_German-SAGT', 'uk_iu': 'UD_Ukrainian-IU', 'hsb_ufal': 'UD_Upper_Sorbian-UFAL', @@ -184,8 +201,10 @@ SHORT_NAMES = { 'vi_vtb': 'UD_Vietnamese-VTB', 'wbp_ufal': 'UD_Warlpiri-UFAL', 'cy_ccg': 'UD_Welsh-CCG', + 'hyw_armtdp': 'UD_Western_Armenian-ArmTDP', 'wo_wtb': 'UD_Wolof-WTB', 'yo_ytb': 'UD_Yoruba-YTB', + 'ess_sli': 'UD_Yupik-SLI', } diff --git a/stanza/models/pos/xpos_vocab_factory.py b/stanza/models/pos/xpos_vocab_factory.py index 5397ca34..39da44fd 100644 --- a/stanza/models/pos/xpos_vocab_factory.py +++ b/stanza/models/pos/xpos_vocab_factory.py @@ -4,9 +4,9 @@ from stanza.models.pos.vocab import WordVocab, XPOSVocab def xpos_vocab_factory(data, shorthand): - if shorthand in ["af_afribooms", "ar_padt", "bg_btb", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gd_arcosg", "gl_ctg", "gl_treegal", "grc_perseus", "hr_set", "is_icepahc", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "it_combined", "la_perseus", "lt_alksnis", "lv_lvtb", "ro_nonstandard", "ro_rrt", "ro_simonero", "sk_snk", "sl_ssj", "sl_sst", "sr_set", "ta_ttb", "uk_iu"]: + if shorthand in ["af_afribooms", "ar_padt", "bg_btb", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gd_arcosg", "gl_ctg", "gl_treegal", "grc_perseus", "hr_set", "is_icepahc", "is_modern", "it_combined", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "la_perseus", "la_udante", "lt_alksnis", "lv_lvtb", "ro_nonstandard", "ro_rrt", "ro_simonero", "sk_snk", "sl_ssj", "sl_sst", "sr_set", "ta_ttb", "uk_iu"]: return XPOSVocab(data, shorthand, idx=2, sep="") - elif shorthand in ["be_hse", "ca_ancora", "cop_scriptorium", "cu_proiel", "cy_ccg", "da_ddt", "de_gsd", "de_hdt", "el_gdt", "en_ewt", "en_gum", "en_combined", "es_ancora", "es_gsd", "et_edt", "et_ewt", "eu_bdt", "fa_perdt", "fa_seraji", "fi_tdt", "fr_ftb", "fr_gsd", "fro_srcmf", "fr_sequoia", "fr_spoken", "ga_idt", "got_proiel", "grc_proiel", "he_htb", "hi_hdtb", "hu_szeged", "hy_armtdp", "id_csui", "ja_gsd", "la_proiel", "lt_hse", "lzh_kyoto", "mr_ufal", "mt_mudt", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "orv_rnc", "orv_torot", "pcm_nsc", "pt_bosque", "pt_gsd", "qtd_sagt", "ru_gsd", "ru_syntagrus", "ru_taiga", "sa_vedic", "sme_giella", "swl_sslc", "te_mtg", "tr_boun", "tr_imst", "ug_udt", "vi_vtb", "wo_wtb", "zh_gsdsimp", "zh-hans_gsdsimp", "zh-hant_gsd", "bxr_bdt", "hsb_ufal", "ja_bccwj", "kk_ktb", "kmr_mg", "olo_kkpp"]: + elif shorthand in ["be_hse", "ca_ancora", "cop_scriptorium", "cu_proiel", "cy_ccg", "da_ddt", "de_gsd", "de_hdt", "el_gdt", "en_combined", "en_ewt", "en_gum", "es_ancora", "es_gsd", "et_edt", "et_ewt", "eu_bdt", "fa_perdt", "fa_seraji", "fi_tdt", "fr_gsd", "fro_srcmf", "fr_sequoia", "fr_spoken", "ga_idt", "got_proiel", "grc_proiel", "he_htb", "hi_hdtb", "hu_szeged", "hy_armtdp", "hyw_armtdp", "id_csui", "ja_gsd", "la_proiel", "lt_hse", "lzh_kyoto", "mr_ufal", "mt_mudt", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "orv_rnc", "orv_torot", "pcm_nsc", "pt_bosque", "pt_gsd", "qtd_sagt", "ru_gsd", "ru_syntagrus", "ru_taiga", "sa_vedic", "sme_giella", "swl_sslc", "te_mtg", "tr_boun", "tr_framenet", "tr_imst", "tr_kenet", "tr_penn", "tr_tourism", "ug_udt", "vi_vtb", "wo_wtb", "zh_gsdsimp", "zh-hant_gsd", "bxr_bdt", "hsb_ufal", "ja_bccwj", "kk_ktb", "kmr_mg", "olo_kkpp"]: return WordVocab(data, shorthand, idx=2, ignore=["_"]) elif shorthand in ["en_lines", "fo_farpahc", "sv_lines", "ur_udtb"]: return XPOSVocab(data, shorthand, idx=2, sep="-") diff --git a/stanza/resources/common.py b/stanza/resources/common.py index ad9d722c..8e70e861 100644 --- a/stanza/resources/common.py +++ b/stanza/resources/common.py @@ -103,7 +103,7 @@ def file_exists(path, md5): """ return os.path.exists(path) and get_md5(path) == md5 -def download_file(url, path, proxies): +def download_file(url, path, proxies, raise_for_status=False): """ Download a URL into a file as specified by `path`. """ @@ -120,8 +120,11 @@ def download_file(url, path, proxies): f.write(chunk) f.flush() pbar.update(len(chunk)) + if raise_for_status: + r.raise_for_status() + return r.status_code -def request_file(url, path, proxies=None, md5=None): +def request_file(url, path, proxies=None, md5=None, raise_for_status=False): """ A complete wrapper over download_file() that also make sure the directory of `path` exists, and that a file matching the md5 value does not exist. @@ -130,7 +133,7 @@ def request_file(url, path, proxies=None, md5=None): if file_exists(path, md5): logger.info(f'File exists: {path}.') return - download_file(url, path, proxies) + download_file(url, path, proxies, raise_for_status) assert(not md5 or file_exists(path, md5)) def sort_processors(processor_list): @@ -332,7 +335,8 @@ def download_resources_json(model_dir, resources_url, resources_branch, request_file( f'{resources_url}/resources_{resources_version}.json', os.path.join(model_dir, 'resources.json'), - proxies + proxies, + raise_for_status=True ) diff --git a/stanza/resources/prepare_resources.py b/stanza/resources/prepare_resources.py index f3fef48e..31791e21 100644 --- a/stanza/resources/prepare_resources.py +++ b/stanza/resources/prepare_resources.py @@ -68,6 +68,7 @@ default_treebanks = { "ug": "udt", "vi": "vtb", "lt": "alksnis", + "hyw": "armtdp", "wo": "wtb", "nb": "bokmaal", "mt": "mudt", @@ -89,11 +90,13 @@ default_treebanks = { # default ner for languages default_ners = { "ar": "aqmar", + "bg": "bsnlp19", "de": "conll03", "en": "ontonotes", "es": "conll02", "fi": "turku", "fr": "wikiner", + "hu": "combined", "nl": "conll02", "ru": "wikiner", "uk": "languk", @@ -104,6 +107,7 @@ default_ners = { # default charlms for languages default_charlms = { "ar": "ccwiki", + "bg": "conll17", "de": "newswiki", "en": "1billion", "es": "newswiki", @@ -131,7 +135,10 @@ ner_charlms = { }, "uk": { "languk": None, - } + }, + "hu": { + "combined": None, + }, } # a few languages have sentiment classifier models @@ -213,7 +220,7 @@ lcode2lang = { "nn": "Norwegian_Nynorsk", "cu": "Old_Church_Slavonic", "fro": "Old_French", - "orv": "Old_Russian", + "orv": "Old_East_Slavic", "fa": "Persian", "pl": "Polish", "pt": "Portuguese", @@ -239,6 +246,7 @@ lcode2lang = { "ug": "Uyghur", "vi": "Vietnamese", "cy": "Welsh", + "hyw": "Western_Armenian", "wo": "Wolof" } diff --git a/stanza/tests/test_installation.py b/stanza/tests/test_installation.py index 05e5b650..13b8c4f9 100644 --- a/stanza/tests/test_installation.py +++ b/stanza/tests/test_installation.py @@ -30,7 +30,7 @@ def test_install_corenlp(): def test_download_corenlp_models(): model_name = "arabic" - version = "4.2.0" + version = "4.2.2" with tempfile.TemporaryDirectory(dir=".") as test_dir: stanza.download_corenlp_models(model=model_name, version=version, dir=test_dir) diff --git a/stanza/utils/datasets/prepare_lemma_treebank.py b/stanza/utils/datasets/prepare_lemma_treebank.py index 3f90fcf5..a754c4fe 100644 --- a/stanza/utils/datasets/prepare_lemma_treebank.py +++ b/stanza/utils/datasets/prepare_lemma_treebank.py @@ -12,8 +12,43 @@ and it will prepare each of train, dev, test import stanza.utils.datasets.common as common import stanza.utils.datasets.prepare_tokenizer_treebank as prepare_tokenizer_treebank +def check_lemmas(train_file): + """ + Check if a treebank has any lemmas in it + + For example, in Vietnamese-VTB, all the words and lemmas are exactly the same + in Telugu-MTG, all the lemmas are blank + """ + # could eliminate a few languages immediately based on UD 2.7 + # but what if a later dataset includes lemmas? + #if short_language in ('vi', 'fro', 'th'): + # return False + with open(train_file) as fin: + for line in fin: + line = line.strip() + if not line or line.startswith("#"): + continue + pieces = line.split("\t") + word = pieces[1].lower().strip() + lemma = pieces[2].lower().strip() + if not lemma or lemma == '_' or lemma == '-': + continue + if word == lemma: + continue + return True + return False + def process_treebank(treebank, paths, args): - prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"]) + if treebank.startswith("UD_"): + udbase_dir = paths["UDBASE"] + train_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True) + augment = check_lemmas(train_conllu) + if not augment: + print("No lemma information found in %s. Not augmenting the dataset" % train_conllu) + else: + # TODO: check the data to see if there are lemmas or not + augment = True + prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"], augment=augment) def main(): common.main(process_treebank) diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py index 7237e722..459a6a74 100755 --- a/stanza/utils/datasets/prepare_tokenizer_treebank.py +++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py @@ -34,7 +34,6 @@ from collections import Counter import stanza.utils.datasets.common as common import stanza.utils.datasets.prepare_tokenizer_data as prepare_tokenizer_data -import stanza.utils.datasets.preprocess_ssj_data as preprocess_ssj_data def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name): @@ -43,7 +42,7 @@ def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_n shutil.copyfile(original, copied) -def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None): +def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None, augment=True): """ This utility method copies only the conllu files to the given destination directory. @@ -60,7 +59,7 @@ def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None): # first we process the tokenization data args = argparse.Namespace() - args.augment = False + args.augment = augment args.prepare_labels = False process_treebank(treebank, paths, args) @@ -783,8 +782,6 @@ def build_combined_korean(udbase_dir, tokenizer_dir, short_name): build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu) def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): - output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" - if dataset == 'train': # could maybe add ParTUT, but that dataset has a slightly different xpos set # (no DE or I) @@ -806,13 +803,11 @@ def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh raise AssertionError("Unexpected format of the italian.mwt file. Has it already be modified to have SpaceAfter=No everywhere?") sentence[2] = sentence[2][:-1] + "SpaceAfter=No" sents = sents + extra_sents - - sents = augment_punct(sents) else: istd_conllu = common.find_treebank_dataset_file("UD_Italian-ISDT", udbase_dir, dataset, "conllu") sents = read_sentences_from_conllu(istd_conllu) - write_sentences_to_conllu(output_conllu, sents) + return sents def check_gum_ready(udbase_dir): gum_conllu = common.find_treebank_dataset_file("UD_English-GUMReddit", udbase_dir, "train", "conllu") @@ -827,8 +822,6 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh """ check_gum_ready(udbase_dir) - output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" - if dataset == 'train': # TODO: include more UD treebanks, possibly with xpos removed # UD_English-ParTUT - xpos are different @@ -843,15 +836,13 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh for treebank in test_treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True) sents.extend(read_sentences_from_conllu(conllu_file)) - - # TODO: refactor things like the augment_punct call - sents = augment_punct(sents) else: ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu") sents = read_sentences_from_conllu(ewt_conllu) sents = strip_mwt_from_sentences(sents) - write_sentences_to_conllu(output_conllu, sents) + return sents + def replace_semicolons(sentences): """ @@ -888,8 +879,6 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh TODO: remove features which aren't shared between datasets TODO: consider mixing in PUD? """ - output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" - if dataset == 'train': treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"] sents = [] @@ -905,15 +894,11 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh raise FileNotFoundError("Cannot find the extra dataset 'spanish.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian)) extra_sents = read_sentences_from_conllu(extra_spanish) sents.extend(extra_sents) - - # TODO: refactor things like the augment_punct call - sents = augment_punct(sents) else: conllu_file = common.find_treebank_dataset_file("UD_Spanish-AnCora", udbase_dir, dataset, "conllu", fail=True) sents = read_sentences_from_conllu(conllu_file) - write_sentences_to_conllu(output_conllu, sents) - + return sents COMBINED_FNS = { @@ -922,19 +907,24 @@ COMBINED_FNS = { "it_combined": build_combined_italian_dataset, } -def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name): +def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, augment): random.seed(1234) build_fn = COMBINED_FNS[short_name] for dataset in ("train", "dev", "test"): - build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset) + output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" + sents = build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset) + if dataset == 'train' and augment: + sents = augment_punct(sents) + write_sentences_to_conllu(output_conllu, sents) -def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset): +def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment): """ Build the GUM dataset by combining GUMReddit It checks to make sure GUMReddit is filled out using the included script """ check_gum_ready(udbase_dir) + random.seed(1234) output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" @@ -944,36 +934,30 @@ def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, da conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True) sents.extend(read_sentences_from_conllu(conllu_file)) - if dataset == 'train': + if dataset == 'train' and augment: sents = augment_punct(sents) write_sentences_to_conllu(output_conllu, sents) -def build_combined_english_gum(udbase_dir, tokenizer_dir, short_name): +def build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, augment): for dataset in ("train", "dev", "test"): - build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset) + build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment) def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True): input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu") output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" - if short_name == "sl_ssj": - preprocess_ssj_data.process(input_conllu, output_conllu) - elif short_name == "te_mtg" and dataset == 'train' and augment: + if short_name == "te_mtg" and dataset == 'train' and augment: write_augmented_dataset(input_conllu, output_conllu, augment_telugu) elif short_name == "ar_padt" and dataset == 'train' and augment: write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt) elif short_name.startswith("ko_") and short_name.endswith("_seg"): remove_spaces(input_conllu, output_conllu) - elif dataset == 'train': - # we treat the additional punct as something that always needs to be there - # this will teach the tagger & depparse about unicode apos, for example + elif dataset == 'train' and augment: write_augmented_dataset(input_conllu, output_conllu, augment_punct) else: shutil.copyfile(input_conllu, output_conllu) - # TODO: refactor this call everywhere - def process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, augment=True): """ Process a normal UD treebank with train/dev/test splits @@ -1049,11 +1033,11 @@ def process_treebank(treebank, paths, args): if short_name.startswith("ko_combined"): build_combined_korean(udbase_dir, tokenizer_dir, short_name) elif short_name in ("it_combined", "en_combined", "es_combined"): - build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name) + build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment) elif short_name.startswith("en_gum"): # we special case GUM because it should include a filled-out GUMReddit print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language)) - build_combined_english_gum(udbase_dir, tokenizer_dir, short_name) + build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment) else: # check that we can find the train file where we expect it train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True) diff --git a/stanza/utils/datasets/preprocess_ssj_data.py b/stanza/utils/datasets/preprocess_ssj_data.py deleted file mode 100644 index 4ce7a8b9..00000000 --- a/stanza/utils/datasets/preprocess_ssj_data.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -The SSJ dataset has an unusual bug: all of the sentences end with SpaceAfter=no - -This script fixes them and writes the fixed files to the given location. -""" - -def process(input_conllu, input_conllu_copy): - conllu_lines = open(input_conllu).readlines() - - new_conllu_lines = list(conllu_lines) - - line_idx = 0 - text_idx = 0 - # invariant: conllu_lines[line_idx] is - # # sent_id - # at the start of a loop - while line_idx < len(conllu_lines): - # extract the text from the comments before each sentence - line_idx = line_idx + 1 - text_line = conllu_lines[line_idx] - assert text_line.startswith("# text = "), "Unexpected format: %s,%d is not # text" % (input_conllu, line_idx) - text_line = text_line[9:-1] - # use that text to keep track of an index in the text where we might need to put new spaces - text_idx = text_idx + len(text_line) - - # advance to the end of the sentence - line_idx = line_idx + 1 - assert conllu_lines[line_idx].startswith("1"), "Unexpected format: %s,%d is not a word" % (input_conllu, line_idx) - while conllu_lines[line_idx].strip(): - line_idx = line_idx + 1 - last_word_idx = line_idx - 1 - - # check if the end of the sentence has SpaceAfter or not - new_line = conllu_lines[last_word_idx].replace("SpaceAfter=No|", "") - assert new_line.find("SpaceAfter=") < 0, "Unexpected format: %s,%d has unusual SpaceAfter" % (input_conllu, line_idx) - - # if not, need to add a new space - if new_line != conllu_lines[last_word_idx]: - conllu_lines[last_word_idx] = new_line - text_idx = text_idx + 1 - - # done with a sentence. skip to the start of the next sentence - # or the end of the document - while line_idx < len(conllu_lines) and not conllu_lines[line_idx].strip(): - line_idx = line_idx + 1 - - with open(input_conllu_copy, "w") as fout: - for line in conllu_lines: - fout.write(line) diff --git a/stanza/utils/training/run_lemma.py b/stanza/utils/training/run_lemma.py index 96b0f1cf..f86648e5 100644 --- a/stanza/utils/training/run_lemma.py +++ b/stanza/utils/training/run_lemma.py @@ -24,33 +24,9 @@ from stanza.models import lemmatizer from stanza.utils.training import common from stanza.utils.training.common import Mode -logger = logging.getLogger('stanza') +from stanza.utils.datasets.prepare_lemma_treebank import check_lemmas -def check_lemmas(train_file): - """ - Check if a treebank has any lemmas in it - - For example, in Vietnamese-VTB, all the words and lemmas are exactly the same - in Telugu-MTG, all the lemmas are blank - """ - # could eliminate a few languages immediately based on UD 2.7 - # but what if a later dataset includes lemmas? - #if short_language in ('vi', 'fro', 'th'): - # return False - with open(train_file) as fin: - for line in fin: - line = line.strip() - if not line or line.startswith("#"): - continue - pieces = line.split("\t") - word = pieces[1].lower().strip() - lemma = pieces[2].lower().strip() - if not lemma or lemma == '_' or lemma == '-': - continue - if word == lemma: - continue - return True - return False +logger = logging.getLogger('stanza') def run_treebank(mode, paths, treebank, short_name, temp_output_file, command_args, extra_args): |