Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2021-06-08 22:46:50 +0300
committerGitHub <noreply@github.com>2021-06-08 22:46:50 +0300
commitca0ff9c40609891da2ad5ef4765369cbd00ef5d5 (patch)
treee283f72143df199be15c36c2868acdbbd64d00ce
parent405e516dee460c57efaa9b20c0728f7e0b6d0d61 (diff)
parent0ac918983bb2210230ab2a3c229711c1ab22903e (diff)
Merge pull request #718 from stanfordnlp/ud28
Ud28
-rwxr-xr-xscripts/download_vectors.sh14
-rwxr-xr-xscripts/treebank_to_shorthand.sh6
-rw-r--r--stanza/_version.py4
-rw-r--r--stanza/models/charlm.py35
-rw-r--r--stanza/models/common/constant.py15
-rw-r--r--stanza/models/common/short_name_to_treebank.py23
-rw-r--r--stanza/models/pos/xpos_vocab_factory.py4
-rw-r--r--stanza/resources/common.py12
-rw-r--r--stanza/resources/prepare_resources.py12
-rw-r--r--stanza/tests/test_installation.py2
-rw-r--r--stanza/utils/datasets/prepare_lemma_treebank.py37
-rwxr-xr-xstanza/utils/datasets/prepare_tokenizer_treebank.py58
-rw-r--r--stanza/utils/datasets/preprocess_ssj_data.py49
-rw-r--r--stanza/utils/training/run_lemma.py28
14 files changed, 148 insertions, 151 deletions
diff --git a/scripts/download_vectors.sh b/scripts/download_vectors.sh
index 960f57ef..1705fdfe 100755
--- a/scripts/download_vectors.sh
+++ b/scripts/download_vectors.sh
@@ -20,9 +20,17 @@ FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki"
# Welsh, Icelandic, Thai, Sanskrit
# https://fasttext.cc/docs/en/crawl-vectors.html
-declare -a FASTTEXT_LANG=("Afrikaans" "Armenian" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
-declare -a FASTTEXT_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
-declare -a LOCAL_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")
+# We get the Armenian word vectors from here:
+# https://github.com/ispras-texterra/word-embeddings-eval-hy
+# https://arxiv.org/ftp/arxiv/papers/1906/1906.03134.pdf
+# In particular, the glove model (dogfooding):
+# https://at.ispras.ru/owncloud/index.php/s/pUUiS1l1jGKNax3/download
+# These vectors improved F1 by about 1 on various tasks for Armenian
+# and had much better coverage of Western Armenian
+
+declare -a FASTTEXT_LANG=("Afrikaans" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
+declare -a FASTTEXT_CODE=("af" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
+declare -a LOCAL_CODE=("af" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")
color_green='\033[32;1m'
color_clear='\033[0m' # No Color
diff --git a/scripts/treebank_to_shorthand.sh b/scripts/treebank_to_shorthand.sh
index 1d15b877..bb6f1793 100755
--- a/scripts/treebank_to_shorthand.sh
+++ b/scripts/treebank_to_shorthand.sh
@@ -7,9 +7,9 @@
# Please keep synced with
# stanza/models/common/constant.py
-declare -A lang2lcode=( ["Afrikaans"]="af" ["Akkadian"]="akk" ["Akuntsu"]="aqz" ["Albanian"]="sq" ["Amharic"]="am" ["Apurina"]="apu" ["Armenian"]="hy" ["Arabic"]="ar" ["Assyrian"]="aii" ["Bambara"]="bm" ["Bengali"]="bn" ["Bhojpuri"]="bho" ["Breton"]="br" ["Bulgarian"]="bg" ["Buryat"]="bxr" ["Cantonese"]="yue" ["Catalan"]="ca" ["Chukchi"]="ckt" ["Czech"]="cs" ["Old_Church_Slavonic"]="cu" ["Danish"]="da" ["German"]="de" ["Greek"]="el" ["English"]="en" ["Spanish"]="es" ["Estonian"]="et" ["Basque"]="eu" ["Persian"]="fa" ["Faroese"]="fo" ["Finnish"]="fi" ["French"]="fr" ["Irish"]="ga" ["Galician"]="gl" ["Gothic"]="got" ["Ancient_Greek"]="grc" ["Mbya_Guarani"]="gun" ["Hebrew"]="he" ["Hindi"]="hi" ["Hindi_English"]="qhe" ["Croatian"]="hr"
-["Hungarian"]="hu" ["Icelandic"]="is" ["Indonesian"]="id" ["Italian"]="it" ["Japanese"]="ja" ["Karelian"]="krl" ["Kazakh"]="kk" ["Khunsari"]="kfm" ["Komi_Permyak"]="koi" ["Komi_Zyrian"]="kpv" ["Korean"]="ko" ["Kurmanji"]="kmr" ["Latin"]="la" ["Latvian"]="lv" ["Malayalam"]="mal" ["Manx"]="gv" ["Moksha"]="mdf" ["Munduruku"]="myu" ["Erzya"]="myv" ["Nayini"]="nyq" ["Dutch"]="nl" ["Norwegian_Bokmaal"]="nb" ["Norwegian_Nynorsk"]="nn" ["Polish"]="pl" ["Portuguese"]="pt" ["Romanian"]="ro" ["Russian"]="ru" ["Sanskrit"]="sa" ["Slovak"]="sk" ["Slovenian"]="sl" ["Soi"]="soj" ["South_Levantine_Arabic"]="ajp" ["Swedish"]="sv" ["Swiss_German"]="gsw" ["Tagalog"]="tl" ["Turkish"]="tr" ["Turkish_German"]="qtd" ["Uyghur"]="ug" ["Ukrainian"]="uk" ["Urdu"]="ur" ["Vietnamese"]="vi" ["Traditional_Chinese"]="zh-hant" ["Welsh"]="cy" ["Altaic"]="bxr" ["Indo_Iranian"]="kmr" ["Uralic"]="sme" ["Slavic"]="hsb"
-["Naija"]="pcm" ["North_Sami"]="sme" ["Old_French"]="fro" ["Old_Turkish"]="otk" ["Serbian"]="sr" ["Skolt_Sami"]="sms" ["Thai"]="th" ["Tupinamba"]="tpn" ["Upper_Sorbian"]="hsb" ["Belarusian"]="be" ["Classical_Chinese"]="lzh" ["Coptic"]="cop" ["Lithuanian"]="lt" ["Livvi"]="olo" ["Maltese"]="mt" ["Marathi"]="mr" ["Old_Russian"]="orv" ["Scottish_Gaelic"]="gd" ["Simplified_Chinese"]="zh-hans" ["Swedish_Sign_Language"]="swl" ["Tamil"]="ta" ["Telugu"]="te" ["Warlpipi"]="wbp" ["Wolof"]="wo" ["Yoruba"]="yo" )
+declare -A lang2lcode=( ["Afrikaans"]="af" ["Akkadian"]="akk" ["Akuntsu"]="aqz" ["Albanian"]="sq" ["Amharic"]="am" ["Apurina"]="apu" ["Armenian"]="hy" ["Arabic"]="ar" ["Assyrian"]="aii" ["Bambara"]="bm" ["Beja"]="bej" ["Bengali"]="bn" ["Bhojpuri"]="bho" ["Breton"]="br" ["Bulgarian"]="bg" ["Buryat"]="bxr" ["Cantonese"]="yue" ["Catalan"]="ca" ["Chukchi"]="ckt" ["Czech"]="cs" ["Old_Church_Slavonic"]="cu" ["Danish"]="da" ["German"]="de" ["Greek"]="el" ["English"]="en" ["Spanish"]="es" ["Estonian"]="et" ["Basque"]="eu" ["Persian"]="fa" ["Faroese"]="fo" ["Finnish"]="fi" ["French"]="fr" ["Frisian_Dutch"]="qfn" ["Guajajara"]="gub" ["Irish"]="ga" ["Galician"]="gl" ["Gothic"]="got" ["Ancient_Greek"]="grc" ["Mbya_Guarani"]="gun" ["Hebrew"]="he" ["Hindi"]="hi" ["Hindi_English"]="qhe" ["Croatian"]="hr"
+["Hungarian"]="hu" ["Icelandic"]="is" ["Indonesian"]="id" ["Italian"]="it" ["Japanese"]="ja" ["Kaapor"]="urb" ["Kangri"]="xnr" ["Karelian"]="krl" ["Kazakh"]="kk" ["Khunsari"]="kfm" ["Kiche"]="quc" ["Komi_Permyak"]="koi" ["Komi_Zyrian"]="kpv" ["Korean"]="ko" ["Kurmanji"]="kmr" ["Latin"]="la" ["Latvian"]="lv" ["Low_Saxon"]="nds" ["Makurap"]="mpu" ["Malayalam"]="mal" ["Manx"]="gv" ["Moksha"]="mdf" ["Munduruku"]="myu" ["Erzya"]="myv" ["Nayini"]="nyq" ["Dutch"]="nl" ["Norwegian_Bokmaal"]="nb" ["Norwegian_Nynorsk"]="nn" ["Polish"]="pl" ["Portuguese"]="pt" ["Romanian"]="ro" ["Russian"]="ru" ["Sanskrit"]="sa" ["Slovak"]="sk" ["Slovenian"]="sl" ["Soi"]="soj" ["South_Levantine_Arabic"]="ajp" ["Swedish"]="sv" ["Swiss_German"]="gsw" ["Tagalog"]="tl" ["Turkish"]="tr" ["Turkish_German"]="qtd" ["Uyghur"]="ug" ["Ukrainian"]="uk" ["Urdu"]="ur" ["Vietnamese"]="vi" ["Traditional_Chinese"]="zh-hant" ["Welsh"]="cy" ["Altaic"]="bxr" ["Indo_Iranian"]="kmr" ["Uralic"]="sme" ["Slavic"]="hsb"
+["Naija"]="pcm" ["North_Sami"]="sme" ["Old_French"]="fro" ["Old_Turkish"]="otk" ["Serbian"]="sr" ["Skolt_Sami"]="sms" ["Thai"]="th" ["Tupinamba"]="tpn" ["Upper_Sorbian"]="hsb" ["Belarusian"]="be" ["Classical_Chinese"]="lzh" ["Coptic"]="cop" ["Lithuanian"]="lt" ["Livvi"]="olo" ["Maltese"]="mt" ["Marathi"]="mr" ["Old_Russian"]="orv" ["Scottish_Gaelic"]="gd" ["Simplified_Chinese"]="zh-hans" ["Swedish_Sign_Language"]="swl" ["Tamil"]="ta" ["Telugu"]="te" ["Warlpipi"]="wbp" ["Western_Armenian"]="hyw" ["Wolof"]="wo" ["Yoruba"]="yo" ["Yupik"]="ess" )
format=$1
shift
diff --git a/stanza/_version.py b/stanza/_version.py
index 5d611f5b..4bce72e3 100644
--- a/stanza/_version.py
+++ b/stanza/_version.py
@@ -1,4 +1,4 @@
""" Single source of truth for version number """
-__version__ = "1.2"
-__resources_version__ = '1.2.0'
+__version__ = "1.2.1"
+__resources_version__ = '1.2.1'
diff --git a/stanza/models/charlm.py b/stanza/models/charlm.py
index 031f2139..36345121 100644
--- a/stanza/models/charlm.py
+++ b/stanza/models/charlm.py
@@ -56,27 +56,26 @@ def readlines(path):
def build_vocab(path, cutoff=0):
# Requires a large amount of memory, but only need to build once
+
+ # here we need some trick to deal with excessively large files
+ # for each file we accumulate the counter of characters, and
+ # at the end we simply pass a list of chars to the vocab builder
+ counter = Counter()
if os.path.isdir(path):
- # here we need some trick to deal with excessively large files
- # for each file we accumulate the counter of characters, and
- # at the end we simply pass a list of chars to the vocab builder
- counter = Counter()
filenames = sorted(os.listdir(path))
- for filename in filenames:
- lines = readlines(path + '/' + filename)
- for line in lines:
- counter.update(list(line))
- # remove infrequent characters from vocab
- for k in list(counter.keys()):
- if counter[k] < cutoff:
- del counter[k]
- # a singleton list of all characters
- data = [sorted([x[0] for x in counter.most_common()])]
- vocab = CharVocab(data) # skip cutoff argument because this has been dealt with
else:
- lines = readlines(path)
- data = [list(line) for line in lines]
- vocab = CharVocab(data, cutoff=cutoff)
+ filenames = [path]
+ for filename in filenames:
+ lines = readlines(path + '/' + filename)
+ for line in lines:
+ counter.update(list(line))
+ # remove infrequent characters from vocab
+ for k in list(counter.keys()):
+ if counter[k] < cutoff:
+ del counter[k]
+ # a singleton list of all characters
+ data = [sorted([x[0] for x in counter.most_common()])]
+ vocab = CharVocab(data) # skip cutoff argument because this has been dealt with
return vocab
def load_file(path, vocab, direction):
diff --git a/stanza/models/common/constant.py b/stanza/models/common/constant.py
index cde8b597..3ba570ab 100644
--- a/stanza/models/common/constant.py
+++ b/stanza/models/common/constant.py
@@ -20,6 +20,7 @@ lcode2lang = {
"aii": "Assyrian",
"bm": "Bambara",
"eu": "Basque",
+ "bej": "Beja",
"be": "Belarusian",
"bn": "Bengali",
"bho": "Bhojpuri",
@@ -42,10 +43,12 @@ lcode2lang = {
"fo": "Faroese",
"fi": "Finnish",
"fr": "French",
+ "qfn": "Frisian_Dutch",
"gl": "Galician",
"de": "German",
"got": "Gothic",
"el": "Greek",
+ "gub": "Guajajara",
"he": "Hebrew",
"hi": "Hindi",
"qhe": "Hindi_English",
@@ -55,9 +58,12 @@ lcode2lang = {
"ga": "Irish",
"it": "Italian",
"ja": "Japanese",
+ "urb": "Kaapor",
+ "xnr": "Kangri",
"krl": "Karelian",
"kk": "Kazakh",
"kfm": "Khunsari",
+ "quc": "Kiche",
"koi": "Komi_Permyak",
"kpv": "Komi_Zyrian",
"ko": "Korean",
@@ -66,6 +72,8 @@ lcode2lang = {
"olo": "Livvi",
"la": "Latin",
"lv": "Latvian",
+ "nds": "Low_Saxon",
+ "mpu": "Makurap",
"mal": "Malayalam",
"mt": "Maltese",
"gv": "Manx",
@@ -80,7 +88,7 @@ lcode2lang = {
"nn": "Norwegian_Nynorsk",
"cu": "Old_Church_Slavonic",
"fro": "Old_French",
- "orv": "Old_Russian",
+ "orv": "Old_East_Slavic",
"otk": "Old_Turkish",
"fa": "Persian",
"pl": "Polish",
@@ -114,7 +122,9 @@ lcode2lang = {
"vi": "Vietnamese",
"wbp": "Warlpiri",
"cy": "Welsh",
+ "hyw": "Western_Armenian",
"wo": "Wolof",
+ "ess": "Yupik",
"yo": "Yoruba",
}
@@ -128,6 +138,9 @@ lcode2lang['zh'] = 'Simplified_Chinese'
lang2lcode['Chinese'] = 'zh'
+# treebank names changed from Old Russian to Old East Slavic in 2.8
+lang2lcode['Old_Russian'] = 'orv'
+
treebank_special_cases = {
"UD_Chinese-GSDSimp": "zh_gsdsimp",
"UD_Chinese-GSD": "zh-hant_gsd",
diff --git a/stanza/models/common/short_name_to_treebank.py b/stanza/models/common/short_name_to_treebank.py
index 9e681787..871f58da 100644
--- a/stanza/models/common/short_name_to_treebank.py
+++ b/stanza/models/common/short_name_to_treebank.py
@@ -18,6 +18,7 @@ SHORT_NAMES = {
'aii_as': 'UD_Assyrian-AS',
'bm_crb': 'UD_Bambara-CRB',
'eu_bdt': 'UD_Basque-BDT',
+ 'bej_nsc': 'UD_Beja-NSC',
'be_hse': 'UD_Belarusian-HSE',
'bho_bhtb': 'UD_Bhojpuri-BHTB',
'br_keb': 'UD_Breton-KEB',
@@ -67,6 +68,7 @@ SHORT_NAMES = {
'fr_partut': 'UD_French-ParTUT',
'fr_sequoia': 'UD_French-Sequoia',
'fr_spoken': 'UD_French-Spoken',
+ 'qfn_fame': 'UD_Frisian_Dutch-Fame',
'gl_ctg': 'UD_Galician-CTG',
'gl_treegal': 'UD_Galician-TreeGal',
'de_gsd': 'UD_German-GSD',
@@ -75,30 +77,37 @@ SHORT_NAMES = {
'de_pud': 'UD_German-PUD',
'got_proiel': 'UD_Gothic-PROIEL',
'el_gdt': 'UD_Greek-GDT',
+ 'gub_tudet': 'UD_Guajajara-TuDeT',
'he_htb': 'UD_Hebrew-HTB',
'hi_hdtb': 'UD_Hindi-HDTB',
'hi_pud': 'UD_Hindi-PUD',
'qhe_hiencs': 'UD_Hindi_English-HIENCS',
'hu_szeged': 'UD_Hungarian-Szeged',
'is_icepahc': 'UD_Icelandic-IcePaHC',
+ 'is_modern': 'UD_Icelandic-Modern',
'is_pud': 'UD_Icelandic-PUD',
'id_csui': 'UD_Indonesian-CSUI',
'id_gsd': 'UD_Indonesian-GSD',
'id_pud': 'UD_Indonesian-PUD',
'ga_idt': 'UD_Irish-IDT',
+ 'ga_twittirish': 'UD_Irish-TwittIrish',
'it_isdt': 'UD_Italian-ISDT',
'it_pud': 'UD_Italian-PUD',
'it_partut': 'UD_Italian-ParTUT',
'it_postwita': 'UD_Italian-PoSTWITA',
'it_twittiro': 'UD_Italian-TWITTIRO',
'it_vit': 'UD_Italian-VIT',
+ 'it_valico': 'UD_Italian-Valico',
'ja_bccwj': 'UD_Japanese-BCCWJ',
'ja_gsd': 'UD_Japanese-GSD',
'ja_modern': 'UD_Japanese-Modern',
'ja_pud': 'UD_Japanese-PUD',
+ 'urb_tudet': 'UD_Kaapor-TuDeT',
+ 'xnr_kdtb': 'UD_Kangri-KDTB',
'krl_kkpp': 'UD_Karelian-KKPP',
'kk_ktb': 'UD_Kazakh-KTB',
'kfm_aha': 'UD_Khunsari-AHA',
+ 'quc_iu': 'UD_Kiche-IU',
'koi_uh': 'UD_Komi_Permyak-UH',
'kpv_ikdp': 'UD_Komi_Zyrian-IKDP',
'kpv_lattice': 'UD_Komi_Zyrian-Lattice',
@@ -110,10 +119,13 @@ SHORT_NAMES = {
'la_llct': 'UD_Latin-LLCT',
'la_proiel': 'UD_Latin-PROIEL',
'la_perseus': 'UD_Latin-Perseus',
+ 'la_udante': 'UD_Latin-UDante',
'lv_lvtb': 'UD_Latvian-LVTB',
'lt_alksnis': 'UD_Lithuanian-ALKSNIS',
'lt_hse': 'UD_Lithuanian-HSE',
'olo_kkpp': 'UD_Livvi-KKPP',
+ 'nds_lsdc': 'UD_Low_Saxon-LSDC',
+ 'mpu_tudet': 'UD_Makurap-TuDeT',
'mt_mudt': 'UD_Maltese-MUDT',
'gv_cadhan': 'UD_Manx-Cadhan',
'mr_ufal': 'UD_Marathi-UFAL',
@@ -128,9 +140,9 @@ SHORT_NAMES = {
'nn_nynorsk': 'UD_Norwegian-Nynorsk',
'nn_nynorsklia': 'UD_Norwegian-NynorskLIA',
'cu_proiel': 'UD_Old_Church_Slavonic-PROIEL',
+ 'orv_rnc': 'UD_Old_East_Slavic-RNC',
+ 'orv_torot': 'UD_Old_East_Slavic-TOROT',
'fro_srcmf': 'UD_Old_French-SRCMF',
- 'orv_rnc': 'UD_Old_Russian-RNC',
- 'orv_torot': 'UD_Old_Russian-TOROT',
'otk_tonqq': 'UD_Old_Turkish-Tonqq',
'fa_perdt': 'UD_Persian-PerDT',
'fa_seraji': 'UD_Persian-Seraji',
@@ -140,6 +152,7 @@ SHORT_NAMES = {
'pt_bosque': 'UD_Portuguese-Bosque',
'pt_gsd': 'UD_Portuguese-GSD',
'pt_pud': 'UD_Portuguese-PUD',
+ 'ro_art': 'UD_Romanian-ArT',
'ro_nonstandard': 'UD_Romanian-Nonstandard',
'ro_rrt': 'UD_Romanian-RRT',
'ro_simonero': 'UD_Romanian-SiMoNERo',
@@ -173,9 +186,13 @@ SHORT_NAMES = {
'th_pud': 'UD_Thai-PUD',
'tpn_tudet': 'UD_Tupinamba-TuDeT',
'tr_boun': 'UD_Turkish-BOUN',
+ 'tr_framenet': 'UD_Turkish-FrameNet',
'tr_gb': 'UD_Turkish-GB',
'tr_imst': 'UD_Turkish-IMST',
+ 'tr_kenet': 'UD_Turkish-Kenet',
'tr_pud': 'UD_Turkish-PUD',
+ 'tr_penn': 'UD_Turkish-Penn',
+ 'tr_tourism': 'UD_Turkish-Tourism',
'qtd_sagt': 'UD_Turkish_German-SAGT',
'uk_iu': 'UD_Ukrainian-IU',
'hsb_ufal': 'UD_Upper_Sorbian-UFAL',
@@ -184,8 +201,10 @@ SHORT_NAMES = {
'vi_vtb': 'UD_Vietnamese-VTB',
'wbp_ufal': 'UD_Warlpiri-UFAL',
'cy_ccg': 'UD_Welsh-CCG',
+ 'hyw_armtdp': 'UD_Western_Armenian-ArmTDP',
'wo_wtb': 'UD_Wolof-WTB',
'yo_ytb': 'UD_Yoruba-YTB',
+ 'ess_sli': 'UD_Yupik-SLI',
}
diff --git a/stanza/models/pos/xpos_vocab_factory.py b/stanza/models/pos/xpos_vocab_factory.py
index 5397ca34..39da44fd 100644
--- a/stanza/models/pos/xpos_vocab_factory.py
+++ b/stanza/models/pos/xpos_vocab_factory.py
@@ -4,9 +4,9 @@
from stanza.models.pos.vocab import WordVocab, XPOSVocab
def xpos_vocab_factory(data, shorthand):
- if shorthand in ["af_afribooms", "ar_padt", "bg_btb", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gd_arcosg", "gl_ctg", "gl_treegal", "grc_perseus", "hr_set", "is_icepahc", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "it_combined", "la_perseus", "lt_alksnis", "lv_lvtb", "ro_nonstandard", "ro_rrt", "ro_simonero", "sk_snk", "sl_ssj", "sl_sst", "sr_set", "ta_ttb", "uk_iu"]:
+ if shorthand in ["af_afribooms", "ar_padt", "bg_btb", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gd_arcosg", "gl_ctg", "gl_treegal", "grc_perseus", "hr_set", "is_icepahc", "is_modern", "it_combined", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "la_perseus", "la_udante", "lt_alksnis", "lv_lvtb", "ro_nonstandard", "ro_rrt", "ro_simonero", "sk_snk", "sl_ssj", "sl_sst", "sr_set", "ta_ttb", "uk_iu"]:
return XPOSVocab(data, shorthand, idx=2, sep="")
- elif shorthand in ["be_hse", "ca_ancora", "cop_scriptorium", "cu_proiel", "cy_ccg", "da_ddt", "de_gsd", "de_hdt", "el_gdt", "en_ewt", "en_gum", "en_combined", "es_ancora", "es_gsd", "et_edt", "et_ewt", "eu_bdt", "fa_perdt", "fa_seraji", "fi_tdt", "fr_ftb", "fr_gsd", "fro_srcmf", "fr_sequoia", "fr_spoken", "ga_idt", "got_proiel", "grc_proiel", "he_htb", "hi_hdtb", "hu_szeged", "hy_armtdp", "id_csui", "ja_gsd", "la_proiel", "lt_hse", "lzh_kyoto", "mr_ufal", "mt_mudt", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "orv_rnc", "orv_torot", "pcm_nsc", "pt_bosque", "pt_gsd", "qtd_sagt", "ru_gsd", "ru_syntagrus", "ru_taiga", "sa_vedic", "sme_giella", "swl_sslc", "te_mtg", "tr_boun", "tr_imst", "ug_udt", "vi_vtb", "wo_wtb", "zh_gsdsimp", "zh-hans_gsdsimp", "zh-hant_gsd", "bxr_bdt", "hsb_ufal", "ja_bccwj", "kk_ktb", "kmr_mg", "olo_kkpp"]:
+ elif shorthand in ["be_hse", "ca_ancora", "cop_scriptorium", "cu_proiel", "cy_ccg", "da_ddt", "de_gsd", "de_hdt", "el_gdt", "en_combined", "en_ewt", "en_gum", "es_ancora", "es_gsd", "et_edt", "et_ewt", "eu_bdt", "fa_perdt", "fa_seraji", "fi_tdt", "fr_gsd", "fro_srcmf", "fr_sequoia", "fr_spoken", "ga_idt", "got_proiel", "grc_proiel", "he_htb", "hi_hdtb", "hu_szeged", "hy_armtdp", "hyw_armtdp", "id_csui", "ja_gsd", "la_proiel", "lt_hse", "lzh_kyoto", "mr_ufal", "mt_mudt", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "orv_rnc", "orv_torot", "pcm_nsc", "pt_bosque", "pt_gsd", "qtd_sagt", "ru_gsd", "ru_syntagrus", "ru_taiga", "sa_vedic", "sme_giella", "swl_sslc", "te_mtg", "tr_boun", "tr_framenet", "tr_imst", "tr_kenet", "tr_penn", "tr_tourism", "ug_udt", "vi_vtb", "wo_wtb", "zh_gsdsimp", "zh-hant_gsd", "bxr_bdt", "hsb_ufal", "ja_bccwj", "kk_ktb", "kmr_mg", "olo_kkpp"]:
return WordVocab(data, shorthand, idx=2, ignore=["_"])
elif shorthand in ["en_lines", "fo_farpahc", "sv_lines", "ur_udtb"]:
return XPOSVocab(data, shorthand, idx=2, sep="-")
diff --git a/stanza/resources/common.py b/stanza/resources/common.py
index ad9d722c..8e70e861 100644
--- a/stanza/resources/common.py
+++ b/stanza/resources/common.py
@@ -103,7 +103,7 @@ def file_exists(path, md5):
"""
return os.path.exists(path) and get_md5(path) == md5
-def download_file(url, path, proxies):
+def download_file(url, path, proxies, raise_for_status=False):
"""
Download a URL into a file as specified by `path`.
"""
@@ -120,8 +120,11 @@ def download_file(url, path, proxies):
f.write(chunk)
f.flush()
pbar.update(len(chunk))
+ if raise_for_status:
+ r.raise_for_status()
+ return r.status_code
-def request_file(url, path, proxies=None, md5=None):
+def request_file(url, path, proxies=None, md5=None, raise_for_status=False):
"""
A complete wrapper over download_file() that also make sure the directory of
`path` exists, and that a file matching the md5 value does not exist.
@@ -130,7 +133,7 @@ def request_file(url, path, proxies=None, md5=None):
if file_exists(path, md5):
logger.info(f'File exists: {path}.')
return
- download_file(url, path, proxies)
+ download_file(url, path, proxies, raise_for_status)
assert(not md5 or file_exists(path, md5))
def sort_processors(processor_list):
@@ -332,7 +335,8 @@ def download_resources_json(model_dir, resources_url, resources_branch,
request_file(
f'{resources_url}/resources_{resources_version}.json',
os.path.join(model_dir, 'resources.json'),
- proxies
+ proxies,
+ raise_for_status=True
)
diff --git a/stanza/resources/prepare_resources.py b/stanza/resources/prepare_resources.py
index f3fef48e..31791e21 100644
--- a/stanza/resources/prepare_resources.py
+++ b/stanza/resources/prepare_resources.py
@@ -68,6 +68,7 @@ default_treebanks = {
"ug": "udt",
"vi": "vtb",
"lt": "alksnis",
+ "hyw": "armtdp",
"wo": "wtb",
"nb": "bokmaal",
"mt": "mudt",
@@ -89,11 +90,13 @@ default_treebanks = {
# default ner for languages
default_ners = {
"ar": "aqmar",
+ "bg": "bsnlp19",
"de": "conll03",
"en": "ontonotes",
"es": "conll02",
"fi": "turku",
"fr": "wikiner",
+ "hu": "combined",
"nl": "conll02",
"ru": "wikiner",
"uk": "languk",
@@ -104,6 +107,7 @@ default_ners = {
# default charlms for languages
default_charlms = {
"ar": "ccwiki",
+ "bg": "conll17",
"de": "newswiki",
"en": "1billion",
"es": "newswiki",
@@ -131,7 +135,10 @@ ner_charlms = {
},
"uk": {
"languk": None,
- }
+ },
+ "hu": {
+ "combined": None,
+ },
}
# a few languages have sentiment classifier models
@@ -213,7 +220,7 @@ lcode2lang = {
"nn": "Norwegian_Nynorsk",
"cu": "Old_Church_Slavonic",
"fro": "Old_French",
- "orv": "Old_Russian",
+ "orv": "Old_East_Slavic",
"fa": "Persian",
"pl": "Polish",
"pt": "Portuguese",
@@ -239,6 +246,7 @@ lcode2lang = {
"ug": "Uyghur",
"vi": "Vietnamese",
"cy": "Welsh",
+ "hyw": "Western_Armenian",
"wo": "Wolof"
}
diff --git a/stanza/tests/test_installation.py b/stanza/tests/test_installation.py
index 05e5b650..13b8c4f9 100644
--- a/stanza/tests/test_installation.py
+++ b/stanza/tests/test_installation.py
@@ -30,7 +30,7 @@ def test_install_corenlp():
def test_download_corenlp_models():
model_name = "arabic"
- version = "4.2.0"
+ version = "4.2.2"
with tempfile.TemporaryDirectory(dir=".") as test_dir:
stanza.download_corenlp_models(model=model_name, version=version, dir=test_dir)
diff --git a/stanza/utils/datasets/prepare_lemma_treebank.py b/stanza/utils/datasets/prepare_lemma_treebank.py
index 3f90fcf5..a754c4fe 100644
--- a/stanza/utils/datasets/prepare_lemma_treebank.py
+++ b/stanza/utils/datasets/prepare_lemma_treebank.py
@@ -12,8 +12,43 @@ and it will prepare each of train, dev, test
import stanza.utils.datasets.common as common
import stanza.utils.datasets.prepare_tokenizer_treebank as prepare_tokenizer_treebank
+def check_lemmas(train_file):
+ """
+ Check if a treebank has any lemmas in it
+
+ For example, in Vietnamese-VTB, all the words and lemmas are exactly the same
+ in Telugu-MTG, all the lemmas are blank
+ """
+ # could eliminate a few languages immediately based on UD 2.7
+ # but what if a later dataset includes lemmas?
+ #if short_language in ('vi', 'fro', 'th'):
+ # return False
+ with open(train_file) as fin:
+ for line in fin:
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+ pieces = line.split("\t")
+ word = pieces[1].lower().strip()
+ lemma = pieces[2].lower().strip()
+ if not lemma or lemma == '_' or lemma == '-':
+ continue
+ if word == lemma:
+ continue
+ return True
+ return False
+
def process_treebank(treebank, paths, args):
- prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"])
+ if treebank.startswith("UD_"):
+ udbase_dir = paths["UDBASE"]
+ train_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
+ augment = check_lemmas(train_conllu)
+ if not augment:
+ print("No lemma information found in %s. Not augmenting the dataset" % train_conllu)
+ else:
+ # TODO: check the data to see if there are lemmas or not
+ augment = True
+ prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"], augment=augment)
def main():
common.main(process_treebank)
diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
index 7237e722..459a6a74 100755
--- a/stanza/utils/datasets/prepare_tokenizer_treebank.py
+++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -34,7 +34,6 @@ from collections import Counter
import stanza.utils.datasets.common as common
import stanza.utils.datasets.prepare_tokenizer_data as prepare_tokenizer_data
-import stanza.utils.datasets.preprocess_ssj_data as preprocess_ssj_data
def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name):
@@ -43,7 +42,7 @@ def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_n
shutil.copyfile(original, copied)
-def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None):
+def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None, augment=True):
"""
This utility method copies only the conllu files to the given destination directory.
@@ -60,7 +59,7 @@ def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None):
# first we process the tokenization data
args = argparse.Namespace()
- args.augment = False
+ args.augment = augment
args.prepare_labels = False
process_treebank(treebank, paths, args)
@@ -783,8 +782,6 @@ def build_combined_korean(udbase_dir, tokenizer_dir, short_name):
build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu)
def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
- output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
-
if dataset == 'train':
# could maybe add ParTUT, but that dataset has a slightly different xpos set
# (no DE or I)
@@ -806,13 +803,11 @@ def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
raise AssertionError("Unexpected format of the italian.mwt file. Has it already be modified to have SpaceAfter=No everywhere?")
sentence[2] = sentence[2][:-1] + "SpaceAfter=No"
sents = sents + extra_sents
-
- sents = augment_punct(sents)
else:
istd_conllu = common.find_treebank_dataset_file("UD_Italian-ISDT", udbase_dir, dataset, "conllu")
sents = read_sentences_from_conllu(istd_conllu)
- write_sentences_to_conllu(output_conllu, sents)
+ return sents
def check_gum_ready(udbase_dir):
gum_conllu = common.find_treebank_dataset_file("UD_English-GUMReddit", udbase_dir, "train", "conllu")
@@ -827,8 +822,6 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
"""
check_gum_ready(udbase_dir)
- output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
-
if dataset == 'train':
# TODO: include more UD treebanks, possibly with xpos removed
# UD_English-ParTUT - xpos are different
@@ -843,15 +836,13 @@ def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
for treebank in test_treebanks:
conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True)
sents.extend(read_sentences_from_conllu(conllu_file))
-
- # TODO: refactor things like the augment_punct call
- sents = augment_punct(sents)
else:
ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu")
sents = read_sentences_from_conllu(ewt_conllu)
sents = strip_mwt_from_sentences(sents)
- write_sentences_to_conllu(output_conllu, sents)
+ return sents
+
def replace_semicolons(sentences):
"""
@@ -888,8 +879,6 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
TODO: remove features which aren't shared between datasets
TODO: consider mixing in PUD?
"""
- output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
-
if dataset == 'train':
treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"]
sents = []
@@ -905,15 +894,11 @@ def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, sh
raise FileNotFoundError("Cannot find the extra dataset 'spanish.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian))
extra_sents = read_sentences_from_conllu(extra_spanish)
sents.extend(extra_sents)
-
- # TODO: refactor things like the augment_punct call
- sents = augment_punct(sents)
else:
conllu_file = common.find_treebank_dataset_file("UD_Spanish-AnCora", udbase_dir, dataset, "conllu", fail=True)
sents = read_sentences_from_conllu(conllu_file)
- write_sentences_to_conllu(output_conllu, sents)
-
+ return sents
COMBINED_FNS = {
@@ -922,19 +907,24 @@ COMBINED_FNS = {
"it_combined": build_combined_italian_dataset,
}
-def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name):
+def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, augment):
random.seed(1234)
build_fn = COMBINED_FNS[short_name]
for dataset in ("train", "dev", "test"):
- build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+ output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+ sents = build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+ if dataset == 'train' and augment:
+ sents = augment_punct(sents)
+ write_sentences_to_conllu(output_conllu, sents)
-def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset):
+def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment):
"""
Build the GUM dataset by combining GUMReddit
It checks to make sure GUMReddit is filled out using the included script
"""
check_gum_ready(udbase_dir)
+ random.seed(1234)
output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
@@ -944,36 +934,30 @@ def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, da
conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
sents.extend(read_sentences_from_conllu(conllu_file))
- if dataset == 'train':
+ if dataset == 'train' and augment:
sents = augment_punct(sents)
write_sentences_to_conllu(output_conllu, sents)
-def build_combined_english_gum(udbase_dir, tokenizer_dir, short_name):
+def build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, augment):
for dataset in ("train", "dev", "test"):
- build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset)
+ build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment)
def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True):
input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu")
output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
- if short_name == "sl_ssj":
- preprocess_ssj_data.process(input_conllu, output_conllu)
- elif short_name == "te_mtg" and dataset == 'train' and augment:
+ if short_name == "te_mtg" and dataset == 'train' and augment:
write_augmented_dataset(input_conllu, output_conllu, augment_telugu)
elif short_name == "ar_padt" and dataset == 'train' and augment:
write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt)
elif short_name.startswith("ko_") and short_name.endswith("_seg"):
remove_spaces(input_conllu, output_conllu)
- elif dataset == 'train':
- # we treat the additional punct as something that always needs to be there
- # this will teach the tagger & depparse about unicode apos, for example
+ elif dataset == 'train' and augment:
write_augmented_dataset(input_conllu, output_conllu, augment_punct)
else:
shutil.copyfile(input_conllu, output_conllu)
- # TODO: refactor this call everywhere
-
def process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, augment=True):
"""
Process a normal UD treebank with train/dev/test splits
@@ -1049,11 +1033,11 @@ def process_treebank(treebank, paths, args):
if short_name.startswith("ko_combined"):
build_combined_korean(udbase_dir, tokenizer_dir, short_name)
elif short_name in ("it_combined", "en_combined", "es_combined"):
- build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name)
+ build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
elif short_name.startswith("en_gum"):
# we special case GUM because it should include a filled-out GUMReddit
print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))
- build_combined_english_gum(udbase_dir, tokenizer_dir, short_name)
+ build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment)
else:
# check that we can find the train file where we expect it
train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
diff --git a/stanza/utils/datasets/preprocess_ssj_data.py b/stanza/utils/datasets/preprocess_ssj_data.py
deleted file mode 100644
index 4ce7a8b9..00000000
--- a/stanza/utils/datasets/preprocess_ssj_data.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
-The SSJ dataset has an unusual bug: all of the sentences end with SpaceAfter=no
-
-This script fixes them and writes the fixed files to the given location.
-"""
-
-def process(input_conllu, input_conllu_copy):
- conllu_lines = open(input_conllu).readlines()
-
- new_conllu_lines = list(conllu_lines)
-
- line_idx = 0
- text_idx = 0
- # invariant: conllu_lines[line_idx] is
- # # sent_id
- # at the start of a loop
- while line_idx < len(conllu_lines):
- # extract the text from the comments before each sentence
- line_idx = line_idx + 1
- text_line = conllu_lines[line_idx]
- assert text_line.startswith("# text = "), "Unexpected format: %s,%d is not # text" % (input_conllu, line_idx)
- text_line = text_line[9:-1]
- # use that text to keep track of an index in the text where we might need to put new spaces
- text_idx = text_idx + len(text_line)
-
- # advance to the end of the sentence
- line_idx = line_idx + 1
- assert conllu_lines[line_idx].startswith("1"), "Unexpected format: %s,%d is not a word" % (input_conllu, line_idx)
- while conllu_lines[line_idx].strip():
- line_idx = line_idx + 1
- last_word_idx = line_idx - 1
-
- # check if the end of the sentence has SpaceAfter or not
- new_line = conllu_lines[last_word_idx].replace("SpaceAfter=No|", "")
- assert new_line.find("SpaceAfter=") < 0, "Unexpected format: %s,%d has unusual SpaceAfter" % (input_conllu, line_idx)
-
- # if not, need to add a new space
- if new_line != conllu_lines[last_word_idx]:
- conllu_lines[last_word_idx] = new_line
- text_idx = text_idx + 1
-
- # done with a sentence. skip to the start of the next sentence
- # or the end of the document
- while line_idx < len(conllu_lines) and not conllu_lines[line_idx].strip():
- line_idx = line_idx + 1
-
- with open(input_conllu_copy, "w") as fout:
- for line in conllu_lines:
- fout.write(line)
diff --git a/stanza/utils/training/run_lemma.py b/stanza/utils/training/run_lemma.py
index 96b0f1cf..f86648e5 100644
--- a/stanza/utils/training/run_lemma.py
+++ b/stanza/utils/training/run_lemma.py
@@ -24,33 +24,9 @@ from stanza.models import lemmatizer
from stanza.utils.training import common
from stanza.utils.training.common import Mode
-logger = logging.getLogger('stanza')
+from stanza.utils.datasets.prepare_lemma_treebank import check_lemmas
-def check_lemmas(train_file):
- """
- Check if a treebank has any lemmas in it
-
- For example, in Vietnamese-VTB, all the words and lemmas are exactly the same
- in Telugu-MTG, all the lemmas are blank
- """
- # could eliminate a few languages immediately based on UD 2.7
- # but what if a later dataset includes lemmas?
- #if short_language in ('vi', 'fro', 'th'):
- # return False
- with open(train_file) as fin:
- for line in fin:
- line = line.strip()
- if not line or line.startswith("#"):
- continue
- pieces = line.split("\t")
- word = pieces[1].lower().strip()
- lemma = pieces[2].lower().strip()
- if not lemma or lemma == '_' or lemma == '-':
- continue
- if word == lemma:
- continue
- return True
- return False
+logger = logging.getLogger('stanza')
def run_treebank(mode, paths, treebank, short_name,
temp_output_file, command_args, extra_args):