Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-07-19 08:50:03 +0300
committerJohn Bauer <horatio@gmail.com>2022-07-20 06:14:20 +0300
commite097c3efc2247d9953600a67847bee508835a284 (patch)
treea64e1f8c7fbc2aa115b6c94b8ef5ab245810726d
parente90b0dc2784cbbb45af4371581a84840f2e9c6ce (diff)
Add a pos-specific charlm map for the medical EN datasets and the one dataset which appears to be hurt by the charlm (tr_boun)pos_charlm
craft, genia -> None
-rw-r--r--stanza/resources/prepare_resources.py12
-rw-r--r--stanza/utils/training/common.py4
-rw-r--r--stanza/utils/training/run_pos.py4
3 files changed, 17 insertions, 3 deletions
diff --git a/stanza/resources/prepare_resources.py b/stanza/resources/prepare_resources.py
index 2facd144..98007a10 100644
--- a/stanza/resources/prepare_resources.py
+++ b/stanza/resources/prepare_resources.py
@@ -177,6 +177,18 @@ default_charlms = {
"zh-hans": "gigaword"
}
+pos_charlms = {
+ "en": {
+ # none of the English charlms help with craft or genia
+ "craft": None,
+ "genia": None,
+ "mimic": "mimic",
+ },
+ "tr": { # no idea why, but this particular one goes down in dev score
+ "boun": None,
+ },
+}
+
ner_charlms = {
"en": {
"conll03": "1billion",
diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py
index 0969d20a..901c9662 100644
--- a/stanza/utils/training/common.py
+++ b/stanza/utils/training/common.py
@@ -336,7 +336,9 @@ def choose_charlm(language, dataset, charlm, language_charlms, dataset_charlms):
return None
elif charlm != "default":
return charlm
- elif specific_charlm:
+ elif dataset in dataset_charlms.get(language, {}):
+ # this way, a "" or None result gets honored
+ # thus treating "not in the map" as a way for dataset_charlms to signal to use the default
return specific_charlm
elif default_charlm:
return default_charlm
diff --git a/stanza/utils/training/run_pos.py b/stanza/utils/training/run_pos.py
index 2afe76b6..6e5ede44 100644
--- a/stanza/utils/training/run_pos.py
+++ b/stanza/utils/training/run_pos.py
@@ -9,7 +9,7 @@ from stanza.resources.prepare_resources import no_pretrain_languages
from stanza.utils.training import common
from stanza.utils.training.common import Mode, add_charlm_args, build_charlm_args, choose_charlm
-from stanza.resources.prepare_resources import default_charlms
+from stanza.resources.prepare_resources import default_charlms, pos_charlms
logger = logging.getLogger('stanza')
@@ -46,7 +46,7 @@ def run_treebank(mode, paths, treebank, short_name,
test_gold_file = f"{pos_dir}/{short_name}.test.gold.conllu"
test_pred_file = temp_output_file if temp_output_file else f"{pos_dir}/{short_name}.test.pred.conllu"
- charlm = choose_charlm(short_language, dataset, command_args.charlm, default_charlms, {})
+ charlm = choose_charlm(short_language, dataset, command_args.charlm, default_charlms, pos_charlms)
charlm_args = build_charlm_args(short_language, charlm)
if mode == Mode.TRAIN: