diff options
author | John Bauer <horatio@gmail.com> | 2022-07-19 08:50:03 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-07-20 06:14:20 +0300 |
commit | e097c3efc2247d9953600a67847bee508835a284 (patch) | |
tree | a64e1f8c7fbc2aa115b6c94b8ef5ab245810726d | |
parent | e90b0dc2784cbbb45af4371581a84840f2e9c6ce (diff) |
Add a pos-specific charlm map for the medical EN datasets and the one dataset which appears to be hurt by the charlm (tr_boun)pos_charlm
craft, genia -> None
-rw-r--r-- | stanza/resources/prepare_resources.py | 12 | ||||
-rw-r--r-- | stanza/utils/training/common.py | 4 | ||||
-rw-r--r-- | stanza/utils/training/run_pos.py | 4 |
3 files changed, 17 insertions, 3 deletions
diff --git a/stanza/resources/prepare_resources.py b/stanza/resources/prepare_resources.py index 2facd144..98007a10 100644 --- a/stanza/resources/prepare_resources.py +++ b/stanza/resources/prepare_resources.py @@ -177,6 +177,18 @@ default_charlms = { "zh-hans": "gigaword" } +pos_charlms = { + "en": { + # none of the English charlms help with craft or genia + "craft": None, + "genia": None, + "mimic": "mimic", + }, + "tr": { # no idea why, but this particular one goes down in dev score + "boun": None, + }, +} + ner_charlms = { "en": { "conll03": "1billion", diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py index 0969d20a..901c9662 100644 --- a/stanza/utils/training/common.py +++ b/stanza/utils/training/common.py @@ -336,7 +336,9 @@ def choose_charlm(language, dataset, charlm, language_charlms, dataset_charlms): return None elif charlm != "default": return charlm - elif specific_charlm: + elif dataset in dataset_charlms.get(language, {}): + # this way, a "" or None result gets honored + # thus treating "not in the map" as a way for dataset_charlms to signal to use the default return specific_charlm elif default_charlm: return default_charlm diff --git a/stanza/utils/training/run_pos.py b/stanza/utils/training/run_pos.py index 2afe76b6..6e5ede44 100644 --- a/stanza/utils/training/run_pos.py +++ b/stanza/utils/training/run_pos.py @@ -9,7 +9,7 @@ from stanza.resources.prepare_resources import no_pretrain_languages from stanza.utils.training import common from stanza.utils.training.common import Mode, add_charlm_args, build_charlm_args, choose_charlm -from stanza.resources.prepare_resources import default_charlms +from stanza.resources.prepare_resources import default_charlms, pos_charlms logger = logging.getLogger('stanza') @@ -46,7 +46,7 @@ def run_treebank(mode, paths, treebank, short_name, test_gold_file = f"{pos_dir}/{short_name}.test.gold.conllu" test_pred_file = temp_output_file if temp_output_file else f"{pos_dir}/{short_name}.test.pred.conllu" - charlm = choose_charlm(short_language, dataset, command_args.charlm, default_charlms, {}) + charlm = choose_charlm(short_language, dataset, command_args.charlm, default_charlms, pos_charlms) charlm_args = build_charlm_args(short_language, charlm) if mode == Mode.TRAIN: |