From 7891dc42c39b43121cd2c230d0eb220cf93515ae Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 14 Jun 2021 07:23:37 -0700 Subject: Add a specialized bulk_process for NER, along with a small unit test --- stanza/pipeline/ner_processor.py | 9 ++++ stanza/tests/test_pipeline_ner_processor.py | 81 +++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 stanza/tests/test_pipeline_ner_processor.py diff --git a/stanza/pipeline/ner_processor.py b/stanza/pipeline/ner_processor.py index eab66b62..52003961 100644 --- a/stanza/pipeline/ner_processor.py +++ b/stanza/pipeline/ner_processor.py @@ -38,3 +38,12 @@ class NERProcessor(UDProcessor): total = len(batch.doc.build_ents()) logger.debug(f'{total} entities found in document.') return batch.doc + + def bulk_process(self, docs): + """ + NER processor has a collation step after running inference + """ + docs = super().bulk_process(docs) + for doc in docs: + doc.build_ents() + return docs diff --git a/stanza/tests/test_pipeline_ner_processor.py b/stanza/tests/test_pipeline_ner_processor.py new file mode 100644 index 00000000..3f88a8d0 --- /dev/null +++ b/stanza/tests/test_pipeline_ner_processor.py @@ -0,0 +1,81 @@ + +import pytest +import stanza +from stanza.utils.conll import CoNLL +from stanza.models.common.doc import Document + +from stanza.tests import * + +pytestmark = [pytest.mark.pipeline, pytest.mark.travis] + +# data for testing +EN_DOCS = ["Barack Obama was born in Hawaii.", "He was elected president in 2008.", "Obama attended Harvard."] + +EXPECTED_ENTS = [[{ + "text": "Barack Obama", + "type": "PERSON", + "start_char": 0, + "end_char": 12 +}, { + "text": "Hawaii", + "type": "GPE", + "start_char": 25, + "end_char": 31 +}], +[{ + "text": "2008", + "type": "DATE", + "start_char": 28, + "end_char": 32 +}], +[{ + "text": "Obama", + "type": "PERSON", + "start_char": 0, + "end_char": 5 +}, { + "text": "Harvard", + "type": "ORG", + "start_char": 15, + "end_char": 22 +}]] + + +@pytest.fixture(scope="module") +def pipeline(): + """ + A reusable pipeline with the NER module + """ + return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,ner") + + +@pytest.fixture(scope="module") +def processed_doc(pipeline): + """ Document created by running full English pipeline on a few sentences """ + return [pipeline(text) for text in EN_DOCS] + + +@pytest.fixture(scope="module") +def processed_bulk(pipeline): + """ Document created by running full English pipeline on a few sentences """ + docs = [Document([], text=t) for t in EN_DOCS] + return pipeline(docs) + +def check_entities_equal(doc, expected): + """ + Checks that the entities of a doc are equal to the given list of maps + """ + assert len(doc.ents) == len(expected) + for doc_entity, expected_entity in zip(doc.ents, expected): + for k in expected_entity: + assert getattr(doc_entity, k) == expected_entity[k] + +def test_bulk_ents(processed_bulk): + assert len(processed_bulk) == len(EXPECTED_ENTS) + for doc, expected in zip(processed_bulk, EXPECTED_ENTS): + check_entities_equal(doc, expected) + +def test_ents(processed_doc): + assert len(processed_doc) == len(EXPECTED_ENTS) + for doc, expected in zip(processed_doc, EXPECTED_ENTS): + check_entities_equal(doc, expected) -- cgit v1.2.3 From 86ba9a12b8763406e922fc2666bc6e738e365757 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 14 Jun 2021 07:30:50 -0700 Subject: Gotta save two seconds when running the tests --- stanza/tests/test_english_pipeline.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/stanza/tests/test_english_pipeline.py b/stanza/tests/test_english_pipeline.py index 8c89774b..73569a9a 100644 --- a/stanza/tests/test_english_pipeline.py +++ b/stanza/tests/test_english_pipeline.py @@ -135,11 +135,13 @@ EN_DOC_CONLLU_GOLD_MULTIDOC = """ @pytest.fixture(scope="module") -def processed_doc(): - """ Document created by running full English pipeline on a few sentences """ - nlp = stanza.Pipeline(dir=TEST_MODELS_DIR) - return nlp(EN_DOC) +def pipeline(): + return stanza.Pipeline(dir=TEST_MODELS_DIR) +@pytest.fixture(scope="module") +def processed_doc(pipeline): + """ Document created by running full English pipeline on a few sentences """ + return pipeline(EN_DOC) def test_text(processed_doc): assert processed_doc.text == EN_DOC @@ -163,11 +165,10 @@ def test_dependency_parse(processed_doc): @pytest.fixture(scope="module") -def processed_multidoc(): +def processed_multidoc(pipeline): """ Document created by running full English pipeline on a few sentences """ docs = [Document([], text=t) for t in EN_DOCS] - nlp = stanza.Pipeline(dir=TEST_MODELS_DIR) - return nlp(docs) + return pipeline(docs) def test_conllu_multidoc(processed_multidoc): -- cgit v1.2.3 From e4e7a284af5643ff58568ab6ddd76d1e175ee1fd Mon Sep 17 00:00:00 2001 From: John Bauer Date: Fri, 25 Jun 2021 14:23:39 -0700 Subject: Only normalize spaces -> nbsp in the pretrain vocabs --- stanza/models/common/pretrain.py | 6 ++++++ stanza/models/common/vocab.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/stanza/models/common/pretrain.py b/stanza/models/common/pretrain.py index e18accbf..193cc71d 100644 --- a/stanza/models/common/pretrain.py +++ b/stanza/models/common/pretrain.py @@ -20,6 +20,12 @@ class PretrainedWordVocab(BaseVocab): self._id2unit = VOCAB_PREFIX + self.data self._unit2id = {w:i for i, w in enumerate(self._id2unit)} + def normalize_unit(self, unit): + unit = super().normalize_unit(unit) + if unit: + unit = unit.replace(" ","\xa0") + return unit + class Pretrain: """ A loader and saver for pretrained embeddings. """ diff --git a/stanza/models/common/vocab.py b/stanza/models/common/vocab.py index e3e2c300..cade67c3 100644 --- a/stanza/models/common/vocab.py +++ b/stanza/models/common/vocab.py @@ -47,9 +47,10 @@ class BaseVocab: return new def normalize_unit(self, unit): + # be sure to look in subclasses for other normalization being done + # especially PretrainWordVocab if unit is None: return unit - unit = unit.replace(" ","\xa0") if self.lower: return unit.lower() return unit -- cgit v1.2.3 From aaea154db67dc604ee44372e4a9dde5f172b0253 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 28 Jun 2021 17:47:49 -0700 Subject: Sort and then unsort the text by length in the sentiment processor. Use this to set a limit on processing length by text length. Set the default batch limit to 5000. Makes it so that a huge document doesn't default to using up the entire GPU --- stanza/models/classifiers/cnn_classifier.py | 10 ++++- stanza/models/common/utils.py | 47 +++++++++++++++++++++++ stanza/pipeline/sentiment_processor.py | 6 ++- stanza/tests/test_pipeline_sentiment_processor.py | 38 ++++++++++++++++++ stanza/tests/test_utils.py | 39 +++++++++++++++++++ 5 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 stanza/tests/test_pipeline_sentiment_processor.py diff --git a/stanza/models/classifiers/cnn_classifier.py b/stanza/models/classifiers/cnn_classifier.py index 3a329db8..2759086d 100644 --- a/stanza/models/classifiers/cnn_classifier.py +++ b/stanza/models/classifiers/cnn_classifier.py @@ -12,6 +12,7 @@ import stanza.models.classifiers.classifier_args as classifier_args import stanza.models.classifiers.data as data from stanza.models.common.vocab import PAD_ID, UNK_ID from stanza.models.common.data import get_long_tensor, sort_all +from stanza.models.common.utils import split_into_batches, sort_with_indices, unsort # TODO: move CharVocab to common from stanza.models.pos.vocab import CharVocab @@ -445,9 +446,10 @@ def label_text(model, text, batch_size=None, reverse_label_map=None, device=None if batch_size is None: intervals = [(0, len(text))] + orig_idx = None else: - # TODO: results would be better if we sort by length and then unsort - intervals = [(i, min(i+batch_size, len(text))) for i in range(0, len(text), batch_size)] + text, orig_idx = sort_with_indices(text, key=len, reverse=True) + intervals = split_into_batches(text, batch_size) labels = [] for interval in intervals: if interval[1] - interval[0] == 0: @@ -457,6 +459,10 @@ def label_text(model, text, batch_size=None, reverse_label_map=None, device=None predicted = torch.argmax(output, dim=1) labels.extend(predicted.tolist()) + if orig_idx: + text = unsort(text, orig_idx) + labels = unsort(labels, orig_idx) + logger.debug("Found labels") for (label, sentence) in zip(labels, text): logger.debug((label, sentence)) diff --git a/stanza/models/common/utils.py b/stanza/models/common/utils.py index a739d366..32f1b2f8 100644 --- a/stanza/models/common/utils.py +++ b/stanza/models/common/utils.py @@ -207,6 +207,53 @@ def unsort(sorted_list, oidx): _, unsorted = [list(t) for t in zip(*sorted(zip(oidx, sorted_list)))] return unsorted +def sort_with_indices(data, key=None, reverse=False): + """ + Sort data and return both the data and the original indices. + + One useful application is to sort by length, which can be done with key=len + Returns the data as a sorted list, then the indices of the original list. + """ + if key: + ordered = sorted(enumerate(data), key=lambda x: key(x[1]), reverse=reverse) + else: + ordered = sorted(enumerate(data), key=lambda x: x[1], reverse=reverse) + + result = tuple(zip(*ordered)) + return result[1], result[0] + +def split_into_batches(data, batch_size): + """ + Returns a list of intervals so that each interval is either <= batch_size or one element long. + + Long elements are not dropped from the intervals. + data is a list of lists + batch_size is how long to make each batch + return value is a list of pairs, start_idx end_idx + """ + intervals = [] + interval_start = 0 + interval_size = 0 + for idx, line in enumerate(data): + if len(line) > batch_size: + # guess we'll just hope the model can handle a batch of this size after all + if interval_size > 0: + intervals.append((interval_start, idx)) + intervals.append((idx, idx+1)) + interval_start = idx+1 + interval_size = 0 + elif len(line) + interval_size > batch_size: + # this line puts us over batch_size + intervals.append((interval_start, idx)) + interval_start = idx + interval_size = len(line) + else: + interval_size = interval_size + len(line) + if interval_size > 0: + # there's some leftover + intervals.append((interval_start, len(data))) + return intervals + def tensor_unsort(sorted_tensor, oidx): """ Unsort a sorted tensor on its 0-th dimension, based on the original idx. diff --git a/stanza/pipeline/sentiment_processor.py b/stanza/pipeline/sentiment_processor.py index a96c80a0..48117142 100644 --- a/stanza/pipeline/sentiment_processor.py +++ b/stanza/pipeline/sentiment_processor.py @@ -24,6 +24,9 @@ class SentimentProcessor(UDProcessor): # set of processor requirements for this processor REQUIRES_DEFAULT = set([TOKENIZE]) + # default batch size, measured in words per batch + DEFAULT_BATCH_SIZE = 5000 + def _set_up_model(self, config, use_gpu): # get pretrained word vectors pretrain_path = config.get('pretrain_path', None) @@ -37,7 +40,8 @@ class SentimentProcessor(UDProcessor): pretrain=self._pretrain, charmodel_forward=charmodel_forward, charmodel_backward=charmodel_backward) - self._batch_size = config.get('batch_size', None) + # batch size counted as words + self._batch_size = config.get('batch_size', SentimentProcessor.DEFAULT_BATCH_SIZE) # TODO: move this call to load() if use_gpu: diff --git a/stanza/tests/test_pipeline_sentiment_processor.py b/stanza/tests/test_pipeline_sentiment_processor.py new file mode 100644 index 00000000..b46eedf4 --- /dev/null +++ b/stanza/tests/test_pipeline_sentiment_processor.py @@ -0,0 +1,38 @@ + +import pytest +import stanza +from stanza.utils.conll import CoNLL +from stanza.models.common.doc import Document + +from stanza.tests import * + +pytestmark = [pytest.mark.pipeline, pytest.mark.travis] + +# data for testing +EN_DOCS = ["Ragavan is terrible and should go away.", "Today is okay.", "Urza's Saga is great."] + +EN_DOC = " ".join(EN_DOCS) + +EXPECTED = [0, 1, 2] + +@pytest.fixture(scope="module") +def pipeline(): + """ + A reusable pipeline with the NER module + """ + return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,sentiment") + +def test_simple(pipeline): + results = [] + for text in EN_DOCS: + doc = pipeline(text) + assert len(doc.sentences) == 1 + results.append(doc.sentences[0].sentiment) + assert EXPECTED == results + +def test_multiple_sentences(pipeline): + doc = pipeline(EN_DOC) + assert len(doc.sentences) == 3 + results = [sentence.sentiment for sentence in doc.sentences] + assert EXPECTED == results + diff --git a/stanza/tests/test_utils.py b/stanza/tests/test_utils.py index 7b654492..bc5cf4e4 100644 --- a/stanza/tests/test_utils.py +++ b/stanza/tests/test_utils.py @@ -75,3 +75,42 @@ def test_wordvec_type(): with pytest.raises(FileNotFoundError): utils.get_wordvec_file(wordvec_dir=temp_dir, shorthand='en_foo') +def test_sort_with_indices(): + data = [[1, 2, 3], [4, 5], [6]] + ordered, orig_idx = utils.sort_with_indices(data, key=len) + assert ordered == ([6], [4, 5], [1, 2, 3]) + assert orig_idx == (2, 1, 0) + + unsorted = utils.unsort(ordered, orig_idx) + assert data == unsorted + +def test_split_into_batches(): + data = [] + for i in range(5): + data.append(["Unban", "mox", "opal", str(i)]) + + data.append(["Do", "n't", "ban", "Urza", "'s", "Saga", "that", "card", "is", "great"]) + data.append(["Ban", "Ragavan"]) + + # small batches will put one element in each interval + batches = utils.split_into_batches(data, 5) + assert batches == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)] + + # this one has a batch interrupted in the middle by a large element + batches = utils.split_into_batches(data, 8) + assert batches == [(0, 2), (2, 4), (4, 5), (5, 6), (6, 7)] + + # this one has the large element at the start of its own batch + batches = utils.split_into_batches(data[1:], 8) + assert batches == [(0, 2), (2, 4), (4, 5), (5, 6)] + + # overloading the test! assert that the key & reverse is working + ordered, orig_idx = utils.sort_with_indices(data, key=len, reverse=True) + assert [len(x) for x in ordered] == [10, 4, 4, 4, 4, 4, 2] + + # this has the large element at the start + batches = utils.split_into_batches(ordered, 8) + assert batches == [(0, 1), (1, 3), (3, 5), (5, 7)] + + # double check that unsort is working as expected + assert data == utils.unsort(ordered, orig_idx) -- cgit v1.2.3 From e61c15f079b620d6433565c0aaba2c4850f47cef Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 29 Jun 2021 00:28:16 -0700 Subject: Add explicit flags for specifying where the charlm files are. Will still guess the location if those flags are not given --- stanza/models/ner_tagger.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/stanza/models/ner_tagger.py b/stanza/models/ner_tagger.py index f1525ca7..d223a37e 100644 --- a/stanza/models/ner_tagger.py +++ b/stanza/models/ner_tagger.py @@ -62,6 +62,8 @@ def parse_args(args=None): parser.add_argument('--charlm', action='store_true', help="Turn on contextualized char embedding using pretrained character-level language model.") parser.add_argument('--charlm_save_dir', type=str, default='saved_models/charlm', help="Root dir for pretrained character-level language model.") parser.add_argument('--charlm_shorthand', type=str, default=None, help="Shorthand for character-level language model training corpus.") + parser.add_argument('--charlm_forward_file', type=str, default=None, help="Exact path to use for forward charlm") + parser.add_argument('--charlm_backward_file', type=str, default=None, help="Exact path to use for backward charlm") parser.add_argument('--char_lowercase', dest='char_lowercase', action='store_true', help="Use lowercased characters in character model.") parser.add_argument('--no_lowercase', dest='lowercase', action='store_false', help="Use cased word vectors.") parser.add_argument('--no_emb_finetune', dest='emb_finetune', action='store_false', help="Turn off finetuning of the embedding matrix.") @@ -137,8 +139,10 @@ def train(args): if args['charlm_shorthand'] is None: raise ValueError("CharLM Shorthand is required for loading pretrained CharLM model...") logger.info('Using pretrained contextualized char embedding') - args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand']) - args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand']) + if not args['charlm_forward_file']: + args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand']) + if not args['charlm_backward_file']: + args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand']) # load data logger.info("Loading data with batch size {}...".format(args['batch_size'])) -- cgit v1.2.3 From acb77bf8c8bec8879309938355eef40a3b743009 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 29 Jun 2021 00:39:37 -0700 Subject: Make prepare_ner_dataset runnable as a module --- stanza/utils/datasets/ner/prepare_ner_dataset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py index 9d7e089a..54a2c7e3 100644 --- a/stanza/utils/datasets/ner/prepare_ner_dataset.py +++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py @@ -316,10 +316,9 @@ def process_bsnlp(paths, short_name): output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard)) prepare_ner_file.process_dataset(csv_file, output_filename) -def main(): +def main(dataset_name): paths = default_paths.get_default_paths() - dataset_name = sys.argv[1] random.seed(1234) if dataset_name == 'fi_turku': @@ -344,4 +343,4 @@ def main(): raise ValueError(f"dataset {dataset_name} currently not handled") if __name__ == '__main__': - main() + main(sys.argv[1]) -- cgit v1.2.3 From f91ecec41e7b8cb393fcfd8fb680c9db9f713633 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 29 Jun 2021 00:28:45 -0700 Subject: Convert the run_ner.sh script to python --- scripts/run_ner.sh | 33 -------- stanza/utils/training/run_ner.py | 163 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 33 deletions(-) delete mode 100755 scripts/run_ner.sh create mode 100644 stanza/utils/training/run_ner.py diff --git a/scripts/run_ner.sh b/scripts/run_ner.sh deleted file mode 100755 index 4edaf931..00000000 --- a/scripts/run_ner.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -# -# Train and evaluate NER tagger. Run as: -# ./run_ner.sh CORPUS OTHER_ARGS -# where CORPUS is the full corpus name (e.g., English-CoNLL03) and OTHER_ARGS are additional training arguments (see tagger code) or empty. -# This script assumes UDBASE and NER_DATA_DIR are correctly set in config.sh. - -source scripts/config.sh - -corpus=$1; shift -args=$@ - -lang=`echo $corpus | sed -e 's#-.*$##g'` -lcode=`python scripts/lang2code.py $lang` -corpus_name=`echo $corpus | sed -e 's#^.*-##g' | tr '[:upper:]' '[:lower:]'` -short=${lcode}_${corpus_name} - -train_file=${NER_DATA_DIR}/${short}.train.json -dev_file=${NER_DATA_DIR}/${short}.dev.json -test_file=${NER_DATA_DIR}/${short}.test.json - -if [ ! -e $train_file ]; then - bash scripts/prep_ner_data.sh $corpus -fi - -echo "Running ner with $args..." -python -m stanza.models.ner_tagger --wordvec_dir $WORDVEC_DIR --train_file $train_file --eval_file $dev_file \ - --lang $lang --shorthand $short --mode train $args -python -m stanza.models.ner_tagger --wordvec_dir $WORDVEC_DIR --eval_file $dev_file \ - --lang $lang --shorthand $short --mode predict $args -python -m stanza.models.ner_tagger --wordvec_dir $WORDVEC_DIR --eval_file $test_file \ - --lang $lang --shorthand $short --mode predict $args - diff --git a/stanza/utils/training/run_ner.py b/stanza/utils/training/run_ner.py new file mode 100644 index 00000000..506f6e4c --- /dev/null +++ b/stanza/utils/training/run_ner.py @@ -0,0 +1,163 @@ +""" +Trains or scores an NER model. + +Will attempt to guess the appropriate word vector file if none is +specified, and will use the charlms specified in the resources +for a given dataset or language if possible. + +Example command line: + python3 -m stanza.utils.training.run_ner.py hu_combined + +This script expects the prepared data to be in + data/ner/dataset.train.json, dev.json, test.json + +If those files don't exist, it will make an attempt to rebuild them +using the prepare_ner_dataset script. However, this will fail if the +data is not already downloaded. More information on where to find +most of the datasets online is in that script. Some of the datasets +have licenses which must be agreed to, so no attempt is made to +automatically download the data. +""" + +import glob +import logging +import os + +from stanza.models import ner_tagger +from stanza.utils.datasets.ner import prepare_ner_dataset +from stanza.utils.training import common +from stanza.utils.training.common import Mode + +from stanza.resources.prepare_resources import default_charlms, ner_charlms +from stanza.resources.common import DEFAULT_MODEL_DIR + +# extra arguments specific to a particular dataset +DATASET_EXTRA_ARGS = { + "vi_vlsp": [ "--dropout", "0.6", + "--word_dropout", "0.1", + "--locked_dropout", "0.1", + "--char_dropout", "0.1" ], +} + +logger = logging.getLogger('stanza') + +def add_ner_args(parser): + parser.add_argument('--charlm', default=None, type=str, help='Which charlm to run on. Will use the default charlm for this language/model if not set. Set to None to turn off charlm for languages with a default charlm') + +def find_charlm(direction, language, charlm): + saved_path = 'saved_models/charlm/{}_{}_{}_charlm.pt'.format(language, charlm, direction) + if os.path.exists(saved_path): + logger.info('Using model %s for %s charlm' % (saved_path, direction)) + return saved_path + + resource_path = '{}/{}/{}_charlm/{}.pt'.format(DEFAULT_MODEL_DIR, language, direction, charlm) + if os.path.exists(resource_path): + logger.info('Using model %s for %s charlm' % (resource_path, direction)) + return resource_path + + raise FileNotFoundError("Cannot find %s charlm in either %s or %s" % (direction, saved_path, resource_path)) + +def find_wordvec_pretrain(language): + # TODO: try to extract/remember the specific pretrain for the given model + # That would be a good way to archive which pretrains are used for which NER models, anyway + pretrain_path = '{}/{}/pretrain/*.pt'.format(DEFAULT_MODEL_DIR, language) + pretrains = glob.glob(pretrain_path) + if len(pretrains) == 0: + raise FileNotFoundError("Cannot find any pretrains in %s Try 'stanza.download(\"%s\")' to get a default pretrain or use --wordvec_pretrain_path to specify a .pt file to use" % (pretrain_path, language)) + if len(pretrains) > 1: + raise FileNotFoundError("Too many pretrains to choose from in %s Must specify an exact path to a --wordvec_pretrain_file" % pretrain_path) + logger.info("Using pretrain found in %s To use a different pretrain, specify --wordvec_pretrain_file" % pretrains[0]) + return pretrains[0] + +def run_treebank(mode, paths, treebank, short_name, + temp_output_file, command_args, extra_args): + ner_dir = paths["NER_DATA_DIR"] + language, dataset = short_name.split("_") + + train_file = os.path.join(ner_dir, "%s.train.json" % short_name) + dev_file = os.path.join(ner_dir, "%s.dev.json" % short_name) + test_file = os.path.join(ner_dir, "%s.test.json" % short_name) + + if not os.path.exists(train_file) or not os.path.exists(dev_file) or not os.path.exists(test_file): + logger.warning("The data for %s is missing or incomplete. Attempting to rebuild..." % short_name) + try: + prepare_ner_dataset.main(short_name) + except: + logger.error("Unable to build the data. Please correctly build the files in %s, %s, %s and then try again." % (train_file, dev_file, test_file)) + raise + + default_charlm = default_charlms.get(language, None) + specific_charlm = ner_charlms.get(language, {}).get(dataset, None) + if command_args.charlm: + charlm = command_args.charlm + if charlm == 'None': + charlm = None + elif specific_charlm: + charlm = specific_charlm + elif default_charlm: + charlm = default_charlm + else: + charlm = None + + if charlm: + # TODO: the --char_hidden_dim arg should be extractable from the charlm file + forward = find_charlm('forward', language, charlm) + backward = find_charlm('backward', language, charlm) + charlm_args = ['--charlm', + '--charlm_shorthand', '%s_%s' % (language, charlm), + '--char_hidden_dim', '1024', + '--charlm_forward_file', forward, + '--charlm_backward_file', backward] + else: + charlm_args = [] + + if mode == Mode.TRAIN: + # VI example arguments: + # --wordvec_pretrain_file ~/stanza_resources/vi/pretrain/vtb.pt + # --train_file data/ner/vi_vlsp.train.json + # --eval_file data/ner/vi_vlsp.dev.json + # --lang vi + # --shorthand vi_vlsp + # --mode train + # --charlm --charlm_shorthand vi_conll17 --char_hidden_dim 1024 + # --dropout 0.6 --word_dropout 0.1 --locked_dropout 0.1 --char_dropout 0.1 + dataset_args = DATASET_EXTRA_ARGS.get(short_name, []) + + train_args = ['--train_file', train_file, + '--eval_file', dev_file, + '--lang', language, + '--shorthand', short_name, + '--mode', 'train'] + train_args = train_args + charlm_args + dataset_args + extra_args + if '--wordvec_pretrain_file' not in train_args: + # will throw an error if the pretrain can't be found + wordvec_pretrain = find_wordvec_pretrain(language) + train_args = train_args + ['--wordvec_pretrain_file', wordvec_pretrain] + logger.info("Running train step with args: {}".format(train_args)) + ner_tagger.main(train_args) + + if mode == Mode.SCORE_DEV or mode == Mode.TRAIN: + dev_args = ['--eval_file', dev_file, + '--lang', language, + '--shorthand', short_name, + '--mode', 'predict'] + dev_args = dev_args + charlm_args + extra_args + logger.info("Running dev step with args: {}".format(dev_args)) + ner_tagger.main(dev_args) + + if mode == Mode.SCORE_TEST or mode == Mode.TRAIN: + test_args = ['--eval_file', test_file, + '--lang', language, + '--shorthand', short_name, + '--mode', 'predict'] + test_args = test_args + charlm_args + extra_args + logger.info("Running test step with args: {}".format(test_args)) + ner_tagger.main(test_args) + + +def main(): + common.main(run_treebank, "ner", "nertagger", add_ner_args) + +if __name__ == "__main__": + main() + -- cgit v1.2.3 From a3b7ff4c34cf74961a3713cc83d68729841df678 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 29 Jun 2021 23:52:38 -0700 Subject: Pass the charlm filenames to the trainer so that different charlm locations can be used when testing --- stanza/models/ner_tagger.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/stanza/models/ner_tagger.py b/stanza/models/ner_tagger.py index d223a37e..2407dc78 100644 --- a/stanza/models/ner_tagger.py +++ b/stanza/models/ner_tagger.py @@ -116,6 +116,7 @@ def train(args): pretrain = None vocab = None trainer = None + if args['finetune'] and os.path.exists(model_file): logger.warning('Finetune is ON. Using model from "{}"'.format(model_file)) _, trainer, vocab = load_model(args, model_file) @@ -262,7 +263,12 @@ def evaluate(args): def load_model(args, model_file): # load model use_cuda = args['cuda'] and not args['cpu'] - trainer = Trainer(model_file=model_file, use_cuda=use_cuda, train_classifier_only=args['train_classifier_only']) + charlm_args = {} + if 'charlm_forward_file' in args: + charlm_args['charlm_forward_file'] = args['charlm_forward_file'] + if 'charlm_backward_file' in args: + charlm_args['charlm_backward_file'] = args['charlm_backward_file'] + trainer = Trainer(args=charlm_args, model_file=model_file, use_cuda=use_cuda, train_classifier_only=args['train_classifier_only']) loaded_args, vocab = trainer.args, trainer.vocab # load config -- cgit v1.2.3 From 7cd4c11f6202dbdc3edb57cb8f2e7dd4d53876cf Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 30 Jun 2021 09:18:22 -0700 Subject: Get the char_hidden_dim from the charlm if a charlm is used --- stanza/models/ner/model.py | 5 +++-- stanza/utils/training/run_ner.py | 4 +--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/stanza/models/ner/model.py b/stanza/models/ner/model.py index bf8e25b1..5d00645e 100644 --- a/stanza/models/ner/model.py +++ b/stanza/models/ner/model.py @@ -37,10 +37,11 @@ class NERTagger(nn.Module): if self.args['charlm']: add_unsaved_module('charmodel_forward', CharacterLanguageModel.load(args['charlm_forward_file'], finetune=False)) add_unsaved_module('charmodel_backward', CharacterLanguageModel.load(args['charlm_backward_file'], finetune=False)) + input_size += self.charmodel_forward.hidden_dim() + self.charmodel_backward.hidden_dim() else: self.charmodel = CharacterModel(args, vocab, bidirectional=True, attention=False) - input_size += self.args['char_hidden_dim'] * 2 - + input_size += self.args['char_hidden_dim'] * 2 + # optionally add a input transformation layer if self.args.get('input_transform', False): self.input_transform = nn.Linear(input_size, input_size) diff --git a/stanza/utils/training/run_ner.py b/stanza/utils/training/run_ner.py index 506f6e4c..ed063326 100644 --- a/stanza/utils/training/run_ner.py +++ b/stanza/utils/training/run_ner.py @@ -100,12 +100,10 @@ def run_treebank(mode, paths, treebank, short_name, charlm = None if charlm: - # TODO: the --char_hidden_dim arg should be extractable from the charlm file forward = find_charlm('forward', language, charlm) backward = find_charlm('backward', language, charlm) charlm_args = ['--charlm', '--charlm_shorthand', '%s_%s' % (language, charlm), - '--char_hidden_dim', '1024', '--charlm_forward_file', forward, '--charlm_backward_file', backward] else: @@ -119,7 +117,7 @@ def run_treebank(mode, paths, treebank, short_name, # --lang vi # --shorthand vi_vlsp # --mode train - # --charlm --charlm_shorthand vi_conll17 --char_hidden_dim 1024 + # --charlm --charlm_shorthand vi_conll17 # --dropout 0.6 --word_dropout 0.1 --locked_dropout 0.1 --char_dropout 0.1 dataset_args = DATASET_EXTRA_ARGS.get(short_name, []) -- cgit v1.2.3 From 697a8c307bc640b531c9dec3baf265bd19bce7f6 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 30 Jun 2021 15:19:57 -0700 Subject: Switch a bunch of % to f-strings --- stanza/utils/training/run_ner.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/stanza/utils/training/run_ner.py b/stanza/utils/training/run_ner.py index ed063326..9e0f3ce2 100644 --- a/stanza/utils/training/run_ner.py +++ b/stanza/utils/training/run_ner.py @@ -47,15 +47,15 @@ def add_ner_args(parser): def find_charlm(direction, language, charlm): saved_path = 'saved_models/charlm/{}_{}_{}_charlm.pt'.format(language, charlm, direction) if os.path.exists(saved_path): - logger.info('Using model %s for %s charlm' % (saved_path, direction)) + logger.info(f'Using model {saved_path} for {direction} charlm') return saved_path resource_path = '{}/{}/{}_charlm/{}.pt'.format(DEFAULT_MODEL_DIR, language, direction, charlm) if os.path.exists(resource_path): - logger.info('Using model %s for %s charlm' % (resource_path, direction)) + logger.info(f'Using model {resource_path} for {direction} charlm') return resource_path - raise FileNotFoundError("Cannot find %s charlm in either %s or %s" % (direction, saved_path, resource_path)) + raise FileNotFoundError(f"Cannot find {direction} charlm in either {saved_path} or {resource_path}") def find_wordvec_pretrain(language): # TODO: try to extract/remember the specific pretrain for the given model @@ -63,27 +63,28 @@ def find_wordvec_pretrain(language): pretrain_path = '{}/{}/pretrain/*.pt'.format(DEFAULT_MODEL_DIR, language) pretrains = glob.glob(pretrain_path) if len(pretrains) == 0: - raise FileNotFoundError("Cannot find any pretrains in %s Try 'stanza.download(\"%s\")' to get a default pretrain or use --wordvec_pretrain_path to specify a .pt file to use" % (pretrain_path, language)) + raise FileNotFoundError(f"Cannot find any pretrains in {pretrain_path} Try 'stanza.download(\"{language}\")' to get a default pretrain or use --wordvec_pretrain_path to specify a .pt file to use") if len(pretrains) > 1: - raise FileNotFoundError("Too many pretrains to choose from in %s Must specify an exact path to a --wordvec_pretrain_file" % pretrain_path) - logger.info("Using pretrain found in %s To use a different pretrain, specify --wordvec_pretrain_file" % pretrains[0]) - return pretrains[0] + raise FileNotFoundError(f"Too many pretrains to choose from in {pretrain_path} Must specify an exact path to a --wordvec_pretrain_file") + pretrain = pretrains[0] + logger.info(f"Using pretrain found in {pretrain} To use a different pretrain, specify --wordvec_pretrain_file") + return pretrain def run_treebank(mode, paths, treebank, short_name, temp_output_file, command_args, extra_args): ner_dir = paths["NER_DATA_DIR"] language, dataset = short_name.split("_") - train_file = os.path.join(ner_dir, "%s.train.json" % short_name) - dev_file = os.path.join(ner_dir, "%s.dev.json" % short_name) - test_file = os.path.join(ner_dir, "%s.test.json" % short_name) + train_file = os.path.join(ner_dir, f"{short_name}.train.json") + dev_file = os.path.join(ner_dir, f"{short_name}.dev.json") + test_file = os.path.join(ner_dir, f"{short_name}.test.json") if not os.path.exists(train_file) or not os.path.exists(dev_file) or not os.path.exists(test_file): - logger.warning("The data for %s is missing or incomplete. Attempting to rebuild..." % short_name) + logger.warning(f"The data for {short_name} is missing or incomplete. Attempting to rebuild...") try: prepare_ner_dataset.main(short_name) except: - logger.error("Unable to build the data. Please correctly build the files in %s, %s, %s and then try again." % (train_file, dev_file, test_file)) + logger.error(f"Unable to build the data. Please correctly build the files in {train_file}, {dev_file}, {test_file} and then try again.") raise default_charlm = default_charlms.get(language, None) @@ -103,7 +104,7 @@ def run_treebank(mode, paths, treebank, short_name, forward = find_charlm('forward', language, charlm) backward = find_charlm('backward', language, charlm) charlm_args = ['--charlm', - '--charlm_shorthand', '%s_%s' % (language, charlm), + '--charlm_shorthand', f'{language}_{charlm}', '--charlm_forward_file', forward, '--charlm_backward_file', backward] else: -- cgit v1.2.3 From 9ea5553bc047dcf0cc332e6a690b068c1d91370d Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 30 Jun 2021 15:22:47 -0700 Subject: Add & clarify some comments on run_ner.py --- stanza/utils/training/run_ner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/stanza/utils/training/run_ner.py b/stanza/utils/training/run_ner.py index 9e0f3ce2..1ee9979f 100644 --- a/stanza/utils/training/run_ner.py +++ b/stanza/utils/training/run_ner.py @@ -9,7 +9,7 @@ Example command line: python3 -m stanza.utils.training.run_ner.py hu_combined This script expects the prepared data to be in - data/ner/dataset.train.json, dev.json, test.json + data/ner/{lang}_{dataset}.train.json, {lang}_{dataset}.dev.json, {lang}_{dataset}.test.json If those files don't exist, it will make an attempt to rebuild them using the prepare_ner_dataset script. However, this will fail if the @@ -70,6 +70,11 @@ def find_wordvec_pretrain(language): logger.info(f"Using pretrain found in {pretrain} To use a different pretrain, specify --wordvec_pretrain_file") return pretrain +# Technically NER datasets are not necessarily treebanks +# (usually not, in fact) +# However, to keep the naming consistent, we leave the +# method which does the training as run_treebank +# TODO: rename treebank -> dataset everywhere def run_treebank(mode, paths, treebank, short_name, temp_output_file, command_args, extra_args): ner_dir = paths["NER_DATA_DIR"] -- cgit v1.2.3 From e992aa07dbbcf554d29cdffb3f1b951803cedd43 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 30 Jun 2021 15:25:37 -0700 Subject: Add some comments on the common.py main method --- stanza/utils/training/common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py index b414bf56..c3635bbb 100644 --- a/stanza/utils/training/common.py +++ b/stanza/utils/training/common.py @@ -40,6 +40,12 @@ def build_argparse(): SHORTNAME_RE = re.compile("[a-z-]+_[a-z0-9]+") def main(run_treebank, model_dir, model_name, add_specific_args=None): + """ + A main program for each of the run_xyz scripts + + It collects the arguments and runs the main method for each dataset provided. + It also tries to look for an existing model and not overwrite it unless --force is provided + """ logger.info("Training program called with:\n" + " ".join(sys.argv)) paths = default_paths.get_default_paths() -- cgit v1.2.3 From 255daf700ea46e4ca8fc66ba47ce98563b5a89c5 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Fri, 11 Jun 2021 15:30:34 -0700 Subject: Make an error message more useful --- stanza/models/ner/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stanza/models/ner/model.py b/stanza/models/ner/model.py index 5d00645e..efad8d51 100644 --- a/stanza/models/ner/model.py +++ b/stanza/models/ner/model.py @@ -74,7 +74,7 @@ class NERTagger(nn.Module): vocab_size = len(self.vocab['word']) dim = self.args['word_emb_dim'] assert emb_matrix.size() == (vocab_size, dim), \ - "Input embedding matrix must match size: {} x {}".format(vocab_size, dim) + "Input embedding matrix must match size: {} x {}, found {}".format(vocab_size, dim, emb_matrix.size()) self.word_emb.weight.data.copy_(emb_matrix) def forward(self, word, word_mask, wordchars, wordchars_mask, tags, word_orig_idx, sentlens, wordlens, chars, charoffsets, charlens, char_orig_idx): -- cgit v1.2.3 From ebe4c99875c2cb9dc76b990c21eadf8ffe60fad5 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Fri, 2 Jul 2021 13:01:57 -0700 Subject: Fix a typo --- stanza/models/ner_tagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stanza/models/ner_tagger.py b/stanza/models/ner_tagger.py index 2407dc78..b4b0a09f 100644 --- a/stanza/models/ner_tagger.py +++ b/stanza/models/ner_tagger.py @@ -31,7 +31,7 @@ logger = logging.getLogger('stanza') def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('--data_dir', type=str, default='data/ner', help='Root dir for saving models.') + parser.add_argument('--data_dir', type=str, default='data/ner', help='Directory of NER data.') parser.add_argument('--wordvec_dir', type=str, default='extern_data/word2vec', help='Directory of word vectors') parser.add_argument('--wordvec_file', type=str, default='', help='File that contains word vectors') parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read') -- cgit v1.2.3 From f7ffaeae6d3e965e59cb636fedeafd617dfb1f3c Mon Sep 17 00:00:00 2001 From: Bram Vanroy Date: Mon, 5 Jul 2021 10:13:15 +0200 Subject: Fix pickling issue on Windows with lambdas --- stanza/models/classifiers/cnn_classifier.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/stanza/models/classifiers/cnn_classifier.py b/stanza/models/classifiers/cnn_classifier.py index 2759086d..fa5160bf 100644 --- a/stanza/models/classifiers/cnn_classifier.py +++ b/stanza/models/classifiers/cnn_classifier.py @@ -74,10 +74,7 @@ class CNNClassifier(nn.Module): charlm_projection = args.charlm_projection, model_type = 'CNNClassifier') - if args.char_lowercase: - self.char_case = lambda x: x.lower() - else: - self.char_case = lambda x: x + self.char_lowercase = args.char_lowercase self.unsaved_modules = [] @@ -170,7 +167,6 @@ class CNNClassifier(nn.Module): self.dropout = nn.Dropout(self.config.dropout) - def add_unsaved_module(self, name, module): self.unsaved_modules += [name] setattr(self, name, module) @@ -201,6 +197,8 @@ class CNNClassifier(nn.Module): return char_reps + def char_case(self, x: str) -> str: + return x.lower() if self.char_lowercase else x def forward(self, inputs, device=None): if not device: -- cgit v1.2.3 From c57fee7d606bc42748549f6d30c3946114df700b Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 22 Jun 2021 08:19:30 -0700 Subject: Give a reason for failing if the md5sum doesn't match --- stanza/resources/common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/stanza/resources/common.py b/stanza/resources/common.py index 8e70e861..d9837468 100644 --- a/stanza/resources/common.py +++ b/stanza/resources/common.py @@ -103,6 +103,12 @@ def file_exists(path, md5): """ return os.path.exists(path) and get_md5(path) == md5 +def assert_file_exists(path, md5=None): + assert os.path.exists(path), "Could not find file at %s" % path + if md5: + file_md5 = get_md5(path) + assert file_md5 == md5, "md5 for %s is %s, expected %s" % (path, file_md5, md5) + def download_file(url, path, proxies, raise_for_status=False): """ Download a URL into a file as specified by `path`. @@ -134,7 +140,7 @@ def request_file(url, path, proxies=None, md5=None, raise_for_status=False): logger.info(f'File exists: {path}.') return download_file(url, path, proxies, raise_for_status) - assert(not md5 or file_exists(path, md5)) + assert_file_exists(path, md5) def sort_processors(processor_list): sorted_list = [] -- cgit v1.2.3 From 2f1f8ccf9a0bef741f99b66571a9e66414ea501d Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 22 Jun 2021 19:16:43 -0700 Subject: Minor whitespace change --- stanza/models/pos/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stanza/models/pos/model.py b/stanza/models/pos/model.py index f7af1ffc..452f7dda 100644 --- a/stanza/models/pos/model.py +++ b/stanza/models/pos/model.py @@ -39,7 +39,7 @@ class Tagger(nn.Module): self.trans_char = nn.Linear(self.args['char_hidden_dim'], self.args['transformed_dim'], bias=False) input_size += self.args['transformed_dim'] - if self.args['pretrain']: + if self.args['pretrain']: # pretrained embeddings, by default this won't be saved into model file add_unsaved_module('pretrained_emb', nn.Embedding.from_pretrained(torch.from_numpy(emb_matrix), freeze=True)) self.trans_pretrained = nn.Linear(emb_matrix.shape[1], self.args['transformed_dim'], bias=False) -- cgit v1.2.3 From f82c4a220d826f1dfa34e164f977bce14d05b67a Mon Sep 17 00:00:00 2001 From: John Bauer Date: Mon, 14 Jun 2021 17:21:31 -0700 Subject: Add a VI NER model --- stanza/resources/prepare_resources.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stanza/resources/prepare_resources.py b/stanza/resources/prepare_resources.py index 31791e21..31177863 100644 --- a/stanza/resources/prepare_resources.py +++ b/stanza/resources/prepare_resources.py @@ -100,6 +100,7 @@ default_ners = { "nl": "conll02", "ru": "wikiner", "uk": "languk", + "vi": "vlsp", "zh-hans": "ontonotes", } @@ -115,6 +116,7 @@ default_charlms = { "fr": "newswiki", "nl": "ccwiki", "ru": "newswiki", + "vi": "conll17", "zh-hans": "gigaword" } -- cgit v1.2.3 From d193a5a34a2e055f16ac69e96568759e059006e7 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Sat, 10 Jul 2021 23:47:15 -0700 Subject: Update version and resources to 1.2.2 --- stanza/_version.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stanza/_version.py b/stanza/_version.py index 4bce72e3..100ecb6e 100644 --- a/stanza/_version.py +++ b/stanza/_version.py @@ -1,4 +1,4 @@ """ Single source of truth for version number """ -__version__ = "1.2.1" -__resources_version__ = '1.2.1' +__version__ = "1.2.2" +__resources_version__ = '1.2.2' -- cgit v1.2.3