From 2511f1f8d96416d047696303f53bd5b12f1abba2 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 13 Sep 2022 16:26:06 -0700 Subject: Try to reduce the scope on various pipelines to make the test suite less likely to run out of GPU memory. Not sure this is the correct approach --- stanza/tests/classifiers/test_classifier.py | 345 +++++++++++---------- stanza/tests/lemma/test_lemma_trainer.py | 2 +- .../pipeline/test_pipeline_depparse_processor.py | 37 +-- .../tests/pipeline/test_pipeline_ner_processor.py | 6 +- 4 files changed, 196 insertions(+), 194 deletions(-) diff --git a/stanza/tests/classifiers/test_classifier.py b/stanza/tests/classifiers/test_classifier.py index 6743b5f2..36beb785 100644 --- a/stanza/tests/classifiers/test_classifier.py +++ b/stanza/tests/classifiers/test_classifier.py @@ -30,175 +30,176 @@ DATASET = [ EMB_DIM = 5 -@pytest.fixture(scope="module") -def train_file(tmp_path_factory): - train_set = DATASET * 20 - train_filename = tmp_path_factory.mktemp("data") / "train.json" - with open(train_filename, "w", encoding="utf-8") as fout: - json.dump(train_set, fout, ensure_ascii=False) - return train_filename - -@pytest.fixture(scope="module") -def dev_file(tmp_path_factory): - dev_set = DATASET * 2 - dev_filename = tmp_path_factory.mktemp("data") / "dev.json" - with open(dev_filename, "w", encoding="utf-8") as fout: - json.dump(dev_set, fout, ensure_ascii=False) - return dev_filename - -@pytest.fixture(scope="module") -def test_file(tmp_path_factory): - test_set = DATASET - test_filename = tmp_path_factory.mktemp("data") / "test.json" - with open(test_filename, "w", encoding="utf-8") as fout: - json.dump(test_set, fout, ensure_ascii=False) - return test_filename - -@pytest.fixture(scope="module") -def fake_embeddings(tmp_path_factory): - # could set np random seed here - words = sorted(set([x.lower() for y in SENTENCES for x in y])) - words = words[:-1] - embedding_txt = tmp_path_factory.mktemp("data") / "embedding.txt" - embedding_pt = tmp_path_factory.mktemp("data") / "embedding.pt" - embedding = np.random.random((len(words), EMB_DIM)) - - with open(embedding_txt, "w", encoding="utf-8") as fout: - for word, emb in zip(words, embedding): - fout.write(word) - fout.write("\t") - fout.write("\t".join(str(x) for x in emb)) - fout.write("\n") - - pt = pretrain.Pretrain(str(embedding_pt), str(embedding_txt)) - pt.load() - assert os.path.exists(embedding_pt) - return embedding_pt - -def test_read_data(train_file): - """ - Test reading of the json format - """ - train_set = data.read_dataset(str(train_file), WVType.OTHER, 1) - assert len(train_set) == 60 - -def test_dataset_vocab(train_file): - """ - Converting a dataset to vocab should have a specific set of words along with PAD and UNK - """ - train_set = data.read_dataset(str(train_file), WVType.OTHER, 1) - vocab = data.dataset_vocab(train_set) - expected = set([PAD, UNK] + [x.lower() for y in SENTENCES for x in y]) - assert set(vocab) == expected - -def test_dataset_labels(train_file): - """ - Test the extraction of labels from a dataset - """ - train_set = data.read_dataset(str(train_file), WVType.OTHER, 1) - labels = data.dataset_labels(train_set) - assert labels == ["0", "1", "2"] - -def build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=None): - """ - Build a model to be used by one of the later tests - """ - save_dir = str(tmp_path / "classifier") - save_name = "model.pt" - args = ["--save_dir", save_dir, - "--save_name", save_name, - "--wordvec_pretrain_file", str(fake_embeddings), - "--filter_channels", "20", - "--fc_shapes", "20,10", - "--train_file", str(train_file), - "--dev_file", str(dev_file), - "--max_epochs", "2", - "--batch_size", "60"] - if extra_args is not None: - args = args + extra_args - args = classifier.parse_args(args) - train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len) - trainer = Trainer.build_new_model(args, train_set) - return trainer, train_set, args - -def run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=None): - """ - Iterate a couple times over a model - """ - trainer, train_set, args = build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args) - dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len) - labels = data.dataset_labels(train_set) - - save_filename = os.path.join(args.save_dir, args.save_name) - checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name) - classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels) - return trainer - -def test_build_model(tmp_path, fake_embeddings, train_file, dev_file): - """ - Test that building a basic model works - """ - build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"]) - -def test_save_load(tmp_path, fake_embeddings, train_file, dev_file): - """ - Test that a basic model can save & load - """ - trainer, _, args = build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"]) - - save_filename = os.path.join(args.save_dir, args.save_name) - trainer.save(save_filename) - - args.load_name = args.save_name - trainer = Trainer.load(args.load_name, args) - args.load_name = save_filename - trainer = Trainer.load(args.load_name, args) - -def test_train_basic(tmp_path, fake_embeddings, train_file, dev_file): - run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"]) - -def test_train_bilstm(tmp_path, fake_embeddings, train_file, dev_file): - """ - Test w/ and w/o bilstm variations of the classifier - """ - args = ["--bilstm", "--bilstm_hidden_dim", "20"] - run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - - args = ["--no_bilstm"] - run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - -def test_train_maxpool_width(tmp_path, fake_embeddings, train_file, dev_file): - """ - Test various maxpool widths - - Also sets --filter_channels to a multiple of 2 but not of 3 for - the test to make sure the math is done correctly on a non-divisible width - """ - args = ["--maxpool_width", "1", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] - run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - - args = ["--maxpool_width", "2", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] - run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - - args = ["--maxpool_width", "3", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] - run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - -def test_train_conv_2d(tmp_path, fake_embeddings, train_file, dev_file): - args = ["--filter_sizes", "(3,4,5)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] - run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - - args = ["--filter_sizes", "((3,2),)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] - run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - - args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] - run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - -def test_train_filter_channels(tmp_path, fake_embeddings, train_file, dev_file): - args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--no_bilstm"] - trainer = run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - assert trainer.model.fc_input_size == 40 - - args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "15,20", "--no_bilstm"] - trainer = run_training(tmp_path, fake_embeddings, train_file, dev_file, args) - # 50 = 2x15 for the 2d conv (over 5 dim embeddings) + 20 - assert trainer.model.fc_input_size == 50 +class TestClassifier: + @pytest.fixture(scope="class") + def train_file(self, tmp_path_factory): + train_set = DATASET * 20 + train_filename = tmp_path_factory.mktemp("data") / "train.json" + with open(train_filename, "w", encoding="utf-8") as fout: + json.dump(train_set, fout, ensure_ascii=False) + return train_filename + + @pytest.fixture(scope="class") + def dev_file(self, tmp_path_factory): + dev_set = DATASET * 2 + dev_filename = tmp_path_factory.mktemp("data") / "dev.json" + with open(dev_filename, "w", encoding="utf-8") as fout: + json.dump(dev_set, fout, ensure_ascii=False) + return dev_filename + + @pytest.fixture(scope="class") + def test_file(self, tmp_path_factory): + test_set = DATASET + test_filename = tmp_path_factory.mktemp("data") / "test.json" + with open(test_filename, "w", encoding="utf-8") as fout: + json.dump(test_set, fout, ensure_ascii=False) + return test_filename + + @pytest.fixture(scope="class") + def fake_embeddings(self, tmp_path_factory): + # could set np random seed here + words = sorted(set([x.lower() for y in SENTENCES for x in y])) + words = words[:-1] + embedding_txt = tmp_path_factory.mktemp("data") / "embedding.txt" + embedding_pt = tmp_path_factory.mktemp("data") / "embedding.pt" + embedding = np.random.random((len(words), EMB_DIM)) + + with open(embedding_txt, "w", encoding="utf-8") as fout: + for word, emb in zip(words, embedding): + fout.write(word) + fout.write("\t") + fout.write("\t".join(str(x) for x in emb)) + fout.write("\n") + + pt = pretrain.Pretrain(str(embedding_pt), str(embedding_txt)) + pt.load() + assert os.path.exists(embedding_pt) + return embedding_pt + + def test_read_data(self, train_file): + """ + Test reading of the json format + """ + train_set = data.read_dataset(str(train_file), WVType.OTHER, 1) + assert len(train_set) == 60 + + def test_dataset_vocab(self, train_file): + """ + Converting a dataset to vocab should have a specific set of words along with PAD and UNK + """ + train_set = data.read_dataset(str(train_file), WVType.OTHER, 1) + vocab = data.dataset_vocab(train_set) + expected = set([PAD, UNK] + [x.lower() for y in SENTENCES for x in y]) + assert set(vocab) == expected + + def test_dataset_labels(self, train_file): + """ + Test the extraction of labels from a dataset + """ + train_set = data.read_dataset(str(train_file), WVType.OTHER, 1) + labels = data.dataset_labels(train_set) + assert labels == ["0", "1", "2"] + + def build_model(self, tmp_path, fake_embeddings, train_file, dev_file, extra_args=None): + """ + Build a model to be used by one of the later tests + """ + save_dir = str(tmp_path / "classifier") + save_name = "model.pt" + args = ["--save_dir", save_dir, + "--save_name", save_name, + "--wordvec_pretrain_file", str(fake_embeddings), + "--filter_channels", "20", + "--fc_shapes", "20,10", + "--train_file", str(train_file), + "--dev_file", str(dev_file), + "--max_epochs", "2", + "--batch_size", "60"] + if extra_args is not None: + args = args + extra_args + args = classifier.parse_args(args) + train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len) + trainer = Trainer.build_new_model(args, train_set) + return trainer, train_set, args + + def run_training(self, tmp_path, fake_embeddings, train_file, dev_file, extra_args=None): + """ + Iterate a couple times over a model + """ + trainer, train_set, args = self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args) + dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len) + labels = data.dataset_labels(train_set) + + save_filename = os.path.join(args.save_dir, args.save_name) + checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name) + classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels) + return trainer + + def test_build_model(self, tmp_path, fake_embeddings, train_file, dev_file): + """ + Test that building a basic model works + """ + self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"]) + + def test_save_load(self, tmp_path, fake_embeddings, train_file, dev_file): + """ + Test that a basic model can save & load + """ + trainer, _, args = self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"]) + + save_filename = os.path.join(args.save_dir, args.save_name) + trainer.save(save_filename) + + args.load_name = args.save_name + trainer = Trainer.load(args.load_name, args) + args.load_name = save_filename + trainer = Trainer.load(args.load_name, args) + + def test_train_basic(self, tmp_path, fake_embeddings, train_file, dev_file): + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"]) + + def test_train_bilstm(self, tmp_path, fake_embeddings, train_file, dev_file): + """ + Test w/ and w/o bilstm variations of the classifier + """ + args = ["--bilstm", "--bilstm_hidden_dim", "20"] + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + + args = ["--no_bilstm"] + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + + def test_train_maxpool_width(self, tmp_path, fake_embeddings, train_file, dev_file): + """ + Test various maxpool widths + + Also sets --filter_channels to a multiple of 2 but not of 3 for + the test to make sure the math is done correctly on a non-divisible width + """ + args = ["--maxpool_width", "1", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + + args = ["--maxpool_width", "2", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + + args = ["--maxpool_width", "3", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + + def test_train_conv_2d(self, tmp_path, fake_embeddings, train_file, dev_file): + args = ["--filter_sizes", "(3,4,5)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + + args = ["--filter_sizes", "((3,2),)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + + args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"] + self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + + def test_train_filter_channels(self, tmp_path, fake_embeddings, train_file, dev_file): + args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--no_bilstm"] + trainer = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + assert trainer.model.fc_input_size == 40 + + args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "15,20", "--no_bilstm"] + trainer = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args) + # 50 = 2x15 for the 2d conv (over 5 dim embeddings) + 20 + assert trainer.model.fc_input_size == 50 diff --git a/stanza/tests/lemma/test_lemma_trainer.py b/stanza/tests/lemma/test_lemma_trainer.py index 24a58b57..dd6a1777 100644 --- a/stanza/tests/lemma/test_lemma_trainer.py +++ b/stanza/tests/lemma/test_lemma_trainer.py @@ -13,7 +13,7 @@ from stanza.tests import * pytestmark = [pytest.mark.pipeline, pytest.mark.travis] -@pytest.fixture +@pytest.fixture(scope="module") def english_model(): models_path = os.path.join(TEST_MODELS_DIR, "en", "lemma", "*") models = glob.glob(models_path) diff --git a/stanza/tests/pipeline/test_pipeline_depparse_processor.py b/stanza/tests/pipeline/test_pipeline_depparse_processor.py index 83535cb8..ad820643 100644 --- a/stanza/tests/pipeline/test_pipeline_depparse_processor.py +++ b/stanza/tests/pipeline/test_pipeline_depparse_processor.py @@ -10,23 +10,24 @@ from stanza.tests import TEST_MODELS_DIR pytestmark = [pytest.mark.pipeline, pytest.mark.travis] -@pytest.fixture(scope="module") -def english_depparse(): - """ - Get a depparse_processor for English - """ - nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'}) - assert 'depparse' in nlp.processors - return nlp.processors['depparse'] +class TestClassifier: + @pytest.fixture(scope="class") + def english_depparse(self): + """ + Get a depparse_processor for English + """ + nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'}) + assert 'depparse' in nlp.processors + return nlp.processors['depparse'] -def test_get_known_relations(english_depparse): - """ - Test getting the known relations from a processor. + def test_get_known_relations(self, english_depparse): + """ + Test getting the known relations from a processor. - Doesn't test that all the relations exist, since who knows what will change in the future - """ - relations = english_depparse.get_known_relations() - assert len(relations) > 5 - assert 'case' in relations - for i in VOCAB_PREFIX: - assert i not in relations + Doesn't test that all the relations exist, since who knows what will change in the future + """ + relations = english_depparse.get_known_relations() + assert len(relations) > 5 + assert 'case' in relations + for i in VOCAB_PREFIX: + assert i not in relations diff --git a/stanza/tests/pipeline/test_pipeline_ner_processor.py b/stanza/tests/pipeline/test_pipeline_ner_processor.py index c67089cf..c88dbe6e 100644 --- a/stanza/tests/pipeline/test_pipeline_ner_processor.py +++ b/stanza/tests/pipeline/test_pipeline_ner_processor.py @@ -58,13 +58,13 @@ class TestNERProcessor: """ return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,ner") - @pytest.fixture + @pytest.fixture(scope="class") def processed_doc(self, pipeline): """ Document created by running full English pipeline on a few sentences """ - return [pipeline(text) for text in EN_DOCS] + return [pipeline(text) for text in EN_DOCS] - @pytest.fixture + @pytest.fixture(scope="class") def processed_bulk(self, pipeline): """ Document created by running full English pipeline on a few sentences """ docs = [Document([], text=t) for t in EN_DOCS] -- cgit v1.2.3