Try to reduce the scope on various pipelines to make the test suite less likely to run out of GPU memory. Not sure this is the correct approach

author: John Bauer <horatio@gmail.com> 2022-09-14 02:26:06 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-14 02:26:06 +0300
commit: 2511f1f8d96416d047696303f53bd5b12f1abba2 (patch)
tree: 2be06d69ab9ae8eb87e211353a23d6c8d6df2d9e
parent: b7fda48e773c17580d5615fdfaafd13d05f09290 (diff)
4 files changed, 196 insertions, 194 deletions
diff --git a/stanza/tests/classifiers/test_classifier.py b/stanza/tests/classifiers/test_classifier.py
index 6743b5f2..36beb785 100644
--- a/stanza/tests/classifiers/test_classifier.py
+++ b/stanza/tests/classifiers/test_classifier.py
@@ -30,175 +30,176 @@ DATASET = [
 
 EMB_DIM = 5
 
-@pytest.fixture(scope="module")
-def train_file(tmp_path_factory):
-    train_set = DATASET * 20
-    train_filename = tmp_path_factory.mktemp("data") / "train.json"
-    with open(train_filename, "w", encoding="utf-8") as fout:
-        json.dump(train_set, fout, ensure_ascii=False)
-    return train_filename
-
-@pytest.fixture(scope="module")
-def dev_file(tmp_path_factory):
-    dev_set = DATASET * 2
-    dev_filename = tmp_path_factory.mktemp("data") / "dev.json"
-    with open(dev_filename, "w", encoding="utf-8") as fout:
-        json.dump(dev_set, fout, ensure_ascii=False)
-    return dev_filename
-
-@pytest.fixture(scope="module")
-def test_file(tmp_path_factory):
-    test_set = DATASET
-    test_filename = tmp_path_factory.mktemp("data") / "test.json"
-    with open(test_filename, "w", encoding="utf-8") as fout:
-        json.dump(test_set, fout, ensure_ascii=False)
-    return test_filename
-
-@pytest.fixture(scope="module")
-def fake_embeddings(tmp_path_factory):
-    # could set np random seed here
-    words = sorted(set([x.lower() for y in SENTENCES for x in y]))
-    words = words[:-1]
-    embedding_txt = tmp_path_factory.mktemp("data") / "embedding.txt"
-    embedding_pt  = tmp_path_factory.mktemp("data") / "embedding.pt"
-    embedding = np.random.random((len(words), EMB_DIM))
-
-    with open(embedding_txt, "w", encoding="utf-8") as fout:
-        for word, emb in zip(words, embedding):
-            fout.write(word)
-            fout.write("\t")
-            fout.write("\t".join(str(x) for x in emb))
-            fout.write("\n")
-
-    pt = pretrain.Pretrain(str(embedding_pt), str(embedding_txt))
-    pt.load()
-    assert os.path.exists(embedding_pt)
-    return embedding_pt
-
-def test_read_data(train_file):
-    """
-    Test reading of the json format
-    """
-    train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
-    assert len(train_set) == 60
-
-def test_dataset_vocab(train_file):
-    """
-    Converting a dataset to vocab should have a specific set of words along with PAD and UNK
-    """
-    train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
-    vocab = data.dataset_vocab(train_set)
-    expected = set([PAD, UNK] + [x.lower() for y in SENTENCES for x in y])
-    assert set(vocab) == expected
-
-def test_dataset_labels(train_file):
-    """
-    Test the extraction of labels from a dataset
-    """
-    train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
-    labels = data.dataset_labels(train_set)
-    assert labels == ["0", "1", "2"]
-
-def build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=None):
-    """
-    Build a model to be used by one of the later tests
-    """
-    save_dir = str(tmp_path / "classifier")
-    save_name = "model.pt"
-    args = ["--save_dir", save_dir,
-            "--save_name", save_name,
-            "--wordvec_pretrain_file", str(fake_embeddings),
-            "--filter_channels", "20",
-            "--fc_shapes", "20,10",
-            "--train_file", str(train_file),
-            "--dev_file", str(dev_file),
-            "--max_epochs", "2",
-            "--batch_size", "60"]
-    if extra_args is not None:
-        args = args + extra_args
-    args = classifier.parse_args(args)
-    train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
-    trainer = Trainer.build_new_model(args, train_set)
-    return trainer, train_set, args
-
-def run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=None):
-    """
-    Iterate a couple times over a model
-    """
-    trainer, train_set, args = build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args)
-    dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len)
-    labels = data.dataset_labels(train_set)
-
-    save_filename = os.path.join(args.save_dir, args.save_name)
-    checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name)
-    classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels)
-    return trainer
-
-def test_build_model(tmp_path, fake_embeddings, train_file, dev_file):
-    """
-    Test that building a basic model works
-    """
-    build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
-
-def test_save_load(tmp_path, fake_embeddings, train_file, dev_file):
-    """
-    Test that a basic model can save & load
-    """
-    trainer, _, args = build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
-
-    save_filename = os.path.join(args.save_dir, args.save_name)
-    trainer.save(save_filename)
-
-    args.load_name = args.save_name
-    trainer = Trainer.load(args.load_name, args)
-    args.load_name = save_filename
-    trainer = Trainer.load(args.load_name, args)
-
-def test_train_basic(tmp_path, fake_embeddings, train_file, dev_file):
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
-
-def test_train_bilstm(tmp_path, fake_embeddings, train_file, dev_file):
-    """
-    Test w/ and w/o bilstm variations of the classifier
-    """
-    args = ["--bilstm", "--bilstm_hidden_dim", "20"]
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-    args = ["--no_bilstm"]
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-def test_train_maxpool_width(tmp_path, fake_embeddings, train_file, dev_file):
-    """
-    Test various maxpool widths
-
-    Also sets --filter_channels to a multiple of 2 but not of 3 for
-    the test to make sure the math is done correctly on a non-divisible width
-    """
-    args = ["--maxpool_width", "1", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-    args = ["--maxpool_width", "2", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-    args = ["--maxpool_width", "3", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-def test_train_conv_2d(tmp_path, fake_embeddings, train_file, dev_file):
-    args = ["--filter_sizes", "(3,4,5)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-    args = ["--filter_sizes", "((3,2),)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-    args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
-    run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-def test_train_filter_channels(tmp_path, fake_embeddings, train_file, dev_file):
-    args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--no_bilstm"]
-    trainer = run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-    assert trainer.model.fc_input_size == 40
-
-    args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "15,20", "--no_bilstm"]
-    trainer = run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-    # 50 = 2x15 for the 2d conv (over 5 dim embeddings) + 20
-    assert trainer.model.fc_input_size == 50
+class TestClassifier:
+    @pytest.fixture(scope="class")
+    def train_file(self, tmp_path_factory):
+        train_set = DATASET * 20
+        train_filename = tmp_path_factory.mktemp("data") / "train.json"
+        with open(train_filename, "w", encoding="utf-8") as fout:
+            json.dump(train_set, fout, ensure_ascii=False)
+        return train_filename
+
+    @pytest.fixture(scope="class")
+    def dev_file(self, tmp_path_factory):
+        dev_set = DATASET * 2
+        dev_filename = tmp_path_factory.mktemp("data") / "dev.json"
+        with open(dev_filename, "w", encoding="utf-8") as fout:
+            json.dump(dev_set, fout, ensure_ascii=False)
+        return dev_filename
+
+    @pytest.fixture(scope="class")
+    def test_file(self, tmp_path_factory):
+        test_set = DATASET
+        test_filename = tmp_path_factory.mktemp("data") / "test.json"
+        with open(test_filename, "w", encoding="utf-8") as fout:
+            json.dump(test_set, fout, ensure_ascii=False)
+        return test_filename
+
+    @pytest.fixture(scope="class")
+    def fake_embeddings(self, tmp_path_factory):
+        # could set np random seed here
+        words = sorted(set([x.lower() for y in SENTENCES for x in y]))
+        words = words[:-1]
+        embedding_txt = tmp_path_factory.mktemp("data") / "embedding.txt"
+        embedding_pt  = tmp_path_factory.mktemp("data") / "embedding.pt"
+        embedding = np.random.random((len(words), EMB_DIM))
+
+        with open(embedding_txt, "w", encoding="utf-8") as fout:
+            for word, emb in zip(words, embedding):
+                fout.write(word)
+                fout.write("\t")
+                fout.write("\t".join(str(x) for x in emb))
+                fout.write("\n")
+
+        pt = pretrain.Pretrain(str(embedding_pt), str(embedding_txt))
+        pt.load()
+        assert os.path.exists(embedding_pt)
+        return embedding_pt
+
+    def test_read_data(self, train_file):
+        """
+        Test reading of the json format
+        """
+        train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
+        assert len(train_set) == 60
+
+    def test_dataset_vocab(self, train_file):
+        """
+        Converting a dataset to vocab should have a specific set of words along with PAD and UNK
+        """
+        train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
+        vocab = data.dataset_vocab(train_set)
+        expected = set([PAD, UNK] + [x.lower() for y in SENTENCES for x in y])
+        assert set(vocab) == expected
+
+    def test_dataset_labels(self, train_file):
+        """
+        Test the extraction of labels from a dataset
+        """
+        train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
+        labels = data.dataset_labels(train_set)
+        assert labels == ["0", "1", "2"]
+
+    def build_model(self, tmp_path, fake_embeddings, train_file, dev_file, extra_args=None):
+        """
+        Build a model to be used by one of the later tests
+        """
+        save_dir = str(tmp_path / "classifier")
+        save_name = "model.pt"
+        args = ["--save_dir", save_dir,
+                "--save_name", save_name,
+                "--wordvec_pretrain_file", str(fake_embeddings),
+                "--filter_channels", "20",
+                "--fc_shapes", "20,10",
+                "--train_file", str(train_file),
+                "--dev_file", str(dev_file),
+                "--max_epochs", "2",
+                "--batch_size", "60"]
+        if extra_args is not None:
+            args = args + extra_args
+        args = classifier.parse_args(args)
+        train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
+        trainer = Trainer.build_new_model(args, train_set)
+        return trainer, train_set, args
+
+    def run_training(self, tmp_path, fake_embeddings, train_file, dev_file, extra_args=None):
+        """
+        Iterate a couple times over a model
+        """
+        trainer, train_set, args = self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args)
+        dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len)
+        labels = data.dataset_labels(train_set)
+
+        save_filename = os.path.join(args.save_dir, args.save_name)
+        checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name)
+        classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels)
+        return trainer
+
+    def test_build_model(self, tmp_path, fake_embeddings, train_file, dev_file):
+        """
+        Test that building a basic model works
+        """
+        self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
+
+    def test_save_load(self, tmp_path, fake_embeddings, train_file, dev_file):
+        """
+        Test that a basic model can save & load
+        """
+        trainer, _, args = self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
+
+        save_filename = os.path.join(args.save_dir, args.save_name)
+        trainer.save(save_filename)
+
+        args.load_name = args.save_name
+        trainer = Trainer.load(args.load_name, args)
+        args.load_name = save_filename
+        trainer = Trainer.load(args.load_name, args)
+
+    def test_train_basic(self, tmp_path, fake_embeddings, train_file, dev_file):
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
+
+    def test_train_bilstm(self, tmp_path, fake_embeddings, train_file, dev_file):
+        """
+        Test w/ and w/o bilstm variations of the classifier
+        """
+        args = ["--bilstm", "--bilstm_hidden_dim", "20"]
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+        args = ["--no_bilstm"]
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+    def test_train_maxpool_width(self, tmp_path, fake_embeddings, train_file, dev_file):
+        """
+        Test various maxpool widths
+
+        Also sets --filter_channels to a multiple of 2 but not of 3 for
+        the test to make sure the math is done correctly on a non-divisible width
+        """
+        args = ["--maxpool_width", "1", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+        args = ["--maxpool_width", "2", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+        args = ["--maxpool_width", "3", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+    def test_train_conv_2d(self, tmp_path, fake_embeddings, train_file, dev_file):
+        args = ["--filter_sizes", "(3,4,5)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+        args = ["--filter_sizes", "((3,2),)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+        args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+        self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+    def test_train_filter_channels(self, tmp_path, fake_embeddings, train_file, dev_file):
+        args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--no_bilstm"]
+        trainer = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+        assert trainer.model.fc_input_size == 40
+
+        args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "15,20", "--no_bilstm"]
+        trainer = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+        # 50 = 2x15 for the 2d conv (over 5 dim embeddings) + 20
+        assert trainer.model.fc_input_size == 50
diff --git a/stanza/tests/lemma/test_lemma_trainer.py b/stanza/tests/lemma/test_lemma_trainer.py
index 24a58b57..dd6a1777 100644
--- a/stanza/tests/lemma/test_lemma_trainer.py
+++ b/stanza/tests/lemma/test_lemma_trainer.py
@@ -13,7 +13,7 @@ from stanza.tests import *
 
 pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def english_model():
     models_path = os.path.join(TEST_MODELS_DIR, "en", "lemma", "*")
     models = glob.glob(models_path)
diff --git a/stanza/tests/pipeline/test_pipeline_depparse_processor.py b/stanza/tests/pipeline/test_pipeline_depparse_processor.py
index 83535cb8..ad820643 100644
--- a/stanza/tests/pipeline/test_pipeline_depparse_processor.py
+++ b/stanza/tests/pipeline/test_pipeline_depparse_processor.py
@@ -10,23 +10,24 @@ from stanza.tests import TEST_MODELS_DIR
 
 pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
 
-@pytest.fixture(scope="module")
-def english_depparse():
-    """
-    Get a depparse_processor for English
-    """
-    nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'})
-    assert 'depparse' in nlp.processors
-    return nlp.processors['depparse']
+class TestClassifier:
+    @pytest.fixture(scope="class")
+    def english_depparse(self):
+        """
+        Get a depparse_processor for English
+        """
+        nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'})
+        assert 'depparse' in nlp.processors
+        return nlp.processors['depparse']
 
-def test_get_known_relations(english_depparse):
-    """
-    Test getting the known relations from a processor.
+    def test_get_known_relations(self, english_depparse):
+        """
+        Test getting the known relations from a processor.
 
-    Doesn't test that all the relations exist, since who knows what will change in the future
-    """
-    relations = english_depparse.get_known_relations()
-    assert len(relations) > 5
-    assert 'case' in relations
-    for i in VOCAB_PREFIX:
-        assert i not in relations
+        Doesn't test that all the relations exist, since who knows what will change in the future
+        """
+        relations = english_depparse.get_known_relations()
+        assert len(relations) > 5
+        assert 'case' in relations
+        for i in VOCAB_PREFIX:
+            assert i not in relations
diff --git a/stanza/tests/pipeline/test_pipeline_ner_processor.py b/stanza/tests/pipeline/test_pipeline_ner_processor.py
index c67089cf..c88dbe6e 100644
--- a/stanza/tests/pipeline/test_pipeline_ner_processor.py
+++ b/stanza/tests/pipeline/test_pipeline_ner_processor.py
@@ -58,13 +58,13 @@ class TestNERProcessor:
         """
         return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,ner")
 
-    @pytest.fixture
+    @pytest.fixture(scope="class")
     def processed_doc(self, pipeline):
         """ Document created by running full English pipeline on a few sentences """
-        return [pipeline(text) for text in  EN_DOCS]
+        return [pipeline(text) for text in EN_DOCS]
 
 
-    @pytest.fixture
+    @pytest.fixture(scope="class")
     def processed_bulk(self, pipeline):
         """ Document created by running full English pipeline on a few sentences """
         docs = [Document([], text=t) for t in EN_DOCS]
author	John Bauer <horatio@gmail.com>	2022-09-14 02:26:06 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-14 02:26:06 +0300
commit	2511f1f8d96416d047696303f53bd5b12f1abba2 (patch)
tree	2be06d69ab9ae8eb87e211353a23d6c8d6df2d9e
parent	b7fda48e773c17580d5615fdfaafd13d05f09290 (diff)