Turn some pipelines getting built over and over into fixtures. Will make them take up less GPU memory, even if the cleanup isn't reliable

author: John Bauer <horatio@gmail.com> 2022-09-14 05:19:49 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-14 05:20:10 +0300
commit: def8cb86e61bfb178f75de12fa189ee57f0a9cc7 (patch)
tree: 9d2e7b87162b1ae1f2cd549a40d2cb5db1547114 /stanza
parent: a74678e36c2ecb933016620ebfd2a8ab50bd55c3 (diff)
2 files changed, 173 insertions, 171 deletions
diff --git a/stanza/tests/common/test_char_model.py b/stanza/tests/common/test_char_model.py
index 5ab080e5..30b34851 100644
--- a/stanza/tests/common/test_char_model.py
+++ b/stanza/tests/common/test_char_model.py
@@ -27,164 +27,165 @@ fake_text_2 = """
 This is plastic cheese
 """
 
-def test_single_file_vocab():
-    with tempfile.TemporaryDirectory() as tempdir:
-        sample_file = os.path.join(tempdir, "text.txt")
-        with open(sample_file, "w", encoding="utf-8") as fout:
-            fout.write(fake_text_1)
-        vocab = char_model.build_charlm_vocab(sample_file)
-
-    for i in fake_text_1:
-        assert i in vocab
-    assert "Q" not in vocab
-
-def test_single_file_xz_vocab():
-    with tempfile.TemporaryDirectory() as tempdir:
-        sample_file = os.path.join(tempdir, "text.txt.xz")
-        with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
-            fout.write(fake_text_1)
-        vocab = char_model.build_charlm_vocab(sample_file)
-
-    for i in fake_text_1:
-        assert i in vocab
-    assert "Q" not in vocab
-
-def test_single_file_dir_vocab():
-    with tempfile.TemporaryDirectory() as tempdir:
-        sample_file = os.path.join(tempdir, "text.txt")
-        with open(sample_file, "w", encoding="utf-8") as fout:
-            fout.write(fake_text_1)
-        vocab = char_model.build_charlm_vocab(tempdir)
-
-    for i in fake_text_1:
-        assert i in vocab
-    assert "Q" not in vocab
-
-def test_multiple_files_vocab():
-    with tempfile.TemporaryDirectory() as tempdir:
-        sample_file = os.path.join(tempdir, "t1.txt")
-        with open(sample_file, "w", encoding="utf-8") as fout:
-            fout.write(fake_text_1)
-        sample_file = os.path.join(tempdir, "t2.txt.xz")
-        with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
-            fout.write(fake_text_2)
-        vocab = char_model.build_charlm_vocab(tempdir)
-
-    for i in fake_text_1:
-        assert i in vocab
-    for i in fake_text_2:
-        assert i in vocab
-    assert "Q" not in vocab
-
-def test_cutoff_vocab():
-    with tempfile.TemporaryDirectory() as tempdir:
-        sample_file = os.path.join(tempdir, "t1.txt")
-        with open(sample_file, "w", encoding="utf-8") as fout:
-            fout.write(fake_text_1)
-        sample_file = os.path.join(tempdir, "t2.txt.xz")
-        with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
-            fout.write(fake_text_2)
-
-        vocab = char_model.build_charlm_vocab(tempdir, cutoff=2)
-
-    counts = Counter(fake_text_1) + Counter(fake_text_2)
-    for letter, count in counts.most_common():
-        if count < 2:
-            assert letter not in vocab
-        else:
-            assert letter in vocab
-
-def test_build_model():
-    """
-    Test the whole thing on a small dataset for an iteration or two
-    """
-    with tempfile.TemporaryDirectory() as tempdir:
-        eval_file = os.path.join(tempdir, "en_test.dev.txt")
-        with open(eval_file, "w", encoding="utf-8") as fout:
-            fout.write(fake_text_1)
-        train_file = os.path.join(tempdir, "en_test.train.txt")
-        with open(train_file, "w", encoding="utf-8") as fout:
-            for i in range(1000):
+class TestCharModel:
+    def test_single_file_vocab(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            sample_file = os.path.join(tempdir, "text.txt")
+            with open(sample_file, "w", encoding="utf-8") as fout:
                 fout.write(fake_text_1)
-                fout.write("\n")
+            vocab = char_model.build_charlm_vocab(sample_file)
+
+        for i in fake_text_1:
+            assert i in vocab
+        assert "Q" not in vocab
+
+    def test_single_file_xz_vocab(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            sample_file = os.path.join(tempdir, "text.txt.xz")
+            with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
+                fout.write(fake_text_1)
+            vocab = char_model.build_charlm_vocab(sample_file)
+
+        for i in fake_text_1:
+            assert i in vocab
+        assert "Q" not in vocab
+
+    def test_single_file_dir_vocab(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            sample_file = os.path.join(tempdir, "text.txt")
+            with open(sample_file, "w", encoding="utf-8") as fout:
+                fout.write(fake_text_1)
+            vocab = char_model.build_charlm_vocab(tempdir)
+
+        for i in fake_text_1:
+            assert i in vocab
+        assert "Q" not in vocab
+
+    def test_multiple_files_vocab(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            sample_file = os.path.join(tempdir, "t1.txt")
+            with open(sample_file, "w", encoding="utf-8") as fout:
+                fout.write(fake_text_1)
+            sample_file = os.path.join(tempdir, "t2.txt.xz")
+            with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
+                fout.write(fake_text_2)
+            vocab = char_model.build_charlm_vocab(tempdir)
+
+        for i in fake_text_1:
+            assert i in vocab
+        for i in fake_text_2:
+            assert i in vocab
+        assert "Q" not in vocab
+
+    def test_cutoff_vocab(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            sample_file = os.path.join(tempdir, "t1.txt")
+            with open(sample_file, "w", encoding="utf-8") as fout:
+                fout.write(fake_text_1)
+            sample_file = os.path.join(tempdir, "t2.txt.xz")
+            with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
                 fout.write(fake_text_2)
-                fout.write("\n")
-        save_name = 'en_test.forward.pt'
-        vocab_save_name = 'en_text.vocab.pt'
-        checkpoint_save_name = 'en_text.checkpoint.pt'
-        args = ['--train_file', train_file,
-                '--eval_file', eval_file,
-                '--eval_steps', '0', # eval once per opoch
-                '--epochs', '2',
-                '--cutoff', '1',
-                '--batch_size', '%d' % len(fake_text_1),
-                '--lang', 'en',
-                '--shorthand', 'en_test',
-                '--save_dir', tempdir,
-                '--save_name', save_name,
-                '--vocab_save_name', vocab_save_name,
-                '--checkpoint_save_name', checkpoint_save_name]
-        args = charlm.parse_args(args)
-        charlm.train(args)
-
-        assert os.path.exists(os.path.join(tempdir, vocab_save_name))
-
-        # test that saving & loading of the model worked
-        assert os.path.exists(os.path.join(tempdir, save_name))
-        model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name))
-
-        # test that saving & loading of the checkpoint worked
-        assert os.path.exists(os.path.join(tempdir, checkpoint_save_name))
-        model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name))
-        trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name))
-
-        assert trainer.global_step > 0
-        assert trainer.epoch == 2
-
-        # quick test to verify this method works with a trained model
-        charlm.get_current_lr(trainer, args)
-
-        # test loading a vocab built by the training method...
-        vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name))
-        trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab)
-        # ... and test the get_current_lr for an untrained model as well
-        # this test is super "eager"
-        assert charlm.get_current_lr(trainer, args) == args['lr0']
-
-@pytest.fixture
-def english_forward():
-    # eg, stanza_test/models/en/forward_charlm/1billion.pt
-    models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*")
-    models = glob.glob(models_path)
-    # we expect at least one English model downloaded for the tests
-    assert len(models) >= 1
-    model_file = models[0]
-    return char_model.CharacterLanguageModel.load(model_file)
-
-@pytest.fixture
-def english_backward():
-    # eg, stanza_test/models/en/forward_charlm/1billion.pt
-    models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
-    models = glob.glob(models_path)
-    # we expect at least one English model downloaded for the tests
-    assert len(models) >= 1
-    model_file = models[0]
-    return char_model.CharacterLanguageModel.load(model_file)
-
-def test_load_model(english_forward, english_backward):
-    """
-    Check that basic loading functions work
-    """
-    assert english_forward.is_forward_lm
-    assert not english_backward.is_forward_lm
-
-def test_save_load_model(english_forward, english_backward):
-    """
-    Load, save, and load again
-    """
-    with tempfile.TemporaryDirectory() as tempdir:
-        for model in (english_forward, english_backward):
-            save_file = os.path.join(tempdir, "resaved", "charlm.pt")
-            model.save(save_file)
-            reloaded = char_model.CharacterLanguageModel.load(save_file)
-            assert model.is_forward_lm == reloaded.is_forward_lm
+
+            vocab = char_model.build_charlm_vocab(tempdir, cutoff=2)
+
+        counts = Counter(fake_text_1) + Counter(fake_text_2)
+        for letter, count in counts.most_common():
+            if count < 2:
+                assert letter not in vocab
+            else:
+                assert letter in vocab
+
+    def test_build_model(self):
+        """
+        Test the whole thing on a small dataset for an iteration or two
+        """
+        with tempfile.TemporaryDirectory() as tempdir:
+            eval_file = os.path.join(tempdir, "en_test.dev.txt")
+            with open(eval_file, "w", encoding="utf-8") as fout:
+                fout.write(fake_text_1)
+            train_file = os.path.join(tempdir, "en_test.train.txt")
+            with open(train_file, "w", encoding="utf-8") as fout:
+                for i in range(1000):
+                    fout.write(fake_text_1)
+                    fout.write("\n")
+                    fout.write(fake_text_2)
+                    fout.write("\n")
+            save_name = 'en_test.forward.pt'
+            vocab_save_name = 'en_text.vocab.pt'
+            checkpoint_save_name = 'en_text.checkpoint.pt'
+            args = ['--train_file', train_file,
+                    '--eval_file', eval_file,
+                    '--eval_steps', '0', # eval once per opoch
+                    '--epochs', '2',
+                    '--cutoff', '1',
+                    '--batch_size', '%d' % len(fake_text_1),
+                    '--lang', 'en',
+                    '--shorthand', 'en_test',
+                    '--save_dir', tempdir,
+                    '--save_name', save_name,
+                    '--vocab_save_name', vocab_save_name,
+                    '--checkpoint_save_name', checkpoint_save_name]
+            args = charlm.parse_args(args)
+            charlm.train(args)
+
+            assert os.path.exists(os.path.join(tempdir, vocab_save_name))
+
+            # test that saving & loading of the model worked
+            assert os.path.exists(os.path.join(tempdir, save_name))
+            model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name))
+
+            # test that saving & loading of the checkpoint worked
+            assert os.path.exists(os.path.join(tempdir, checkpoint_save_name))
+            model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name))
+            trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name))
+
+            assert trainer.global_step > 0
+            assert trainer.epoch == 2
+
+            # quick test to verify this method works with a trained model
+            charlm.get_current_lr(trainer, args)
+
+            # test loading a vocab built by the training method...
+            vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name))
+            trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab)
+            # ... and test the get_current_lr for an untrained model as well
+            # this test is super "eager"
+            assert charlm.get_current_lr(trainer, args) == args['lr0']
+
+    @pytest.fixture(scope="class")
+    def english_forward(self):
+        # eg, stanza_test/models/en/forward_charlm/1billion.pt
+        models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*")
+        models = glob.glob(models_path)
+        # we expect at least one English model downloaded for the tests
+        assert len(models) >= 1
+        model_file = models[0]
+        return char_model.CharacterLanguageModel.load(model_file)
+
+    @pytest.fixture(scope="class")
+    def english_backward(self):
+        # eg, stanza_test/models/en/forward_charlm/1billion.pt
+        models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
+        models = glob.glob(models_path)
+        # we expect at least one English model downloaded for the tests
+        assert len(models) >= 1
+        model_file = models[0]
+        return char_model.CharacterLanguageModel.load(model_file)
+
+    def test_load_model(self, english_forward, english_backward):
+        """
+        Check that basic loading functions work
+        """
+        assert english_forward.is_forward_lm
+        assert not english_backward.is_forward_lm
+
+    def test_save_load_model(self, english_forward, english_backward):
+        """
+        Load, save, and load again
+        """
+        with tempfile.TemporaryDirectory() as tempdir:
+            for model in (english_forward, english_backward):
+                save_file = os.path.join(tempdir, "resaved", "charlm.pt")
+                model.save(save_file)
+                reloaded = char_model.CharacterLanguageModel.load(save_file)
+                assert model.is_forward_lm == reloaded.is_forward_lm
diff --git a/stanza/tests/common/test_data_objects.py b/stanza/tests/common/test_data_objects.py
index 765fb748..42e6cf1b 100644
--- a/stanza/tests/common/test_data_objects.py
+++ b/stanza/tests/common/test_data_objects.py
@@ -16,32 +16,34 @@ EN_DOC_UPOS_XPOS = (('PRON_DT', 'AUX_VBZ', 'DET_DT', 'NOUN_NN', 'NOUN_NN', 'PUNC
 
 EN_DOC2 = "Chris wrote a sentence. Then another."
 
-def test_readonly():
-    Document.add_property('some_property', 123)
+@pytest.fixture(scope="module")
+def nlp_pipeline():
     nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
-    doc = nlp(EN_DOC)
+    return nlp
+
+def test_readonly(nlp_pipeline):
+    Document.add_property('some_property', 123)
+    doc = nlp_pipeline(EN_DOC)
     assert doc.some_property == 123
     with pytest.raises(ValueError):
         doc.some_property = 456
 
 
-def test_getter():
+def test_getter(nlp_pipeline):
     Word.add_property('upos_xpos', getter=lambda self: f"{self.upos}_{self.xpos}")
-    nlp = stanza.Pipeline(**{'dir': TEST_MODELS_DIR, 'lang': 'en'})
 
-    doc = nlp(EN_DOC)
+    doc = nlp_pipeline(EN_DOC)
 
     assert EN_DOC_UPOS_XPOS == tuple(tuple(word.upos_xpos for word in sentence.words) for sentence in doc.sentences)
 
-def test_setter_getter():
+def test_setter_getter(nlp_pipeline):
     int2str = {0: 'ok', 1: 'good', 2: 'bad'}
     str2int = {'ok': 0, 'good': 1, 'bad': 2}
     def setter(self, value):
         self._classname = str2int[value]
     Sentence.add_property('classname', getter=lambda self: int2str[self._classname] if self._classname is not None else None, setter=setter)
-    nlp = stanza.Pipeline(**{'dir': TEST_MODELS_DIR, 'lang': 'en'})
 
-    doc = nlp(EN_DOC)
+    doc = nlp_pipeline(EN_DOC)
     sentence = doc.sentences[0]
     sentence.classname = 'good'
     assert sentence._classname == 1
@@ -50,9 +52,8 @@ def test_setter_getter():
     sentence._classname = 2
     assert sentence.classname == 'bad'
 
-def test_backpointer():
-    nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
-    doc = nlp(EN_DOC2)
+def test_backpointer(nlp_pipeline):
+    doc = nlp_pipeline(EN_DOC2)
     ent = doc.ents[0]
     assert ent.sent is doc.sentences[0]
     assert list(doc.iter_words())[0].sent is doc.sentences[0]
author	John Bauer <horatio@gmail.com>	2022-09-14 05:19:49 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-14 05:20:10 +0300
commit	def8cb86e61bfb178f75de12fa189ee57f0a9cc7 (patch)
tree	9d2e7b87162b1ae1f2cd549a40d2cb5db1547114 /stanza
parent	a74678e36c2ecb933016620ebfd2a8ab50bd55c3 (diff)