diff options
author | John Bauer <horatio@gmail.com> | 2022-09-14 05:19:49 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-09-14 05:20:10 +0300 |
commit | def8cb86e61bfb178f75de12fa189ee57f0a9cc7 (patch) | |
tree | 9d2e7b87162b1ae1f2cd549a40d2cb5db1547114 | |
parent | a74678e36c2ecb933016620ebfd2a8ab50bd55c3 (diff) |
Turn some pipelines getting built over and over into fixtures. Will make them take up less GPU memory, even if the cleanup isn't reliable
-rw-r--r-- | stanza/tests/common/test_char_model.py | 319 | ||||
-rw-r--r-- | stanza/tests/common/test_data_objects.py | 25 |
2 files changed, 173 insertions, 171 deletions
diff --git a/stanza/tests/common/test_char_model.py b/stanza/tests/common/test_char_model.py index 5ab080e5..30b34851 100644 --- a/stanza/tests/common/test_char_model.py +++ b/stanza/tests/common/test_char_model.py @@ -27,164 +27,165 @@ fake_text_2 = """ This is plastic cheese """ -def test_single_file_vocab(): - with tempfile.TemporaryDirectory() as tempdir: - sample_file = os.path.join(tempdir, "text.txt") - with open(sample_file, "w", encoding="utf-8") as fout: - fout.write(fake_text_1) - vocab = char_model.build_charlm_vocab(sample_file) - - for i in fake_text_1: - assert i in vocab - assert "Q" not in vocab - -def test_single_file_xz_vocab(): - with tempfile.TemporaryDirectory() as tempdir: - sample_file = os.path.join(tempdir, "text.txt.xz") - with lzma.open(sample_file, "wt", encoding="utf-8") as fout: - fout.write(fake_text_1) - vocab = char_model.build_charlm_vocab(sample_file) - - for i in fake_text_1: - assert i in vocab - assert "Q" not in vocab - -def test_single_file_dir_vocab(): - with tempfile.TemporaryDirectory() as tempdir: - sample_file = os.path.join(tempdir, "text.txt") - with open(sample_file, "w", encoding="utf-8") as fout: - fout.write(fake_text_1) - vocab = char_model.build_charlm_vocab(tempdir) - - for i in fake_text_1: - assert i in vocab - assert "Q" not in vocab - -def test_multiple_files_vocab(): - with tempfile.TemporaryDirectory() as tempdir: - sample_file = os.path.join(tempdir, "t1.txt") - with open(sample_file, "w", encoding="utf-8") as fout: - fout.write(fake_text_1) - sample_file = os.path.join(tempdir, "t2.txt.xz") - with lzma.open(sample_file, "wt", encoding="utf-8") as fout: - fout.write(fake_text_2) - vocab = char_model.build_charlm_vocab(tempdir) - - for i in fake_text_1: - assert i in vocab - for i in fake_text_2: - assert i in vocab - assert "Q" not in vocab - -def test_cutoff_vocab(): - with tempfile.TemporaryDirectory() as tempdir: - sample_file = os.path.join(tempdir, "t1.txt") - with open(sample_file, "w", encoding="utf-8") as fout: - fout.write(fake_text_1) - sample_file = os.path.join(tempdir, "t2.txt.xz") - with lzma.open(sample_file, "wt", encoding="utf-8") as fout: - fout.write(fake_text_2) - - vocab = char_model.build_charlm_vocab(tempdir, cutoff=2) - - counts = Counter(fake_text_1) + Counter(fake_text_2) - for letter, count in counts.most_common(): - if count < 2: - assert letter not in vocab - else: - assert letter in vocab - -def test_build_model(): - """ - Test the whole thing on a small dataset for an iteration or two - """ - with tempfile.TemporaryDirectory() as tempdir: - eval_file = os.path.join(tempdir, "en_test.dev.txt") - with open(eval_file, "w", encoding="utf-8") as fout: - fout.write(fake_text_1) - train_file = os.path.join(tempdir, "en_test.train.txt") - with open(train_file, "w", encoding="utf-8") as fout: - for i in range(1000): +class TestCharModel: + def test_single_file_vocab(self): + with tempfile.TemporaryDirectory() as tempdir: + sample_file = os.path.join(tempdir, "text.txt") + with open(sample_file, "w", encoding="utf-8") as fout: fout.write(fake_text_1) - fout.write("\n") + vocab = char_model.build_charlm_vocab(sample_file) + + for i in fake_text_1: + assert i in vocab + assert "Q" not in vocab + + def test_single_file_xz_vocab(self): + with tempfile.TemporaryDirectory() as tempdir: + sample_file = os.path.join(tempdir, "text.txt.xz") + with lzma.open(sample_file, "wt", encoding="utf-8") as fout: + fout.write(fake_text_1) + vocab = char_model.build_charlm_vocab(sample_file) + + for i in fake_text_1: + assert i in vocab + assert "Q" not in vocab + + def test_single_file_dir_vocab(self): + with tempfile.TemporaryDirectory() as tempdir: + sample_file = os.path.join(tempdir, "text.txt") + with open(sample_file, "w", encoding="utf-8") as fout: + fout.write(fake_text_1) + vocab = char_model.build_charlm_vocab(tempdir) + + for i in fake_text_1: + assert i in vocab + assert "Q" not in vocab + + def test_multiple_files_vocab(self): + with tempfile.TemporaryDirectory() as tempdir: + sample_file = os.path.join(tempdir, "t1.txt") + with open(sample_file, "w", encoding="utf-8") as fout: + fout.write(fake_text_1) + sample_file = os.path.join(tempdir, "t2.txt.xz") + with lzma.open(sample_file, "wt", encoding="utf-8") as fout: + fout.write(fake_text_2) + vocab = char_model.build_charlm_vocab(tempdir) + + for i in fake_text_1: + assert i in vocab + for i in fake_text_2: + assert i in vocab + assert "Q" not in vocab + + def test_cutoff_vocab(self): + with tempfile.TemporaryDirectory() as tempdir: + sample_file = os.path.join(tempdir, "t1.txt") + with open(sample_file, "w", encoding="utf-8") as fout: + fout.write(fake_text_1) + sample_file = os.path.join(tempdir, "t2.txt.xz") + with lzma.open(sample_file, "wt", encoding="utf-8") as fout: fout.write(fake_text_2) - fout.write("\n") - save_name = 'en_test.forward.pt' - vocab_save_name = 'en_text.vocab.pt' - checkpoint_save_name = 'en_text.checkpoint.pt' - args = ['--train_file', train_file, - '--eval_file', eval_file, - '--eval_steps', '0', # eval once per opoch - '--epochs', '2', - '--cutoff', '1', - '--batch_size', '%d' % len(fake_text_1), - '--lang', 'en', - '--shorthand', 'en_test', - '--save_dir', tempdir, - '--save_name', save_name, - '--vocab_save_name', vocab_save_name, - '--checkpoint_save_name', checkpoint_save_name] - args = charlm.parse_args(args) - charlm.train(args) - - assert os.path.exists(os.path.join(tempdir, vocab_save_name)) - - # test that saving & loading of the model worked - assert os.path.exists(os.path.join(tempdir, save_name)) - model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name)) - - # test that saving & loading of the checkpoint worked - assert os.path.exists(os.path.join(tempdir, checkpoint_save_name)) - model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name)) - trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name)) - - assert trainer.global_step > 0 - assert trainer.epoch == 2 - - # quick test to verify this method works with a trained model - charlm.get_current_lr(trainer, args) - - # test loading a vocab built by the training method... - vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name)) - trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab) - # ... and test the get_current_lr for an untrained model as well - # this test is super "eager" - assert charlm.get_current_lr(trainer, args) == args['lr0'] - -@pytest.fixture -def english_forward(): - # eg, stanza_test/models/en/forward_charlm/1billion.pt - models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*") - models = glob.glob(models_path) - # we expect at least one English model downloaded for the tests - assert len(models) >= 1 - model_file = models[0] - return char_model.CharacterLanguageModel.load(model_file) - -@pytest.fixture -def english_backward(): - # eg, stanza_test/models/en/forward_charlm/1billion.pt - models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*") - models = glob.glob(models_path) - # we expect at least one English model downloaded for the tests - assert len(models) >= 1 - model_file = models[0] - return char_model.CharacterLanguageModel.load(model_file) - -def test_load_model(english_forward, english_backward): - """ - Check that basic loading functions work - """ - assert english_forward.is_forward_lm - assert not english_backward.is_forward_lm - -def test_save_load_model(english_forward, english_backward): - """ - Load, save, and load again - """ - with tempfile.TemporaryDirectory() as tempdir: - for model in (english_forward, english_backward): - save_file = os.path.join(tempdir, "resaved", "charlm.pt") - model.save(save_file) - reloaded = char_model.CharacterLanguageModel.load(save_file) - assert model.is_forward_lm == reloaded.is_forward_lm + + vocab = char_model.build_charlm_vocab(tempdir, cutoff=2) + + counts = Counter(fake_text_1) + Counter(fake_text_2) + for letter, count in counts.most_common(): + if count < 2: + assert letter not in vocab + else: + assert letter in vocab + + def test_build_model(self): + """ + Test the whole thing on a small dataset for an iteration or two + """ + with tempfile.TemporaryDirectory() as tempdir: + eval_file = os.path.join(tempdir, "en_test.dev.txt") + with open(eval_file, "w", encoding="utf-8") as fout: + fout.write(fake_text_1) + train_file = os.path.join(tempdir, "en_test.train.txt") + with open(train_file, "w", encoding="utf-8") as fout: + for i in range(1000): + fout.write(fake_text_1) + fout.write("\n") + fout.write(fake_text_2) + fout.write("\n") + save_name = 'en_test.forward.pt' + vocab_save_name = 'en_text.vocab.pt' + checkpoint_save_name = 'en_text.checkpoint.pt' + args = ['--train_file', train_file, + '--eval_file', eval_file, + '--eval_steps', '0', # eval once per opoch + '--epochs', '2', + '--cutoff', '1', + '--batch_size', '%d' % len(fake_text_1), + '--lang', 'en', + '--shorthand', 'en_test', + '--save_dir', tempdir, + '--save_name', save_name, + '--vocab_save_name', vocab_save_name, + '--checkpoint_save_name', checkpoint_save_name] + args = charlm.parse_args(args) + charlm.train(args) + + assert os.path.exists(os.path.join(tempdir, vocab_save_name)) + + # test that saving & loading of the model worked + assert os.path.exists(os.path.join(tempdir, save_name)) + model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name)) + + # test that saving & loading of the checkpoint worked + assert os.path.exists(os.path.join(tempdir, checkpoint_save_name)) + model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name)) + trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name)) + + assert trainer.global_step > 0 + assert trainer.epoch == 2 + + # quick test to verify this method works with a trained model + charlm.get_current_lr(trainer, args) + + # test loading a vocab built by the training method... + vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name)) + trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab) + # ... and test the get_current_lr for an untrained model as well + # this test is super "eager" + assert charlm.get_current_lr(trainer, args) == args['lr0'] + + @pytest.fixture(scope="class") + def english_forward(self): + # eg, stanza_test/models/en/forward_charlm/1billion.pt + models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*") + models = glob.glob(models_path) + # we expect at least one English model downloaded for the tests + assert len(models) >= 1 + model_file = models[0] + return char_model.CharacterLanguageModel.load(model_file) + + @pytest.fixture(scope="class") + def english_backward(self): + # eg, stanza_test/models/en/forward_charlm/1billion.pt + models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*") + models = glob.glob(models_path) + # we expect at least one English model downloaded for the tests + assert len(models) >= 1 + model_file = models[0] + return char_model.CharacterLanguageModel.load(model_file) + + def test_load_model(self, english_forward, english_backward): + """ + Check that basic loading functions work + """ + assert english_forward.is_forward_lm + assert not english_backward.is_forward_lm + + def test_save_load_model(self, english_forward, english_backward): + """ + Load, save, and load again + """ + with tempfile.TemporaryDirectory() as tempdir: + for model in (english_forward, english_backward): + save_file = os.path.join(tempdir, "resaved", "charlm.pt") + model.save(save_file) + reloaded = char_model.CharacterLanguageModel.load(save_file) + assert model.is_forward_lm == reloaded.is_forward_lm diff --git a/stanza/tests/common/test_data_objects.py b/stanza/tests/common/test_data_objects.py index 765fb748..42e6cf1b 100644 --- a/stanza/tests/common/test_data_objects.py +++ b/stanza/tests/common/test_data_objects.py @@ -16,32 +16,34 @@ EN_DOC_UPOS_XPOS = (('PRON_DT', 'AUX_VBZ', 'DET_DT', 'NOUN_NN', 'NOUN_NN', 'PUNC EN_DOC2 = "Chris wrote a sentence. Then another." -def test_readonly(): - Document.add_property('some_property', 123) +@pytest.fixture(scope="module") +def nlp_pipeline(): nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en') - doc = nlp(EN_DOC) + return nlp + +def test_readonly(nlp_pipeline): + Document.add_property('some_property', 123) + doc = nlp_pipeline(EN_DOC) assert doc.some_property == 123 with pytest.raises(ValueError): doc.some_property = 456 -def test_getter(): +def test_getter(nlp_pipeline): Word.add_property('upos_xpos', getter=lambda self: f"{self.upos}_{self.xpos}") - nlp = stanza.Pipeline(**{'dir': TEST_MODELS_DIR, 'lang': 'en'}) - doc = nlp(EN_DOC) + doc = nlp_pipeline(EN_DOC) assert EN_DOC_UPOS_XPOS == tuple(tuple(word.upos_xpos for word in sentence.words) for sentence in doc.sentences) -def test_setter_getter(): +def test_setter_getter(nlp_pipeline): int2str = {0: 'ok', 1: 'good', 2: 'bad'} str2int = {'ok': 0, 'good': 1, 'bad': 2} def setter(self, value): self._classname = str2int[value] Sentence.add_property('classname', getter=lambda self: int2str[self._classname] if self._classname is not None else None, setter=setter) - nlp = stanza.Pipeline(**{'dir': TEST_MODELS_DIR, 'lang': 'en'}) - doc = nlp(EN_DOC) + doc = nlp_pipeline(EN_DOC) sentence = doc.sentences[0] sentence.classname = 'good' assert sentence._classname == 1 @@ -50,9 +52,8 @@ def test_setter_getter(): sentence._classname = 2 assert sentence.classname == 'bad' -def test_backpointer(): - nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en') - doc = nlp(EN_DOC2) +def test_backpointer(nlp_pipeline): + doc = nlp_pipeline(EN_DOC2) ent = doc.ents[0] assert ent.sent is doc.sentences[0] assert list(doc.iter_words())[0].sent is doc.sentences[0] |