Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-14 05:19:49 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-14 05:20:10 +0300
commitdef8cb86e61bfb178f75de12fa189ee57f0a9cc7 (patch)
tree9d2e7b87162b1ae1f2cd549a40d2cb5db1547114
parenta74678e36c2ecb933016620ebfd2a8ab50bd55c3 (diff)
Turn some pipelines getting built over and over into fixtures. Will make them take up less GPU memory, even if the cleanup isn't reliable
-rw-r--r--stanza/tests/common/test_char_model.py319
-rw-r--r--stanza/tests/common/test_data_objects.py25
2 files changed, 173 insertions, 171 deletions
diff --git a/stanza/tests/common/test_char_model.py b/stanza/tests/common/test_char_model.py
index 5ab080e5..30b34851 100644
--- a/stanza/tests/common/test_char_model.py
+++ b/stanza/tests/common/test_char_model.py
@@ -27,164 +27,165 @@ fake_text_2 = """
This is plastic cheese
"""
-def test_single_file_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "text.txt")
- with open(sample_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- vocab = char_model.build_charlm_vocab(sample_file)
-
- for i in fake_text_1:
- assert i in vocab
- assert "Q" not in vocab
-
-def test_single_file_xz_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "text.txt.xz")
- with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- vocab = char_model.build_charlm_vocab(sample_file)
-
- for i in fake_text_1:
- assert i in vocab
- assert "Q" not in vocab
-
-def test_single_file_dir_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "text.txt")
- with open(sample_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- vocab = char_model.build_charlm_vocab(tempdir)
-
- for i in fake_text_1:
- assert i in vocab
- assert "Q" not in vocab
-
-def test_multiple_files_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "t1.txt")
- with open(sample_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- sample_file = os.path.join(tempdir, "t2.txt.xz")
- with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
- fout.write(fake_text_2)
- vocab = char_model.build_charlm_vocab(tempdir)
-
- for i in fake_text_1:
- assert i in vocab
- for i in fake_text_2:
- assert i in vocab
- assert "Q" not in vocab
-
-def test_cutoff_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "t1.txt")
- with open(sample_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- sample_file = os.path.join(tempdir, "t2.txt.xz")
- with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
- fout.write(fake_text_2)
-
- vocab = char_model.build_charlm_vocab(tempdir, cutoff=2)
-
- counts = Counter(fake_text_1) + Counter(fake_text_2)
- for letter, count in counts.most_common():
- if count < 2:
- assert letter not in vocab
- else:
- assert letter in vocab
-
-def test_build_model():
- """
- Test the whole thing on a small dataset for an iteration or two
- """
- with tempfile.TemporaryDirectory() as tempdir:
- eval_file = os.path.join(tempdir, "en_test.dev.txt")
- with open(eval_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- train_file = os.path.join(tempdir, "en_test.train.txt")
- with open(train_file, "w", encoding="utf-8") as fout:
- for i in range(1000):
+class TestCharModel:
+ def test_single_file_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "text.txt")
+ with open(sample_file, "w", encoding="utf-8") as fout:
fout.write(fake_text_1)
- fout.write("\n")
+ vocab = char_model.build_charlm_vocab(sample_file)
+
+ for i in fake_text_1:
+ assert i in vocab
+ assert "Q" not in vocab
+
+ def test_single_file_xz_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "text.txt.xz")
+ with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ vocab = char_model.build_charlm_vocab(sample_file)
+
+ for i in fake_text_1:
+ assert i in vocab
+ assert "Q" not in vocab
+
+ def test_single_file_dir_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "text.txt")
+ with open(sample_file, "w", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ vocab = char_model.build_charlm_vocab(tempdir)
+
+ for i in fake_text_1:
+ assert i in vocab
+ assert "Q" not in vocab
+
+ def test_multiple_files_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "t1.txt")
+ with open(sample_file, "w", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ sample_file = os.path.join(tempdir, "t2.txt.xz")
+ with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
+ fout.write(fake_text_2)
+ vocab = char_model.build_charlm_vocab(tempdir)
+
+ for i in fake_text_1:
+ assert i in vocab
+ for i in fake_text_2:
+ assert i in vocab
+ assert "Q" not in vocab
+
+ def test_cutoff_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "t1.txt")
+ with open(sample_file, "w", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ sample_file = os.path.join(tempdir, "t2.txt.xz")
+ with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
fout.write(fake_text_2)
- fout.write("\n")
- save_name = 'en_test.forward.pt'
- vocab_save_name = 'en_text.vocab.pt'
- checkpoint_save_name = 'en_text.checkpoint.pt'
- args = ['--train_file', train_file,
- '--eval_file', eval_file,
- '--eval_steps', '0', # eval once per opoch
- '--epochs', '2',
- '--cutoff', '1',
- '--batch_size', '%d' % len(fake_text_1),
- '--lang', 'en',
- '--shorthand', 'en_test',
- '--save_dir', tempdir,
- '--save_name', save_name,
- '--vocab_save_name', vocab_save_name,
- '--checkpoint_save_name', checkpoint_save_name]
- args = charlm.parse_args(args)
- charlm.train(args)
-
- assert os.path.exists(os.path.join(tempdir, vocab_save_name))
-
- # test that saving & loading of the model worked
- assert os.path.exists(os.path.join(tempdir, save_name))
- model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name))
-
- # test that saving & loading of the checkpoint worked
- assert os.path.exists(os.path.join(tempdir, checkpoint_save_name))
- model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name))
- trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name))
-
- assert trainer.global_step > 0
- assert trainer.epoch == 2
-
- # quick test to verify this method works with a trained model
- charlm.get_current_lr(trainer, args)
-
- # test loading a vocab built by the training method...
- vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name))
- trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab)
- # ... and test the get_current_lr for an untrained model as well
- # this test is super "eager"
- assert charlm.get_current_lr(trainer, args) == args['lr0']
-
-@pytest.fixture
-def english_forward():
- # eg, stanza_test/models/en/forward_charlm/1billion.pt
- models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*")
- models = glob.glob(models_path)
- # we expect at least one English model downloaded for the tests
- assert len(models) >= 1
- model_file = models[0]
- return char_model.CharacterLanguageModel.load(model_file)
-
-@pytest.fixture
-def english_backward():
- # eg, stanza_test/models/en/forward_charlm/1billion.pt
- models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
- models = glob.glob(models_path)
- # we expect at least one English model downloaded for the tests
- assert len(models) >= 1
- model_file = models[0]
- return char_model.CharacterLanguageModel.load(model_file)
-
-def test_load_model(english_forward, english_backward):
- """
- Check that basic loading functions work
- """
- assert english_forward.is_forward_lm
- assert not english_backward.is_forward_lm
-
-def test_save_load_model(english_forward, english_backward):
- """
- Load, save, and load again
- """
- with tempfile.TemporaryDirectory() as tempdir:
- for model in (english_forward, english_backward):
- save_file = os.path.join(tempdir, "resaved", "charlm.pt")
- model.save(save_file)
- reloaded = char_model.CharacterLanguageModel.load(save_file)
- assert model.is_forward_lm == reloaded.is_forward_lm
+
+ vocab = char_model.build_charlm_vocab(tempdir, cutoff=2)
+
+ counts = Counter(fake_text_1) + Counter(fake_text_2)
+ for letter, count in counts.most_common():
+ if count < 2:
+ assert letter not in vocab
+ else:
+ assert letter in vocab
+
+ def test_build_model(self):
+ """
+ Test the whole thing on a small dataset for an iteration or two
+ """
+ with tempfile.TemporaryDirectory() as tempdir:
+ eval_file = os.path.join(tempdir, "en_test.dev.txt")
+ with open(eval_file, "w", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ train_file = os.path.join(tempdir, "en_test.train.txt")
+ with open(train_file, "w", encoding="utf-8") as fout:
+ for i in range(1000):
+ fout.write(fake_text_1)
+ fout.write("\n")
+ fout.write(fake_text_2)
+ fout.write("\n")
+ save_name = 'en_test.forward.pt'
+ vocab_save_name = 'en_text.vocab.pt'
+ checkpoint_save_name = 'en_text.checkpoint.pt'
+ args = ['--train_file', train_file,
+ '--eval_file', eval_file,
+ '--eval_steps', '0', # eval once per opoch
+ '--epochs', '2',
+ '--cutoff', '1',
+ '--batch_size', '%d' % len(fake_text_1),
+ '--lang', 'en',
+ '--shorthand', 'en_test',
+ '--save_dir', tempdir,
+ '--save_name', save_name,
+ '--vocab_save_name', vocab_save_name,
+ '--checkpoint_save_name', checkpoint_save_name]
+ args = charlm.parse_args(args)
+ charlm.train(args)
+
+ assert os.path.exists(os.path.join(tempdir, vocab_save_name))
+
+ # test that saving & loading of the model worked
+ assert os.path.exists(os.path.join(tempdir, save_name))
+ model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name))
+
+ # test that saving & loading of the checkpoint worked
+ assert os.path.exists(os.path.join(tempdir, checkpoint_save_name))
+ model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name))
+ trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name))
+
+ assert trainer.global_step > 0
+ assert trainer.epoch == 2
+
+ # quick test to verify this method works with a trained model
+ charlm.get_current_lr(trainer, args)
+
+ # test loading a vocab built by the training method...
+ vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name))
+ trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab)
+ # ... and test the get_current_lr for an untrained model as well
+ # this test is super "eager"
+ assert charlm.get_current_lr(trainer, args) == args['lr0']
+
+ @pytest.fixture(scope="class")
+ def english_forward(self):
+ # eg, stanza_test/models/en/forward_charlm/1billion.pt
+ models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*")
+ models = glob.glob(models_path)
+ # we expect at least one English model downloaded for the tests
+ assert len(models) >= 1
+ model_file = models[0]
+ return char_model.CharacterLanguageModel.load(model_file)
+
+ @pytest.fixture(scope="class")
+ def english_backward(self):
+ # eg, stanza_test/models/en/forward_charlm/1billion.pt
+ models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
+ models = glob.glob(models_path)
+ # we expect at least one English model downloaded for the tests
+ assert len(models) >= 1
+ model_file = models[0]
+ return char_model.CharacterLanguageModel.load(model_file)
+
+ def test_load_model(self, english_forward, english_backward):
+ """
+ Check that basic loading functions work
+ """
+ assert english_forward.is_forward_lm
+ assert not english_backward.is_forward_lm
+
+ def test_save_load_model(self, english_forward, english_backward):
+ """
+ Load, save, and load again
+ """
+ with tempfile.TemporaryDirectory() as tempdir:
+ for model in (english_forward, english_backward):
+ save_file = os.path.join(tempdir, "resaved", "charlm.pt")
+ model.save(save_file)
+ reloaded = char_model.CharacterLanguageModel.load(save_file)
+ assert model.is_forward_lm == reloaded.is_forward_lm
diff --git a/stanza/tests/common/test_data_objects.py b/stanza/tests/common/test_data_objects.py
index 765fb748..42e6cf1b 100644
--- a/stanza/tests/common/test_data_objects.py
+++ b/stanza/tests/common/test_data_objects.py
@@ -16,32 +16,34 @@ EN_DOC_UPOS_XPOS = (('PRON_DT', 'AUX_VBZ', 'DET_DT', 'NOUN_NN', 'NOUN_NN', 'PUNC
EN_DOC2 = "Chris wrote a sentence. Then another."
-def test_readonly():
- Document.add_property('some_property', 123)
+@pytest.fixture(scope="module")
+def nlp_pipeline():
nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
- doc = nlp(EN_DOC)
+ return nlp
+
+def test_readonly(nlp_pipeline):
+ Document.add_property('some_property', 123)
+ doc = nlp_pipeline(EN_DOC)
assert doc.some_property == 123
with pytest.raises(ValueError):
doc.some_property = 456
-def test_getter():
+def test_getter(nlp_pipeline):
Word.add_property('upos_xpos', getter=lambda self: f"{self.upos}_{self.xpos}")
- nlp = stanza.Pipeline(**{'dir': TEST_MODELS_DIR, 'lang': 'en'})
- doc = nlp(EN_DOC)
+ doc = nlp_pipeline(EN_DOC)
assert EN_DOC_UPOS_XPOS == tuple(tuple(word.upos_xpos for word in sentence.words) for sentence in doc.sentences)
-def test_setter_getter():
+def test_setter_getter(nlp_pipeline):
int2str = {0: 'ok', 1: 'good', 2: 'bad'}
str2int = {'ok': 0, 'good': 1, 'bad': 2}
def setter(self, value):
self._classname = str2int[value]
Sentence.add_property('classname', getter=lambda self: int2str[self._classname] if self._classname is not None else None, setter=setter)
- nlp = stanza.Pipeline(**{'dir': TEST_MODELS_DIR, 'lang': 'en'})
- doc = nlp(EN_DOC)
+ doc = nlp_pipeline(EN_DOC)
sentence = doc.sentences[0]
sentence.classname = 'good'
assert sentence._classname == 1
@@ -50,9 +52,8 @@ def test_setter_getter():
sentence._classname = 2
assert sentence.classname == 'bad'
-def test_backpointer():
- nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
- doc = nlp(EN_DOC2)
+def test_backpointer(nlp_pipeline):
+ doc = nlp_pipeline(EN_DOC2)
ent = doc.ents[0]
assert ent.sent is doc.sentences[0]
assert list(doc.iter_words())[0].sent is doc.sentences[0]