Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/stanza
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-14 05:19:49 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-14 05:20:10 +0300
commitdef8cb86e61bfb178f75de12fa189ee57f0a9cc7 (patch)
tree9d2e7b87162b1ae1f2cd549a40d2cb5db1547114 /stanza
parenta74678e36c2ecb933016620ebfd2a8ab50bd55c3 (diff)
Turn some pipelines getting built over and over into fixtures. Will make them take up less GPU memory, even if the cleanup isn't reliable
Diffstat (limited to 'stanza')
-rw-r--r--stanza/tests/common/test_char_model.py319
-rw-r--r--stanza/tests/common/test_data_objects.py25
2 files changed, 173 insertions, 171 deletions
diff --git a/stanza/tests/common/test_char_model.py b/stanza/tests/common/test_char_model.py
index 5ab080e5..30b34851 100644
--- a/stanza/tests/common/test_char_model.py
+++ b/stanza/tests/common/test_char_model.py
@@ -27,164 +27,165 @@ fake_text_2 = """
This is plastic cheese
"""
-def test_single_file_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "text.txt")
- with open(sample_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- vocab = char_model.build_charlm_vocab(sample_file)
-
- for i in fake_text_1:
- assert i in vocab
- assert "Q" not in vocab
-
-def test_single_file_xz_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "text.txt.xz")
- with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- vocab = char_model.build_charlm_vocab(sample_file)
-
- for i in fake_text_1:
- assert i in vocab
- assert "Q" not in vocab
-
-def test_single_file_dir_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "text.txt")
- with open(sample_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- vocab = char_model.build_charlm_vocab(tempdir)
-
- for i in fake_text_1:
- assert i in vocab
- assert "Q" not in vocab
-
-def test_multiple_files_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "t1.txt")
- with open(sample_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- sample_file = os.path.join(tempdir, "t2.txt.xz")
- with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
- fout.write(fake_text_2)
- vocab = char_model.build_charlm_vocab(tempdir)
-
- for i in fake_text_1:
- assert i in vocab
- for i in fake_text_2:
- assert i in vocab
- assert "Q" not in vocab
-
-def test_cutoff_vocab():
- with tempfile.TemporaryDirectory() as tempdir:
- sample_file = os.path.join(tempdir, "t1.txt")
- with open(sample_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- sample_file = os.path.join(tempdir, "t2.txt.xz")
- with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
- fout.write(fake_text_2)
-
- vocab = char_model.build_charlm_vocab(tempdir, cutoff=2)
-
- counts = Counter(fake_text_1) + Counter(fake_text_2)
- for letter, count in counts.most_common():
- if count < 2:
- assert letter not in vocab
- else:
- assert letter in vocab
-
-def test_build_model():
- """
- Test the whole thing on a small dataset for an iteration or two
- """
- with tempfile.TemporaryDirectory() as tempdir:
- eval_file = os.path.join(tempdir, "en_test.dev.txt")
- with open(eval_file, "w", encoding="utf-8") as fout:
- fout.write(fake_text_1)
- train_file = os.path.join(tempdir, "en_test.train.txt")
- with open(train_file, "w", encoding="utf-8") as fout:
- for i in range(1000):
+class TestCharModel:
+ def test_single_file_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "text.txt")
+ with open(sample_file, "w", encoding="utf-8") as fout:
fout.write(fake_text_1)
- fout.write("\n")
+ vocab = char_model.build_charlm_vocab(sample_file)
+
+ for i in fake_text_1:
+ assert i in vocab
+ assert "Q" not in vocab
+
+ def test_single_file_xz_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "text.txt.xz")
+ with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ vocab = char_model.build_charlm_vocab(sample_file)
+
+ for i in fake_text_1:
+ assert i in vocab
+ assert "Q" not in vocab
+
+ def test_single_file_dir_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "text.txt")
+ with open(sample_file, "w", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ vocab = char_model.build_charlm_vocab(tempdir)
+
+ for i in fake_text_1:
+ assert i in vocab
+ assert "Q" not in vocab
+
+ def test_multiple_files_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "t1.txt")
+ with open(sample_file, "w", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ sample_file = os.path.join(tempdir, "t2.txt.xz")
+ with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
+ fout.write(fake_text_2)
+ vocab = char_model.build_charlm_vocab(tempdir)
+
+ for i in fake_text_1:
+ assert i in vocab
+ for i in fake_text_2:
+ assert i in vocab
+ assert "Q" not in vocab
+
+ def test_cutoff_vocab(self):
+ with tempfile.TemporaryDirectory() as tempdir:
+ sample_file = os.path.join(tempdir, "t1.txt")
+ with open(sample_file, "w", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ sample_file = os.path.join(tempdir, "t2.txt.xz")
+ with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
fout.write(fake_text_2)
- fout.write("\n")
- save_name = 'en_test.forward.pt'
- vocab_save_name = 'en_text.vocab.pt'
- checkpoint_save_name = 'en_text.checkpoint.pt'
- args = ['--train_file', train_file,
- '--eval_file', eval_file,
- '--eval_steps', '0', # eval once per opoch
- '--epochs', '2',
- '--cutoff', '1',
- '--batch_size', '%d' % len(fake_text_1),
- '--lang', 'en',
- '--shorthand', 'en_test',
- '--save_dir', tempdir,
- '--save_name', save_name,
- '--vocab_save_name', vocab_save_name,
- '--checkpoint_save_name', checkpoint_save_name]
- args = charlm.parse_args(args)
- charlm.train(args)
-
- assert os.path.exists(os.path.join(tempdir, vocab_save_name))
-
- # test that saving & loading of the model worked
- assert os.path.exists(os.path.join(tempdir, save_name))
- model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name))
-
- # test that saving & loading of the checkpoint worked
- assert os.path.exists(os.path.join(tempdir, checkpoint_save_name))
- model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name))
- trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name))
-
- assert trainer.global_step > 0
- assert trainer.epoch == 2
-
- # quick test to verify this method works with a trained model
- charlm.get_current_lr(trainer, args)
-
- # test loading a vocab built by the training method...
- vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name))
- trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab)
- # ... and test the get_current_lr for an untrained model as well
- # this test is super "eager"
- assert charlm.get_current_lr(trainer, args) == args['lr0']
-
-@pytest.fixture
-def english_forward():
- # eg, stanza_test/models/en/forward_charlm/1billion.pt
- models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*")
- models = glob.glob(models_path)
- # we expect at least one English model downloaded for the tests
- assert len(models) >= 1
- model_file = models[0]
- return char_model.CharacterLanguageModel.load(model_file)
-
-@pytest.fixture
-def english_backward():
- # eg, stanza_test/models/en/forward_charlm/1billion.pt
- models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
- models = glob.glob(models_path)
- # we expect at least one English model downloaded for the tests
- assert len(models) >= 1
- model_file = models[0]
- return char_model.CharacterLanguageModel.load(model_file)
-
-def test_load_model(english_forward, english_backward):
- """
- Check that basic loading functions work
- """
- assert english_forward.is_forward_lm
- assert not english_backward.is_forward_lm
-
-def test_save_load_model(english_forward, english_backward):
- """
- Load, save, and load again
- """
- with tempfile.TemporaryDirectory() as tempdir:
- for model in (english_forward, english_backward):
- save_file = os.path.join(tempdir, "resaved", "charlm.pt")
- model.save(save_file)
- reloaded = char_model.CharacterLanguageModel.load(save_file)
- assert model.is_forward_lm == reloaded.is_forward_lm
+
+ vocab = char_model.build_charlm_vocab(tempdir, cutoff=2)
+
+ counts = Counter(fake_text_1) + Counter(fake_text_2)
+ for letter, count in counts.most_common():
+ if count < 2:
+ assert letter not in vocab
+ else:
+ assert letter in vocab
+
+ def test_build_model(self):
+ """
+ Test the whole thing on a small dataset for an iteration or two
+ """
+ with tempfile.TemporaryDirectory() as tempdir:
+ eval_file = os.path.join(tempdir, "en_test.dev.txt")
+ with open(eval_file, "w", encoding="utf-8") as fout:
+ fout.write(fake_text_1)
+ train_file = os.path.join(tempdir, "en_test.train.txt")
+ with open(train_file, "w", encoding="utf-8") as fout:
+ for i in range(1000):
+ fout.write(fake_text_1)
+ fout.write("\n")
+ fout.write(fake_text_2)
+ fout.write("\n")
+ save_name = 'en_test.forward.pt'
+ vocab_save_name = 'en_text.vocab.pt'
+ checkpoint_save_name = 'en_text.checkpoint.pt'
+ args = ['--train_file', train_file,
+ '--eval_file', eval_file,
+ '--eval_steps', '0', # eval once per opoch
+ '--epochs', '2',
+ '--cutoff', '1',
+ '--batch_size', '%d' % len(fake_text_1),
+ '--lang', 'en',
+ '--shorthand', 'en_test',
+ '--save_dir', tempdir,
+ '--save_name', save_name,
+ '--vocab_save_name', vocab_save_name,
+ '--checkpoint_save_name', checkpoint_save_name]
+ args = charlm.parse_args(args)
+ charlm.train(args)
+
+ assert os.path.exists(os.path.join(tempdir, vocab_save_name))
+
+ # test that saving & loading of the model worked
+ assert os.path.exists(os.path.join(tempdir, save_name))
+ model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name))
+
+ # test that saving & loading of the checkpoint worked
+ assert os.path.exists(os.path.join(tempdir, checkpoint_save_name))
+ model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name))
+ trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name))
+
+ assert trainer.global_step > 0
+ assert trainer.epoch == 2
+
+ # quick test to verify this method works with a trained model
+ charlm.get_current_lr(trainer, args)
+
+ # test loading a vocab built by the training method...
+ vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name))
+ trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab)
+ # ... and test the get_current_lr for an untrained model as well
+ # this test is super "eager"
+ assert charlm.get_current_lr(trainer, args) == args['lr0']
+
+ @pytest.fixture(scope="class")
+ def english_forward(self):
+ # eg, stanza_test/models/en/forward_charlm/1billion.pt
+ models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*")
+ models = glob.glob(models_path)
+ # we expect at least one English model downloaded for the tests
+ assert len(models) >= 1
+ model_file = models[0]
+ return char_model.CharacterLanguageModel.load(model_file)
+
+ @pytest.fixture(scope="class")
+ def english_backward(self):
+ # eg, stanza_test/models/en/forward_charlm/1billion.pt
+ models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
+ models = glob.glob(models_path)
+ # we expect at least one English model downloaded for the tests
+ assert len(models) >= 1
+ model_file = models[0]
+ return char_model.CharacterLanguageModel.load(model_file)
+
+ def test_load_model(self, english_forward, english_backward):
+ """
+ Check that basic loading functions work
+ """
+ assert english_forward.is_forward_lm
+ assert not english_backward.is_forward_lm
+
+ def test_save_load_model(self, english_forward, english_backward):
+ """
+ Load, save, and load again
+ """
+ with tempfile.TemporaryDirectory() as tempdir:
+ for model in (english_forward, english_backward):
+ save_file = os.path.join(tempdir, "resaved", "charlm.pt")
+ model.save(save_file)
+ reloaded = char_model.CharacterLanguageModel.load(save_file)
+ assert model.is_forward_lm == reloaded.is_forward_lm
diff --git a/stanza/tests/common/test_data_objects.py b/stanza/tests/common/test_data_objects.py
index 765fb748..42e6cf1b 100644
--- a/stanza/tests/common/test_data_objects.py
+++ b/stanza/tests/common/test_data_objects.py
@@ -16,32 +16,34 @@ EN_DOC_UPOS_XPOS = (('PRON_DT', 'AUX_VBZ', 'DET_DT', 'NOUN_NN', 'NOUN_NN', 'PUNC
EN_DOC2 = "Chris wrote a sentence. Then another."
-def test_readonly():
- Document.add_property('some_property', 123)
+@pytest.fixture(scope="module")
+def nlp_pipeline():
nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
- doc = nlp(EN_DOC)
+ return nlp
+
+def test_readonly(nlp_pipeline):
+ Document.add_property('some_property', 123)
+ doc = nlp_pipeline(EN_DOC)
assert doc.some_property == 123
with pytest.raises(ValueError):
doc.some_property = 456
-def test_getter():
+def test_getter(nlp_pipeline):
Word.add_property('upos_xpos', getter=lambda self: f"{self.upos}_{self.xpos}")
- nlp = stanza.Pipeline(**{'dir': TEST_MODELS_DIR, 'lang': 'en'})
- doc = nlp(EN_DOC)
+ doc = nlp_pipeline(EN_DOC)
assert EN_DOC_UPOS_XPOS == tuple(tuple(word.upos_xpos for word in sentence.words) for sentence in doc.sentences)
-def test_setter_getter():
+def test_setter_getter(nlp_pipeline):
int2str = {0: 'ok', 1: 'good', 2: 'bad'}
str2int = {'ok': 0, 'good': 1, 'bad': 2}
def setter(self, value):
self._classname = str2int[value]
Sentence.add_property('classname', getter=lambda self: int2str[self._classname] if self._classname is not None else None, setter=setter)
- nlp = stanza.Pipeline(**{'dir': TEST_MODELS_DIR, 'lang': 'en'})
- doc = nlp(EN_DOC)
+ doc = nlp_pipeline(EN_DOC)
sentence = doc.sentences[0]
sentence.classname = 'good'
assert sentence._classname == 1
@@ -50,9 +52,8 @@ def test_setter_getter():
sentence._classname = 2
assert sentence.classname == 'bad'
-def test_backpointer():
- nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
- doc = nlp(EN_DOC2)
+def test_backpointer(nlp_pipeline):
+ doc = nlp_pipeline(EN_DOC2)
ent = doc.ents[0]
assert ent.sent is doc.sentences[0]
assert list(doc.iter_words())[0].sent is doc.sentences[0]