Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-14 02:26:06 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-14 02:26:06 +0300
commit2511f1f8d96416d047696303f53bd5b12f1abba2 (patch)
tree2be06d69ab9ae8eb87e211353a23d6c8d6df2d9e
parentb7fda48e773c17580d5615fdfaafd13d05f09290 (diff)
Try to reduce the scope on various pipelines to make the test suite less likely to run out of GPU memory. Not sure this is the correct approach
-rw-r--r--stanza/tests/classifiers/test_classifier.py345
-rw-r--r--stanza/tests/lemma/test_lemma_trainer.py2
-rw-r--r--stanza/tests/pipeline/test_pipeline_depparse_processor.py37
-rw-r--r--stanza/tests/pipeline/test_pipeline_ner_processor.py6
4 files changed, 196 insertions, 194 deletions
diff --git a/stanza/tests/classifiers/test_classifier.py b/stanza/tests/classifiers/test_classifier.py
index 6743b5f2..36beb785 100644
--- a/stanza/tests/classifiers/test_classifier.py
+++ b/stanza/tests/classifiers/test_classifier.py
@@ -30,175 +30,176 @@ DATASET = [
EMB_DIM = 5
-@pytest.fixture(scope="module")
-def train_file(tmp_path_factory):
- train_set = DATASET * 20
- train_filename = tmp_path_factory.mktemp("data") / "train.json"
- with open(train_filename, "w", encoding="utf-8") as fout:
- json.dump(train_set, fout, ensure_ascii=False)
- return train_filename
-
-@pytest.fixture(scope="module")
-def dev_file(tmp_path_factory):
- dev_set = DATASET * 2
- dev_filename = tmp_path_factory.mktemp("data") / "dev.json"
- with open(dev_filename, "w", encoding="utf-8") as fout:
- json.dump(dev_set, fout, ensure_ascii=False)
- return dev_filename
-
-@pytest.fixture(scope="module")
-def test_file(tmp_path_factory):
- test_set = DATASET
- test_filename = tmp_path_factory.mktemp("data") / "test.json"
- with open(test_filename, "w", encoding="utf-8") as fout:
- json.dump(test_set, fout, ensure_ascii=False)
- return test_filename
-
-@pytest.fixture(scope="module")
-def fake_embeddings(tmp_path_factory):
- # could set np random seed here
- words = sorted(set([x.lower() for y in SENTENCES for x in y]))
- words = words[:-1]
- embedding_txt = tmp_path_factory.mktemp("data") / "embedding.txt"
- embedding_pt = tmp_path_factory.mktemp("data") / "embedding.pt"
- embedding = np.random.random((len(words), EMB_DIM))
-
- with open(embedding_txt, "w", encoding="utf-8") as fout:
- for word, emb in zip(words, embedding):
- fout.write(word)
- fout.write("\t")
- fout.write("\t".join(str(x) for x in emb))
- fout.write("\n")
-
- pt = pretrain.Pretrain(str(embedding_pt), str(embedding_txt))
- pt.load()
- assert os.path.exists(embedding_pt)
- return embedding_pt
-
-def test_read_data(train_file):
- """
- Test reading of the json format
- """
- train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
- assert len(train_set) == 60
-
-def test_dataset_vocab(train_file):
- """
- Converting a dataset to vocab should have a specific set of words along with PAD and UNK
- """
- train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
- vocab = data.dataset_vocab(train_set)
- expected = set([PAD, UNK] + [x.lower() for y in SENTENCES for x in y])
- assert set(vocab) == expected
-
-def test_dataset_labels(train_file):
- """
- Test the extraction of labels from a dataset
- """
- train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
- labels = data.dataset_labels(train_set)
- assert labels == ["0", "1", "2"]
-
-def build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=None):
- """
- Build a model to be used by one of the later tests
- """
- save_dir = str(tmp_path / "classifier")
- save_name = "model.pt"
- args = ["--save_dir", save_dir,
- "--save_name", save_name,
- "--wordvec_pretrain_file", str(fake_embeddings),
- "--filter_channels", "20",
- "--fc_shapes", "20,10",
- "--train_file", str(train_file),
- "--dev_file", str(dev_file),
- "--max_epochs", "2",
- "--batch_size", "60"]
- if extra_args is not None:
- args = args + extra_args
- args = classifier.parse_args(args)
- train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
- trainer = Trainer.build_new_model(args, train_set)
- return trainer, train_set, args
-
-def run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=None):
- """
- Iterate a couple times over a model
- """
- trainer, train_set, args = build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args)
- dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len)
- labels = data.dataset_labels(train_set)
-
- save_filename = os.path.join(args.save_dir, args.save_name)
- checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name)
- classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels)
- return trainer
-
-def test_build_model(tmp_path, fake_embeddings, train_file, dev_file):
- """
- Test that building a basic model works
- """
- build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
-
-def test_save_load(tmp_path, fake_embeddings, train_file, dev_file):
- """
- Test that a basic model can save & load
- """
- trainer, _, args = build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
-
- save_filename = os.path.join(args.save_dir, args.save_name)
- trainer.save(save_filename)
-
- args.load_name = args.save_name
- trainer = Trainer.load(args.load_name, args)
- args.load_name = save_filename
- trainer = Trainer.load(args.load_name, args)
-
-def test_train_basic(tmp_path, fake_embeddings, train_file, dev_file):
- run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
-
-def test_train_bilstm(tmp_path, fake_embeddings, train_file, dev_file):
- """
- Test w/ and w/o bilstm variations of the classifier
- """
- args = ["--bilstm", "--bilstm_hidden_dim", "20"]
- run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
- args = ["--no_bilstm"]
- run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-def test_train_maxpool_width(tmp_path, fake_embeddings, train_file, dev_file):
- """
- Test various maxpool widths
-
- Also sets --filter_channels to a multiple of 2 but not of 3 for
- the test to make sure the math is done correctly on a non-divisible width
- """
- args = ["--maxpool_width", "1", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
- run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
- args = ["--maxpool_width", "2", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
- run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
- args = ["--maxpool_width", "3", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
- run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-def test_train_conv_2d(tmp_path, fake_embeddings, train_file, dev_file):
- args = ["--filter_sizes", "(3,4,5)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
- run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
- args = ["--filter_sizes", "((3,2),)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
- run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
- args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
- run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
-
-def test_train_filter_channels(tmp_path, fake_embeddings, train_file, dev_file):
- args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--no_bilstm"]
- trainer = run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
- assert trainer.model.fc_input_size == 40
-
- args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "15,20", "--no_bilstm"]
- trainer = run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
- # 50 = 2x15 for the 2d conv (over 5 dim embeddings) + 20
- assert trainer.model.fc_input_size == 50
+class TestClassifier:
+ @pytest.fixture(scope="class")
+ def train_file(self, tmp_path_factory):
+ train_set = DATASET * 20
+ train_filename = tmp_path_factory.mktemp("data") / "train.json"
+ with open(train_filename, "w", encoding="utf-8") as fout:
+ json.dump(train_set, fout, ensure_ascii=False)
+ return train_filename
+
+ @pytest.fixture(scope="class")
+ def dev_file(self, tmp_path_factory):
+ dev_set = DATASET * 2
+ dev_filename = tmp_path_factory.mktemp("data") / "dev.json"
+ with open(dev_filename, "w", encoding="utf-8") as fout:
+ json.dump(dev_set, fout, ensure_ascii=False)
+ return dev_filename
+
+ @pytest.fixture(scope="class")
+ def test_file(self, tmp_path_factory):
+ test_set = DATASET
+ test_filename = tmp_path_factory.mktemp("data") / "test.json"
+ with open(test_filename, "w", encoding="utf-8") as fout:
+ json.dump(test_set, fout, ensure_ascii=False)
+ return test_filename
+
+ @pytest.fixture(scope="class")
+ def fake_embeddings(self, tmp_path_factory):
+ # could set np random seed here
+ words = sorted(set([x.lower() for y in SENTENCES for x in y]))
+ words = words[:-1]
+ embedding_txt = tmp_path_factory.mktemp("data") / "embedding.txt"
+ embedding_pt = tmp_path_factory.mktemp("data") / "embedding.pt"
+ embedding = np.random.random((len(words), EMB_DIM))
+
+ with open(embedding_txt, "w", encoding="utf-8") as fout:
+ for word, emb in zip(words, embedding):
+ fout.write(word)
+ fout.write("\t")
+ fout.write("\t".join(str(x) for x in emb))
+ fout.write("\n")
+
+ pt = pretrain.Pretrain(str(embedding_pt), str(embedding_txt))
+ pt.load()
+ assert os.path.exists(embedding_pt)
+ return embedding_pt
+
+ def test_read_data(self, train_file):
+ """
+ Test reading of the json format
+ """
+ train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
+ assert len(train_set) == 60
+
+ def test_dataset_vocab(self, train_file):
+ """
+ Converting a dataset to vocab should have a specific set of words along with PAD and UNK
+ """
+ train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
+ vocab = data.dataset_vocab(train_set)
+ expected = set([PAD, UNK] + [x.lower() for y in SENTENCES for x in y])
+ assert set(vocab) == expected
+
+ def test_dataset_labels(self, train_file):
+ """
+ Test the extraction of labels from a dataset
+ """
+ train_set = data.read_dataset(str(train_file), WVType.OTHER, 1)
+ labels = data.dataset_labels(train_set)
+ assert labels == ["0", "1", "2"]
+
+ def build_model(self, tmp_path, fake_embeddings, train_file, dev_file, extra_args=None):
+ """
+ Build a model to be used by one of the later tests
+ """
+ save_dir = str(tmp_path / "classifier")
+ save_name = "model.pt"
+ args = ["--save_dir", save_dir,
+ "--save_name", save_name,
+ "--wordvec_pretrain_file", str(fake_embeddings),
+ "--filter_channels", "20",
+ "--fc_shapes", "20,10",
+ "--train_file", str(train_file),
+ "--dev_file", str(dev_file),
+ "--max_epochs", "2",
+ "--batch_size", "60"]
+ if extra_args is not None:
+ args = args + extra_args
+ args = classifier.parse_args(args)
+ train_set = data.read_dataset(args.train_file, args.wordvec_type, args.min_train_len)
+ trainer = Trainer.build_new_model(args, train_set)
+ return trainer, train_set, args
+
+ def run_training(self, tmp_path, fake_embeddings, train_file, dev_file, extra_args=None):
+ """
+ Iterate a couple times over a model
+ """
+ trainer, train_set, args = self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args)
+ dev_set = data.read_dataset(args.dev_file, args.wordvec_type, args.min_train_len)
+ labels = data.dataset_labels(train_set)
+
+ save_filename = os.path.join(args.save_dir, args.save_name)
+ checkpoint_file = utils.checkpoint_name(args.save_dir, save_filename, args.checkpoint_save_name)
+ classifier.train_model(trainer, save_filename, checkpoint_file, args, train_set, dev_set, labels)
+ return trainer
+
+ def test_build_model(self, tmp_path, fake_embeddings, train_file, dev_file):
+ """
+ Test that building a basic model works
+ """
+ self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
+
+ def test_save_load(self, tmp_path, fake_embeddings, train_file, dev_file):
+ """
+ Test that a basic model can save & load
+ """
+ trainer, _, args = self.build_model(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
+
+ save_filename = os.path.join(args.save_dir, args.save_name)
+ trainer.save(save_filename)
+
+ args.load_name = args.save_name
+ trainer = Trainer.load(args.load_name, args)
+ args.load_name = save_filename
+ trainer = Trainer.load(args.load_name, args)
+
+ def test_train_basic(self, tmp_path, fake_embeddings, train_file, dev_file):
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, extra_args=["--bilstm_hidden_dim", "20"])
+
+ def test_train_bilstm(self, tmp_path, fake_embeddings, train_file, dev_file):
+ """
+ Test w/ and w/o bilstm variations of the classifier
+ """
+ args = ["--bilstm", "--bilstm_hidden_dim", "20"]
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+ args = ["--no_bilstm"]
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+ def test_train_maxpool_width(self, tmp_path, fake_embeddings, train_file, dev_file):
+ """
+ Test various maxpool widths
+
+ Also sets --filter_channels to a multiple of 2 but not of 3 for
+ the test to make sure the math is done correctly on a non-divisible width
+ """
+ args = ["--maxpool_width", "1", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+ args = ["--maxpool_width", "2", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+ args = ["--maxpool_width", "3", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+ def test_train_conv_2d(self, tmp_path, fake_embeddings, train_file, dev_file):
+ args = ["--filter_sizes", "(3,4,5)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+ args = ["--filter_sizes", "((3,2),)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+ args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--bilstm_hidden_dim", "20"]
+ self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+
+ def test_train_filter_channels(self, tmp_path, fake_embeddings, train_file, dev_file):
+ args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "20", "--no_bilstm"]
+ trainer = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+ assert trainer.model.fc_input_size == 40
+
+ args = ["--filter_sizes", "((3,2),3)", "--filter_channels", "15,20", "--no_bilstm"]
+ trainer = self.run_training(tmp_path, fake_embeddings, train_file, dev_file, args)
+ # 50 = 2x15 for the 2d conv (over 5 dim embeddings) + 20
+ assert trainer.model.fc_input_size == 50
diff --git a/stanza/tests/lemma/test_lemma_trainer.py b/stanza/tests/lemma/test_lemma_trainer.py
index 24a58b57..dd6a1777 100644
--- a/stanza/tests/lemma/test_lemma_trainer.py
+++ b/stanza/tests/lemma/test_lemma_trainer.py
@@ -13,7 +13,7 @@ from stanza.tests import *
pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
-@pytest.fixture
+@pytest.fixture(scope="module")
def english_model():
models_path = os.path.join(TEST_MODELS_DIR, "en", "lemma", "*")
models = glob.glob(models_path)
diff --git a/stanza/tests/pipeline/test_pipeline_depparse_processor.py b/stanza/tests/pipeline/test_pipeline_depparse_processor.py
index 83535cb8..ad820643 100644
--- a/stanza/tests/pipeline/test_pipeline_depparse_processor.py
+++ b/stanza/tests/pipeline/test_pipeline_depparse_processor.py
@@ -10,23 +10,24 @@ from stanza.tests import TEST_MODELS_DIR
pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
-@pytest.fixture(scope="module")
-def english_depparse():
- """
- Get a depparse_processor for English
- """
- nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'})
- assert 'depparse' in nlp.processors
- return nlp.processors['depparse']
+class TestClassifier:
+ @pytest.fixture(scope="class")
+ def english_depparse(self):
+ """
+ Get a depparse_processor for English
+ """
+ nlp = stanza.Pipeline(**{'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en'})
+ assert 'depparse' in nlp.processors
+ return nlp.processors['depparse']
-def test_get_known_relations(english_depparse):
- """
- Test getting the known relations from a processor.
+ def test_get_known_relations(self, english_depparse):
+ """
+ Test getting the known relations from a processor.
- Doesn't test that all the relations exist, since who knows what will change in the future
- """
- relations = english_depparse.get_known_relations()
- assert len(relations) > 5
- assert 'case' in relations
- for i in VOCAB_PREFIX:
- assert i not in relations
+ Doesn't test that all the relations exist, since who knows what will change in the future
+ """
+ relations = english_depparse.get_known_relations()
+ assert len(relations) > 5
+ assert 'case' in relations
+ for i in VOCAB_PREFIX:
+ assert i not in relations
diff --git a/stanza/tests/pipeline/test_pipeline_ner_processor.py b/stanza/tests/pipeline/test_pipeline_ner_processor.py
index c67089cf..c88dbe6e 100644
--- a/stanza/tests/pipeline/test_pipeline_ner_processor.py
+++ b/stanza/tests/pipeline/test_pipeline_ner_processor.py
@@ -58,13 +58,13 @@ class TestNERProcessor:
"""
return stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,ner")
- @pytest.fixture
+ @pytest.fixture(scope="class")
def processed_doc(self, pipeline):
""" Document created by running full English pipeline on a few sentences """
- return [pipeline(text) for text in EN_DOCS]
+ return [pipeline(text) for text in EN_DOCS]
- @pytest.fixture
+ @pytest.fixture(scope="class")
def processed_bulk(self, pipeline):
""" Document created by running full English pipeline on a few sentences """
docs = [Document([], text=t) for t in EN_DOCS]