Turn some multilingual pipeline tests into fixtures. Again, should save memory

author: John Bauer <horatio@gmail.com> 2022-09-14 05:36:39 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-14 05:36:39 +0300
commit: 93191328e85b9cfe5365b54e8a02e416fb2cf727 (patch)
tree: c3208ba5455feed813e50d2c845891d194d2c441
parent: def8cb86e61bfb178f75de12fa189ee57f0a9cc7 (diff)
1 files changed, 36 insertions, 28 deletions
diff --git a/stanza/tests/langid/test_langid.py b/stanza/tests/langid/test_langid.py
index 19531bd8..c80e04b7 100644
--- a/stanza/tests/langid/test_langid.py
+++ b/stanza/tests/langid/test_langid.py
@@ -12,7 +12,23 @@ pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
 
 #pytestmark = pytest.mark.skip
 
-def test_langid():
+@pytest.fixture(scope="module")
+def basic_multilingual():
+    return Pipeline(dir=TEST_MODELS_DIR, lang='multilingual', processors="langid")
+
+@pytest.fixture(scope="module")
+def enfr_multilingual():
+    return Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid", langid_lang_subset=["en", "fr"])
+
+@pytest.fixture(scope="module")
+def en_multilingual():
+    return Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid", langid_lang_subset=["en"])
+
+@pytest.fixture(scope="module")
+def clean_multilingual():
+    return Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid", langid_clean_text=True)
+
+def test_langid(basic_multilingual):
     """
     Basic test of language identification
     """
@@ -20,13 +36,12 @@ def test_langid():
     french_text = "C'est une phrase française."
     docs = [english_text, french_text]
 
-    nlp = Pipeline(dir=TEST_MODELS_DIR, lang='multilingual', processors="langid")
     docs = [Document([], text=text) for text in docs]
-    nlp(docs)
+    basic_multilingual(docs)
     predictions = [doc.lang for doc in docs]
     assert predictions == ["en", "fr"]
 
-def test_langid_benchmark():
+def test_langid_benchmark(basic_multilingual):
     """
     Run lang id model on 500 examples, confirm reasonable accuracy.
     """
@@ -532,15 +547,14 @@ def test_langid_benchmark():
     {"text": "Například Pedagogická fakulta Univerzity Karlovy", "label": "cs"},
     {"text": "nostris ut eriperet nos de praesenti saeculo", "label": "la"}]
     
-    nlp = Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid")
     docs = [Document([], text=example["text"]) for example in examples]
     gold_labels = [example["label"] for example in examples]
-    nlp(docs)
+    basic_multilingual(docs)
     accuracy = sum([(doc.lang == label) for doc,label in zip(docs,gold_labels)])/len(docs)
     assert accuracy >= 0.98
 
 
-def test_text_cleaning():
+def test_text_cleaning(basic_multilingual, clean_multilingual):
     """
     Basic test of cleaning text
     """
@@ -548,48 +562,42 @@ def test_text_cleaning():
             "Bonjour le monde! https://t.co/U0Zjp3tusD"]
     docs = [Document([], text=text) for text in docs]
     
-    nlp = Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid")
-    nlp(docs)
+    basic_multilingual(docs)
     assert [doc.lang for doc in docs] == ["it", "it"]
     
-    nlp = Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid", langid_clean_text=True)
-    assert nlp.processors["langid"]._clean_text
-    nlp(docs)
+    assert clean_multilingual.processors["langid"]._clean_text
+    clean_multilingual(docs)
     assert [doc.lang for doc in docs] == ["fr", "fr"]
 
-def test_lang_subset():
+def test_lang_subset(basic_multilingual, enfr_multilingual, en_multilingual):
     """
     Basic test of restricting output to subset of languages
     """
     docs = ["Bonjour le monde! #thisisfrench #ilovefrance",
             "Bonjour le monde! https://t.co/U0Zjp3tusD"]
     docs = [Document([], text=text) for text in docs]
-    
-    nlp = Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid")
-    nlp(docs)
+
+    basic_multilingual(docs)
     assert [doc.lang for doc in docs] == ["it", "it"]
-    
-    nlp = Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid", langid_lang_subset=["en","fr"])
-    assert nlp.processors["langid"]._model.lang_subset == ["en", "fr"]
-    nlp(docs)
+
+    assert enfr_multilingual.processors["langid"]._model.lang_subset == ["en", "fr"]
+    enfr_multilingual(docs)
     assert [doc.lang for doc in docs] == ["fr", "fr"]
-    
-    nlp = Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid", langid_lang_subset=["en"])
-    assert nlp.processors["langid"]._model.lang_subset == ["en"]
-    nlp(docs)
+
+    assert en_multilingual.processors["langid"]._model.lang_subset == ["en"]
+    en_multilingual(docs)
     assert [doc.lang for doc in docs] == ["en", "en"]
 
-def test_lang_subset_unlikely_language():
+def test_lang_subset_unlikely_language(en_multilingual):
     """
     Test that the language subset masking chooses a legal language, even if all legal languages are supa unlikely
     """
     sentences = ["你好" * 200]
     docs = [Document([], text=text) for text in sentences]
-    nlp = Pipeline(dir=TEST_MODELS_DIR, lang="multilingual", processors="langid", langid_lang_subset=["en"])
-    nlp(docs)
+    en_multilingual(docs)
     assert [doc.lang for doc in docs] == ["en"]
 
-    processor = nlp.processors['langid']
+    processor = en_multilingual.processors['langid']
     model = processor._model
     text_tensor = processor._text_to_tensor(sentences)
     en_idx = model.tag_to_idx['en']
author	John Bauer <horatio@gmail.com>	2022-09-14 05:36:39 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-14 05:36:39 +0300
commit	93191328e85b9cfe5365b54e8a02e416fb2cf727 (patch)
tree	c3208ba5455feed813e50d2c845891d194d2c441
parent	def8cb86e61bfb178f75de12fa189ee57f0a9cc7 (diff)