1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
|
"""
Currently tests a few configurations of files for creating a charlm vocab
Also has a skeleton test of loading & saving a charlm
"""
from collections import Counter
import glob
import lzma
import os
import tempfile
import pytest
from stanza.models import charlm
from stanza.models.common import char_model
from stanza.tests import TEST_MODELS_DIR
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
fake_text_1 = """
Unban mox opal!
I hate watching Peppa Pig
"""
fake_text_2 = """
This is plastic cheese
"""
class TestCharModel:
def test_single_file_vocab(self):
with tempfile.TemporaryDirectory() as tempdir:
sample_file = os.path.join(tempdir, "text.txt")
with open(sample_file, "w", encoding="utf-8") as fout:
fout.write(fake_text_1)
vocab = char_model.build_charlm_vocab(sample_file)
for i in fake_text_1:
assert i in vocab
assert "Q" not in vocab
def test_single_file_xz_vocab(self):
with tempfile.TemporaryDirectory() as tempdir:
sample_file = os.path.join(tempdir, "text.txt.xz")
with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
fout.write(fake_text_1)
vocab = char_model.build_charlm_vocab(sample_file)
for i in fake_text_1:
assert i in vocab
assert "Q" not in vocab
def test_single_file_dir_vocab(self):
with tempfile.TemporaryDirectory() as tempdir:
sample_file = os.path.join(tempdir, "text.txt")
with open(sample_file, "w", encoding="utf-8") as fout:
fout.write(fake_text_1)
vocab = char_model.build_charlm_vocab(tempdir)
for i in fake_text_1:
assert i in vocab
assert "Q" not in vocab
def test_multiple_files_vocab(self):
with tempfile.TemporaryDirectory() as tempdir:
sample_file = os.path.join(tempdir, "t1.txt")
with open(sample_file, "w", encoding="utf-8") as fout:
fout.write(fake_text_1)
sample_file = os.path.join(tempdir, "t2.txt.xz")
with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
fout.write(fake_text_2)
vocab = char_model.build_charlm_vocab(tempdir)
for i in fake_text_1:
assert i in vocab
for i in fake_text_2:
assert i in vocab
assert "Q" not in vocab
def test_cutoff_vocab(self):
with tempfile.TemporaryDirectory() as tempdir:
sample_file = os.path.join(tempdir, "t1.txt")
with open(sample_file, "w", encoding="utf-8") as fout:
fout.write(fake_text_1)
sample_file = os.path.join(tempdir, "t2.txt.xz")
with lzma.open(sample_file, "wt", encoding="utf-8") as fout:
fout.write(fake_text_2)
vocab = char_model.build_charlm_vocab(tempdir, cutoff=2)
counts = Counter(fake_text_1) + Counter(fake_text_2)
for letter, count in counts.most_common():
if count < 2:
assert letter not in vocab
else:
assert letter in vocab
def test_build_model(self):
"""
Test the whole thing on a small dataset for an iteration or two
"""
with tempfile.TemporaryDirectory() as tempdir:
eval_file = os.path.join(tempdir, "en_test.dev.txt")
with open(eval_file, "w", encoding="utf-8") as fout:
fout.write(fake_text_1)
train_file = os.path.join(tempdir, "en_test.train.txt")
with open(train_file, "w", encoding="utf-8") as fout:
for i in range(1000):
fout.write(fake_text_1)
fout.write("\n")
fout.write(fake_text_2)
fout.write("\n")
save_name = 'en_test.forward.pt'
vocab_save_name = 'en_text.vocab.pt'
checkpoint_save_name = 'en_text.checkpoint.pt'
args = ['--train_file', train_file,
'--eval_file', eval_file,
'--eval_steps', '0', # eval once per opoch
'--epochs', '2',
'--cutoff', '1',
'--batch_size', '%d' % len(fake_text_1),
'--lang', 'en',
'--shorthand', 'en_test',
'--save_dir', tempdir,
'--save_name', save_name,
'--vocab_save_name', vocab_save_name,
'--checkpoint_save_name', checkpoint_save_name]
args = charlm.parse_args(args)
charlm.train(args)
assert os.path.exists(os.path.join(tempdir, vocab_save_name))
# test that saving & loading of the model worked
assert os.path.exists(os.path.join(tempdir, save_name))
model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, save_name))
# test that saving & loading of the checkpoint worked
assert os.path.exists(os.path.join(tempdir, checkpoint_save_name))
model = char_model.CharacterLanguageModel.load(os.path.join(tempdir, checkpoint_save_name))
trainer = char_model.CharacterLanguageModelTrainer.load(args, os.path.join(tempdir, checkpoint_save_name))
assert trainer.global_step > 0
assert trainer.epoch == 2
# quick test to verify this method works with a trained model
charlm.get_current_lr(trainer, args)
# test loading a vocab built by the training method...
vocab = charlm.load_char_vocab(os.path.join(tempdir, vocab_save_name))
trainer = char_model.CharacterLanguageModelTrainer.from_new_model(args, vocab)
# ... and test the get_current_lr for an untrained model as well
# this test is super "eager"
assert charlm.get_current_lr(trainer, args) == args['lr0']
@pytest.fixture(scope="class")
def english_forward(self):
# eg, stanza_test/models/en/forward_charlm/1billion.pt
models_path = os.path.join(TEST_MODELS_DIR, "en", "forward_charlm", "*")
models = glob.glob(models_path)
# we expect at least one English model downloaded for the tests
assert len(models) >= 1
model_file = models[0]
return char_model.CharacterLanguageModel.load(model_file)
@pytest.fixture(scope="class")
def english_backward(self):
# eg, stanza_test/models/en/forward_charlm/1billion.pt
models_path = os.path.join(TEST_MODELS_DIR, "en", "backward_charlm", "*")
models = glob.glob(models_path)
# we expect at least one English model downloaded for the tests
assert len(models) >= 1
model_file = models[0]
return char_model.CharacterLanguageModel.load(model_file)
def test_load_model(self, english_forward, english_backward):
"""
Check that basic loading functions work
"""
assert english_forward.is_forward_lm
assert not english_backward.is_forward_lm
def test_save_load_model(self, english_forward, english_backward):
"""
Load, save, and load again
"""
with tempfile.TemporaryDirectory() as tempdir:
for model in (english_forward, english_backward):
save_file = os.path.join(tempdir, "resaved", "charlm.pt")
model.save(save_file)
reloaded = char_model.CharacterLanguageModel.load(save_file)
assert model.is_forward_lm == reloaded.is_forward_lm
|