blob: de613977ddf8cd9a5cde6d750d474c0c3ef7aa53 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
"""
A utility script to load a word embedding file from a text file and save it as a .pt
Run it as follows:
python stanza/models/common/convert_pretrain.py <.pt file> <text file> <# vectors>
Note that -1 for # of vectors will keep all the vectors
As a concrete example, you can convert a newly downloaded Faroese WV file as follows:
python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/fo_farpahc.pretrain.pt ~/extern_data/wordvec/fasttext/faroese.txt -1
or save part of an Icelandic WV file:
python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/is_icepahc.pretrain.pt ~/extern_data/wordvec/fasttext/icelandic.cc.is.300.vec 150000
Note that if the pretrain already exists, nothing will be changed.
"""
import os
import sys
from stanza.models.common import pretrain
def main():
filename = sys.argv[1]
vec_filename = sys.argv[2]
max_vocab = int(sys.argv[3])
pt = pretrain.Pretrain(filename, vec_filename, max_vocab)
print("Pretrain is of size {}".format(len(pt.vocab)))
if __name__ == '__main__':
main()
|