1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
from collections import Counter
import re
from stanza.models.common.vocab import BaseVocab
from stanza.models.common.vocab import UNK, PAD
SPACE_RE = re.compile(r'\s')
class Vocab(BaseVocab):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.lang_replaces_spaces = any([self.lang.startswith(x) for x in ['zh', 'ja', 'ko']])
def build_vocab(self):
paras = self.data
counter = Counter()
for para in paras:
for unit in para:
normalized = self.normalize_unit(unit[0])
counter[normalized] += 1
self._id2unit = [PAD, UNK] + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
self._unit2id = {w:i for i, w in enumerate(self._id2unit)}
def normalize_unit(self, unit):
# Normalize minimal units used by the tokenizer
# For Vietnamese this means a syllable, for other languages this means a character
normalized = unit
if self.lang.startswith('vi'):
normalized = normalized.lstrip()
return normalized
def normalize_token(self, token):
token = SPACE_RE.sub(' ', token.lstrip())
if self.lang_replaces_spaces:
token = token.replace(' ', '')
return token
|