stanza/models/tokenization/vocab.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

from collections import Counter
import re

from stanza.models.common.vocab import BaseVocab
from stanza.models.common.vocab import UNK, PAD

SPACE_RE = re.compile(r'\s')

class Vocab(BaseVocab):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.lang_replaces_spaces = any([self.lang.startswith(x) for x in ['zh', 'ja', 'ko']])

    def build_vocab(self):
        paras = self.data
        counter = Counter()
        for para in paras:
            for unit in para:
                normalized = self.normalize_unit(unit[0])
                counter[normalized] += 1

        self._id2unit = [PAD, UNK] + list(sorted(list(counter.keys()), key=lambda k: counter[k], reverse=True))
        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}

    def normalize_unit(self, unit):
        # Normalize minimal units used by the tokenizer
        # For Vietnamese this means a syllable, for other languages this means a character
        normalized = unit
        if self.lang.startswith('vi'):
            normalized = normalized.lstrip()

        return normalized

    def normalize_token(self, token):
        token = SPACE_RE.sub(' ', token.lstrip())

        if self.lang_replaces_spaces:
            token = token.replace(' ', '')

        return token