src/builder.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!

#ifndef BUILDER_H_
#define BUILDER_H_

#include <map>
#include <string>
#include <vector>
#include "common.h"
#include "sentencepiece_model.pb.h"
#include "sentencepiece_processor.h"
#include "third_party/absl/strings/string_view.h"

namespace sentencepiece {
namespace normalizer {

// Builder creates a text normalization rule from user-defined string
// to string mappings. The normalization mapping is compiled into
// a single and compact blob index which is stored into the model proto.
// This class also provides pre-defined rules based on Unicode NFKC.
// https://en.wikipedia.org/wiki/Unicode_equivalence#Normalization
class Builder {
 public:
  Builder() = delete;
  ~Builder() = delete;

  // Basic Unicode character sequence.
  using Chars = std::vector<char32>;

  // String-to-string mapping.
  using CharsMap = std::map<Chars, Chars>;

  static util::Status CompileCharsMap(const CharsMap &chars_map,
                                      std::string *output);

  // Decompiles `blob` into `chars_map`.
  static util::Status DecompileCharsMap(absl::string_view blob,
                                        CharsMap *chars_map);

  // Returns a pre-compiled binary index with `name`.
  static util::Status GetPrecompiledCharsMap(const std::string &name,
                                             std::string *output);

  // Makes a normalization mapping based on NFKC.
  //
  // Note that Normalizer/Builder classes do not support
  // full NFKC normalization, since full NFKC normalization cannot
  // be implemented with a simple longest matching string-to-string
  // replacement. One unsupported normalization is multiple combining
  // marks.
  //
  // Strings with multiple combining marks cannot correctly
  // be normalized, because it needs to sort the combining marks
  // with Canonical_Combining_Class (CCC).
  // http://unicode.org/reports/tr15/#Multiple_Mark_Figure
  //
  // Example:
  //  Original:    U+1E0B U+0323
  //  Decomposed:  U+0064 U+0307 U+0323
  //  NFKD:        U+0064 U+0323 U+0307 (Combining characters are sorted by CCC)
  //  NFKC:        U+1E0D U+0307 (U+0064 U+0323 => U+1E0D)
  //
  // To support the normalization above with a longest matching, we need to
  // enumerate all possible permutations of combining marks in advance,
  // which is not feasible. For example, suppose the case there are three
  // combining marks X, Y and Z, which are sorted into one canonical order
  // Z, Y, X with NFK(D|C).  In this case, all permutations (XYZ, XZY, YXZ...)
  // are normalized into ZYX. When we implement this normalization with
  // a longest matching, we need to have 3! rules. XYZ=>ZYX, XZY=>ZYX..
  // Since Unicode has more than 100 combining characters, it is not possible
  // to expand all permutations.
  //
  // We will not implement the full NFKC in SentencePiece because
  //  1) It is unusual to see decomposed Unicode characters in real text.
  //  2) Providing a flexible, user-customizable, and self-contained
  //     normalizer is the goal of SentencePiece.
  //
  // TODO(taku): Make NFC, NFD, and NFKD mapping if necessary.
  static util::Status BuildNFKCMap(CharsMap *chars_map);

  // Makes an NFKC-based mapping with NMT specific modifications around
  // whitespaces.
  static util::Status BuildNmtNFKCMap(CharsMap *chars_map);

  // Merge Unicode case folding mapping into `chars_map`.
  static util::Status MergeUnicodeCaseFoldMap(CharsMap *chars_map);

  // Makes NFKC with Unicode case folding.
  static util::Status BuildNFKC_CFMap(CharsMap *chars_map);

  // Makes NMT NFKC with Unicode case folding.
  static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);

  // Builds Chars map save in `filename`.
  // Format:
  // src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
  // (src|trg)_ucharX must be a hex of Unicode code point.
  static util::Status LoadCharsMap(absl::string_view filename,
                                   CharsMap *chars_map);

  // Saves Chars map to `filename` as TSV.
  static util::Status SaveCharsMap(absl::string_view filename,
                                   const CharsMap &chars_map);

 private:
  FRIEND_TEST(BuilderTest, RemoveRedundantMapTest);

  // Removes redundant rules from `chars_map`.
  // When char_maps have "aa" => "bb" and "a" => "b", the first
  // rule is not necessary since the second rule can cover the first rule.
  static util::Status RemoveRedundantMap(CharsMap *chars_map);
};
}  // namespace normalizer
}  // namespace sentencepiece
#endif  // BUILDER_H_