diff options
author | Taku Kudo <taku@google.com> | 2018-06-04 14:32:37 +0300 |
---|---|---|
committer | Taku Kudo <taku@google.com> | 2018-06-04 14:32:37 +0300 |
commit | 4e3bcf1373fb7c8ddca151ddf2a6f10914057cfa (patch) | |
tree | 71ea36b440216b25cfa7290be2a09d3ac82b7acb /src/builder.h | |
parent | 4f7af0dfadbf547264296d46924055842c901b60 (diff) |
Updated normalizer
Diffstat (limited to 'src/builder.h')
-rw-r--r-- | src/builder.h | 22 |
1 files changed, 11 insertions, 11 deletions
diff --git a/src/builder.h b/src/builder.h index 7a5fdb2..98bd59c 100644 --- a/src/builder.h +++ b/src/builder.h @@ -45,14 +45,13 @@ class Builder { static util::Status CompileCharsMap(const CharsMap &chars_map, std::string *output); + // Decompiles `blob` into `chars_map`. + static util::Status DecompileCharsMap(StringPiece blob, CharsMap *chars_map); + // Returns a pre-compiled binary index with `name`. static util::Status GetPrecompiledCharsMap(const std::string &name, std::string *output); - // Populates necessary fields (precompiled_charmap) from - // `name` or `normalization_rule_tsv` fields in `normalizer_spec`. - static util::Status PopulateNormalizerSpec(NormalizerSpec *normalizer_spec); - // Makes a normalization mapping based on NFKC. // // Note that Normalizer/Builder classes do not support @@ -88,16 +87,17 @@ class Builder { // normalizer is the goal of SentencePiece. // // TODO(taku): Make NFC, NFD, and NFKD mapping if necessary. - static CharsMap BuildNFKCMap(); - - // Returns identity mapping, which dose not perform any normalization. - static CharsMap BuildIdentityMap(); + static util::Status BuildNFKCMap(CharsMap *chars_map); // Builds Chars map save in `filename`. // Format: // src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2... - // (src|trg)_ucharX must be a hex of UCS4. - static CharsMap BuildMapFromFile(StringPiece filename); + // (src|trg)_ucharX must be a hex of Unicode code point. + static util::Status LoadCharsMap(StringPiece filename, CharsMap *chars_map); + + // Saves Chars map to `filename` as TSV. + static util::Status SaveCharsMap(StringPiece filename, + const CharsMap &chars_map); private: FRIEND_TEST(BuilderTest, RemoveRedundantMapTest); @@ -105,7 +105,7 @@ class Builder { // Removes redundant rules from `chars_map`. // When char_maps have "aa" => "bb" and "a" => "b", the first // rule is not necessary since the second rule can cover the first rule. - static CharsMap RemoveRedundantMap(const CharsMap &chars_map); + static util::Status RemoveRedundantMap(CharsMap *chars_map); }; } // namespace normalizer } // namespace sentencepiece |