diff options
author | Marcin Junczys-Dowmunt <marcinjd@microsoft.com> | 2020-11-10 18:05:43 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <marcinjd@microsoft.com> | 2020-11-10 18:05:43 +0300 |
commit | d2f8e0d811f203e29fff32cc31e087a3eec26409 (patch) | |
tree | bf7b5a46f0deea194dc6700d49d6993048e16a03 | |
parent | 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 (diff) |
byte-level test
-rw-r--r-- | src/normalizer.cc | 5 | ||||
-rw-r--r-- | src/trainer_interface.cc | 3 | ||||
-rw-r--r-- | src/util.cc | 12 | ||||
-rw-r--r-- | src/util.h | 2 |
4 files changed, 19 insertions, 3 deletions
diff --git a/src/normalizer.cc b/src/normalizer.cc index 3fe919b..52754db 100644 --- a/src/normalizer.cc +++ b/src/normalizer.cc @@ -150,7 +150,8 @@ util::Status Normalizer::Normalize(absl::string_view input, norm_to_orig->push_back(consumed); } } else { - *normalized += data[n]; + *normalized += string_util::hexStr(data + n, 1); + norm_to_orig->push_back(consumed); norm_to_orig->push_back(consumed); } } @@ -185,6 +186,8 @@ util::Status Normalizer::Normalize(absl::string_view input, CHECK_EQ_OR_RETURN(norm_to_orig->size(), normalized->size() + 1); + // std::cerr << *normalized << std::endl; + return util::OkStatus(); } diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc index 0ea71d3..a32f78d 100644 --- a/src/trainer_interface.cc +++ b/src/trainer_interface.cc @@ -403,8 +403,7 @@ END: for (size_t i = n; i < sentences_.size(); i += trainer_spec_.num_threads()) { auto *s = &sentences_[i].first; - *s = meta_pieces_matcher.GlobalReplace(normalizer.Normalize(*s), - kUPPBoundaryStr); + *s = normalizer.Normalize(meta_pieces_matcher.GlobalReplace(*s, kUPPBoundaryStr)); // cannot normalize meta pieces, but that's fine. } }); } diff --git a/src/util.cc b/src/util.cc index 58225ae..555c822 100644 --- a/src/util.cc +++ b/src/util.cc @@ -120,6 +120,18 @@ size_t EncodeUTF8(char32 c, char *output) { std::string UnicodeCharToUTF8(const char32 c) { return UnicodeTextToUTF8({c}); } +constexpr char hexmap[] = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; + +std::string hexStr(const char *data, int len) { + std::string s(len * 2, ' '); + for (int i = 0; i < len; ++i) { + s[2 * i] = hexmap[(data[i] & 0xF0) >> 4]; + s[2 * i + 1] = hexmap[(data[i] & 0x0F)]; + } + return s; +} + UnicodeText UTF8ToUnicodeText(absl::string_view utf8) { UnicodeText uc; const char *begin = utf8.data(); @@ -54,6 +54,8 @@ uint32 GetRandomGeneratorSeed(); // String utilities namespace string_util { +std::string hexStr(const char *data, int len); + struct string_view_hash { // DJB hash function. inline size_t operator()(const absl::string_view &sp) const { |