Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <marcinjd@microsoft.com>2020-11-10 18:05:43 +0300
committerMarcin Junczys-Dowmunt <marcinjd@microsoft.com>2020-11-10 18:05:43 +0300
commitd2f8e0d811f203e29fff32cc31e087a3eec26409 (patch)
treebf7b5a46f0deea194dc6700d49d6993048e16a03
parent8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 (diff)
byte-level test
-rw-r--r--src/normalizer.cc5
-rw-r--r--src/trainer_interface.cc3
-rw-r--r--src/util.cc12
-rw-r--r--src/util.h2
4 files changed, 19 insertions, 3 deletions
diff --git a/src/normalizer.cc b/src/normalizer.cc
index 3fe919b..52754db 100644
--- a/src/normalizer.cc
+++ b/src/normalizer.cc
@@ -150,7 +150,8 @@ util::Status Normalizer::Normalize(absl::string_view input,
norm_to_orig->push_back(consumed);
}
} else {
- *normalized += data[n];
+ *normalized += string_util::hexStr(data + n, 1);
+ norm_to_orig->push_back(consumed);
norm_to_orig->push_back(consumed);
}
}
@@ -185,6 +186,8 @@ util::Status Normalizer::Normalize(absl::string_view input,
CHECK_EQ_OR_RETURN(norm_to_orig->size(), normalized->size() + 1);
+ // std::cerr << *normalized << std::endl;
+
return util::OkStatus();
}
diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc
index 0ea71d3..a32f78d 100644
--- a/src/trainer_interface.cc
+++ b/src/trainer_interface.cc
@@ -403,8 +403,7 @@ END:
for (size_t i = n; i < sentences_.size();
i += trainer_spec_.num_threads()) {
auto *s = &sentences_[i].first;
- *s = meta_pieces_matcher.GlobalReplace(normalizer.Normalize(*s),
- kUPPBoundaryStr);
+ *s = normalizer.Normalize(meta_pieces_matcher.GlobalReplace(*s, kUPPBoundaryStr)); // cannot normalize meta pieces, but that's fine.
}
});
}
diff --git a/src/util.cc b/src/util.cc
index 58225ae..555c822 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -120,6 +120,18 @@ size_t EncodeUTF8(char32 c, char *output) {
std::string UnicodeCharToUTF8(const char32 c) { return UnicodeTextToUTF8({c}); }
+constexpr char hexmap[] = {'0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
+
+std::string hexStr(const char *data, int len) {
+ std::string s(len * 2, ' ');
+ for (int i = 0; i < len; ++i) {
+ s[2 * i] = hexmap[(data[i] & 0xF0) >> 4];
+ s[2 * i + 1] = hexmap[(data[i] & 0x0F)];
+ }
+ return s;
+}
+
UnicodeText UTF8ToUnicodeText(absl::string_view utf8) {
UnicodeText uc;
const char *begin = utf8.data();
diff --git a/src/util.h b/src/util.h
index 673e8f6..9e1ce21 100644
--- a/src/util.h
+++ b/src/util.h
@@ -54,6 +54,8 @@ uint32 GetRandomGeneratorSeed();
// String utilities
namespace string_util {
+std::string hexStr(const char *data, int len);
+
struct string_view_hash {
// DJB hash function.
inline size_t operator()(const absl::string_view &sp) const {