Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaku Kudo <taku@google.com>2018-05-13 15:31:51 +0300
committerTaku Kudo <taku@google.com>2018-05-13 15:31:51 +0300
commit4a200b89749fa3fc426feb7bba915e544e4baa57 (patch)
treece7b1da3601f75aab8fe333c07f53cde3470304d /src/util.cc
parent2e01a0890ef87312407da71c925fdfa56bdd4d1d (diff)
Made DecodeUTF8 more strict.
Diffstat (limited to 'src/util.cc')
-rw-r--r--src/util.cc139
1 files changed, 67 insertions, 72 deletions
diff --git a/src/util.cc b/src/util.cc
index 915f01f..7983021 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -99,94 +99,89 @@ void StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
}
// mblen sotres the number of bytes consumed after decoding.
-// decoder_utf8 is optimized for speed. It doesn't check
-// the following malformed UTF8:
-// 1) Redundant UTF8
-// 2) BOM (returns value is undefined).
-// 3) Trailing byte after leading byte (c & 0xc0 == 0x80)
char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen) {
const size_t len = end - begin;
- if (len >= 3 && (begin[0] & 0xf0) == 0xe0) {
- *mblen = 3;
- return (((begin[0] & 0x0f) << 12) | ((begin[1] & 0x3f) << 6) |
- ((begin[2] & 0x3f)));
- } else if (static_cast<unsigned char>(begin[0]) < 0x80) {
+
+ if (static_cast<unsigned char>(begin[0]) < 0x80) {
*mblen = 1;
return static_cast<unsigned char>(begin[0]);
- } else if (len >= 2 && (begin[0] & 0xe0) == 0xc0) {
- *mblen = 2;
- return (((begin[0] & 0x1f) << 6) | ((begin[1] & 0x3f)));
- } else if (len >= 4 && (begin[0] & 0xf8) == 0xf0) {
- *mblen = 4;
- return (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3f) << 12) |
- ((begin[2] & 0x3f) << 6) | ((begin[3] & 0x3f)));
- } else if (len >= 5 && (begin[0] & 0xfc) == 0xf8) {
- *mblen = 5;
- return (((begin[0] & 0x03) << 24) | ((begin[1] & 0x3f) << 18) |
- ((begin[2] & 0x3f) << 12) | ((begin[3] & 0x3f) << 6) |
- ((begin[4] & 0x3f)));
- } else if (len >= 6 && (begin[0] & 0xfe) == 0xfc) {
- *mblen = 6;
- return (((begin[0] & 0x01) << 30) | ((begin[1] & 0x3f) << 24) |
- ((begin[2] & 0x3f) << 18) | ((begin[3] & 0x3f) << 12) |
- ((begin[4] & 0x3f) << 6) | ((begin[5] & 0x3f)));
+ } else if (len >= 2 && (begin[0] & 0xE0) == 0xC0) {
+ const char32 cp = (((begin[0] & 0x1F) << 6) | ((begin[1] & 0x3F)));
+ if (IsTrailByte(begin[1]) && cp >= 0x0080 && IsValidCodepoint(cp)) {
+ *mblen = 2;
+ return cp;
+ }
+ } else if (len >= 3 && (begin[0] & 0xF0) == 0xE0) {
+ const char32 cp = (((begin[0] & 0x0F) << 12) | ((begin[1] & 0x3F) << 6) |
+ ((begin[2] & 0x3F)));
+ if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) && cp >= 0x0800 &&
+ IsValidCodepoint(cp)) {
+ *mblen = 3;
+ return cp;
+ }
+ } else if (len >= 4 && (begin[0] & 0xf8) == 0xF0) {
+ const char32 cp = (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3F) << 12) |
+ ((begin[2] & 0x3F) << 6) | ((begin[3] & 0x3F)));
+ if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) &&
+ IsTrailByte(begin[3]) && cp >= 0x10000 && IsValidCodepoint(cp)) {
+ *mblen = 4;
+ return cp;
+ }
}
+ // Invalid UTF-8.
*mblen = 1;
- return 0;
+ return kUnicodeError;
}
-size_t EncodeUTF8(char32 c, char *output) {
- if (c == 0) {
- // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8Append
- // worked like this.
- output[0] = '\0';
- return 0;
+bool IsStructurallyValid(StringPiece str) {
+ const char *begin = str.data();
+ const char *end = str.data() + str.size();
+ size_t mblen = 0;
+ while (begin < end) {
+ const char32 c = DecodeUTF8(begin, end, &mblen);
+ if (c == kUnicodeError && mblen != 3) return false;
+ if (!IsValidCodepoint(c)) return false;
+ begin += mblen;
}
- if (c < 0x00080) {
- output[0] = static_cast<char>(c & 0xFF);
- output[1] = '\0';
+ return true;
+}
+
+size_t EncodeUTF8(char32 c, char *output) {
+ if (c <= 0x7F) {
+ *output = static_cast<char>(c);
return 1;
}
- if (c < 0x00800) {
- output[0] = static_cast<char>(0xC0 + ((c >> 6) & 0x1F));
- output[1] = static_cast<char>(0x80 + (c & 0x3F));
- output[2] = '\0';
+
+ if (c <= 0x7FF) {
+ output[1] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[0] = 0xC0 | c;
return 2;
}
- if (c < 0x10000) {
- output[0] = static_cast<char>(0xE0 + ((c >> 12) & 0x0F));
- output[1] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
- output[2] = static_cast<char>(0x80 + (c & 0x3F));
- output[3] = '\0';
+
+ // if `c` is out-of-range, convert it to REPLACEMENT CHARACTER (U+FFFD).
+ // This treatment is the same as the original runetochar.
+ if (c > 0x10FFFF) c = kUnicodeError;
+
+ if (c <= 0xFFFF) {
+ output[2] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[1] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[0] = 0xE0 | c;
return 3;
}
- if (c < 0x200000) {
- output[0] = static_cast<char>(0xF0 + ((c >> 18) & 0x07));
- output[1] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
- output[2] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
- output[3] = static_cast<char>(0x80 + (c & 0x3F));
- output[4] = '\0';
- return 4;
- }
- // below is not in UCS4 but in 32bit int.
- if (c < 0x8000000) {
- output[0] = static_cast<char>(0xF8 + ((c >> 24) & 0x03));
- output[1] = static_cast<char>(0x80 + ((c >> 18) & 0x3F));
- output[2] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
- output[3] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
- output[4] = static_cast<char>(0x80 + (c & 0x3F));
- output[5] = '\0';
- return 5;
- }
- output[0] = static_cast<char>(0xFC + ((c >> 30) & 0x01));
- output[1] = static_cast<char>(0x80 + ((c >> 24) & 0x3F));
- output[2] = static_cast<char>(0x80 + ((c >> 18) & 0x3F));
- output[3] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
- output[4] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
- output[5] = static_cast<char>(0x80 + (c & 0x3F));
- output[6] = '\0';
- return 6;
+
+ output[3] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[2] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[1] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[0] = 0xF0 | c;
+
+ return 4;
}
std::string UnicodeCharToUTF8(const char32 c) { return UnicodeTextToUTF8({c}); }