diff options
author | Taku Kudo <taku@google.com> | 2018-05-13 15:31:51 +0300 |
---|---|---|
committer | Taku Kudo <taku@google.com> | 2018-05-13 15:31:51 +0300 |
commit | 4a200b89749fa3fc426feb7bba915e544e4baa57 (patch) | |
tree | ce7b1da3601f75aab8fe333c07f53cde3470304d /src/util.cc | |
parent | 2e01a0890ef87312407da71c925fdfa56bdd4d1d (diff) |
Made DecodeUTF8 more strict.
Diffstat (limited to 'src/util.cc')
-rw-r--r-- | src/util.cc | 139 |
1 files changed, 67 insertions, 72 deletions
diff --git a/src/util.cc b/src/util.cc index 915f01f..7983021 100644 --- a/src/util.cc +++ b/src/util.cc @@ -99,94 +99,89 @@ void StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub, } // mblen sotres the number of bytes consumed after decoding. -// decoder_utf8 is optimized for speed. It doesn't check -// the following malformed UTF8: -// 1) Redundant UTF8 -// 2) BOM (returns value is undefined). -// 3) Trailing byte after leading byte (c & 0xc0 == 0x80) char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen) { const size_t len = end - begin; - if (len >= 3 && (begin[0] & 0xf0) == 0xe0) { - *mblen = 3; - return (((begin[0] & 0x0f) << 12) | ((begin[1] & 0x3f) << 6) | - ((begin[2] & 0x3f))); - } else if (static_cast<unsigned char>(begin[0]) < 0x80) { + + if (static_cast<unsigned char>(begin[0]) < 0x80) { *mblen = 1; return static_cast<unsigned char>(begin[0]); - } else if (len >= 2 && (begin[0] & 0xe0) == 0xc0) { - *mblen = 2; - return (((begin[0] & 0x1f) << 6) | ((begin[1] & 0x3f))); - } else if (len >= 4 && (begin[0] & 0xf8) == 0xf0) { - *mblen = 4; - return (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3f) << 12) | - ((begin[2] & 0x3f) << 6) | ((begin[3] & 0x3f))); - } else if (len >= 5 && (begin[0] & 0xfc) == 0xf8) { - *mblen = 5; - return (((begin[0] & 0x03) << 24) | ((begin[1] & 0x3f) << 18) | - ((begin[2] & 0x3f) << 12) | ((begin[3] & 0x3f) << 6) | - ((begin[4] & 0x3f))); - } else if (len >= 6 && (begin[0] & 0xfe) == 0xfc) { - *mblen = 6; - return (((begin[0] & 0x01) << 30) | ((begin[1] & 0x3f) << 24) | - ((begin[2] & 0x3f) << 18) | ((begin[3] & 0x3f) << 12) | - ((begin[4] & 0x3f) << 6) | ((begin[5] & 0x3f))); + } else if (len >= 2 && (begin[0] & 0xE0) == 0xC0) { + const char32 cp = (((begin[0] & 0x1F) << 6) | ((begin[1] & 0x3F))); + if (IsTrailByte(begin[1]) && cp >= 0x0080 && IsValidCodepoint(cp)) { + *mblen = 2; + return cp; + } + } else if (len >= 3 && (begin[0] & 0xF0) == 0xE0) { + const char32 cp = (((begin[0] & 0x0F) << 12) | ((begin[1] & 0x3F) << 6) | + ((begin[2] & 0x3F))); + if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) && cp >= 0x0800 && + IsValidCodepoint(cp)) { + *mblen = 3; + return cp; + } + } else if (len >= 4 && (begin[0] & 0xf8) == 0xF0) { + const char32 cp = (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3F) << 12) | + ((begin[2] & 0x3F) << 6) | ((begin[3] & 0x3F))); + if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) && + IsTrailByte(begin[3]) && cp >= 0x10000 && IsValidCodepoint(cp)) { + *mblen = 4; + return cp; + } } + // Invalid UTF-8. *mblen = 1; - return 0; + return kUnicodeError; } -size_t EncodeUTF8(char32 c, char *output) { - if (c == 0) { - // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8Append - // worked like this. - output[0] = '\0'; - return 0; +bool IsStructurallyValid(StringPiece str) { + const char *begin = str.data(); + const char *end = str.data() + str.size(); + size_t mblen = 0; + while (begin < end) { + const char32 c = DecodeUTF8(begin, end, &mblen); + if (c == kUnicodeError && mblen != 3) return false; + if (!IsValidCodepoint(c)) return false; + begin += mblen; } - if (c < 0x00080) { - output[0] = static_cast<char>(c & 0xFF); - output[1] = '\0'; + return true; +} + +size_t EncodeUTF8(char32 c, char *output) { + if (c <= 0x7F) { + *output = static_cast<char>(c); return 1; } - if (c < 0x00800) { - output[0] = static_cast<char>(0xC0 + ((c >> 6) & 0x1F)); - output[1] = static_cast<char>(0x80 + (c & 0x3F)); - output[2] = '\0'; + + if (c <= 0x7FF) { + output[1] = 0x80 | (c & 0x3F); + c >>= 6; + output[0] = 0xC0 | c; return 2; } - if (c < 0x10000) { - output[0] = static_cast<char>(0xE0 + ((c >> 12) & 0x0F)); - output[1] = static_cast<char>(0x80 + ((c >> 6) & 0x3F)); - output[2] = static_cast<char>(0x80 + (c & 0x3F)); - output[3] = '\0'; + + // if `c` is out-of-range, convert it to REPLACEMENT CHARACTER (U+FFFD). + // This treatment is the same as the original runetochar. + if (c > 0x10FFFF) c = kUnicodeError; + + if (c <= 0xFFFF) { + output[2] = 0x80 | (c & 0x3F); + c >>= 6; + output[1] = 0x80 | (c & 0x3F); + c >>= 6; + output[0] = 0xE0 | c; return 3; } - if (c < 0x200000) { - output[0] = static_cast<char>(0xF0 + ((c >> 18) & 0x07)); - output[1] = static_cast<char>(0x80 + ((c >> 12) & 0x3F)); - output[2] = static_cast<char>(0x80 + ((c >> 6) & 0x3F)); - output[3] = static_cast<char>(0x80 + (c & 0x3F)); - output[4] = '\0'; - return 4; - } - // below is not in UCS4 but in 32bit int. - if (c < 0x8000000) { - output[0] = static_cast<char>(0xF8 + ((c >> 24) & 0x03)); - output[1] = static_cast<char>(0x80 + ((c >> 18) & 0x3F)); - output[2] = static_cast<char>(0x80 + ((c >> 12) & 0x3F)); - output[3] = static_cast<char>(0x80 + ((c >> 6) & 0x3F)); - output[4] = static_cast<char>(0x80 + (c & 0x3F)); - output[5] = '\0'; - return 5; - } - output[0] = static_cast<char>(0xFC + ((c >> 30) & 0x01)); - output[1] = static_cast<char>(0x80 + ((c >> 24) & 0x3F)); - output[2] = static_cast<char>(0x80 + ((c >> 18) & 0x3F)); - output[3] = static_cast<char>(0x80 + ((c >> 12) & 0x3F)); - output[4] = static_cast<char>(0x80 + ((c >> 6) & 0x3F)); - output[5] = static_cast<char>(0x80 + (c & 0x3F)); - output[6] = '\0'; - return 6; + + output[3] = 0x80 | (c & 0x3F); + c >>= 6; + output[2] = 0x80 | (c & 0x3F); + c >>= 6; + output[1] = 0x80 | (c & 0x3F); + c >>= 6; + output[0] = 0xF0 | c; + + return 4; } std::string UnicodeCharToUTF8(const char32 c) { return UnicodeTextToUTF8({c}); } |