Made DecodeUTF8 more strict.

author: Taku Kudo <taku@google.com> 2018-05-13 15:31:51 +0300
committer: Taku Kudo <taku@google.com> 2018-05-13 15:31:51 +0300
commit: 4a200b89749fa3fc426feb7bba915e544e4baa57 (patch)
tree: ce7b1da3601f75aab8fe333c07f53cde3470304d /src/util.cc
parent: 2e01a0890ef87312407da71c925fdfa56bdd4d1d (diff)
1 files changed, 67 insertions, 72 deletions
diff --git a/src/util.cc b/src/util.cc
index 915f01f..7983021 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -99,94 +99,89 @@ void StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
 }
 
 // mblen sotres the number of bytes consumed after decoding.
-// decoder_utf8 is optimized for speed. It doesn't check
-// the following malformed UTF8:
-// 1) Redundant UTF8
-// 2) BOM (returns value is undefined).
-// 3) Trailing byte after leading byte (c & 0xc0 == 0x80)
 char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen) {
   const size_t len = end - begin;
-  if (len >= 3 && (begin[0] & 0xf0) == 0xe0) {
-    *mblen = 3;
-    return (((begin[0] & 0x0f) << 12) | ((begin[1] & 0x3f) << 6) |
-            ((begin[2] & 0x3f)));
-  } else if (static_cast<unsigned char>(begin[0]) < 0x80) {
+
+  if (static_cast<unsigned char>(begin[0]) < 0x80) {
     *mblen = 1;
     return static_cast<unsigned char>(begin[0]);
-  } else if (len >= 2 && (begin[0] & 0xe0) == 0xc0) {
-    *mblen = 2;
-    return (((begin[0] & 0x1f) << 6) | ((begin[1] & 0x3f)));
-  } else if (len >= 4 && (begin[0] & 0xf8) == 0xf0) {
-    *mblen = 4;
-    return (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3f) << 12) |
-            ((begin[2] & 0x3f) << 6) | ((begin[3] & 0x3f)));
-  } else if (len >= 5 && (begin[0] & 0xfc) == 0xf8) {
-    *mblen = 5;
-    return (((begin[0] & 0x03) << 24) | ((begin[1] & 0x3f) << 18) |
-            ((begin[2] & 0x3f) << 12) | ((begin[3] & 0x3f) << 6) |
-            ((begin[4] & 0x3f)));
-  } else if (len >= 6 && (begin[0] & 0xfe) == 0xfc) {
-    *mblen = 6;
-    return (((begin[0] & 0x01) << 30) | ((begin[1] & 0x3f) << 24) |
-            ((begin[2] & 0x3f) << 18) | ((begin[3] & 0x3f) << 12) |
-            ((begin[4] & 0x3f) << 6) | ((begin[5] & 0x3f)));
+  } else if (len >= 2 && (begin[0] & 0xE0) == 0xC0) {
+    const char32 cp = (((begin[0] & 0x1F) << 6) | ((begin[1] & 0x3F)));
+    if (IsTrailByte(begin[1]) && cp >= 0x0080 && IsValidCodepoint(cp)) {
+      *mblen = 2;
+      return cp;
+    }
+  } else if (len >= 3 && (begin[0] & 0xF0) == 0xE0) {
+    const char32 cp = (((begin[0] & 0x0F) << 12) | ((begin[1] & 0x3F) << 6) |
+                       ((begin[2] & 0x3F)));
+    if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) && cp >= 0x0800 &&
+        IsValidCodepoint(cp)) {
+      *mblen = 3;
+      return cp;
+    }
+  } else if (len >= 4 && (begin[0] & 0xf8) == 0xF0) {
+    const char32 cp = (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3F) << 12) |
+                       ((begin[2] & 0x3F) << 6) | ((begin[3] & 0x3F)));
+    if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) &&
+        IsTrailByte(begin[3]) && cp >= 0x10000 && IsValidCodepoint(cp)) {
+      *mblen = 4;
+      return cp;
+    }
   }
 
+  // Invalid UTF-8.
   *mblen = 1;
-  return 0;
+  return kUnicodeError;
 }
 
-size_t EncodeUTF8(char32 c, char *output) {
-  if (c == 0) {
-    // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8Append
-    // worked like this.
-    output[0] = '\0';
-    return 0;
+bool IsStructurallyValid(StringPiece str) {
+  const char *begin = str.data();
+  const char *end = str.data() + str.size();
+  size_t mblen = 0;
+  while (begin < end) {
+    const char32 c = DecodeUTF8(begin, end, &mblen);
+    if (c == kUnicodeError && mblen != 3) return false;
+    if (!IsValidCodepoint(c)) return false;
+    begin += mblen;
   }
-  if (c < 0x00080) {
-    output[0] = static_cast<char>(c & 0xFF);
-    output[1] = '\0';
+  return true;
+}
+
+size_t EncodeUTF8(char32 c, char *output) {
+  if (c <= 0x7F) {
+    *output = static_cast<char>(c);
     return 1;
   }
-  if (c < 0x00800) {
-    output[0] = static_cast<char>(0xC0 + ((c >> 6) & 0x1F));
-    output[1] = static_cast<char>(0x80 + (c & 0x3F));
-    output[2] = '\0';
+
+  if (c <= 0x7FF) {
+    output[1] = 0x80 | (c & 0x3F);
+    c >>= 6;
+    output[0] = 0xC0 | c;
     return 2;
   }
-  if (c < 0x10000) {
-    output[0] = static_cast<char>(0xE0 + ((c >> 12) & 0x0F));
-    output[1] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
-    output[2] = static_cast<char>(0x80 + (c & 0x3F));
-    output[3] = '\0';
+
+  // if `c` is out-of-range, convert it to REPLACEMENT CHARACTER (U+FFFD).
+  // This treatment is the same as the original runetochar.
+  if (c > 0x10FFFF) c = kUnicodeError;
+
+  if (c <= 0xFFFF) {
+    output[2] = 0x80 | (c & 0x3F);
+    c >>= 6;
+    output[1] = 0x80 | (c & 0x3F);
+    c >>= 6;
+    output[0] = 0xE0 | c;
     return 3;
   }
-  if (c < 0x200000) {
-    output[0] = static_cast<char>(0xF0 + ((c >> 18) & 0x07));
-    output[1] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
-    output[2] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
-    output[3] = static_cast<char>(0x80 + (c & 0x3F));
-    output[4] = '\0';
-    return 4;
-  }
-  // below is not in UCS4 but in 32bit int.
-  if (c < 0x8000000) {
-    output[0] = static_cast<char>(0xF8 + ((c >> 24) & 0x03));
-    output[1] = static_cast<char>(0x80 + ((c >> 18) & 0x3F));
-    output[2] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
-    output[3] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
-    output[4] = static_cast<char>(0x80 + (c & 0x3F));
-    output[5] = '\0';
-    return 5;
-  }
-  output[0] = static_cast<char>(0xFC + ((c >> 30) & 0x01));
-  output[1] = static_cast<char>(0x80 + ((c >> 24) & 0x3F));
-  output[2] = static_cast<char>(0x80 + ((c >> 18) & 0x3F));
-  output[3] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
-  output[4] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
-  output[5] = static_cast<char>(0x80 + (c & 0x3F));
-  output[6] = '\0';
-  return 6;
+
+  output[3] = 0x80 | (c & 0x3F);
+  c >>= 6;
+  output[2] = 0x80 | (c & 0x3F);
+  c >>= 6;
+  output[1] = 0x80 | (c & 0x3F);
+  c >>= 6;
+  output[0] = 0xF0 | c;
+
+  return 4;
 }
 
 std::string UnicodeCharToUTF8(const char32 c) { return UnicodeTextToUTF8({c}); }
author	Taku Kudo <taku@google.com>	2018-05-13 15:31:51 +0300
committer	Taku Kudo <taku@google.com>	2018-05-13 15:31:51 +0300
commit	4a200b89749fa3fc426feb7bba915e544e4baa57 (patch)
tree	ce7b1da3601f75aab8fe333c07f53cde3470304d /src/util.cc
parent	2e01a0890ef87312407da71c925fdfa56bdd4d1d (diff)