Made DecodeUTF8 more strict.

author: Taku Kudo <taku@google.com> 2018-05-13 15:31:51 +0300
committer: Taku Kudo <taku@google.com> 2018-05-13 15:31:51 +0300
commit: 4a200b89749fa3fc426feb7bba915e544e4baa57 (patch)
tree: ce7b1da3601f75aab8fe333c07f53cde3470304d /src
parent: 2e01a0890ef87312407da71c925fdfa56bdd4d1d (diff)
7 files changed, 190 insertions, 137 deletions
diff --git a/src/builder.cc b/src/builder.cc
index 81e2027..6f71b6d 100644
--- a/src/builder.cc
+++ b/src/builder.cc
@@ -158,6 +158,7 @@ util::Status Builder::CompileCharsMap(const CharsMap &chars_map,
   for (auto &p : normalized2pos) {
     p.second = normalized.size();  // stores the pointer (position).
     const std::string utf8_out = string_util::UnicodeTextToUTF8(p.first);
+    CHECK_OR_RETURN(string_util::IsStructurallyValid(utf8_out));
     normalized += utf8_out;
     normalized += '\0';
   }
@@ -166,6 +167,7 @@ util::Status Builder::CompileCharsMap(const CharsMap &chars_map,
   for (const auto &p : chars_map) {
     // The value of Trie stores the pointer to the normalized string.
     const std::string utf8_in = string_util::UnicodeTextToUTF8(p.first);
+    CHECK_OR_RETURN(string_util::IsStructurallyValid(utf8_in));
     kv.emplace_back(utf8_in, port::FindOrDie(normalized2pos, p.second));
   }
 
@@ -258,7 +260,7 @@ Builder::CharsMap Builder::BuildNFKCMap() {
 
   Builder::CharsMap nfkc_map;  // The final NFKC mapping.
 
-  constexpr int kMaxUnicode = 0x110000;
+  constexpr int kMaxUnicode = 0x10FFFF;
   for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
     if (!U_IS_UNICODE_CHAR(cp)) {
       continue;
diff --git a/src/common.h b/src/common.h
index d273fff..a1a3c80 100644
--- a/src/common.h
+++ b/src/common.h
@@ -63,6 +63,8 @@ static constexpr int32 kint32max = ((int32)0x7FFFFFFF);
 static constexpr int64 kint64min = ((int64)(~0x7FFFFFFFFFFFFFFF));
 static constexpr int64 kint64max = ((int64)(0x7FFFFFFFFFFFFFFF));
 
+static constexpr uint32 kUnicodeError = 0xFFFD;
+
 #ifdef OS_WIN
 #define OUTPUT_MODE std::ios::binary | std::ios::out
 #else
diff --git a/src/normalizer.cc b/src/normalizer.cc
index 32e65f2..ddb701a 100644
--- a/src/normalizer.cc
+++ b/src/normalizer.cc
@@ -196,9 +196,19 @@ std::pair<StringPiece, int> Normalizer::NormalizePrefix(
   }
 
   if (longest_length == 0) {
-    result.second = std::min<int>(
-        input.size(), std::max<int>(1, string_util::OneCharLen(input.data())));
-    result.first.set(input.data(), result.second);
+    size_t length = 0;
+    if (!string_util::IsValidDecodeUTF8(input, &length)) {
+      // Found a malformed utf8.
+      // The rune is set to be 0xFFFD (REPLACEMENT CHARACTER),
+      // which is a valid Unicode of three bytes in utf8,
+      // but here we only consume one byte.
+      result.second = 1;
+      static const char kReplacementChar[] = "\xEF\xBF\xBD";
+      result.first.set(kReplacementChar, 3);
+    } else {
+      result.second = length;
+      result.first.set(input.data(), result.second);
+    }
   } else {
     result.second = longest_length;
     // No need to pass the size of normalized sentence,
diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc
index 01b5b7d..6c7d9af 100644
--- a/src/trainer_interface.cc
+++ b/src/trainer_interface.cc
@@ -100,6 +100,9 @@ bool TrainerInterface::IsValidSentencePiece(
       LOG(WARNING) << "space must not be included in normalized string.";
       return false;
     }
+    if (!string_util::IsValidCodepoint(*it)) {
+      return false;
+    }
     if (*it == kWSChar) {
       // Only allows whitespace to appear as a prefix of piece.
       // When split_by_whitespace is false, we allow whitespaces to
@@ -203,6 +206,7 @@ END:
   std::unordered_map<char32, int64> chars_count;
   for (const auto &w : sentences_) {
     for (const char32 c : string_util::UTF8ToUnicodeText(w.first)) {
+      if (!string_util::IsValidCodepoint(c)) continue;
       if (c == 0x0020) {
         // UTF8ToUnicodeText returns a white space if the text
         // contains an interchange-invalid character.
@@ -282,8 +286,9 @@ util::Status TrainerInterface::Serialize(ModelProto *model_proto) const {
   // Duplicated sentencepiece is not allowed.
   std::unordered_set<std::string> dup;
 
-#define CHECK_PIECE(piece)         \
-  CHECK_OR_RETURN(!piece.empty()); \
+#define CHECK_PIECE(piece)                                  \
+  CHECK_OR_RETURN(string_util::IsStructurallyValid(piece)); \
+  CHECK_OR_RETURN(!piece.empty());                          \
   CHECK_OR_RETURN(dup.insert(piece).second) << piece << " is already defined";
 
   for (const auto &w : meta_pieces_) {
diff --git a/src/util.cc b/src/util.cc
index 915f01f..7983021 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -99,94 +99,89 @@ void StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
 }
 
 // mblen sotres the number of bytes consumed after decoding.
-// decoder_utf8 is optimized for speed. It doesn't check
-// the following malformed UTF8:
-// 1) Redundant UTF8
-// 2) BOM (returns value is undefined).
-// 3) Trailing byte after leading byte (c & 0xc0 == 0x80)
 char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen) {
   const size_t len = end - begin;
-  if (len >= 3 && (begin[0] & 0xf0) == 0xe0) {
-    *mblen = 3;
-    return (((begin[0] & 0x0f) << 12) | ((begin[1] & 0x3f) << 6) |
-            ((begin[2] & 0x3f)));
-  } else if (static_cast<unsigned char>(begin[0]) < 0x80) {
+
+  if (static_cast<unsigned char>(begin[0]) < 0x80) {
     *mblen = 1;
     return static_cast<unsigned char>(begin[0]);
-  } else if (len >= 2 && (begin[0] & 0xe0) == 0xc0) {
-    *mblen = 2;
-    return (((begin[0] & 0x1f) << 6) | ((begin[1] & 0x3f)));
-  } else if (len >= 4 && (begin[0] & 0xf8) == 0xf0) {
-    *mblen = 4;
-    return (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3f) << 12) |
-            ((begin[2] & 0x3f) << 6) | ((begin[3] & 0x3f)));
-  } else if (len >= 5 && (begin[0] & 0xfc) == 0xf8) {
-    *mblen = 5;
-    return (((begin[0] & 0x03) << 24) | ((begin[1] & 0x3f) << 18) |
-            ((begin[2] & 0x3f) << 12) | ((begin[3] & 0x3f) << 6) |
-            ((begin[4] & 0x3f)));
-  } else if (len >= 6 && (begin[0] & 0xfe) == 0xfc) {
-    *mblen = 6;
-    return (((begin[0] & 0x01) << 30) | ((begin[1] & 0x3f) << 24) |
-            ((begin[2] & 0x3f) << 18) | ((begin[3] & 0x3f) << 12) |
-            ((begin[4] & 0x3f) << 6) | ((begin[5] & 0x3f)));
+  } else if (len >= 2 && (begin[0] & 0xE0) == 0xC0) {
+    const char32 cp = (((begin[0] & 0x1F) << 6) | ((begin[1] & 0x3F)));
+    if (IsTrailByte(begin[1]) && cp >= 0x0080 && IsValidCodepoint(cp)) {
+      *mblen = 2;
+      return cp;
+    }
+  } else if (len >= 3 && (begin[0] & 0xF0) == 0xE0) {
+    const char32 cp = (((begin[0] & 0x0F) << 12) | ((begin[1] & 0x3F) << 6) |
+                       ((begin[2] & 0x3F)));
+    if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) && cp >= 0x0800 &&
+        IsValidCodepoint(cp)) {
+      *mblen = 3;
+      return cp;
+    }
+  } else if (len >= 4 && (begin[0] & 0xf8) == 0xF0) {
+    const char32 cp = (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3F) << 12) |
+                       ((begin[2] & 0x3F) << 6) | ((begin[3] & 0x3F)));
+    if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) &&
+        IsTrailByte(begin[3]) && cp >= 0x10000 && IsValidCodepoint(cp)) {
+      *mblen = 4;
+      return cp;
+    }
   }
 
+  // Invalid UTF-8.
   *mblen = 1;
-  return 0;
+  return kUnicodeError;
 }
 
-size_t EncodeUTF8(char32 c, char *output) {
-  if (c == 0) {
-    // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8Append
-    // worked like this.
-    output[0] = '\0';
-    return 0;
+bool IsStructurallyValid(StringPiece str) {
+  const char *begin = str.data();
+  const char *end = str.data() + str.size();
+  size_t mblen = 0;
+  while (begin < end) {
+    const char32 c = DecodeUTF8(begin, end, &mblen);
+    if (c == kUnicodeError && mblen != 3) return false;
+    if (!IsValidCodepoint(c)) return false;
+    begin += mblen;
   }
-  if (c < 0x00080) {
-    output[0] = static_cast<char>(c & 0xFF);
-    output[1] = '\0';
+  return true;
+}
+
+size_t EncodeUTF8(char32 c, char *output) {
+  if (c <= 0x7F) {
+    *output = static_cast<char>(c);
     return 1;
   }
-  if (c < 0x00800) {
-    output[0] = static_cast<char>(0xC0 + ((c >> 6) & 0x1F));
-    output[1] = static_cast<char>(0x80 + (c & 0x3F));
-    output[2] = '\0';
+
+  if (c <= 0x7FF) {
+    output[1] = 0x80 | (c & 0x3F);
+    c >>= 6;
+    output[0] = 0xC0 | c;
     return 2;
   }
-  if (c < 0x10000) {
-    output[0] = static_cast<char>(0xE0 + ((c >> 12) & 0x0F));
-    output[1] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
-    output[2] = static_cast<char>(0x80 + (c & 0x3F));
-    output[3] = '\0';
+
+  // if `c` is out-of-range, convert it to REPLACEMENT CHARACTER (U+FFFD).
+  // This treatment is the same as the original runetochar.
+  if (c > 0x10FFFF) c = kUnicodeError;
+
+  if (c <= 0xFFFF) {
+    output[2] = 0x80 | (c & 0x3F);
+    c >>= 6;
+    output[1] = 0x80 | (c & 0x3F);
+    c >>= 6;
+    output[0] = 0xE0 | c;
     return 3;
   }
-  if (c < 0x200000) {
-    output[0] = static_cast<char>(0xF0 + ((c >> 18) & 0x07));
-    output[1] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
-    output[2] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
-    output[3] = static_cast<char>(0x80 + (c & 0x3F));
-    output[4] = '\0';
-    return 4;
-  }
-  // below is not in UCS4 but in 32bit int.
-  if (c < 0x8000000) {
-    output[0] = static_cast<char>(0xF8 + ((c >> 24) & 0x03));
-    output[1] = static_cast<char>(0x80 + ((c >> 18) & 0x3F));
-    output[2] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
-    output[3] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
-    output[4] = static_cast<char>(0x80 + (c & 0x3F));
-    output[5] = '\0';
-    return 5;
-  }
-  output[0] = static_cast<char>(0xFC + ((c >> 30) & 0x01));
-  output[1] = static_cast<char>(0x80 + ((c >> 24) & 0x3F));
-  output[2] = static_cast<char>(0x80 + ((c >> 18) & 0x3F));
-  output[3] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
-  output[4] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
-  output[5] = static_cast<char>(0x80 + (c & 0x3F));
-  output[6] = '\0';
-  return 6;
+
+  output[3] = 0x80 | (c & 0x3F);
+  c >>= 6;
+  output[2] = 0x80 | (c & 0x3F);
+  c >>= 6;
+  output[1] = 0x80 | (c & 0x3F);
+  c >>= 6;
+  output[0] = 0xF0 | c;
+
+  return 4;
 }
 
 std::string UnicodeCharToUTF8(const char32 c) { return UnicodeTextToUTF8({c}); }
diff --git a/src/util.h b/src/util.h
index f3b8158..828f7dd 100644
--- a/src/util.h
+++ b/src/util.h
@@ -168,25 +168,32 @@ inline size_t Itoa(T val, char *s) {
 
 // Return length of a single UTF-8 source character
 inline size_t OneCharLen(const char *src) {
-  // Table of UTF-8 character lengths, based on first byte
-  constexpr unsigned char kUTF8LenTable[256] = {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-      2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-      4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1};
-  return kUTF8LenTable[*reinterpret_cast<const unsigned char *>(src)];
+  return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
 }
 
+// Return (x & 0xC0) == 0x80;
+// Since trail bytes are always in [0x80, 0xBF], we can optimize:
+inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
+
+inline bool IsValidCodepoint(char32 c) {
+  return (static_cast<uint32>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
+}
+
+bool IsStructurallyValid(StringPiece str);
+
 using UnicodeText = std::vector<char32>;
 
 char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen);
+
+inline char32 DecodeUTF8(StringPiece input, size_t *mblen) {
+  return DecodeUTF8(input.data(), input.data() + input.size(), mblen);
+}
+
+inline bool IsValidDecodeUTF8(StringPiece input, size_t *mblen) {
+  const char32 c = DecodeUTF8(input, mblen);
+  return c != kUnicodeError || *mblen == 3;
+}
+
 size_t EncodeUTF8(char32 c, char *output);
 
 std::string UnicodeCharToUTF8(const char32 c);
diff --git a/src/util_test.cc b/src/util_test.cc
index e9d3829..6684f29 100644
--- a/src/util_test.cc
+++ b/src/util_test.cc
@@ -1,3 +1,4 @@
+
 // Copyright 2016 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,6 +18,9 @@
 #include "testharness.h"
 
 namespace sentencepiece {
+namespace {
+constexpr int kMaxUnicode = 0x10FFFF;
+}
 
 TEST(UtilTest, LexicalCastTest) {
   bool b = false;
@@ -260,73 +264,57 @@ TEST(UtilTest, DecodeUTF8Test) {
 
   {
     const std::string input = "";
-    EXPECT_EQ(0, string_util::DecodeUTF8(input.data(),
-                                         input.data() + input.size(), &mblen));
+    EXPECT_EQ(0, string_util::DecodeUTF8(input, &mblen));
     EXPECT_EQ(1, mblen);  // mblen always returns >= 1
   }
 
   {
-    const std::string input = "\x01";
-    EXPECT_EQ(1, string_util::DecodeUTF8(input.data(),
-                                         input.data() + input.size(), &mblen));
+    EXPECT_EQ(1, string_util::DecodeUTF8("\x01", &mblen));
     EXPECT_EQ(1, mblen);
   }
 
   {
-    const std::string input = "\x7F";
-    EXPECT_EQ(0x7F, string_util::DecodeUTF8(
-                        input.data(), input.data() + input.size(), &mblen));
+    EXPECT_EQ(0x7F, string_util::DecodeUTF8("\x7F", &mblen));
     EXPECT_EQ(1, mblen);
   }
 
   {
-    const std::string input = "\xC2\x80 ";
-    EXPECT_EQ(0x80, string_util::DecodeUTF8(
-                        input.data(), input.data() + input.size(), &mblen));
+    EXPECT_EQ(0x80, string_util::DecodeUTF8("\xC2\x80 ", &mblen));
     EXPECT_EQ(2, mblen);
   }
 
   {
-    const std::string input = "\xDF\xBF ";
-    EXPECT_EQ(0x7FF, string_util::DecodeUTF8(
-                         input.data(), input.data() + input.size(), &mblen));
+    EXPECT_EQ(0x7FF, string_util::DecodeUTF8("\xDF\xBF ", &mblen));
     EXPECT_EQ(2, mblen);
   }
 
   {
-    const std::string input = "\xE0\xA0\x80 ";
-    EXPECT_EQ(0x800, string_util::DecodeUTF8(
-                         input.data(), input.data() + input.size(), &mblen));
+    EXPECT_EQ(0x800, string_util::DecodeUTF8("\xE0\xA0\x80 ", &mblen));
     EXPECT_EQ(3, mblen);
   }
 
   {
-    const std::string input = "\xF0\x90\x80\x80 ";
-    EXPECT_EQ(0x10000, string_util::DecodeUTF8(
-                           input.data(), input.data() + input.size(), &mblen));
+    EXPECT_EQ(0x10000, string_util::DecodeUTF8("\xF0\x90\x80\x80 ", &mblen));
     EXPECT_EQ(4, mblen);
   }
 
+  // Invalid UTF8
   {
-    const std::string input = "\xF7\xBF\xBF\xBF ";
-    EXPECT_EQ(0x1FFFFF, string_util::DecodeUTF8(
-                            input.data(), input.data() + input.size(), &mblen));
-    EXPECT_EQ(4, mblen);
+    EXPECT_EQ(kUnicodeError,
+              string_util::DecodeUTF8("\xF7\xBF\xBF\xBF ", &mblen));
+    EXPECT_EQ(1, mblen);
   }
 
   {
-    const std::string input = "\xF8\x88\x80\x80\x80 ";
-    EXPECT_EQ(0x200000, string_util::DecodeUTF8(
-                            input.data(), input.data() + input.size(), &mblen));
-    EXPECT_EQ(5, mblen);
+    EXPECT_EQ(kUnicodeError,
+              string_util::DecodeUTF8("\xF8\x88\x80\x80\x80 ", &mblen));
+    EXPECT_EQ(1, mblen);
   }
 
   {
-    const std::string input = "\xFC\x84\x80\x80\x80\x80 ";
-    EXPECT_EQ(0x4000000,
-              string_util::DecodeUTF8(input.data(), input.data() + input.size(),
-                                      &mblen));
-    EXPECT_EQ(6, mblen);
+    EXPECT_EQ(kUnicodeError,
+              string_util::DecodeUTF8("\xFC\x84\x80\x80\x80\x80 ", &mblen));
+    EXPECT_EQ(1, mblen);
   }
 
   {
@@ -340,41 +328,60 @@ TEST(UtilTest, DecodeUTF8Test) {
     for (size_t i = 0; i < 4; ++i) {
       // return values of string_util::DecodeUTF8 is not defined.
       // TODO(taku) implement an workaround.
-      string_util::DecodeUTF8(
-          kInvalidData[i], kInvalidData[i] + strlen(kInvalidData[i]), &mblen);
+      EXPECT_EQ(kUnicodeError,
+                string_util::DecodeUTF8(
+                    kInvalidData[i], kInvalidData[i] + strlen(kInvalidData[i]),
+                    &mblen));
+      EXPECT_FALSE(string_util::IsStructurallyValid(kInvalidData[i]));
       EXPECT_EQ(1, mblen);
     }
   }
+
+  {
+    EXPECT_EQ(kUnicodeError, string_util::DecodeUTF8("\xDF\xDF ", &mblen));
+    EXPECT_EQ(1, mblen);
+  }
+
+  {
+    EXPECT_EQ(kUnicodeError, string_util::DecodeUTF8("\xE0\xE0\xE0 ", &mblen));
+    EXPECT_EQ(1, mblen);
+  }
+
+  {
+    EXPECT_EQ(kUnicodeError,
+              string_util::DecodeUTF8("\xF0\xF0\xF0\xFF ", &mblen));
+    EXPECT_EQ(1, mblen);
+  }
 }
 
 TEST(UtilTest, EncodeUTF8Test) {
-  constexpr int kMaxUnicode = 0x110000;
   char buf[16];
   for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
+    if (!string_util::IsValidCodepoint(cp)) continue;
     const size_t mblen = string_util::EncodeUTF8(cp, buf);
     size_t mblen2;
-    char32 c = string_util::DecodeUTF8(buf, buf + 16, &mblen2);
+    const char32 c = string_util::DecodeUTF8(buf, buf + 16, &mblen2);
     EXPECT_EQ(mblen2, mblen);
     EXPECT_EQ(cp, c);
   }
 
-  EXPECT_EQ(0, string_util::EncodeUTF8(0, buf));
+  EXPECT_EQ(1, string_util::EncodeUTF8(0, buf));
   EXPECT_EQ('\0', buf[0]);
 
   // non UCS4
   size_t mblen;
-  EXPECT_EQ(5, string_util::EncodeUTF8(0x7000000, buf));
-  string_util::DecodeUTF8(buf, buf + 16, &mblen);
-  EXPECT_EQ(5, mblen);
+  EXPECT_EQ(3, string_util::EncodeUTF8(0x7000000, buf));
+  EXPECT_EQ(kUnicodeError, string_util::DecodeUTF8(buf, buf + 16, &mblen));
+  EXPECT_EQ(3, mblen);
 
-  EXPECT_EQ(6, string_util::EncodeUTF8(0x8000001, buf));
-  string_util::DecodeUTF8(buf, buf + 16, &mblen);
-  EXPECT_EQ(6, mblen);
+  EXPECT_EQ(3, string_util::EncodeUTF8(0x8000001, buf));
+  EXPECT_EQ(kUnicodeError, string_util::DecodeUTF8(buf, buf + 16, &mblen));
+  EXPECT_EQ(3, mblen);
 }
 
 TEST(UtilTest, UnicodeCharToUTF8Test) {
-  constexpr int kMaxUnicode = 0x110000;
   for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
+    if (!string_util::IsValidCodepoint(cp)) continue;
     const auto s = string_util::UnicodeCharToUTF8(cp);
     const auto ut = string_util::UTF8ToUnicodeText(s);
     EXPECT_EQ(1, ut.size());
@@ -382,6 +389,31 @@ TEST(UtilTest, UnicodeCharToUTF8Test) {
   }
 }
 
+TEST(UtilTest, IsStructurallyValidTest) {
+  EXPECT_TRUE(string_util::IsStructurallyValid("abcd"));
+  EXPECT_TRUE(
+      string_util::IsStructurallyValid(StringPiece("a\0cd", 4)));     // NUL
+  EXPECT_TRUE(string_util::IsStructurallyValid("ab\xc3\x81"));        // 2-byte
+  EXPECT_TRUE(string_util::IsStructurallyValid("a\xe3\x81\x81"));     // 3-byte
+  EXPECT_TRUE(string_util::IsStructurallyValid("\xf2\x82\x81\x84"));  // 4
+  EXPECT_FALSE(string_util::IsStructurallyValid("abc\x80"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("abc\xc3"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("ab\xe3\x81"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("a\xf3\x81\x81"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("ab\xc0\x82"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("a\xe0\x82\x81"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xf0\x82\x83\x84"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xf4\xbd\xbe\xbf"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xED\xA0\x80"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xED\xBF\xBF"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xc0\x81"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xc1\xbf"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xe0\x81\x82"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xe0\x9f\xbf"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xf0\x80\x81\x82"));
+  EXPECT_FALSE(string_util::IsStructurallyValid("\xf0\x83\xbe\xbd"));
+}
+
 TEST(UtilTest, UnicodeTextToUTF8Test) {
   string_util::UnicodeText ut;
author	Taku Kudo <taku@google.com>	2018-05-13 15:31:51 +0300
committer	Taku Kudo <taku@google.com>	2018-05-13 15:31:51 +0300
commit	4a200b89749fa3fc426feb7bba915e544e4baa57 (patch)
tree	ce7b1da3601f75aab8fe333c07f53cde3470304d /src
parent	2e01a0890ef87312407da71c925fdfa56bdd4d1d (diff)