Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTaku Kudo <taku@google.com>2018-05-13 15:31:51 +0300
committerTaku Kudo <taku@google.com>2018-05-13 15:31:51 +0300
commit4a200b89749fa3fc426feb7bba915e544e4baa57 (patch)
treece7b1da3601f75aab8fe333c07f53cde3470304d /src
parent2e01a0890ef87312407da71c925fdfa56bdd4d1d (diff)
Made DecodeUTF8 more strict.
Diffstat (limited to 'src')
-rw-r--r--src/builder.cc4
-rw-r--r--src/common.h2
-rw-r--r--src/normalizer.cc16
-rw-r--r--src/trainer_interface.cc9
-rw-r--r--src/util.cc139
-rw-r--r--src/util.h35
-rw-r--r--src/util_test.cc122
7 files changed, 190 insertions, 137 deletions
diff --git a/src/builder.cc b/src/builder.cc
index 81e2027..6f71b6d 100644
--- a/src/builder.cc
+++ b/src/builder.cc
@@ -158,6 +158,7 @@ util::Status Builder::CompileCharsMap(const CharsMap &chars_map,
for (auto &p : normalized2pos) {
p.second = normalized.size(); // stores the pointer (position).
const std::string utf8_out = string_util::UnicodeTextToUTF8(p.first);
+ CHECK_OR_RETURN(string_util::IsStructurallyValid(utf8_out));
normalized += utf8_out;
normalized += '\0';
}
@@ -166,6 +167,7 @@ util::Status Builder::CompileCharsMap(const CharsMap &chars_map,
for (const auto &p : chars_map) {
// The value of Trie stores the pointer to the normalized string.
const std::string utf8_in = string_util::UnicodeTextToUTF8(p.first);
+ CHECK_OR_RETURN(string_util::IsStructurallyValid(utf8_in));
kv.emplace_back(utf8_in, port::FindOrDie(normalized2pos, p.second));
}
@@ -258,7 +260,7 @@ Builder::CharsMap Builder::BuildNFKCMap() {
Builder::CharsMap nfkc_map; // The final NFKC mapping.
- constexpr int kMaxUnicode = 0x110000;
+ constexpr int kMaxUnicode = 0x10FFFF;
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
if (!U_IS_UNICODE_CHAR(cp)) {
continue;
diff --git a/src/common.h b/src/common.h
index d273fff..a1a3c80 100644
--- a/src/common.h
+++ b/src/common.h
@@ -63,6 +63,8 @@ static constexpr int32 kint32max = ((int32)0x7FFFFFFF);
static constexpr int64 kint64min = ((int64)(~0x7FFFFFFFFFFFFFFF));
static constexpr int64 kint64max = ((int64)(0x7FFFFFFFFFFFFFFF));
+static constexpr uint32 kUnicodeError = 0xFFFD;
+
#ifdef OS_WIN
#define OUTPUT_MODE std::ios::binary | std::ios::out
#else
diff --git a/src/normalizer.cc b/src/normalizer.cc
index 32e65f2..ddb701a 100644
--- a/src/normalizer.cc
+++ b/src/normalizer.cc
@@ -196,9 +196,19 @@ std::pair<StringPiece, int> Normalizer::NormalizePrefix(
}
if (longest_length == 0) {
- result.second = std::min<int>(
- input.size(), std::max<int>(1, string_util::OneCharLen(input.data())));
- result.first.set(input.data(), result.second);
+ size_t length = 0;
+ if (!string_util::IsValidDecodeUTF8(input, &length)) {
+ // Found a malformed utf8.
+ // The rune is set to be 0xFFFD (REPLACEMENT CHARACTER),
+ // which is a valid Unicode of three bytes in utf8,
+ // but here we only consume one byte.
+ result.second = 1;
+ static const char kReplacementChar[] = "\xEF\xBF\xBD";
+ result.first.set(kReplacementChar, 3);
+ } else {
+ result.second = length;
+ result.first.set(input.data(), result.second);
+ }
} else {
result.second = longest_length;
// No need to pass the size of normalized sentence,
diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc
index 01b5b7d..6c7d9af 100644
--- a/src/trainer_interface.cc
+++ b/src/trainer_interface.cc
@@ -100,6 +100,9 @@ bool TrainerInterface::IsValidSentencePiece(
LOG(WARNING) << "space must not be included in normalized string.";
return false;
}
+ if (!string_util::IsValidCodepoint(*it)) {
+ return false;
+ }
if (*it == kWSChar) {
// Only allows whitespace to appear as a prefix of piece.
// When split_by_whitespace is false, we allow whitespaces to
@@ -203,6 +206,7 @@ END:
std::unordered_map<char32, int64> chars_count;
for (const auto &w : sentences_) {
for (const char32 c : string_util::UTF8ToUnicodeText(w.first)) {
+ if (!string_util::IsValidCodepoint(c)) continue;
if (c == 0x0020) {
// UTF8ToUnicodeText returns a white space if the text
// contains an interchange-invalid character.
@@ -282,8 +286,9 @@ util::Status TrainerInterface::Serialize(ModelProto *model_proto) const {
// Duplicated sentencepiece is not allowed.
std::unordered_set<std::string> dup;
-#define CHECK_PIECE(piece) \
- CHECK_OR_RETURN(!piece.empty()); \
+#define CHECK_PIECE(piece) \
+ CHECK_OR_RETURN(string_util::IsStructurallyValid(piece)); \
+ CHECK_OR_RETURN(!piece.empty()); \
CHECK_OR_RETURN(dup.insert(piece).second) << piece << " is already defined";
for (const auto &w : meta_pieces_) {
diff --git a/src/util.cc b/src/util.cc
index 915f01f..7983021 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -99,94 +99,89 @@ void StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
}
// mblen sotres the number of bytes consumed after decoding.
-// decoder_utf8 is optimized for speed. It doesn't check
-// the following malformed UTF8:
-// 1) Redundant UTF8
-// 2) BOM (returns value is undefined).
-// 3) Trailing byte after leading byte (c & 0xc0 == 0x80)
char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen) {
const size_t len = end - begin;
- if (len >= 3 && (begin[0] & 0xf0) == 0xe0) {
- *mblen = 3;
- return (((begin[0] & 0x0f) << 12) | ((begin[1] & 0x3f) << 6) |
- ((begin[2] & 0x3f)));
- } else if (static_cast<unsigned char>(begin[0]) < 0x80) {
+
+ if (static_cast<unsigned char>(begin[0]) < 0x80) {
*mblen = 1;
return static_cast<unsigned char>(begin[0]);
- } else if (len >= 2 && (begin[0] & 0xe0) == 0xc0) {
- *mblen = 2;
- return (((begin[0] & 0x1f) << 6) | ((begin[1] & 0x3f)));
- } else if (len >= 4 && (begin[0] & 0xf8) == 0xf0) {
- *mblen = 4;
- return (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3f) << 12) |
- ((begin[2] & 0x3f) << 6) | ((begin[3] & 0x3f)));
- } else if (len >= 5 && (begin[0] & 0xfc) == 0xf8) {
- *mblen = 5;
- return (((begin[0] & 0x03) << 24) | ((begin[1] & 0x3f) << 18) |
- ((begin[2] & 0x3f) << 12) | ((begin[3] & 0x3f) << 6) |
- ((begin[4] & 0x3f)));
- } else if (len >= 6 && (begin[0] & 0xfe) == 0xfc) {
- *mblen = 6;
- return (((begin[0] & 0x01) << 30) | ((begin[1] & 0x3f) << 24) |
- ((begin[2] & 0x3f) << 18) | ((begin[3] & 0x3f) << 12) |
- ((begin[4] & 0x3f) << 6) | ((begin[5] & 0x3f)));
+ } else if (len >= 2 && (begin[0] & 0xE0) == 0xC0) {
+ const char32 cp = (((begin[0] & 0x1F) << 6) | ((begin[1] & 0x3F)));
+ if (IsTrailByte(begin[1]) && cp >= 0x0080 && IsValidCodepoint(cp)) {
+ *mblen = 2;
+ return cp;
+ }
+ } else if (len >= 3 && (begin[0] & 0xF0) == 0xE0) {
+ const char32 cp = (((begin[0] & 0x0F) << 12) | ((begin[1] & 0x3F) << 6) |
+ ((begin[2] & 0x3F)));
+ if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) && cp >= 0x0800 &&
+ IsValidCodepoint(cp)) {
+ *mblen = 3;
+ return cp;
+ }
+ } else if (len >= 4 && (begin[0] & 0xf8) == 0xF0) {
+ const char32 cp = (((begin[0] & 0x07) << 18) | ((begin[1] & 0x3F) << 12) |
+ ((begin[2] & 0x3F) << 6) | ((begin[3] & 0x3F)));
+ if (IsTrailByte(begin[1]) && IsTrailByte(begin[2]) &&
+ IsTrailByte(begin[3]) && cp >= 0x10000 && IsValidCodepoint(cp)) {
+ *mblen = 4;
+ return cp;
+ }
}
+ // Invalid UTF-8.
*mblen = 1;
- return 0;
+ return kUnicodeError;
}
-size_t EncodeUTF8(char32 c, char *output) {
- if (c == 0) {
- // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8Append
- // worked like this.
- output[0] = '\0';
- return 0;
+bool IsStructurallyValid(StringPiece str) {
+ const char *begin = str.data();
+ const char *end = str.data() + str.size();
+ size_t mblen = 0;
+ while (begin < end) {
+ const char32 c = DecodeUTF8(begin, end, &mblen);
+ if (c == kUnicodeError && mblen != 3) return false;
+ if (!IsValidCodepoint(c)) return false;
+ begin += mblen;
}
- if (c < 0x00080) {
- output[0] = static_cast<char>(c & 0xFF);
- output[1] = '\0';
+ return true;
+}
+
+size_t EncodeUTF8(char32 c, char *output) {
+ if (c <= 0x7F) {
+ *output = static_cast<char>(c);
return 1;
}
- if (c < 0x00800) {
- output[0] = static_cast<char>(0xC0 + ((c >> 6) & 0x1F));
- output[1] = static_cast<char>(0x80 + (c & 0x3F));
- output[2] = '\0';
+
+ if (c <= 0x7FF) {
+ output[1] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[0] = 0xC0 | c;
return 2;
}
- if (c < 0x10000) {
- output[0] = static_cast<char>(0xE0 + ((c >> 12) & 0x0F));
- output[1] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
- output[2] = static_cast<char>(0x80 + (c & 0x3F));
- output[3] = '\0';
+
+ // if `c` is out-of-range, convert it to REPLACEMENT CHARACTER (U+FFFD).
+ // This treatment is the same as the original runetochar.
+ if (c > 0x10FFFF) c = kUnicodeError;
+
+ if (c <= 0xFFFF) {
+ output[2] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[1] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[0] = 0xE0 | c;
return 3;
}
- if (c < 0x200000) {
- output[0] = static_cast<char>(0xF0 + ((c >> 18) & 0x07));
- output[1] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
- output[2] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
- output[3] = static_cast<char>(0x80 + (c & 0x3F));
- output[4] = '\0';
- return 4;
- }
- // below is not in UCS4 but in 32bit int.
- if (c < 0x8000000) {
- output[0] = static_cast<char>(0xF8 + ((c >> 24) & 0x03));
- output[1] = static_cast<char>(0x80 + ((c >> 18) & 0x3F));
- output[2] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
- output[3] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
- output[4] = static_cast<char>(0x80 + (c & 0x3F));
- output[5] = '\0';
- return 5;
- }
- output[0] = static_cast<char>(0xFC + ((c >> 30) & 0x01));
- output[1] = static_cast<char>(0x80 + ((c >> 24) & 0x3F));
- output[2] = static_cast<char>(0x80 + ((c >> 18) & 0x3F));
- output[3] = static_cast<char>(0x80 + ((c >> 12) & 0x3F));
- output[4] = static_cast<char>(0x80 + ((c >> 6) & 0x3F));
- output[5] = static_cast<char>(0x80 + (c & 0x3F));
- output[6] = '\0';
- return 6;
+
+ output[3] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[2] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[1] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ output[0] = 0xF0 | c;
+
+ return 4;
}
std::string UnicodeCharToUTF8(const char32 c) { return UnicodeTextToUTF8({c}); }
diff --git a/src/util.h b/src/util.h
index f3b8158..828f7dd 100644
--- a/src/util.h
+++ b/src/util.h
@@ -168,25 +168,32 @@ inline size_t Itoa(T val, char *s) {
// Return length of a single UTF-8 source character
inline size_t OneCharLen(const char *src) {
- // Table of UTF-8 character lengths, based on first byte
- constexpr unsigned char kUTF8LenTable[256] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1};
- return kUTF8LenTable[*reinterpret_cast<const unsigned char *>(src)];
+ return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
}
+// Return (x & 0xC0) == 0x80;
+// Since trail bytes are always in [0x80, 0xBF], we can optimize:
+inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
+
+inline bool IsValidCodepoint(char32 c) {
+ return (static_cast<uint32>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
+}
+
+bool IsStructurallyValid(StringPiece str);
+
using UnicodeText = std::vector<char32>;
char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen);
+
+inline char32 DecodeUTF8(StringPiece input, size_t *mblen) {
+ return DecodeUTF8(input.data(), input.data() + input.size(), mblen);
+}
+
+inline bool IsValidDecodeUTF8(StringPiece input, size_t *mblen) {
+ const char32 c = DecodeUTF8(input, mblen);
+ return c != kUnicodeError || *mblen == 3;
+}
+
size_t EncodeUTF8(char32 c, char *output);
std::string UnicodeCharToUTF8(const char32 c);
diff --git a/src/util_test.cc b/src/util_test.cc
index e9d3829..6684f29 100644
--- a/src/util_test.cc
+++ b/src/util_test.cc
@@ -1,3 +1,4 @@
+
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,6 +18,9 @@
#include "testharness.h"
namespace sentencepiece {
+namespace {
+constexpr int kMaxUnicode = 0x10FFFF;
+}
TEST(UtilTest, LexicalCastTest) {
bool b = false;
@@ -260,73 +264,57 @@ TEST(UtilTest, DecodeUTF8Test) {
{
const std::string input = "";
- EXPECT_EQ(0, string_util::DecodeUTF8(input.data(),
- input.data() + input.size(), &mblen));
+ EXPECT_EQ(0, string_util::DecodeUTF8(input, &mblen));
EXPECT_EQ(1, mblen); // mblen always returns >= 1
}
{
- const std::string input = "\x01";
- EXPECT_EQ(1, string_util::DecodeUTF8(input.data(),
- input.data() + input.size(), &mblen));
+ EXPECT_EQ(1, string_util::DecodeUTF8("\x01", &mblen));
EXPECT_EQ(1, mblen);
}
{
- const std::string input = "\x7F";
- EXPECT_EQ(0x7F, string_util::DecodeUTF8(
- input.data(), input.data() + input.size(), &mblen));
+ EXPECT_EQ(0x7F, string_util::DecodeUTF8("\x7F", &mblen));
EXPECT_EQ(1, mblen);
}
{
- const std::string input = "\xC2\x80 ";
- EXPECT_EQ(0x80, string_util::DecodeUTF8(
- input.data(), input.data() + input.size(), &mblen));
+ EXPECT_EQ(0x80, string_util::DecodeUTF8("\xC2\x80 ", &mblen));
EXPECT_EQ(2, mblen);
}
{
- const std::string input = "\xDF\xBF ";
- EXPECT_EQ(0x7FF, string_util::DecodeUTF8(
- input.data(), input.data() + input.size(), &mblen));
+ EXPECT_EQ(0x7FF, string_util::DecodeUTF8("\xDF\xBF ", &mblen));
EXPECT_EQ(2, mblen);
}
{
- const std::string input = "\xE0\xA0\x80 ";
- EXPECT_EQ(0x800, string_util::DecodeUTF8(
- input.data(), input.data() + input.size(), &mblen));
+ EXPECT_EQ(0x800, string_util::DecodeUTF8("\xE0\xA0\x80 ", &mblen));
EXPECT_EQ(3, mblen);
}
{
- const std::string input = "\xF0\x90\x80\x80 ";
- EXPECT_EQ(0x10000, string_util::DecodeUTF8(
- input.data(), input.data() + input.size(), &mblen));
+ EXPECT_EQ(0x10000, string_util::DecodeUTF8("\xF0\x90\x80\x80 ", &mblen));
EXPECT_EQ(4, mblen);
}
+ // Invalid UTF8
{
- const std::string input = "\xF7\xBF\xBF\xBF ";
- EXPECT_EQ(0x1FFFFF, string_util::DecodeUTF8(
- input.data(), input.data() + input.size(), &mblen));
- EXPECT_EQ(4, mblen);
+ EXPECT_EQ(kUnicodeError,
+ string_util::DecodeUTF8("\xF7\xBF\xBF\xBF ", &mblen));
+ EXPECT_EQ(1, mblen);
}
{
- const std::string input = "\xF8\x88\x80\x80\x80 ";
- EXPECT_EQ(0x200000, string_util::DecodeUTF8(
- input.data(), input.data() + input.size(), &mblen));
- EXPECT_EQ(5, mblen);
+ EXPECT_EQ(kUnicodeError,
+ string_util::DecodeUTF8("\xF8\x88\x80\x80\x80 ", &mblen));
+ EXPECT_EQ(1, mblen);
}
{
- const std::string input = "\xFC\x84\x80\x80\x80\x80 ";
- EXPECT_EQ(0x4000000,
- string_util::DecodeUTF8(input.data(), input.data() + input.size(),
- &mblen));
- EXPECT_EQ(6, mblen);
+ EXPECT_EQ(kUnicodeError,
+ string_util::DecodeUTF8("\xFC\x84\x80\x80\x80\x80 ", &mblen));
+ EXPECT_EQ(1, mblen);
}
{
@@ -340,41 +328,60 @@ TEST(UtilTest, DecodeUTF8Test) {
for (size_t i = 0; i < 4; ++i) {
// return values of string_util::DecodeUTF8 is not defined.
// TODO(taku) implement an workaround.
- string_util::DecodeUTF8(
- kInvalidData[i], kInvalidData[i] + strlen(kInvalidData[i]), &mblen);
+ EXPECT_EQ(kUnicodeError,
+ string_util::DecodeUTF8(
+ kInvalidData[i], kInvalidData[i] + strlen(kInvalidData[i]),
+ &mblen));
+ EXPECT_FALSE(string_util::IsStructurallyValid(kInvalidData[i]));
EXPECT_EQ(1, mblen);
}
}
+
+ {
+ EXPECT_EQ(kUnicodeError, string_util::DecodeUTF8("\xDF\xDF ", &mblen));
+ EXPECT_EQ(1, mblen);
+ }
+
+ {
+ EXPECT_EQ(kUnicodeError, string_util::DecodeUTF8("\xE0\xE0\xE0 ", &mblen));
+ EXPECT_EQ(1, mblen);
+ }
+
+ {
+ EXPECT_EQ(kUnicodeError,
+ string_util::DecodeUTF8("\xF0\xF0\xF0\xFF ", &mblen));
+ EXPECT_EQ(1, mblen);
+ }
}
TEST(UtilTest, EncodeUTF8Test) {
- constexpr int kMaxUnicode = 0x110000;
char buf[16];
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
+ if (!string_util::IsValidCodepoint(cp)) continue;
const size_t mblen = string_util::EncodeUTF8(cp, buf);
size_t mblen2;
- char32 c = string_util::DecodeUTF8(buf, buf + 16, &mblen2);
+ const char32 c = string_util::DecodeUTF8(buf, buf + 16, &mblen2);
EXPECT_EQ(mblen2, mblen);
EXPECT_EQ(cp, c);
}
- EXPECT_EQ(0, string_util::EncodeUTF8(0, buf));
+ EXPECT_EQ(1, string_util::EncodeUTF8(0, buf));
EXPECT_EQ('\0', buf[0]);
// non UCS4
size_t mblen;
- EXPECT_EQ(5, string_util::EncodeUTF8(0x7000000, buf));
- string_util::DecodeUTF8(buf, buf + 16, &mblen);
- EXPECT_EQ(5, mblen);
+ EXPECT_EQ(3, string_util::EncodeUTF8(0x7000000, buf));
+ EXPECT_EQ(kUnicodeError, string_util::DecodeUTF8(buf, buf + 16, &mblen));
+ EXPECT_EQ(3, mblen);
- EXPECT_EQ(6, string_util::EncodeUTF8(0x8000001, buf));
- string_util::DecodeUTF8(buf, buf + 16, &mblen);
- EXPECT_EQ(6, mblen);
+ EXPECT_EQ(3, string_util::EncodeUTF8(0x8000001, buf));
+ EXPECT_EQ(kUnicodeError, string_util::DecodeUTF8(buf, buf + 16, &mblen));
+ EXPECT_EQ(3, mblen);
}
TEST(UtilTest, UnicodeCharToUTF8Test) {
- constexpr int kMaxUnicode = 0x110000;
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
+ if (!string_util::IsValidCodepoint(cp)) continue;
const auto s = string_util::UnicodeCharToUTF8(cp);
const auto ut = string_util::UTF8ToUnicodeText(s);
EXPECT_EQ(1, ut.size());
@@ -382,6 +389,31 @@ TEST(UtilTest, UnicodeCharToUTF8Test) {
}
}
+TEST(UtilTest, IsStructurallyValidTest) {
+ EXPECT_TRUE(string_util::IsStructurallyValid("abcd"));
+ EXPECT_TRUE(
+ string_util::IsStructurallyValid(StringPiece("a\0cd", 4))); // NUL
+ EXPECT_TRUE(string_util::IsStructurallyValid("ab\xc3\x81")); // 2-byte
+ EXPECT_TRUE(string_util::IsStructurallyValid("a\xe3\x81\x81")); // 3-byte
+ EXPECT_TRUE(string_util::IsStructurallyValid("\xf2\x82\x81\x84")); // 4
+ EXPECT_FALSE(string_util::IsStructurallyValid("abc\x80"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("abc\xc3"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("ab\xe3\x81"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("a\xf3\x81\x81"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("ab\xc0\x82"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("a\xe0\x82\x81"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xf0\x82\x83\x84"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xf4\xbd\xbe\xbf"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xED\xA0\x80"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xED\xBF\xBF"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xc0\x81"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xc1\xbf"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xe0\x81\x82"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xe0\x9f\xbf"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xf0\x80\x81\x82"));
+ EXPECT_FALSE(string_util::IsStructurallyValid("\xf0\x83\xbe\xbd"));
+}
+
TEST(UtilTest, UnicodeTextToUTF8Test) {
string_util::UnicodeText ut;