diff options
author | vng <viktor.govako@gmail.com> | 2015-09-18 18:00:38 +0300 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2015-09-23 04:26:59 +0300 |
commit | d436125ad08d5043c12c792727ac6058fd9a497a (patch) | |
tree | ffe94781bfce41986f2eb58b8a23dfed87b861db | |
parent | 76b9949a81c0ab82d0ff5d1b688967758e46d46f (diff) |
Built-in utf8proc into our normalization routine.vng-utf8proc
-rw-r--r-- | base/base_tests/string_utils_test.cpp | 4 | ||||
-rw-r--r-- | base/string_utils.cpp | 37 |
2 files changed, 37 insertions, 4 deletions
diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index 6c6e6a548e..75ffd18f8d 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -384,8 +384,8 @@ UNIT_TEST(Normalize) UNIT_TEST(Normalize_Special) { { - string const utf8 = "ąĄćłŁÓŻźŃĘęĆ"; - TEST_EQUAL(strings::ToUtf8(strings::Normalize(strings::MakeUniString(utf8))), "aAclLOZzNEeC", ()); + string const utf8 = "ąĄćłŁÓŻźŃĘęĆß"; + TEST_EQUAL(strings::ToUtf8(strings::Normalize(strings::MakeUniString(utf8))), "aAclLOZzNEeCss", ()); } { diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 39d281a50d..c1c5fb123a 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -1,10 +1,13 @@ #include "base/string_utils.hpp" #include "base/assert.hpp" +#include "base/logging.hpp" -#include "std/target_os.hpp" -#include "std/iterator.hpp" #include "std/cmath.hpp" #include "std/iomanip.hpp" +#include "std/iterator.hpp" +#include "std/target_os.hpp" + +#include "utf8proc/utf8proc.h" #include <boost/algorithm/string.hpp> // boost::trim @@ -96,6 +99,36 @@ string MakeLowerCase(string const & s) return r; } +UniString Normalize(UniString const & s) +{ + utf8proc_option_t const opt = static_cast<utf8proc_option_t>( + UTF8PROC_COMPAT | UTF8PROC_DECOMPOSE | UTF8PROC_STRIPMARK | UTF8PROC_STRIPCC | UTF8PROC_IGNORE); + + UniString r(32); + utf8proc_ssize_t sz = utf8proc_decompose_utf32( + reinterpret_cast<utf8proc_int32_t const *>(s.data()), s.size(), + reinterpret_cast<utf8proc_int32_t *>(r.data()), r.size(), opt); + + if (sz < 0) + { + LOG(LERROR, ("Can't normilize string", s, "Reason", utf8proc_errmsg(sz))); + return UniString(); + } + + if (sz > r.size()) + { + r.resize(sz); + sz = utf8proc_decompose_utf32( + reinterpret_cast<utf8proc_int32_t const *>(s.data()), s.size(), + reinterpret_cast<utf8proc_int32_t *>(r.data()), r.size(), opt); + ASSERT_EQUAL(sz, r.size(), ()); + } + else + r.resize(sz); + + return r; +} + namespace { char ascii_to_lower(char in) |