diff options
Diffstat (limited to 'base/string_utils.cpp')
-rw-r--r-- | base/string_utils.cpp | 37 |
1 files changed, 35 insertions, 2 deletions
diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 39d281a50d..c1c5fb123a 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -1,10 +1,13 @@ #include "base/string_utils.hpp" #include "base/assert.hpp" +#include "base/logging.hpp" -#include "std/target_os.hpp" -#include "std/iterator.hpp" #include "std/cmath.hpp" #include "std/iomanip.hpp" +#include "std/iterator.hpp" +#include "std/target_os.hpp" + +#include "utf8proc/utf8proc.h" #include <boost/algorithm/string.hpp> // boost::trim @@ -96,6 +99,36 @@ string MakeLowerCase(string const & s) return r; } +UniString Normalize(UniString const & s) +{ + utf8proc_option_t const opt = static_cast<utf8proc_option_t>( + UTF8PROC_COMPAT | UTF8PROC_DECOMPOSE | UTF8PROC_STRIPMARK | UTF8PROC_STRIPCC | UTF8PROC_IGNORE); + + UniString r(32); + utf8proc_ssize_t sz = utf8proc_decompose_utf32( + reinterpret_cast<utf8proc_int32_t const *>(s.data()), s.size(), + reinterpret_cast<utf8proc_int32_t *>(r.data()), r.size(), opt); + + if (sz < 0) + { + LOG(LERROR, ("Can't normilize string", s, "Reason", utf8proc_errmsg(sz))); + return UniString(); + } + + if (sz > r.size()) + { + r.resize(sz); + sz = utf8proc_decompose_utf32( + reinterpret_cast<utf8proc_int32_t const *>(s.data()), s.size(), + reinterpret_cast<utf8proc_int32_t *>(r.data()), r.size(), opt); + ASSERT_EQUAL(sz, r.size(), ()); + } + else + r.resize(sz); + + return r; +} + namespace { char ascii_to_lower(char in) |