Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorvng <viktor.govako@gmail.com>2015-09-18 18:00:38 +0300
committerAlex Zolotarev <alex@maps.me>2015-09-23 04:26:59 +0300
commitd436125ad08d5043c12c792727ac6058fd9a497a (patch)
treeffe94781bfce41986f2eb58b8a23dfed87b861db
parent76b9949a81c0ab82d0ff5d1b688967758e46d46f (diff)
Built-in utf8proc into our normalization routine.vng-utf8proc
-rw-r--r--base/base_tests/string_utils_test.cpp4
-rw-r--r--base/string_utils.cpp37
2 files changed, 37 insertions, 4 deletions
diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp
index 6c6e6a548e..75ffd18f8d 100644
--- a/base/base_tests/string_utils_test.cpp
+++ b/base/base_tests/string_utils_test.cpp
@@ -384,8 +384,8 @@ UNIT_TEST(Normalize)
UNIT_TEST(Normalize_Special)
{
{
- string const utf8 = "ąĄćłŁÓŻźŃĘęĆ";
- TEST_EQUAL(strings::ToUtf8(strings::Normalize(strings::MakeUniString(utf8))), "aAclLOZzNEeC", ());
+ string const utf8 = "ąĄćłŁÓŻźŃĘęĆß";
+ TEST_EQUAL(strings::ToUtf8(strings::Normalize(strings::MakeUniString(utf8))), "aAclLOZzNEeCss", ());
}
{
diff --git a/base/string_utils.cpp b/base/string_utils.cpp
index 39d281a50d..c1c5fb123a 100644
--- a/base/string_utils.cpp
+++ b/base/string_utils.cpp
@@ -1,10 +1,13 @@
#include "base/string_utils.hpp"
#include "base/assert.hpp"
+#include "base/logging.hpp"
-#include "std/target_os.hpp"
-#include "std/iterator.hpp"
#include "std/cmath.hpp"
#include "std/iomanip.hpp"
+#include "std/iterator.hpp"
+#include "std/target_os.hpp"
+
+#include "utf8proc/utf8proc.h"
#include <boost/algorithm/string.hpp> // boost::trim
@@ -96,6 +99,36 @@ string MakeLowerCase(string const & s)
return r;
}
+UniString Normalize(UniString const & s)
+{
+ utf8proc_option_t const opt = static_cast<utf8proc_option_t>(
+ UTF8PROC_COMPAT | UTF8PROC_DECOMPOSE | UTF8PROC_STRIPMARK | UTF8PROC_STRIPCC | UTF8PROC_IGNORE);
+
+ UniString r(32);
+ utf8proc_ssize_t sz = utf8proc_decompose_utf32(
+ reinterpret_cast<utf8proc_int32_t const *>(s.data()), s.size(),
+ reinterpret_cast<utf8proc_int32_t *>(r.data()), r.size(), opt);
+
+ if (sz < 0)
+ {
+ LOG(LERROR, ("Can't normilize string", s, "Reason", utf8proc_errmsg(sz)));
+ return UniString();
+ }
+
+ if (sz > r.size())
+ {
+ r.resize(sz);
+ sz = utf8proc_decompose_utf32(
+ reinterpret_cast<utf8proc_int32_t const *>(s.data()), s.size(),
+ reinterpret_cast<utf8proc_int32_t *>(r.data()), r.size(), opt);
+ ASSERT_EQUAL(sz, r.size(), ());
+ }
+ else
+ r.resize(sz);
+
+ return r;
+}
+
namespace
{
char ascii_to_lower(char in)