Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/coding
diff options
context:
space:
mode:
authorYuri Gorshenin <mipt.vi002@gmail.com>2017-04-05 13:06:42 +0300
committerGitHub <noreply@github.com>2017-04-05 13:06:42 +0300
commite9b446ff4302980a35b2df6026cf615fe8386589 (patch)
treebbaef60259ed58619eba8d52e997ca511c5d0b3a /coding
parent330fb14366eb4e64a17567db94a65ee2c16421dd (diff)
parent0f1fb3c9f81e3d0844613f145b995f9497f9437d (diff)
Merge pull request #5691 from darina/icu-transliteration
Transliteration using for obtaining best feature name.
Diffstat (limited to 'coding')
-rw-r--r--coding/CMakeLists.txt8
-rw-r--r--coding/coding.pro5
-rw-r--r--coding/multilang_utf8_string.cpp92
-rw-r--r--coding/multilang_utf8_string.hpp4
-rw-r--r--coding/transliteration.cpp70
-rw-r--r--coding/transliteration.hpp27
6 files changed, 189 insertions, 17 deletions
diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt
index 51bd2c744b..908c2d0fb6 100644
--- a/coding/CMakeLists.txt
+++ b/coding/CMakeLists.txt
@@ -1,8 +1,12 @@
project(coding)
+add_definitions(-DU_DISABLE_RENAMING)
+
include_directories(
${OMIM_ROOT}/coding
${OMIM_ROOT}/3party/expat
+ ${OMIM_ROOT}/3party/icu/common
+ ${OMIM_ROOT}/3party/icu/i18n
)
set(
@@ -66,7 +70,9 @@ set(
streams_sink.hpp
succinct_mapper.hpp
traffic.cpp
- traffic.hpp
+ traffic.hpp
+ transliteration.cpp
+ transliteration.hpp
uri.cpp
uri.hpp
url_encode.hpp
diff --git a/coding/coding.pro b/coding/coding.pro
index 4025552cb8..94ead8140d 100644
--- a/coding/coding.pro
+++ b/coding/coding.pro
@@ -2,6 +2,9 @@
TARGET = coding
TEMPLATE = lib
CONFIG += staticlib warn_on
+INCLUDEPATH += ../3party/icu/common ../3party/icu/i18n
+
+DEFINES *= U_DISABLE_RENAMING
ROOT_DIR = ..
@@ -24,6 +27,7 @@ SOURCES += \
reader_writer_ops.cpp \
simple_dense_coding.cpp \
traffic.cpp \
+ transliteration.cpp \
uri.cpp \
# varint_vector.cpp \
zip_creator.cpp \
@@ -76,6 +80,7 @@ HEADERS += \
streams_sink.hpp \
succinct_mapper.hpp \
traffic.hpp \
+ transliteration.hpp \
uri.hpp \
url_encode.hpp \
value_opt_string.hpp \
diff --git a/coding/multilang_utf8_string.cpp b/coding/multilang_utf8_string.cpp
index 12a5ca5a67..9e3211501a 100644
--- a/coding/multilang_utf8_string.cpp
+++ b/coding/multilang_utf8_string.cpp
@@ -9,22 +9,71 @@ namespace
// Languages below were choosen after sorting name:<lang> tags in 2011.
// Note, that it's not feasible to increase languages number here due to
// our current encoding (6 bit to store language code).
-StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country"},
- {"en", "English"}, {"ja", "日本語"}, {"fr", "Français"}, {"ko_rm", "Korean (Romanized)"},
- {"ar", "العربية"}, {"de", "Deutsch"}, {"int_name", "International (Latin)"}, {"ru", "Русский"},
- {"sv", "Svenska"}, {"zh", "中文"}, {"fi", "Suomi"}, {"be", "Беларуская"}, {"ka", "ქართული"},
- {"ko", "한국어"}, {"he", "עברית"}, {"nl", "Nederlands"}, {"ga", "Gaeilge"},
- {"ja_rm", "Japanese (Romanized)"}, {"el", "Ελληνικά"}, {"it", "Italiano"}, {"es", "Español"},
- {"zh_pinyin", "Chinese (Pinyin)"}, {"th", "ไทย"}, {"cy", "Cymraeg"}, {"sr", "Српски"},
- {"uk", "Українська"}, {"ca", "Català"}, {"hu", "Magyar"}, {"hsb", "Hornjoserbšćina"}, {"eu", "Euskara"},
- {"fa", "فارسی"}, {"br", "Breton"}, {"pl", "Polski"}, {"hy", "Հայերէն"}, {"kn", "ಕನ್ನಡ"},
- {"sl", "Slovenščina"}, {"ro", "Română"}, {"sq", "Shqipe"}, {"am", "አማርኛ"}, {"fy", "Frysk"},
- {"cs", "Čeština"}, {"gd", "Gàidhlig"}, {"sk", "Slovenčina"}, {"af", "Afrikaans"},
- {"ja_kana", "日本語(カタカナ)"}, {"lb", "Luxembourgish"}, {"pt", "Português"}, {"hr", "Hrvatski"},
- {"fur", "Friulian"}, {"vi", "Tiếng Việt"}, {"tr", "Türkçe"}, {"bg", "Български"},
- {"eo", "Esperanto"}, {"lt", "Lietuvių"}, {"la", "Latin"}, {"kk", "Қазақ"},
- {"gsw", "Schwiizertüütsch"}, {"et", "Eesti"}, {"ku", "Kurdish"}, {"mn", "Mongolian"},
- {"mk", "Македонски"}, {"lv", "Latviešu"}, {"hi", "हिन्दी"}
+StringUtf8Multilang::Languages const g_languages = {{
+ {"default", "Native for each country", "Any-Latin"},
+ {"en", "English", ""},
+ {"ja", "日本語", ""},
+ {"fr", "Français", ""},
+ {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"},
+ {"ar", "العربية", "Any-Latin"},
+ {"de", "Deutsch", ""},
+ {"int_name", "International (Latin)", "Any-Latin"},
+ {"ru", "Русский", "Russian-Latin/BGN"},
+ {"sv", "Svenska", "Any-Latin"},
+ {"zh", "中文", "Any-Latin"},
+ {"fi", "Suomi", "Any-Latin"},
+ {"be", "Беларуская", "Belarusian-Latin/BGN"},
+ {"ka", "ქართული", "Georgian-Latin"},
+ {"ko", "한국어", "Hangul-Latin/BGN"},
+ {"he", "עברית", "Hebrew-Latin/BGN"},
+ {"nl", "Nederlands", ""},
+ {"ga", "Gaeilge", "Any-Latin"},
+ {"ja_rm", "Japanese (Romanized)", "Any-Latin"},
+ {"el", "Ελληνικά", "Greek-Latin"},
+ {"it", "Italiano", ""},
+ {"es", "Español", ""},
+ {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"},
+ {"th", "ไทย", "Thai-Latin"},
+ {"cy", "Cymraeg", "Any-Latin"},
+ {"sr", "Српски", "Serbian-Latin/BGN"},
+ {"uk", "Українська", "Ukrainian-Latin/BGN"},
+ {"ca", "Català", "Any-Latin"},
+ {"hu", "Magyar", "Any-Latin"},
+ {"hsb", "Hornjoserbšćina", "Any-Latin"},
+ {"eu", "Euskara", "Any-Latin"},
+ {"fa", "فارسی", "Any-Latin"},
+ {"br", "Breton", "Any-Latin"},
+ {"pl", "Polski", "Any-Latin"},
+ {"hy", "Հայերէն", "Armenian-Latin"},
+ {"kn", "ಕನ್ನಡ", "Kannada-Latin"},
+ {"sl", "Slovenščina", "Any-Latin"},
+ {"ro", "Română", "Any-Latin"},
+ {"sq", "Shqipe", "Any-Latin"},
+ {"am", "አማርኛ", "Amharic-Latin/BGN"},
+ {"fy", "Frysk", "Any-Latin"},
+ {"cs", "Čeština", "Any-Latin"},
+ {"gd", "Gàidhlig", "Any-Latin"},
+ {"sk", "Slovenčina", "Any-Latin"},
+ {"af", "Afrikaans", "Any-Latin"},
+ {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"},
+ {"lb", "Luxembourgish", "Any-Latin"},
+ {"pt", "Português", "Any-Latin"},
+ {"hr", "Hrvatski", "Any-Latin"},
+ {"fur", "Friulian", "Any-Latin"},
+ {"vi", "Tiếng Việt", "Any-Latin"},
+ {"tr", "Türkçe", "Any-Latin"},
+ {"bg", "Български", "Bulgarian-Latin/BGN"},
+ {"eo", "Esperanto", "Any-Latin"},
+ {"lt", "Lietuvių", "Any-Latin"},
+ {"la", "Latin", ""},
+ {"kk", "Қазақ", "Kazakh-Latin/BGN"},
+ {"gsw", "Schwiizertüütsch", "Any-Latin"},
+ {"et", "Eesti", "Any-Latin"},
+ {"ku", "Kurdish", "Any-Latin"},
+ {"mn", "Mongolian", "Mongolian-Latin/BGN"},
+ {"mk", "Македонски", "Macedonian-Latin/BGN"},
+ {"lv", "Latviešu", "Any-Latin"},
+ {"hi", "हिन्दी", "Any-Latin"}
}};
static_assert(g_languages.size() == StringUtf8Multilang::kMaxSupportedLanguages,
@@ -44,6 +93,7 @@ StringUtf8Multilang::Languages const & StringUtf8Multilang::GetSupportedLanguage
ASSERT_EQUAL(g_languages[kInternationalCode].m_code, string("int_name"), ());
return g_languages;
}
+
// static
int8_t StringUtf8Multilang::GetLangIndex(string const & lang)
{
@@ -53,6 +103,7 @@ int8_t StringUtf8Multilang::GetLangIndex(string const & lang)
return kUnsupportedLanguageCode;
}
+
// static
char const * StringUtf8Multilang::GetLangByCode(int8_t langCode)
{
@@ -60,6 +111,7 @@ char const * StringUtf8Multilang::GetLangByCode(int8_t langCode)
return "";
return g_languages[langCode].m_code;
}
+
// static
char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode)
{
@@ -68,6 +120,14 @@ char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode)
return g_languages[langCode].m_name;
}
+// static
+char const * StringUtf8Multilang::GetTransliteratorIdByCode(int8_t langCode)
+{
+ if (langCode < 0 || langCode >= static_cast<int8_t>(g_languages.size()))
+ return "";
+ return g_languages[langCode].m_transliteratorId;
+}
+
size_t StringUtf8Multilang::GetNextIndex(size_t i) const
{
++i;
diff --git a/coding/multilang_utf8_string.hpp b/coding/multilang_utf8_string.hpp
index db0ffa0f5d..f54d402a1e 100644
--- a/coding/multilang_utf8_string.hpp
+++ b/coding/multilang_utf8_string.hpp
@@ -50,6 +50,8 @@ public:
char const * m_code;
/// Native language name.
char const * m_name;
+ /// Transliterator to latin id.
+ char const * m_transliteratorId;
};
using Languages = array<Lang, kMaxSupportedLanguages>;
@@ -61,6 +63,8 @@ public:
static char const * GetLangByCode(int8_t langCode);
/// @returns empty string if langCode is invalid.
static char const * GetLangNameByCode(int8_t langCode);
+ /// @returns empty string if langCode is invalid.
+ static char const * GetTransliteratorIdByCode(int8_t langCode);
inline bool operator== (StringUtf8Multilang const & rhs) const
{
diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp
new file mode 100644
index 0000000000..e30a03820a
--- /dev/null
+++ b/coding/transliteration.cpp
@@ -0,0 +1,70 @@
+#include "coding/transliteration.hpp"
+#include "coding/multilang_utf8_string.hpp"
+
+#include "base/logging.hpp"
+
+#include "3party/icu/common/unicode/uclean.h"
+#include "3party/icu/common/unicode/unistr.h"
+#include "3party/icu/common/unicode/utypes.h"
+#include "3party/icu/i18n/unicode/translit.h"
+#include "3party/icu/i18n/unicode/utrans.h"
+
+Transliteration::~Transliteration()
+{
+ // The use of u_cleanup() just before an application terminates is optional,
+ // but it should be called only once for performance reasons.
+ // The primary benefit is to eliminate reports of memory or resource leaks originating
+ // in ICU code from the results generated by heap analysis tools.
+ // http://www.icu-project.org/apiref/icu4c/uclean_8h.html#a93f27d0ddc7c196a1da864763f2d8920
+ m_transliterators.clear();
+ u_cleanup();
+}
+
+Transliteration & Transliteration::Instance()
+{
+ static Transliteration instance;
+ return instance;
+}
+
+void Transliteration::Init(std::string const & icuDataDir)
+{
+ u_setDataDirectory(icuDataDir.c_str());
+
+ for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages())
+ {
+ if (strlen(lang.m_transliteratorId) == 0 || m_transliterators.count(lang.m_transliteratorId) != 0)
+ continue;
+
+ UErrorCode status = U_ZERO_ERROR;
+ std::unique_ptr<Transliterator> transliterator(
+ Transliterator::createInstance(lang.m_transliteratorId, UTRANS_FORWARD, status));
+
+ if (transliterator != nullptr)
+ m_transliterators.emplace(lang.m_transliteratorId, std::move(transliterator));
+ else
+ LOG(LWARNING, ("Cannot create transliterator \"", lang.m_transliteratorId, "\", icu error =", status));
+ }
+}
+
+bool Transliteration::Transliterate(std::string const & str, int8_t langCode, std::string & out) const
+{
+ if (str.empty())
+ return false;
+
+ auto const transliteratorId = StringUtf8Multilang::GetTransliteratorIdByCode(langCode);
+ auto const & it = m_transliterators.find(transliteratorId);
+ if (it == m_transliterators.end())
+ {
+ LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\""));
+ return false;
+ }
+
+ UnicodeString ustr(str.c_str());
+ it->second->transliterate(ustr);
+
+ if (ustr.isEmpty())
+ return false;
+
+ ustr.toUTF8String(out);
+ return true;
+}
diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp
new file mode 100644
index 0000000000..cc3f97eb4d
--- /dev/null
+++ b/coding/transliteration.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+namespace icu
+{
+class Transliterator;
+}
+
+class Transliteration
+{
+public:
+ ~Transliteration();
+
+ static Transliteration & Instance();
+
+ void Init(std::string const & icuDataDir);
+
+ bool Transliterate(std::string const & str, int8_t langCode, std::string & out) const;
+
+private:
+ Transliteration() = default;
+
+ std::map<std::string, std::unique_ptr<icu::Transliterator>> m_transliterators;
+};