Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/coding
diff options
context:
space:
mode:
authorDaria Volvenkova <d.volvenkova@corp.mail.ru>2017-03-24 14:22:54 +0300
committerDaria Volvenkova <d.volvenkova@corp.mail.ru>2017-03-30 14:53:25 +0300
commitbfdc4ee246347545456116a7ef9e39c160170830 (patch)
treeb92094b7f3a4ccd71658034275dcc79fd8e1a9a1 /coding
parent955d43ea1e4597e138e8ec213688fab4594f6d7f (diff)
Using a pool of transliterators.
Diffstat (limited to 'coding')
-rw-r--r--coding/CMakeLists.txt8
-rw-r--r--coding/coding.pro5
-rw-r--r--coding/multilang_utf8_string.cpp36
-rw-r--r--coding/multilang_utf8_string.hpp6
-rw-r--r--coding/transliteration.cpp59
-rw-r--r--coding/transliteration.hpp28
6 files changed, 120 insertions, 22 deletions
diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt
index 51bd2c744b..52ae94df7b 100644
--- a/coding/CMakeLists.txt
+++ b/coding/CMakeLists.txt
@@ -1,8 +1,12 @@
project(coding)
+add_definitions(-DU_DISABLE_RENAMING)
+
include_directories(
${OMIM_ROOT}/coding
${OMIM_ROOT}/3party/expat
+ ${OMIM_ROOT}/3party/icu/common
+ ${OMIM_ROOT}/3party/icu/i18n
)
set(
@@ -66,7 +70,9 @@ set(
streams_sink.hpp
succinct_mapper.hpp
traffic.cpp
- traffic.hpp
+ traffic.hpp
+ transliterator.cpp
+ transliterator.hpp
uri.cpp
uri.hpp
url_encode.hpp
diff --git a/coding/coding.pro b/coding/coding.pro
index 4025552cb8..94ead8140d 100644
--- a/coding/coding.pro
+++ b/coding/coding.pro
@@ -2,6 +2,9 @@
TARGET = coding
TEMPLATE = lib
CONFIG += staticlib warn_on
+INCLUDEPATH += ../3party/icu/common ../3party/icu/i18n
+
+DEFINES *= U_DISABLE_RENAMING
ROOT_DIR = ..
@@ -24,6 +27,7 @@ SOURCES += \
reader_writer_ops.cpp \
simple_dense_coding.cpp \
traffic.cpp \
+ transliteration.cpp \
uri.cpp \
# varint_vector.cpp \
zip_creator.cpp \
@@ -76,6 +80,7 @@ HEADERS += \
streams_sink.hpp \
succinct_mapper.hpp \
traffic.hpp \
+ transliteration.hpp \
uri.hpp \
url_encode.hpp \
value_opt_string.hpp \
diff --git a/coding/multilang_utf8_string.cpp b/coding/multilang_utf8_string.cpp
index 628287cb5c..a3e1723121 100644
--- a/coding/multilang_utf8_string.cpp
+++ b/coding/multilang_utf8_string.cpp
@@ -9,22 +9,22 @@ namespace
// Languages below were choosen after sorting name:<lang> tags in 2011.
// Note, that it's not feasible to increase languages number here due to
// our current encoding (6 bit to store language code).
-StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any"},
- {"en", "English", "English"}, {"ja", "日本語", "Japanese"}, {"fr", "Français", "French"}, {"ko_rm", "Korean (Romanized)", "Korean"},
- {"ar", "العربية", "Arabic"}, {"de", "Deutsch", "German"}, {"int_name", "International (Latin)", "Latin"}, {"ru", "Русский", "Russian"},
- {"sv", "Svenska", "Swedish"}, {"zh", "中文", "Chinese"}, {"fi", "Suomi", "Finnish"}, {"be", "Беларуская", "Belarusian"}, {"ka", "ქართული", "Georgian"},
- {"ko", "한국어", "Korean"}, {"he", "עברית", "Hebrew"}, {"nl", "Nederlands", "Dutch"}, {"ga", "Gaeilge", "Irish"},
- {"ja_rm", "Japanese (Romanized)", "Japanese"}, {"el", "Ελληνικά", "Greek"}, {"it", "Italiano", "Italian"}, {"es", "Español", "Spanish"},
- {"zh_pinyin", "Chinese (Pinyin)", "Chinese"}, {"th", "ไทย", "Thailand"}, {"cy", "Cymraeg", "Welsh"}, {"sr", "Српски", "Serbian"},
- {"uk", "Українська", "Ukrainian"}, {"ca", "Català", "Catalan"}, {"hu", "Magyar", "Hungarian"}, {"hsb", "Hornjoserbšćina", "Upper Sorbian"}, {"eu", "Euskara", "Basque"},
- {"fa", "فارسی", "Farsi"}, {"br", "Breton", "Breton"}, {"pl", "Polski", "Polish"}, {"hy", "Հայերէն", "Armenian"}, {"kn", "ಕನ್ನಡ", "Kannada"},
- {"sl", "Slovenščina", "Slovene"}, {"ro", "Română", "Romanian"}, {"sq", "Shqipe", "Shqipe"}, {"am", "አማርኛ", "Amharic"}, {"fy", "Frysk", "Frisian"},
- {"cs", "Čeština", "Czech"}, {"gd", "Gàidhlig", "Scots Gaelic"}, {"sk", "Slovenčina", "Slovak"}, {"af", "Afrikaans", "Afrikaans"},
- {"ja_kana", "日本語(カタカナ)", "Japanese (Katakana)"}, {"lb", "Luxembourgish", "Luxembourgish"}, {"pt", "Português", "Portuguese"}, {"hr", "Hrvatski", "Croatian"},
- {"fur", "Friulian", "Friulian"}, {"vi", "Tiếng Việt", "Vietnamese"}, {"tr", "Türkçe", "Turkish"}, {"bg", "Български", "Bulgarian"},
- {"eo", "Esperanto", "Esperanto"}, {"lt", "Lietuvių", "Lithuanian"}, {"la", "Latin", "Latin"}, {"kk", "Қазақ", "Kazakh"},
- {"gsw", "Schwiizertüütsch", "Swiss German"}, {"et", "Eesti", "Estonian"}, {"ku", "Kurdish", "Kurdish"}, {"mn", "Mongolian", "Mongolian"},
- {"mk", "Македонски", "Macedonian"}, {"lv", "Latviešu", "Latvian"}, {"hi", "हिन्दी", "Hindi"}
+StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any-Latin"},
+ {"en", "English", ""}, {"ja", "日本語", "Any-Latin"}, {"fr", "Français", ""}, {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"},
+ {"ar", "العربية", "Any-Latin"}, {"de", "Deutsch", ""}, {"int_name", "International (Latin)", "Any-Latin"}, {"ru", "Русский", "Russian-Latin/BGN"},
+ {"sv", "Svenska", "Any-Latin"}, {"zh", "中文", "Any-Latin"}, {"fi", "Suomi", "Any-Latin"}, {"be", "Беларуская", "Belarusian-Latin/BGN"}, {"ka", "ქართული", "Georgian-Latin"},
+ {"ko", "한국어", "Hangul"}, {"he", "עברית", "Hebrew"}, {"nl", "Nederlands", ""}, {"ga", "Gaeilge", "Any-Latin"},
+ {"ja_rm", "Japanese (Romanized)", "Any-Latin"}, {"el", "Ελληνικά", "Greek-Latin"}, {"it", "Italiano", ""}, {"es", "Español", ""},
+ {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"}, {"th", "ไทย", "Thai-Latin"}, {"cy", "Cymraeg", "Any-Latin"}, {"sr", "Српски", "Serbian-Latin/BGN"},
+ {"uk", "Українська", "Ukrainian-Latin/BGN"}, {"ca", "Català", "Any-Latin"}, {"hu", "Magyar", "Any-Latin"}, {"hsb", "Hornjoserbšćina", "Any-Latin"}, {"eu", "Euskara", "Any-Latin"},
+ {"fa", "فارسی", "Any-Latin"}, {"br", "Breton", "Any-Latin"}, {"pl", "Polski", "Any-Latin"}, {"hy", "Հայերէն", "Armenian-Latin"}, {"kn", "ಕನ್ನಡ", "Kannada-Latin"},
+ {"sl", "Slovenščina", "Any-Latin"}, {"ro", "Română", "Any-Latin"}, {"sq", "Shqipe", "Any-Latin"}, {"am", "አማርኛ", "Amharic-Latin/BGN"}, {"fy", "Frysk", "Any-Latin"},
+ {"cs", "Čeština", "Any-Latin"}, {"gd", "Gàidhlig", "Any-Latin"}, {"sk", "Slovenčina", "Any-Latin"}, {"af", "Afrikaans", "Any-Latin"},
+ {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"}, {"lb", "Luxembourgish", "Any-Latin"}, {"pt", "Português", "Any-Latin"}, {"hr", "Hrvatski", "Any-Latin"},
+ {"fur", "Friulian", "Any-Latin"}, {"vi", "Tiếng Việt", "Any-Latin"}, {"tr", "Türkçe", "Any-Latin"}, {"bg", "Български", "Bulgarian-Latin/BGN"},
+ {"eo", "Esperanto", "Any-Latin"}, {"lt", "Lietuvių", "Any-Latin"}, {"la", "Latin", ""}, {"kk", "Қазақ", "Kazakh-Latin/BGN"},
+ {"gsw", "Schwiizertüütsch", "Any-Latin"}, {"et", "Eesti", "Any-Latin"}, {"ku", "Kurdish", "Any-Latin"}, {"mn", "Mongolian", "Mongolian-Latin/BGN"},
+ {"mk", "Македонски", "Macedonian-Latin/BGN"}, {"lv", "Latviešu", "Any-Latin"}, {"hi", "हिन्दी", "Any-Latin"}
}};
static_assert(g_languages.size() == StringUtf8Multilang::kMaxSupportedLanguages,
@@ -72,11 +72,11 @@ char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode)
}
// static
-char const * StringUtf8Multilang::GetLangEnNameByCode(int8_t langCode)
+char const * StringUtf8Multilang::GetTransliteratorIdByCode(int8_t langCode)
{
if (langCode < 0 || langCode >= static_cast<int8_t>(g_languages.size()))
return "";
- return g_languages[langCode].m_enName;
+ return g_languages[langCode].m_transliteratorId;
}
size_t StringUtf8Multilang::GetNextIndex(size_t i) const
diff --git a/coding/multilang_utf8_string.hpp b/coding/multilang_utf8_string.hpp
index afd19250b0..f54d402a1e 100644
--- a/coding/multilang_utf8_string.hpp
+++ b/coding/multilang_utf8_string.hpp
@@ -50,8 +50,8 @@ public:
char const * m_code;
/// Native language name.
char const * m_name;
- /// Native language name in English.
- char const * m_enName;
+ /// Transliterator to latin id.
+ char const * m_transliteratorId;
};
using Languages = array<Lang, kMaxSupportedLanguages>;
@@ -64,7 +64,7 @@ public:
/// @returns empty string if langCode is invalid.
static char const * GetLangNameByCode(int8_t langCode);
/// @returns empty string if langCode is invalid.
- static char const * GetLangEnNameByCode(int8_t langCode);
+ static char const * GetTransliteratorIdByCode(int8_t langCode);
inline bool operator== (StringUtf8Multilang const & rhs) const
{
diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp
new file mode 100644
index 0000000000..937dd8106f
--- /dev/null
+++ b/coding/transliteration.cpp
@@ -0,0 +1,59 @@
+#include "coding/transliteration.hpp"
+#include "coding/multilang_utf8_string.hpp"
+
+#include "base/logging.hpp"
+
+#include "3party/icu/common/unicode/unistr.h"
+#include "3party/icu/common/unicode/utypes.h"
+#include "3party/icu/i18n/unicode/translit.h"
+#include "3party/icu/i18n/unicode/utrans.h"
+
+Transliteration::~Transliteration()
+{
+ //u_cleanup();
+}
+
+Transliteration & Transliteration::GetInstance()
+{
+ static Transliteration instance;
+ return instance;
+}
+
+void Transliteration::Init(std::string const & icuDataDir)
+{
+ u_setDataDirectory(icuDataDir.c_str());
+
+ for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages())
+ {
+ if (strlen(lang.m_transliteratorId) == 0 || m_transliterators.count(lang.m_transliteratorId) != 0)
+ continue;
+
+ UErrorCode status = U_ZERO_ERROR;
+ std::unique_ptr<Transliterator> transliterator(
+ Transliterator::createInstance(lang.m_transliteratorId, UTRANS_FORWARD, status));
+
+ if (transliterator != nullptr)
+ m_transliterators.emplace(lang.m_transliteratorId, std::move(transliterator));
+ else
+ LOG(LWARNING, ("Cannot create transliterator \"", lang.m_transliteratorId, "\", icu error =", status));
+ }
+}
+
+std::string Transliteration::Transliterate(std::string const & str, int8_t langCode) const
+{
+ auto const transliteratorId = StringUtf8Multilang::GetTransliteratorIdByCode(langCode);
+ auto const & it = m_transliterators.find(transliteratorId);
+ if (it == m_transliterators.end())
+ {
+ LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\""));
+ return "";
+ }
+
+ UnicodeString ustr(str.c_str());
+ it->second->transliterate(ustr);
+
+ std::string resultStr;
+ ustr.toUTF8String(resultStr);
+
+ return resultStr;
+}
diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp
new file mode 100644
index 0000000000..4d1f0eb7e2
--- /dev/null
+++ b/coding/transliteration.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+namespace icu
+{
+class Transliterator;
+}
+
+class Transliteration
+{
+public:
+ ~Transliteration();
+
+ static Transliteration & GetInstance();
+
+ void Init(std::string const & icuDataDir);
+
+ std::string Transliterate(std::string const & str, int8_t langCode) const;
+
+private:
+ Transliteration() = default;
+
+ struct TransliteratorWrapper;
+ std::map<std::string, std::unique_ptr<icu::Transliterator>> m_transliterators;
+};