From d13124d48244721d0cca3f59dcd3ed1a7d8fa641 Mon Sep 17 00:00:00 2001 From: Daria Volvenkova Date: Tue, 21 Mar 2017 21:38:56 +0300 Subject: Using transliteration for obtaining best feature name. --- coding/multilang_utf8_string.cpp | 43 +++++++++++++++++++++++++--------------- coding/multilang_utf8_string.hpp | 4 ++++ 2 files changed, 31 insertions(+), 16 deletions(-) (limited to 'coding') diff --git a/coding/multilang_utf8_string.cpp b/coding/multilang_utf8_string.cpp index 12a5ca5a67..628287cb5c 100644 --- a/coding/multilang_utf8_string.cpp +++ b/coding/multilang_utf8_string.cpp @@ -9,22 +9,22 @@ namespace // Languages below were choosen after sorting name: tags in 2011. // Note, that it's not feasible to increase languages number here due to // our current encoding (6 bit to store language code). -StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country"}, - {"en", "English"}, {"ja", "日本語"}, {"fr", "Français"}, {"ko_rm", "Korean (Romanized)"}, - {"ar", "العربية"}, {"de", "Deutsch"}, {"int_name", "International (Latin)"}, {"ru", "Русский"}, - {"sv", "Svenska"}, {"zh", "中文"}, {"fi", "Suomi"}, {"be", "Беларуская"}, {"ka", "ქართული"}, - {"ko", "한국어"}, {"he", "עברית"}, {"nl", "Nederlands"}, {"ga", "Gaeilge"}, - {"ja_rm", "Japanese (Romanized)"}, {"el", "Ελληνικά"}, {"it", "Italiano"}, {"es", "Español"}, - {"zh_pinyin", "Chinese (Pinyin)"}, {"th", "ไทย"}, {"cy", "Cymraeg"}, {"sr", "Српски"}, - {"uk", "Українська"}, {"ca", "Català"}, {"hu", "Magyar"}, {"hsb", "Hornjoserbšćina"}, {"eu", "Euskara"}, - {"fa", "فارسی"}, {"br", "Breton"}, {"pl", "Polski"}, {"hy", "Հայերէն"}, {"kn", "ಕನ್ನಡ"}, - {"sl", "Slovenščina"}, {"ro", "Română"}, {"sq", "Shqipe"}, {"am", "አማርኛ"}, {"fy", "Frysk"}, - {"cs", "Čeština"}, {"gd", "Gàidhlig"}, {"sk", "Slovenčina"}, {"af", "Afrikaans"}, - {"ja_kana", "日本語(カタカナ)"}, {"lb", "Luxembourgish"}, {"pt", "Português"}, {"hr", "Hrvatski"}, - {"fur", "Friulian"}, {"vi", "Tiếng Việt"}, {"tr", "Türkçe"}, {"bg", "Български"}, - {"eo", "Esperanto"}, {"lt", "Lietuvių"}, {"la", "Latin"}, {"kk", "Қазақ"}, - {"gsw", "Schwiizertüütsch"}, {"et", "Eesti"}, {"ku", "Kurdish"}, {"mn", "Mongolian"}, - {"mk", "Македонски"}, {"lv", "Latviešu"}, {"hi", "हिन्दी"} +StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any"}, + {"en", "English", "English"}, {"ja", "日本語", "Japanese"}, {"fr", "Français", "French"}, {"ko_rm", "Korean (Romanized)", "Korean"}, + {"ar", "العربية", "Arabic"}, {"de", "Deutsch", "German"}, {"int_name", "International (Latin)", "Latin"}, {"ru", "Русский", "Russian"}, + {"sv", "Svenska", "Swedish"}, {"zh", "中文", "Chinese"}, {"fi", "Suomi", "Finnish"}, {"be", "Беларуская", "Belarusian"}, {"ka", "ქართული", "Georgian"}, + {"ko", "한국어", "Korean"}, {"he", "עברית", "Hebrew"}, {"nl", "Nederlands", "Dutch"}, {"ga", "Gaeilge", "Irish"}, + {"ja_rm", "Japanese (Romanized)", "Japanese"}, {"el", "Ελληνικά", "Greek"}, {"it", "Italiano", "Italian"}, {"es", "Español", "Spanish"}, + {"zh_pinyin", "Chinese (Pinyin)", "Chinese"}, {"th", "ไทย", "Thailand"}, {"cy", "Cymraeg", "Welsh"}, {"sr", "Српски", "Serbian"}, + {"uk", "Українська", "Ukrainian"}, {"ca", "Català", "Catalan"}, {"hu", "Magyar", "Hungarian"}, {"hsb", "Hornjoserbšćina", "Upper Sorbian"}, {"eu", "Euskara", "Basque"}, + {"fa", "فارسی", "Farsi"}, {"br", "Breton", "Breton"}, {"pl", "Polski", "Polish"}, {"hy", "Հայերէն", "Armenian"}, {"kn", "ಕನ್ನಡ", "Kannada"}, + {"sl", "Slovenščina", "Slovene"}, {"ro", "Română", "Romanian"}, {"sq", "Shqipe", "Shqipe"}, {"am", "አማርኛ", "Amharic"}, {"fy", "Frysk", "Frisian"}, + {"cs", "Čeština", "Czech"}, {"gd", "Gàidhlig", "Scots Gaelic"}, {"sk", "Slovenčina", "Slovak"}, {"af", "Afrikaans", "Afrikaans"}, + {"ja_kana", "日本語(カタカナ)", "Japanese (Katakana)"}, {"lb", "Luxembourgish", "Luxembourgish"}, {"pt", "Português", "Portuguese"}, {"hr", "Hrvatski", "Croatian"}, + {"fur", "Friulian", "Friulian"}, {"vi", "Tiếng Việt", "Vietnamese"}, {"tr", "Türkçe", "Turkish"}, {"bg", "Български", "Bulgarian"}, + {"eo", "Esperanto", "Esperanto"}, {"lt", "Lietuvių", "Lithuanian"}, {"la", "Latin", "Latin"}, {"kk", "Қазақ", "Kazakh"}, + {"gsw", "Schwiizertüütsch", "Swiss German"}, {"et", "Eesti", "Estonian"}, {"ku", "Kurdish", "Kurdish"}, {"mn", "Mongolian", "Mongolian"}, + {"mk", "Македонски", "Macedonian"}, {"lv", "Latviešu", "Latvian"}, {"hi", "हिन्दी", "Hindi"} }}; static_assert(g_languages.size() == StringUtf8Multilang::kMaxSupportedLanguages, @@ -44,6 +44,7 @@ StringUtf8Multilang::Languages const & StringUtf8Multilang::GetSupportedLanguage ASSERT_EQUAL(g_languages[kInternationalCode].m_code, string("int_name"), ()); return g_languages; } + // static int8_t StringUtf8Multilang::GetLangIndex(string const & lang) { @@ -53,6 +54,7 @@ int8_t StringUtf8Multilang::GetLangIndex(string const & lang) return kUnsupportedLanguageCode; } + // static char const * StringUtf8Multilang::GetLangByCode(int8_t langCode) { @@ -60,6 +62,7 @@ char const * StringUtf8Multilang::GetLangByCode(int8_t langCode) return ""; return g_languages[langCode].m_code; } + // static char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode) { @@ -68,6 +71,14 @@ char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode) return g_languages[langCode].m_name; } +// static +char const * StringUtf8Multilang::GetLangEnNameByCode(int8_t langCode) +{ + if (langCode < 0 || langCode >= static_cast(g_languages.size())) + return ""; + return g_languages[langCode].m_enName; +} + size_t StringUtf8Multilang::GetNextIndex(size_t i) const { ++i; diff --git a/coding/multilang_utf8_string.hpp b/coding/multilang_utf8_string.hpp index db0ffa0f5d..afd19250b0 100644 --- a/coding/multilang_utf8_string.hpp +++ b/coding/multilang_utf8_string.hpp @@ -50,6 +50,8 @@ public: char const * m_code; /// Native language name. char const * m_name; + /// Native language name in English. + char const * m_enName; }; using Languages = array; @@ -61,6 +63,8 @@ public: static char const * GetLangByCode(int8_t langCode); /// @returns empty string if langCode is invalid. static char const * GetLangNameByCode(int8_t langCode); + /// @returns empty string if langCode is invalid. + static char const * GetLangEnNameByCode(int8_t langCode); inline bool operator== (StringUtf8Multilang const & rhs) const { -- cgit v1.2.3 From bfdc4ee246347545456116a7ef9e39c160170830 Mon Sep 17 00:00:00 2001 From: Daria Volvenkova Date: Fri, 24 Mar 2017 14:22:54 +0300 Subject: Using a pool of transliterators. --- coding/CMakeLists.txt | 8 +++++- coding/coding.pro | 5 ++++ coding/multilang_utf8_string.cpp | 36 ++++++++++++------------ coding/multilang_utf8_string.hpp | 6 ++-- coding/transliteration.cpp | 59 ++++++++++++++++++++++++++++++++++++++++ coding/transliteration.hpp | 28 +++++++++++++++++++ 6 files changed, 120 insertions(+), 22 deletions(-) create mode 100644 coding/transliteration.cpp create mode 100644 coding/transliteration.hpp (limited to 'coding') diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt index 51bd2c744b..52ae94df7b 100644 --- a/coding/CMakeLists.txt +++ b/coding/CMakeLists.txt @@ -1,8 +1,12 @@ project(coding) +add_definitions(-DU_DISABLE_RENAMING) + include_directories( ${OMIM_ROOT}/coding ${OMIM_ROOT}/3party/expat + ${OMIM_ROOT}/3party/icu/common + ${OMIM_ROOT}/3party/icu/i18n ) set( @@ -66,7 +70,9 @@ set( streams_sink.hpp succinct_mapper.hpp traffic.cpp - traffic.hpp + traffic.hpp + transliterator.cpp + transliterator.hpp uri.cpp uri.hpp url_encode.hpp diff --git a/coding/coding.pro b/coding/coding.pro index 4025552cb8..94ead8140d 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -2,6 +2,9 @@ TARGET = coding TEMPLATE = lib CONFIG += staticlib warn_on +INCLUDEPATH += ../3party/icu/common ../3party/icu/i18n + +DEFINES *= U_DISABLE_RENAMING ROOT_DIR = .. @@ -24,6 +27,7 @@ SOURCES += \ reader_writer_ops.cpp \ simple_dense_coding.cpp \ traffic.cpp \ + transliteration.cpp \ uri.cpp \ # varint_vector.cpp \ zip_creator.cpp \ @@ -76,6 +80,7 @@ HEADERS += \ streams_sink.hpp \ succinct_mapper.hpp \ traffic.hpp \ + transliteration.hpp \ uri.hpp \ url_encode.hpp \ value_opt_string.hpp \ diff --git a/coding/multilang_utf8_string.cpp b/coding/multilang_utf8_string.cpp index 628287cb5c..a3e1723121 100644 --- a/coding/multilang_utf8_string.cpp +++ b/coding/multilang_utf8_string.cpp @@ -9,22 +9,22 @@ namespace // Languages below were choosen after sorting name: tags in 2011. // Note, that it's not feasible to increase languages number here due to // our current encoding (6 bit to store language code). -StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any"}, - {"en", "English", "English"}, {"ja", "日本語", "Japanese"}, {"fr", "Français", "French"}, {"ko_rm", "Korean (Romanized)", "Korean"}, - {"ar", "العربية", "Arabic"}, {"de", "Deutsch", "German"}, {"int_name", "International (Latin)", "Latin"}, {"ru", "Русский", "Russian"}, - {"sv", "Svenska", "Swedish"}, {"zh", "中文", "Chinese"}, {"fi", "Suomi", "Finnish"}, {"be", "Беларуская", "Belarusian"}, {"ka", "ქართული", "Georgian"}, - {"ko", "한국어", "Korean"}, {"he", "עברית", "Hebrew"}, {"nl", "Nederlands", "Dutch"}, {"ga", "Gaeilge", "Irish"}, - {"ja_rm", "Japanese (Romanized)", "Japanese"}, {"el", "Ελληνικά", "Greek"}, {"it", "Italiano", "Italian"}, {"es", "Español", "Spanish"}, - {"zh_pinyin", "Chinese (Pinyin)", "Chinese"}, {"th", "ไทย", "Thailand"}, {"cy", "Cymraeg", "Welsh"}, {"sr", "Српски", "Serbian"}, - {"uk", "Українська", "Ukrainian"}, {"ca", "Català", "Catalan"}, {"hu", "Magyar", "Hungarian"}, {"hsb", "Hornjoserbšćina", "Upper Sorbian"}, {"eu", "Euskara", "Basque"}, - {"fa", "فارسی", "Farsi"}, {"br", "Breton", "Breton"}, {"pl", "Polski", "Polish"}, {"hy", "Հայերէն", "Armenian"}, {"kn", "ಕನ್ನಡ", "Kannada"}, - {"sl", "Slovenščina", "Slovene"}, {"ro", "Română", "Romanian"}, {"sq", "Shqipe", "Shqipe"}, {"am", "አማርኛ", "Amharic"}, {"fy", "Frysk", "Frisian"}, - {"cs", "Čeština", "Czech"}, {"gd", "Gàidhlig", "Scots Gaelic"}, {"sk", "Slovenčina", "Slovak"}, {"af", "Afrikaans", "Afrikaans"}, - {"ja_kana", "日本語(カタカナ)", "Japanese (Katakana)"}, {"lb", "Luxembourgish", "Luxembourgish"}, {"pt", "Português", "Portuguese"}, {"hr", "Hrvatski", "Croatian"}, - {"fur", "Friulian", "Friulian"}, {"vi", "Tiếng Việt", "Vietnamese"}, {"tr", "Türkçe", "Turkish"}, {"bg", "Български", "Bulgarian"}, - {"eo", "Esperanto", "Esperanto"}, {"lt", "Lietuvių", "Lithuanian"}, {"la", "Latin", "Latin"}, {"kk", "Қазақ", "Kazakh"}, - {"gsw", "Schwiizertüütsch", "Swiss German"}, {"et", "Eesti", "Estonian"}, {"ku", "Kurdish", "Kurdish"}, {"mn", "Mongolian", "Mongolian"}, - {"mk", "Македонски", "Macedonian"}, {"lv", "Latviešu", "Latvian"}, {"hi", "हिन्दी", "Hindi"} +StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any-Latin"}, + {"en", "English", ""}, {"ja", "日本語", "Any-Latin"}, {"fr", "Français", ""}, {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"}, + {"ar", "العربية", "Any-Latin"}, {"de", "Deutsch", ""}, {"int_name", "International (Latin)", "Any-Latin"}, {"ru", "Русский", "Russian-Latin/BGN"}, + {"sv", "Svenska", "Any-Latin"}, {"zh", "中文", "Any-Latin"}, {"fi", "Suomi", "Any-Latin"}, {"be", "Беларуская", "Belarusian-Latin/BGN"}, {"ka", "ქართული", "Georgian-Latin"}, + {"ko", "한국어", "Hangul"}, {"he", "עברית", "Hebrew"}, {"nl", "Nederlands", ""}, {"ga", "Gaeilge", "Any-Latin"}, + {"ja_rm", "Japanese (Romanized)", "Any-Latin"}, {"el", "Ελληνικά", "Greek-Latin"}, {"it", "Italiano", ""}, {"es", "Español", ""}, + {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"}, {"th", "ไทย", "Thai-Latin"}, {"cy", "Cymraeg", "Any-Latin"}, {"sr", "Српски", "Serbian-Latin/BGN"}, + {"uk", "Українська", "Ukrainian-Latin/BGN"}, {"ca", "Català", "Any-Latin"}, {"hu", "Magyar", "Any-Latin"}, {"hsb", "Hornjoserbšćina", "Any-Latin"}, {"eu", "Euskara", "Any-Latin"}, + {"fa", "فارسی", "Any-Latin"}, {"br", "Breton", "Any-Latin"}, {"pl", "Polski", "Any-Latin"}, {"hy", "Հայերէն", "Armenian-Latin"}, {"kn", "ಕನ್ನಡ", "Kannada-Latin"}, + {"sl", "Slovenščina", "Any-Latin"}, {"ro", "Română", "Any-Latin"}, {"sq", "Shqipe", "Any-Latin"}, {"am", "አማርኛ", "Amharic-Latin/BGN"}, {"fy", "Frysk", "Any-Latin"}, + {"cs", "Čeština", "Any-Latin"}, {"gd", "Gàidhlig", "Any-Latin"}, {"sk", "Slovenčina", "Any-Latin"}, {"af", "Afrikaans", "Any-Latin"}, + {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"}, {"lb", "Luxembourgish", "Any-Latin"}, {"pt", "Português", "Any-Latin"}, {"hr", "Hrvatski", "Any-Latin"}, + {"fur", "Friulian", "Any-Latin"}, {"vi", "Tiếng Việt", "Any-Latin"}, {"tr", "Türkçe", "Any-Latin"}, {"bg", "Български", "Bulgarian-Latin/BGN"}, + {"eo", "Esperanto", "Any-Latin"}, {"lt", "Lietuvių", "Any-Latin"}, {"la", "Latin", ""}, {"kk", "Қазақ", "Kazakh-Latin/BGN"}, + {"gsw", "Schwiizertüütsch", "Any-Latin"}, {"et", "Eesti", "Any-Latin"}, {"ku", "Kurdish", "Any-Latin"}, {"mn", "Mongolian", "Mongolian-Latin/BGN"}, + {"mk", "Македонски", "Macedonian-Latin/BGN"}, {"lv", "Latviešu", "Any-Latin"}, {"hi", "हिन्दी", "Any-Latin"} }}; static_assert(g_languages.size() == StringUtf8Multilang::kMaxSupportedLanguages, @@ -72,11 +72,11 @@ char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode) } // static -char const * StringUtf8Multilang::GetLangEnNameByCode(int8_t langCode) +char const * StringUtf8Multilang::GetTransliteratorIdByCode(int8_t langCode) { if (langCode < 0 || langCode >= static_cast(g_languages.size())) return ""; - return g_languages[langCode].m_enName; + return g_languages[langCode].m_transliteratorId; } size_t StringUtf8Multilang::GetNextIndex(size_t i) const diff --git a/coding/multilang_utf8_string.hpp b/coding/multilang_utf8_string.hpp index afd19250b0..f54d402a1e 100644 --- a/coding/multilang_utf8_string.hpp +++ b/coding/multilang_utf8_string.hpp @@ -50,8 +50,8 @@ public: char const * m_code; /// Native language name. char const * m_name; - /// Native language name in English. - char const * m_enName; + /// Transliterator to latin id. + char const * m_transliteratorId; }; using Languages = array; @@ -64,7 +64,7 @@ public: /// @returns empty string if langCode is invalid. static char const * GetLangNameByCode(int8_t langCode); /// @returns empty string if langCode is invalid. - static char const * GetLangEnNameByCode(int8_t langCode); + static char const * GetTransliteratorIdByCode(int8_t langCode); inline bool operator== (StringUtf8Multilang const & rhs) const { diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp new file mode 100644 index 0000000000..937dd8106f --- /dev/null +++ b/coding/transliteration.cpp @@ -0,0 +1,59 @@ +#include "coding/transliteration.hpp" +#include "coding/multilang_utf8_string.hpp" + +#include "base/logging.hpp" + +#include "3party/icu/common/unicode/unistr.h" +#include "3party/icu/common/unicode/utypes.h" +#include "3party/icu/i18n/unicode/translit.h" +#include "3party/icu/i18n/unicode/utrans.h" + +Transliteration::~Transliteration() +{ + //u_cleanup(); +} + +Transliteration & Transliteration::GetInstance() +{ + static Transliteration instance; + return instance; +} + +void Transliteration::Init(std::string const & icuDataDir) +{ + u_setDataDirectory(icuDataDir.c_str()); + + for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages()) + { + if (strlen(lang.m_transliteratorId) == 0 || m_transliterators.count(lang.m_transliteratorId) != 0) + continue; + + UErrorCode status = U_ZERO_ERROR; + std::unique_ptr transliterator( + Transliterator::createInstance(lang.m_transliteratorId, UTRANS_FORWARD, status)); + + if (transliterator != nullptr) + m_transliterators.emplace(lang.m_transliteratorId, std::move(transliterator)); + else + LOG(LWARNING, ("Cannot create transliterator \"", lang.m_transliteratorId, "\", icu error =", status)); + } +} + +std::string Transliteration::Transliterate(std::string const & str, int8_t langCode) const +{ + auto const transliteratorId = StringUtf8Multilang::GetTransliteratorIdByCode(langCode); + auto const & it = m_transliterators.find(transliteratorId); + if (it == m_transliterators.end()) + { + LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\"")); + return ""; + } + + UnicodeString ustr(str.c_str()); + it->second->transliterate(ustr); + + std::string resultStr; + ustr.toUTF8String(resultStr); + + return resultStr; +} diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp new file mode 100644 index 0000000000..4d1f0eb7e2 --- /dev/null +++ b/coding/transliteration.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include + +namespace icu +{ +class Transliterator; +} + +class Transliteration +{ +public: + ~Transliteration(); + + static Transliteration & GetInstance(); + + void Init(std::string const & icuDataDir); + + std::string Transliterate(std::string const & str, int8_t langCode) const; + +private: + Transliteration() = default; + + struct TransliteratorWrapper; + std::map> m_transliterators; +}; -- cgit v1.2.3 From ddadc1d8d44e4d1eea916a1f464b63204d51dad2 Mon Sep 17 00:00:00 2001 From: Daria Volvenkova Date: Mon, 27 Mar 2017 21:07:06 +0300 Subject: Added transliteration test. --- coding/multilang_utf8_string.cpp | 49 ++++++++++++++++++++++++++-------------- coding/transliteration.cpp | 8 ++++++- 2 files changed, 39 insertions(+), 18 deletions(-) (limited to 'coding') diff --git a/coding/multilang_utf8_string.cpp b/coding/multilang_utf8_string.cpp index a3e1723121..107655bca9 100644 --- a/coding/multilang_utf8_string.cpp +++ b/coding/multilang_utf8_string.cpp @@ -9,23 +9,38 @@ namespace // Languages below were choosen after sorting name: tags in 2011. // Note, that it's not feasible to increase languages number here due to // our current encoding (6 bit to store language code). -StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any-Latin"}, - {"en", "English", ""}, {"ja", "日本語", "Any-Latin"}, {"fr", "Français", ""}, {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"}, - {"ar", "العربية", "Any-Latin"}, {"de", "Deutsch", ""}, {"int_name", "International (Latin)", "Any-Latin"}, {"ru", "Русский", "Russian-Latin/BGN"}, - {"sv", "Svenska", "Any-Latin"}, {"zh", "中文", "Any-Latin"}, {"fi", "Suomi", "Any-Latin"}, {"be", "Беларуская", "Belarusian-Latin/BGN"}, {"ka", "ქართული", "Georgian-Latin"}, - {"ko", "한국어", "Hangul"}, {"he", "עברית", "Hebrew"}, {"nl", "Nederlands", ""}, {"ga", "Gaeilge", "Any-Latin"}, - {"ja_rm", "Japanese (Romanized)", "Any-Latin"}, {"el", "Ελληνικά", "Greek-Latin"}, {"it", "Italiano", ""}, {"es", "Español", ""}, - {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"}, {"th", "ไทย", "Thai-Latin"}, {"cy", "Cymraeg", "Any-Latin"}, {"sr", "Српски", "Serbian-Latin/BGN"}, - {"uk", "Українська", "Ukrainian-Latin/BGN"}, {"ca", "Català", "Any-Latin"}, {"hu", "Magyar", "Any-Latin"}, {"hsb", "Hornjoserbšćina", "Any-Latin"}, {"eu", "Euskara", "Any-Latin"}, - {"fa", "فارسی", "Any-Latin"}, {"br", "Breton", "Any-Latin"}, {"pl", "Polski", "Any-Latin"}, {"hy", "Հայերէն", "Armenian-Latin"}, {"kn", "ಕನ್ನಡ", "Kannada-Latin"}, - {"sl", "Slovenščina", "Any-Latin"}, {"ro", "Română", "Any-Latin"}, {"sq", "Shqipe", "Any-Latin"}, {"am", "አማርኛ", "Amharic-Latin/BGN"}, {"fy", "Frysk", "Any-Latin"}, - {"cs", "Čeština", "Any-Latin"}, {"gd", "Gàidhlig", "Any-Latin"}, {"sk", "Slovenčina", "Any-Latin"}, {"af", "Afrikaans", "Any-Latin"}, - {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"}, {"lb", "Luxembourgish", "Any-Latin"}, {"pt", "Português", "Any-Latin"}, {"hr", "Hrvatski", "Any-Latin"}, - {"fur", "Friulian", "Any-Latin"}, {"vi", "Tiếng Việt", "Any-Latin"}, {"tr", "Türkçe", "Any-Latin"}, {"bg", "Български", "Bulgarian-Latin/BGN"}, - {"eo", "Esperanto", "Any-Latin"}, {"lt", "Lietuvių", "Any-Latin"}, {"la", "Latin", ""}, {"kk", "Қазақ", "Kazakh-Latin/BGN"}, - {"gsw", "Schwiizertüütsch", "Any-Latin"}, {"et", "Eesti", "Any-Latin"}, {"ku", "Kurdish", "Any-Latin"}, {"mn", "Mongolian", "Mongolian-Latin/BGN"}, - {"mk", "Македонски", "Macedonian-Latin/BGN"}, {"lv", "Latviešu", "Any-Latin"}, {"hi", "हिन्दी", "Any-Latin"} -}}; +StringUtf8Multilang::Languages const g_languages = {{ + {"default", "Native for each country", "Any-Latin"}, + {"en", "English", ""}, {"ja", "日本語", "Any-Latin"}, {"fr", "Français", ""}, + {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"}, {"ar", "العربية", "Any-Latin"}, + {"de", "Deutsch", ""}, {"int_name", "International (Latin)", "Any-Latin"}, + {"ru", "Русский", "Russian-Latin/BGN"}, {"sv", "Svenska", "Any-Latin"}, + {"zh", "中文", "Any-Latin"}, {"fi", "Suomi", "Any-Latin"}, + {"be", "Беларуская", "Belarusian-Latin/BGN"}, {"ka", "ქართული", "Georgian-Latin"}, + {"ko", "한국어", "Hangul-Latin/BGN"}, {"he", "עברית", "Hebrew-Latin/BGN"}, {"nl", "Nederlands", ""}, + {"ga", "Gaeilge", "Any-Latin"}, {"ja_rm", "Japanese (Romanized)", "Any-Latin"}, + {"el", "Ελληνικά", "Greek-Latin"}, {"it", "Italiano", ""}, {"es", "Español", ""}, + {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"}, {"th", "ไทย", "Thai-Latin"}, + {"cy", "Cymraeg", "Any-Latin"}, {"sr", "Српски", "Serbian-Latin/BGN"}, + {"uk", "Українська", "Ukrainian-Latin/BGN"}, {"ca", "Català", "Any-Latin"}, + {"hu", "Magyar", "Any-Latin"}, {"hsb", "Hornjoserbšćina", "Any-Latin"}, + {"eu", "Euskara", "Any-Latin"}, {"fa", "فارسی", "Any-Latin"}, {"br", "Breton", "Any-Latin"}, + {"pl", "Polski", "Any-Latin"}, {"hy", "Հայերէն", "Armenian-Latin"}, + {"kn", "ಕನ್ನಡ", "Kannada-Latin"}, {"sl", "Slovenščina", "Any-Latin"}, + {"ro", "Română", "Any-Latin"}, {"sq", "Shqipe", "Any-Latin"}, + {"am", "አማርኛ", "Amharic-Latin/BGN"}, {"fy", "Frysk", "Any-Latin"}, + {"cs", "Čeština", "Any-Latin"}, {"gd", "Gàidhlig", "Any-Latin"}, + {"sk", "Slovenčina", "Any-Latin"}, {"af", "Afrikaans", "Any-Latin"}, + {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"}, {"lb", "Luxembourgish", "Any-Latin"}, + {"pt", "Português", "Any-Latin"}, {"hr", "Hrvatski", "Any-Latin"}, + {"fur", "Friulian", "Any-Latin"}, {"vi", "Tiếng Việt", "Any-Latin"}, + {"tr", "Türkçe", "Any-Latin"}, {"bg", "Български", "Bulgarian-Latin/BGN"}, + {"eo", "Esperanto", "Any-Latin"}, {"lt", "Lietuvių", "Any-Latin"}, {"la", "Latin", ""}, + {"kk", "Қазақ", "Kazakh-Latin/BGN"}, {"gsw", "Schwiizertüütsch", "Any-Latin"}, + {"et", "Eesti", "Any-Latin"}, {"ku", "Kurdish", "Any-Latin"}, + {"mn", "Mongolian", "Mongolian-Latin/BGN"}, {"mk", "Македонски", "Macedonian-Latin/BGN"}, + {"lv", "Latviešu", "Any-Latin"}, {"hi", "हिन्दी", "Any-Latin"} + }}; static_assert(g_languages.size() == StringUtf8Multilang::kMaxSupportedLanguages, "With current encoding we are limited to 64 languages max."); diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp index 937dd8106f..c1a9ab1138 100644 --- a/coding/transliteration.cpp +++ b/coding/transliteration.cpp @@ -3,6 +3,7 @@ #include "base/logging.hpp" +#include "3party/icu/common/unicode/uclean.h" #include "3party/icu/common/unicode/unistr.h" #include "3party/icu/common/unicode/utypes.h" #include "3party/icu/i18n/unicode/translit.h" @@ -10,7 +11,12 @@ Transliteration::~Transliteration() { - //u_cleanup(); + // The use of u_cleanup() just before an application terminates is optional, + // but it should be called only once for performance reasons. + // The primary benefit is to eliminate reports of memory or resource leaks originating + // in ICU code from the results generated by heap analysis tools. + m_transliterators.clear(); + u_cleanup(); } Transliteration & Transliteration::GetInstance() -- cgit v1.2.3 From 2d5a2fd236c095bc1053a12fa4f4862e4b3693a6 Mon Sep 17 00:00:00 2001 From: Daria Volvenkova Date: Tue, 28 Mar 2017 15:13:43 +0300 Subject: Don't use transliteration for Japanese language. --- coding/multilang_utf8_string.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'coding') diff --git a/coding/multilang_utf8_string.cpp b/coding/multilang_utf8_string.cpp index 107655bca9..369ffdb9a0 100644 --- a/coding/multilang_utf8_string.cpp +++ b/coding/multilang_utf8_string.cpp @@ -11,7 +11,7 @@ namespace // our current encoding (6 bit to store language code). StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any-Latin"}, - {"en", "English", ""}, {"ja", "日本語", "Any-Latin"}, {"fr", "Français", ""}, + {"en", "English", ""}, {"ja", "日本語", ""}, {"fr", "Français", ""}, {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"}, {"ar", "العربية", "Any-Latin"}, {"de", "Deutsch", ""}, {"int_name", "International (Latin)", "Any-Latin"}, {"ru", "Русский", "Russian-Latin/BGN"}, {"sv", "Svenska", "Any-Latin"}, -- cgit v1.2.3 From 965d71afc32981da4205b7b3189771acb0764e3a Mon Sep 17 00:00:00 2001 From: Daria Volvenkova Date: Tue, 28 Mar 2017 19:27:59 +0300 Subject: Transliteration allowability parameter added. --- coding/transliteration.hpp | 1 - 1 file changed, 1 deletion(-) (limited to 'coding') diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp index 4d1f0eb7e2..04dd75b6c3 100644 --- a/coding/transliteration.hpp +++ b/coding/transliteration.hpp @@ -23,6 +23,5 @@ public: private: Transliteration() = default; - struct TransliteratorWrapper; std::map> m_transliterators; }; -- cgit v1.2.3 From 0f1fb3c9f81e3d0844613f145b995f9497f9437d Mon Sep 17 00:00:00 2001 From: Daria Volvenkova Date: Wed, 29 Mar 2017 17:08:01 +0300 Subject: Review fixes. --- coding/CMakeLists.txt | 4 +- coding/multilang_utf8_string.cpp | 96 +++++++++++++++++++++++++++------------- coding/transliteration.cpp | 17 ++++--- coding/transliteration.hpp | 4 +- 4 files changed, 80 insertions(+), 41 deletions(-) (limited to 'coding') diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt index 52ae94df7b..908c2d0fb6 100644 --- a/coding/CMakeLists.txt +++ b/coding/CMakeLists.txt @@ -71,8 +71,8 @@ set( succinct_mapper.hpp traffic.cpp traffic.hpp - transliterator.cpp - transliterator.hpp + transliteration.cpp + transliteration.hpp uri.cpp uri.hpp url_encode.hpp diff --git a/coding/multilang_utf8_string.cpp b/coding/multilang_utf8_string.cpp index 369ffdb9a0..9e3211501a 100644 --- a/coding/multilang_utf8_string.cpp +++ b/coding/multilang_utf8_string.cpp @@ -10,37 +10,71 @@ namespace // Note, that it's not feasible to increase languages number here due to // our current encoding (6 bit to store language code). StringUtf8Multilang::Languages const g_languages = {{ - {"default", "Native for each country", "Any-Latin"}, - {"en", "English", ""}, {"ja", "日本語", ""}, {"fr", "Français", ""}, - {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"}, {"ar", "العربية", "Any-Latin"}, - {"de", "Deutsch", ""}, {"int_name", "International (Latin)", "Any-Latin"}, - {"ru", "Русский", "Russian-Latin/BGN"}, {"sv", "Svenska", "Any-Latin"}, - {"zh", "中文", "Any-Latin"}, {"fi", "Suomi", "Any-Latin"}, - {"be", "Беларуская", "Belarusian-Latin/BGN"}, {"ka", "ქართული", "Georgian-Latin"}, - {"ko", "한국어", "Hangul-Latin/BGN"}, {"he", "עברית", "Hebrew-Latin/BGN"}, {"nl", "Nederlands", ""}, - {"ga", "Gaeilge", "Any-Latin"}, {"ja_rm", "Japanese (Romanized)", "Any-Latin"}, - {"el", "Ελληνικά", "Greek-Latin"}, {"it", "Italiano", ""}, {"es", "Español", ""}, - {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"}, {"th", "ไทย", "Thai-Latin"}, - {"cy", "Cymraeg", "Any-Latin"}, {"sr", "Српски", "Serbian-Latin/BGN"}, - {"uk", "Українська", "Ukrainian-Latin/BGN"}, {"ca", "Català", "Any-Latin"}, - {"hu", "Magyar", "Any-Latin"}, {"hsb", "Hornjoserbšćina", "Any-Latin"}, - {"eu", "Euskara", "Any-Latin"}, {"fa", "فارسی", "Any-Latin"}, {"br", "Breton", "Any-Latin"}, - {"pl", "Polski", "Any-Latin"}, {"hy", "Հայերէն", "Armenian-Latin"}, - {"kn", "ಕನ್ನಡ", "Kannada-Latin"}, {"sl", "Slovenščina", "Any-Latin"}, - {"ro", "Română", "Any-Latin"}, {"sq", "Shqipe", "Any-Latin"}, - {"am", "አማርኛ", "Amharic-Latin/BGN"}, {"fy", "Frysk", "Any-Latin"}, - {"cs", "Čeština", "Any-Latin"}, {"gd", "Gàidhlig", "Any-Latin"}, - {"sk", "Slovenčina", "Any-Latin"}, {"af", "Afrikaans", "Any-Latin"}, - {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"}, {"lb", "Luxembourgish", "Any-Latin"}, - {"pt", "Português", "Any-Latin"}, {"hr", "Hrvatski", "Any-Latin"}, - {"fur", "Friulian", "Any-Latin"}, {"vi", "Tiếng Việt", "Any-Latin"}, - {"tr", "Türkçe", "Any-Latin"}, {"bg", "Български", "Bulgarian-Latin/BGN"}, - {"eo", "Esperanto", "Any-Latin"}, {"lt", "Lietuvių", "Any-Latin"}, {"la", "Latin", ""}, - {"kk", "Қазақ", "Kazakh-Latin/BGN"}, {"gsw", "Schwiizertüütsch", "Any-Latin"}, - {"et", "Eesti", "Any-Latin"}, {"ku", "Kurdish", "Any-Latin"}, - {"mn", "Mongolian", "Mongolian-Latin/BGN"}, {"mk", "Македонски", "Macedonian-Latin/BGN"}, - {"lv", "Latviešu", "Any-Latin"}, {"hi", "हिन्दी", "Any-Latin"} - }}; + {"default", "Native for each country", "Any-Latin"}, + {"en", "English", ""}, + {"ja", "日本語", ""}, + {"fr", "Français", ""}, + {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"}, + {"ar", "العربية", "Any-Latin"}, + {"de", "Deutsch", ""}, + {"int_name", "International (Latin)", "Any-Latin"}, + {"ru", "Русский", "Russian-Latin/BGN"}, + {"sv", "Svenska", "Any-Latin"}, + {"zh", "中文", "Any-Latin"}, + {"fi", "Suomi", "Any-Latin"}, + {"be", "Беларуская", "Belarusian-Latin/BGN"}, + {"ka", "ქართული", "Georgian-Latin"}, + {"ko", "한국어", "Hangul-Latin/BGN"}, + {"he", "עברית", "Hebrew-Latin/BGN"}, + {"nl", "Nederlands", ""}, + {"ga", "Gaeilge", "Any-Latin"}, + {"ja_rm", "Japanese (Romanized)", "Any-Latin"}, + {"el", "Ελληνικά", "Greek-Latin"}, + {"it", "Italiano", ""}, + {"es", "Español", ""}, + {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"}, + {"th", "ไทย", "Thai-Latin"}, + {"cy", "Cymraeg", "Any-Latin"}, + {"sr", "Српски", "Serbian-Latin/BGN"}, + {"uk", "Українська", "Ukrainian-Latin/BGN"}, + {"ca", "Català", "Any-Latin"}, + {"hu", "Magyar", "Any-Latin"}, + {"hsb", "Hornjoserbšćina", "Any-Latin"}, + {"eu", "Euskara", "Any-Latin"}, + {"fa", "فارسی", "Any-Latin"}, + {"br", "Breton", "Any-Latin"}, + {"pl", "Polski", "Any-Latin"}, + {"hy", "Հայերէն", "Armenian-Latin"}, + {"kn", "ಕನ್ನಡ", "Kannada-Latin"}, + {"sl", "Slovenščina", "Any-Latin"}, + {"ro", "Română", "Any-Latin"}, + {"sq", "Shqipe", "Any-Latin"}, + {"am", "አማርኛ", "Amharic-Latin/BGN"}, + {"fy", "Frysk", "Any-Latin"}, + {"cs", "Čeština", "Any-Latin"}, + {"gd", "Gàidhlig", "Any-Latin"}, + {"sk", "Slovenčina", "Any-Latin"}, + {"af", "Afrikaans", "Any-Latin"}, + {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"}, + {"lb", "Luxembourgish", "Any-Latin"}, + {"pt", "Português", "Any-Latin"}, + {"hr", "Hrvatski", "Any-Latin"}, + {"fur", "Friulian", "Any-Latin"}, + {"vi", "Tiếng Việt", "Any-Latin"}, + {"tr", "Türkçe", "Any-Latin"}, + {"bg", "Български", "Bulgarian-Latin/BGN"}, + {"eo", "Esperanto", "Any-Latin"}, + {"lt", "Lietuvių", "Any-Latin"}, + {"la", "Latin", ""}, + {"kk", "Қазақ", "Kazakh-Latin/BGN"}, + {"gsw", "Schwiizertüütsch", "Any-Latin"}, + {"et", "Eesti", "Any-Latin"}, + {"ku", "Kurdish", "Any-Latin"}, + {"mn", "Mongolian", "Mongolian-Latin/BGN"}, + {"mk", "Македонски", "Macedonian-Latin/BGN"}, + {"lv", "Latviešu", "Any-Latin"}, + {"hi", "हिन्दी", "Any-Latin"} +}}; static_assert(g_languages.size() == StringUtf8Multilang::kMaxSupportedLanguages, "With current encoding we are limited to 64 languages max."); diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp index c1a9ab1138..e30a03820a 100644 --- a/coding/transliteration.cpp +++ b/coding/transliteration.cpp @@ -15,11 +15,12 @@ Transliteration::~Transliteration() // but it should be called only once for performance reasons. // The primary benefit is to eliminate reports of memory or resource leaks originating // in ICU code from the results generated by heap analysis tools. + // http://www.icu-project.org/apiref/icu4c/uclean_8h.html#a93f27d0ddc7c196a1da864763f2d8920 m_transliterators.clear(); u_cleanup(); } -Transliteration & Transliteration::GetInstance() +Transliteration & Transliteration::Instance() { static Transliteration instance; return instance; @@ -45,21 +46,25 @@ void Transliteration::Init(std::string const & icuDataDir) } } -std::string Transliteration::Transliterate(std::string const & str, int8_t langCode) const +bool Transliteration::Transliterate(std::string const & str, int8_t langCode, std::string & out) const { + if (str.empty()) + return false; + auto const transliteratorId = StringUtf8Multilang::GetTransliteratorIdByCode(langCode); auto const & it = m_transliterators.find(transliteratorId); if (it == m_transliterators.end()) { LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\"")); - return ""; + return false; } UnicodeString ustr(str.c_str()); it->second->transliterate(ustr); - std::string resultStr; - ustr.toUTF8String(resultStr); + if (ustr.isEmpty()) + return false; - return resultStr; + ustr.toUTF8String(out); + return true; } diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp index 04dd75b6c3..cc3f97eb4d 100644 --- a/coding/transliteration.hpp +++ b/coding/transliteration.hpp @@ -14,11 +14,11 @@ class Transliteration public: ~Transliteration(); - static Transliteration & GetInstance(); + static Transliteration & Instance(); void Init(std::string const & icuDataDir); - std::string Transliterate(std::string const & str, int8_t langCode) const; + bool Transliterate(std::string const & str, int8_t langCode, std::string & out) const; private: Transliteration() = default; -- cgit v1.2.3