Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/coding
diff options
context:
space:
mode:
authortatiana-yan <tatiana.kondakova@gmail.com>2020-06-29 14:04:32 +0300
committerArsentiy Milchakov <milcars@mapswithme.com>2020-06-30 19:42:08 +0300
commit66f0e55c943a0b10837566a905d2dcba827f6390 (patch)
tree93f686fea0136e55b1c95ec7f0c95808bd5c3c53 /coding
parentd7f8d5c40894269d1e4008ef76c56d21cace356b (diff)
[coding] Support multiple transliterators for lang_code. Support hiragana-latin transliteration.
Diffstat (limited to 'coding')
-rw-r--r--coding/string_utf8_multilang.cpp139
-rw-r--r--coding/string_utf8_multilang.hpp13
-rw-r--r--coding/transliteration.cpp71
3 files changed, 118 insertions, 105 deletions
diff --git a/coding/string_utf8_multilang.cpp b/coding/string_utf8_multilang.cpp
index 86fe581068..2579255e0c 100644
--- a/coding/string_utf8_multilang.cpp
+++ b/coding/string_utf8_multilang.cpp
@@ -15,70 +15,70 @@ namespace
// Note that it's not feasible to increase languages number here due to current encoding (6 bit to
// store language code).
array<StringUtf8Multilang::Lang, StringUtf8Multilang::kMaxSupportedLanguages> const kLanguages = {
- {{"default", "Native for each country", "Any-Latin"},
- {"en", "English", ""},
- {"ja", "日本語", ""},
- {"fr", "Français", ""},
- {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"},
- {"ar", "العربية", "Any-Latin"},
- {"de", "Deutsch", ""},
- {"int_name", "International (Latin)", "Any-Latin"},
- {"ru", "Русский", "Russian-Latin/BGN"},
- {"sv", "Svenska", ""},
- {"zh", "中文", "Any-Latin"},
- {"fi", "Suomi", ""},
- {"be", "Беларуская", "Belarusian-Latin/BGN"},
- {"ka", "ქართული", "Georgian-Latin"},
- {"ko", "한국어", "Hangul-Latin/BGN"},
- {"he", "עברית", "Hebrew-Latin"},
- {"nl", "Nederlands", ""},
- {"ga", "Gaeilge", ""},
- {"ja_rm", "Japanese (Romanized)", "Any-Latin"},
- {"el", "Ελληνικά", "Greek-Latin"},
- {"it", "Italiano", ""},
- {"es", "Español", ""},
- {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"},
- {"th", "ไทย", ""}, // Thai-Latin
- {"cy", "Cymraeg", ""},
- {"sr", "Српски", "Serbian-Latin/BGN"},
- {"uk", "Українська", "Ukrainian-Latin/BGN"},
- {"ca", "Català", ""},
- {"hu", "Magyar", ""},
- {StringUtf8Multilang::kReservedLang /* hsb */, "", ""},
- {"eu", "Euskara", ""},
- {"fa", "فارسی", "Any-Latin"},
- {StringUtf8Multilang::kReservedLang /* br */, "", ""},
- {"pl", "Polski", ""},
- {"hy", "Հայերէն", "Armenian-Latin"},
- {StringUtf8Multilang::kReservedLang /* kn */, "", ""},
- {"sl", "Slovenščina", ""},
- {"ro", "Română", ""},
- {"sq", "Shqip", ""},
- {"am", "አማርኛ", "Amharic-Latin/BGN"},
- {"no", "Norsk", ""}, // Was "fy" before December 2018.
- {"cs", "Čeština", ""},
- {"id", "Bahasa Indonesia", ""}, // Was "gd" before December 2018.
- {"sk", "Slovenčina", ""},
- {"af", "Afrikaans", ""},
- {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"},
- {StringUtf8Multilang::kReservedLang /* lb */, "", ""},
- {"pt", "Português", ""},
- {"hr", "Hrvatski", ""},
- {"da", "Dansk", ""}, // Was "fur" before December 2018.
- {"vi", "Tiếng Việt", ""},
- {"tr", "Türkçe", ""},
- {"bg", "Български", "Bulgarian-Latin/BGN"},
- {"alt_name", "Alternative name", "Any-Latin"}, // Was "eo" before December 2018.
- {"lt", "Lietuvių", ""},
- {"old_name", "Old/Previous name", "Any-Latin"}, // Was "la" before December 2018.
- {"kk", "Қазақ", "Kazakh-Latin/BGN"},
- {StringUtf8Multilang::kReservedLang /* gsw */, "", ""},
- {"et", "Eesti", ""},
- {"ku", "Kurdish", "Any-Latin"},
- {"mn", "Mongolian", "Mongolian-Latin/BGN"},
- {"mk", "Македонски", "Macedonian-Latin/BGN"},
- {"lv", "Latviešu", ""},
- {"hi", "हिन्दी", "Any-Latin"}}};
+ {{"default", "Native for each country", {"Any-Latin"}},
+ {"en", "English", {}},
+ {"ja", "日本語", {}},
+ {"fr", "Français", {}},
+ {"ko_rm", "Korean (Romanized)", {"Korean-Latin/BGN"}},
+ {"ar", "العربية", {"Any-Latin"}},
+ {"de", "Deutsch", {}},
+ {"int_name", "International (Latin)", {"Any-Latin"}},
+ {"ru", "Русский", {"Russian-Latin/BGN"}},
+ {"sv", "Svenska", {}},
+ {"zh", "中文", {"Any-Latin"}},
+ {"fi", "Suomi", {}},
+ {"be", "Беларуская", {"Belarusian-Latin/BGN"}},
+ {"ka", "ქართული", {"Georgian-Latin"}},
+ {"ko", "한국어", {"Hangul-Latin/BGN"}},
+ {"he", "עברית", {"Hebrew-Latin"}},
+ {"nl", "Nederlands", {}},
+ {"ga", "Gaeilge", {}},
+ {"ja_rm", "Japanese (Romanized)", {"Any-Latin"}},
+ {"el", "Ελληνικά", {"Greek-Latin"}},
+ {"it", "Italiano", {}},
+ {"es", "Español", {}},
+ {"zh_pinyin", "Chinese (Pinyin)", {"Any-Latin"}},
+ {"th", "ไทย", {}}, // Thai-Latin
+ {"cy", "Cymraeg", {}},
+ {"sr", "Српски", {"Serbian-Latin/BGN"}},
+ {"uk", "Українська", {"Ukrainian-Latin/BGN"}},
+ {"ca", "Català", {}},
+ {"hu", "Magyar", {}},
+ {StringUtf8Multilang::kReservedLang /* hsb */, "", {}},
+ {"eu", "Euskara", {}},
+ {"fa", "فارسی", {"Any-Latin"}},
+ {StringUtf8Multilang::kReservedLang /* br */, "", {}},
+ {"pl", "Polski", {}},
+ {"hy", "Հայերէն", {"Armenian-Latin"}},
+ {StringUtf8Multilang::kReservedLang /* kn */, "", {}},
+ {"sl", "Slovenščina", {}},
+ {"ro", "Română", {}},
+ {"sq", "Shqip", {}},
+ {"am", "አማርኛ", {"Amharic-Latin/BGN"}},
+ {"no", "Norsk", {}}, // Was "fy" before December 2018.
+ {"cs", "Čeština", {}},
+ {"id", "Bahasa Indonesia", {}}, // Was "gd" before December 2018.
+ {"sk", "Slovenčina", {}},
+ {"af", "Afrikaans", {}},
+ {"ja_kana", "日本語(カタカナ)", {"Katakana-Latin", "Hiragana-Latin"}},
+ {StringUtf8Multilang::kReservedLang /* lb */, "", {}},
+ {"pt", "Português", {}},
+ {"hr", "Hrvatski", {}},
+ {"da", "Dansk", {}}, // Was "fur" before December 2018.
+ {"vi", "Tiếng Việt", {}},
+ {"tr", "Türkçe", {}},
+ {"bg", "Български", {"Bulgarian-Latin/BGN"}},
+ {"alt_name", "Alternative name", {"Any-Latin"}}, // Was "eo" before December 2018.
+ {"lt", "Lietuvių", {}},
+ {"old_name", "Old/Previous name", {"Any-Latin"}}, // Was "la" before December 2018.
+ {"kk", "Қазақ", {"Kazakh-Latin/BGN"}},
+ {StringUtf8Multilang::kReservedLang /* gsw */, "", {}},
+ {"et", "Eesti", {}},
+ {"ku", "Kurdish", {"Any-Latin"}},
+ {"mn", "Mongolian", {"Mongolian-Latin/BGN"}},
+ {"mk", "Македонски", {"Macedonian-Latin/BGN"}},
+ {"lv", "Latviešu", {}},
+ {"hi", "हिन्दी", {"Any-Latin"}}}};
static_assert(
kLanguages.size() == StringUtf8Multilang::kMaxSupportedLanguages,
@@ -137,7 +137,7 @@ char const * StringUtf8Multilang::GetLangByCode(int8_t langCode)
if (!IsSupportedLangCode(langCode))
return "";
- return kLanguages[langCode].m_code;
+ return kLanguages[langCode].m_code.c_str();
}
// static
@@ -146,16 +146,17 @@ char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode)
if (!IsSupportedLangCode(langCode))
return "";
- return kLanguages[langCode].m_name;
+ return kLanguages[langCode].m_name.c_str();
}
// static
-char const * StringUtf8Multilang::GetTransliteratorIdByCode(int8_t langCode)
+vector<string> const & StringUtf8Multilang::GetTransliteratorsIdsByCode(int8_t langCode)
{
+ static const vector<string> empty;
if (!IsSupportedLangCode(langCode))
- return "";
+ return empty;
- return kLanguages[langCode].m_transliteratorId;
+ return kLanguages[langCode].m_transliteratorsIds;
}
size_t StringUtf8Multilang::GetNextIndex(size_t i) const
diff --git a/coding/string_utf8_multilang.hpp b/coding/string_utf8_multilang.hpp
index 9e663af9e0..108257a8b3 100644
--- a/coding/string_utf8_multilang.hpp
+++ b/coding/string_utf8_multilang.hpp
@@ -13,6 +13,7 @@
#include <functional>
#include <string>
#include <utility>
+#include <vector>
namespace utils
{
@@ -67,11 +68,11 @@ public:
struct Lang
{
/// OSM language code (e.g. for name:en it's "en" part).
- char const * m_code;
+ std::string m_code;
/// Native language name.
- char const * m_name;
- /// Transliterator to latin id.
- char const * m_transliteratorId;
+ std::string m_name;
+ /// Transliterators to latin ids.
+ std::vector<std::string> m_transliteratorsIds;
};
struct Position
@@ -107,8 +108,8 @@ public:
static char const * GetLangByCode(int8_t langCode);
/// @returns empty string if langCode is invalid.
static char const * GetLangNameByCode(int8_t langCode);
- /// @returns empty string if langCode is invalid.
- static char const * GetTransliteratorIdByCode(int8_t langCode);
+ /// @returns empty vector if langCode is invalid.
+ static std::vector<std::string> const & GetTransliteratorsIdsByCode(int8_t langCode);
inline bool operator==(StringUtf8Multilang const & rhs) const { return m_s == rhs.m_s; }
inline bool operator!=(StringUtf8Multilang const & rhs) const { return !(*this == rhs); }
diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp
index 22f21d2bcf..f0ccf2c405 100644
--- a/coding/transliteration.cpp
+++ b/coding/transliteration.cpp
@@ -54,10 +54,11 @@ void Transliteration::Init(std::string const & icuDataDir)
for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages())
{
- if (strlen(lang.m_transliteratorId) == 0 || m_transliterators.count(lang.m_transliteratorId) != 0)
- continue;
-
- m_transliterators.emplace(lang.m_transliteratorId, std::make_unique<TransliteratorInfo>());
+ for (auto const & t : lang.m_transliteratorsIds)
+ {
+ if (m_transliterators.count(t) == 0)
+ m_transliterators.emplace(t, std::make_unique<TransliteratorInfo>());
+ }
}
}
@@ -74,47 +75,57 @@ bool Transliteration::Transliterate(std::string const & str, int8_t langCode, st
if (str.empty() || strings::IsASCIIString(str))
return false;
- std::string transliteratorId(StringUtf8Multilang::GetTransliteratorIdByCode(langCode));
-
- if (transliteratorId.empty())
+ auto const & transliteratorsIds = StringUtf8Multilang::GetTransliteratorsIdsByCode(langCode);
+ if (transliteratorsIds.empty())
return false;
- auto it = m_transliterators.find(transliteratorId);
- if (it == m_transliterators.end())
+ UnicodeString ustr(str.c_str());
+ for (auto transliteratorId : transliteratorsIds)
{
- LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\""));
- return false;
- }
+ if (transliteratorId.empty())
+ return false;
+
+ auto it = m_transliterators.find(transliteratorId);
+ if (it == m_transliterators.end())
+ {
+ LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\""));
+ return false;
+ }
- if (!it->second->m_initialized)
- {
- std::lock_guard<std::mutex> lock(it->second->m_mutex);
if (!it->second->m_initialized)
{
- UErrorCode status = U_ZERO_ERROR;
+ std::lock_guard<std::mutex> lock(it->second->m_mutex);
+ if (!it->second->m_initialized)
+ {
+ UErrorCode status = U_ZERO_ERROR;
- std::string const removeDiacriticRule = ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC";
- transliteratorId.append(removeDiacriticRule);
+ std::string const removeDiacriticRule =
+ ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC";
+ transliteratorId.append(removeDiacriticRule);
- UnicodeString translitId(transliteratorId.c_str());
+ UnicodeString translitId(transliteratorId.c_str());
- it->second->m_transliterator.reset(Transliterator::createInstance(translitId, UTRANS_FORWARD, status));
+ it->second->m_transliterator.reset(
+ Transliterator::createInstance(translitId, UTRANS_FORWARD, status));
- if (it->second->m_transliterator == nullptr)
- LOG(LWARNING, ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status));
+ if (it->second->m_transliterator == nullptr)
+ {
+ LOG(LWARNING,
+ ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status));
+ }
- it->second->m_initialized = true;
+ it->second->m_initialized = true;
+ }
}
- }
- if (it->second->m_transliterator == nullptr)
- return false;
+ if (it->second->m_transliterator == nullptr)
+ return false;
- UnicodeString ustr(str.c_str());
- it->second->m_transliterator->transliterate(ustr);
+ it->second->m_transliterator->transliterate(ustr);
- if (ustr.isEmpty())
- return false;
+ if (ustr.isEmpty())
+ return false;
+ }
ustr.toUTF8String(out);
return true;