diff options
author | tatiana-yan <tatiana.kondakova@gmail.com> | 2020-07-01 15:42:20 +0300 |
---|---|---|
committer | mpimenov <mpimenov@users.noreply.github.com> | 2020-07-10 13:48:36 +0300 |
commit | 8f5a93d1e7093e055b6f39dc72719e61e110baf1 (patch) | |
tree | 7d537aac18e9c0d5c29114cae9deaa1f8e038790 /coding | |
parent | 8f77809313da8a70d1de84332784bd0a28a77217 (diff) |
[coding] Transliteration: support Hiragana-Katakana transliteration; threadsafe Init().
Diffstat (limited to 'coding')
-rw-r--r-- | coding/transliteration.cpp | 123 | ||||
-rw-r--r-- | coding/transliteration.hpp | 16 |
2 files changed, 93 insertions, 46 deletions
diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp index 09d71bb189..fc3939d524 100644 --- a/coding/transliteration.cpp +++ b/coding/transliteration.cpp @@ -47,9 +47,16 @@ Transliteration & Transliteration::Instance() void Transliteration::Init(std::string const & icuDataDir) { - // This function should be called at most once in a process, - // before the first ICU operation that will require the loading of an ICU data file. - // This function is not thread-safe. Use it before calling ICU APIs from multiple threads. + // Fast atomic check before mutex lock. + if (m_inited) + return; + + std::lock_guard<std::mutex> lock(m_initializationMutex); + if (m_inited) + return; + + // This function should be called before the first ICU operation that will require the loading of + // an ICU data file. u_setDataDirectory(icuDataDir.c_str()); for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages()) @@ -60,6 +67,11 @@ void Transliteration::Init(std::string const & icuDataDir) m_transliterators.emplace(t, std::make_unique<TransliteratorInfo>()); } } + + // We need "Hiragana-Katakana" for strings normalization, not for latin transliteration. + // That's why it is not mentioned in StringUtf8Multilang transliterators list. + m_transliterators.emplace("Hiragana-Katakana", std::make_unique<TransliteratorInfo>()); + m_inited = true; } void Transliteration::SetMode(Transliteration::Mode mode) @@ -67,65 +79,88 @@ void Transliteration::SetMode(Transliteration::Mode mode) m_mode = mode; } -bool Transliteration::Transliterate(std::string const & str, int8_t langCode, std::string & out) const +bool Transliteration::Transliterate(std::string transliteratorId, UnicodeString & ustr) const { - if (m_mode != Mode::Enabled) - return false; - - if (str.empty() || strings::IsASCIIString(str)) - return false; + CHECK(!transliteratorId.empty(), (transliteratorId)); - auto const & transliteratorsIds = StringUtf8Multilang::GetTransliteratorsIdsByCode(langCode); - if (transliteratorsIds.empty()) + auto it = m_transliterators.find(transliteratorId); + if (it == m_transliterators.end()) + { + LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\"")); return false; + } - UnicodeString ustr(str.c_str()); - for (auto transliteratorId : transliteratorsIds) + if (!it->second->m_initialized) { - CHECK(!transliteratorId.empty(), (transliteratorId)); - - auto it = m_transliterators.find(transliteratorId); - if (it == m_transliterators.end()) - { - LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\"")); - continue; - } - + std::lock_guard<std::mutex> lock(it->second->m_mutex); if (!it->second->m_initialized) { - std::lock_guard<std::mutex> lock(it->second->m_mutex); - if (!it->second->m_initialized) - { - UErrorCode status = U_ZERO_ERROR; - - std::string const removeDiacriticRule = - ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC"; - transliteratorId.append(removeDiacriticRule); + UErrorCode status = U_ZERO_ERROR; - UnicodeString translitId(transliteratorId.c_str()); + std::string const removeDiacriticRule = + ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC"; + transliteratorId.append(removeDiacriticRule); - it->second->m_transliterator.reset( - Transliterator::createInstance(translitId, UTRANS_FORWARD, status)); + UnicodeString translitId(transliteratorId.c_str()); - if (it->second->m_transliterator == nullptr) - { - LOG(LWARNING, - ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status)); - } + it->second->m_transliterator.reset( + Transliterator::createInstance(translitId, UTRANS_FORWARD, status)); - it->second->m_initialized = true; + if (it->second->m_transliterator == nullptr) + { + LOG(LWARNING, + ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status)); } + + it->second->m_initialized = true; } + } - if (it->second->m_transliterator == nullptr) - continue; + if (it->second->m_transliterator == nullptr) + return false; - it->second->m_transliterator->transliterate(ustr); + it->second->m_transliterator->transliterate(ustr); + + if (ustr.isEmpty()) + return false; + + return true; +} + +bool Transliteration::Transliterate(std::string const & str, std::string transliteratorId, + std::string & out) const +{ + UnicodeString ustr(str.c_str()); + auto const res = Transliterate(transliteratorId, ustr); + if (res) + ustr.toUTF8String(out); + return res; +} + +bool Transliteration::Transliterate(std::string const & str, int8_t langCode, + std::string & out) const +{ + if (m_mode != Mode::Enabled) + return false; + + if (str.empty() || strings::IsASCIIString(str)) + return false; + + auto const & transliteratorsIds = StringUtf8Multilang::GetTransliteratorsIdsByCode(langCode); + if (transliteratorsIds.empty()) + return false; + + UnicodeString ustr(str.c_str()); + for (auto transliteratorId : transliteratorsIds) + { + if (!Transliterate(transliteratorId, ustr)) + continue; - if (ustr.isEmpty()) - return false; } + if (ustr.isEmpty()) + return false; + ustr.toUTF8String(out); return true; } diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp index cefe519d82..e2c1e73fef 100644 --- a/coding/transliteration.hpp +++ b/coding/transliteration.hpp @@ -4,8 +4,14 @@ #include <cstdint> #include <map> #include <memory> +#include <mutex> #include <string> +namespace icu +{ +class UnicodeString; +} // namespace icu + class Transliteration { public: @@ -22,13 +28,19 @@ public: void Init(std::string const & icuDataDir); void SetMode(Mode mode); + bool Transliterate(std::string const & str, std::string transliteratorId, + std::string & out) const; bool Transliterate(std::string const & str, int8_t langCode, std::string & out) const; private: + struct TransliteratorInfo; + Transliteration(); - std::atomic<Mode> m_mode; + bool Transliterate(std::string transliteratorId, icu::UnicodeString & ustr) const; - struct TransliteratorInfo; + std::mutex m_initializationMutex; + std::atomic<bool> m_inited; + std::atomic<Mode> m_mode; std::map<std::string, std::unique_ptr<TransliteratorInfo>> m_transliterators; }; |