Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/coding
diff options
context:
space:
mode:
authortatiana-yan <tatiana.kondakova@gmail.com>2020-07-01 15:42:20 +0300
committermpimenov <mpimenov@users.noreply.github.com>2020-07-10 13:48:36 +0300
commit8f5a93d1e7093e055b6f39dc72719e61e110baf1 (patch)
tree7d537aac18e9c0d5c29114cae9deaa1f8e038790 /coding
parent8f77809313da8a70d1de84332784bd0a28a77217 (diff)
[coding] Transliteration: support Hiragana-Katakana transliteration; threadsafe Init().
Diffstat (limited to 'coding')
-rw-r--r--coding/transliteration.cpp123
-rw-r--r--coding/transliteration.hpp16
2 files changed, 93 insertions, 46 deletions
diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp
index 09d71bb189..fc3939d524 100644
--- a/coding/transliteration.cpp
+++ b/coding/transliteration.cpp
@@ -47,9 +47,16 @@ Transliteration & Transliteration::Instance()
void Transliteration::Init(std::string const & icuDataDir)
{
- // This function should be called at most once in a process,
- // before the first ICU operation that will require the loading of an ICU data file.
- // This function is not thread-safe. Use it before calling ICU APIs from multiple threads.
+ // Fast atomic check before mutex lock.
+ if (m_inited)
+ return;
+
+ std::lock_guard<std::mutex> lock(m_initializationMutex);
+ if (m_inited)
+ return;
+
+ // This function should be called before the first ICU operation that will require the loading of
+ // an ICU data file.
u_setDataDirectory(icuDataDir.c_str());
for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages())
@@ -60,6 +67,11 @@ void Transliteration::Init(std::string const & icuDataDir)
m_transliterators.emplace(t, std::make_unique<TransliteratorInfo>());
}
}
+
+ // We need "Hiragana-Katakana" for strings normalization, not for latin transliteration.
+ // That's why it is not mentioned in StringUtf8Multilang transliterators list.
+ m_transliterators.emplace("Hiragana-Katakana", std::make_unique<TransliteratorInfo>());
+ m_inited = true;
}
void Transliteration::SetMode(Transliteration::Mode mode)
@@ -67,65 +79,88 @@ void Transliteration::SetMode(Transliteration::Mode mode)
m_mode = mode;
}
-bool Transliteration::Transliterate(std::string const & str, int8_t langCode, std::string & out) const
+bool Transliteration::Transliterate(std::string transliteratorId, UnicodeString & ustr) const
{
- if (m_mode != Mode::Enabled)
- return false;
-
- if (str.empty() || strings::IsASCIIString(str))
- return false;
+ CHECK(!transliteratorId.empty(), (transliteratorId));
- auto const & transliteratorsIds = StringUtf8Multilang::GetTransliteratorsIdsByCode(langCode);
- if (transliteratorsIds.empty())
+ auto it = m_transliterators.find(transliteratorId);
+ if (it == m_transliterators.end())
+ {
+ LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\""));
return false;
+ }
- UnicodeString ustr(str.c_str());
- for (auto transliteratorId : transliteratorsIds)
+ if (!it->second->m_initialized)
{
- CHECK(!transliteratorId.empty(), (transliteratorId));
-
- auto it = m_transliterators.find(transliteratorId);
- if (it == m_transliterators.end())
- {
- LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\""));
- continue;
- }
-
+ std::lock_guard<std::mutex> lock(it->second->m_mutex);
if (!it->second->m_initialized)
{
- std::lock_guard<std::mutex> lock(it->second->m_mutex);
- if (!it->second->m_initialized)
- {
- UErrorCode status = U_ZERO_ERROR;
-
- std::string const removeDiacriticRule =
- ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC";
- transliteratorId.append(removeDiacriticRule);
+ UErrorCode status = U_ZERO_ERROR;
- UnicodeString translitId(transliteratorId.c_str());
+ std::string const removeDiacriticRule =
+ ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC";
+ transliteratorId.append(removeDiacriticRule);
- it->second->m_transliterator.reset(
- Transliterator::createInstance(translitId, UTRANS_FORWARD, status));
+ UnicodeString translitId(transliteratorId.c_str());
- if (it->second->m_transliterator == nullptr)
- {
- LOG(LWARNING,
- ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status));
- }
+ it->second->m_transliterator.reset(
+ Transliterator::createInstance(translitId, UTRANS_FORWARD, status));
- it->second->m_initialized = true;
+ if (it->second->m_transliterator == nullptr)
+ {
+ LOG(LWARNING,
+ ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status));
}
+
+ it->second->m_initialized = true;
}
+ }
- if (it->second->m_transliterator == nullptr)
- continue;
+ if (it->second->m_transliterator == nullptr)
+ return false;
- it->second->m_transliterator->transliterate(ustr);
+ it->second->m_transliterator->transliterate(ustr);
+
+ if (ustr.isEmpty())
+ return false;
+
+ return true;
+}
+
+bool Transliteration::Transliterate(std::string const & str, std::string transliteratorId,
+ std::string & out) const
+{
+ UnicodeString ustr(str.c_str());
+ auto const res = Transliterate(transliteratorId, ustr);
+ if (res)
+ ustr.toUTF8String(out);
+ return res;
+}
+
+bool Transliteration::Transliterate(std::string const & str, int8_t langCode,
+ std::string & out) const
+{
+ if (m_mode != Mode::Enabled)
+ return false;
+
+ if (str.empty() || strings::IsASCIIString(str))
+ return false;
+
+ auto const & transliteratorsIds = StringUtf8Multilang::GetTransliteratorsIdsByCode(langCode);
+ if (transliteratorsIds.empty())
+ return false;
+
+ UnicodeString ustr(str.c_str());
+ for (auto transliteratorId : transliteratorsIds)
+ {
+ if (!Transliterate(transliteratorId, ustr))
+ continue;
- if (ustr.isEmpty())
- return false;
}
+ if (ustr.isEmpty())
+ return false;
+
ustr.toUTF8String(out);
return true;
}
diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp
index cefe519d82..e2c1e73fef 100644
--- a/coding/transliteration.hpp
+++ b/coding/transliteration.hpp
@@ -4,8 +4,14 @@
#include <cstdint>
#include <map>
#include <memory>
+#include <mutex>
#include <string>
+namespace icu
+{
+class UnicodeString;
+} // namespace icu
+
class Transliteration
{
public:
@@ -22,13 +28,19 @@ public:
void Init(std::string const & icuDataDir);
void SetMode(Mode mode);
+ bool Transliterate(std::string const & str, std::string transliteratorId,
+ std::string & out) const;
bool Transliterate(std::string const & str, int8_t langCode, std::string & out) const;
private:
+ struct TransliteratorInfo;
+
Transliteration();
- std::atomic<Mode> m_mode;
+ bool Transliterate(std::string transliteratorId, icu::UnicodeString & ustr) const;
- struct TransliteratorInfo;
+ std::mutex m_initializationMutex;
+ std::atomic<bool> m_inited;
+ std::atomic<Mode> m_mode;
std::map<std::string, std::unique_ptr<TransliteratorInfo>> m_transliterators;
};