diff options
author | Maxim Pimenov <m@maps.me> | 2018-12-12 20:17:50 +0300 |
---|---|---|
committer | Tatiana Yan <tatiana.kondakova@gmail.com> | 2018-12-13 13:19:18 +0300 |
commit | 5bd46a0144339fb35e9c1cba55050dcdadc7a67f (patch) | |
tree | d4673f294b7a47a1fed0c56a3d17c6193ed11b3a /coding | |
parent | f07de9288b85b7d7952801383b7fa3c20756de5b (diff) |
[coding] Renamed the StringUtf8Multilang files.
Also added some documentation on how StringUtf8Multilang is stored.
Diffstat (limited to 'coding')
-rw-r--r-- | coding/CMakeLists.txt | 8 | ||||
-rw-r--r-- | coding/coding_tests/CMakeLists.txt | 2 | ||||
-rw-r--r-- | coding/coding_tests/string_utf8_multilang_tests.cpp (renamed from coding/coding_tests/multilang_utf8_string_test.cpp) | 13 | ||||
-rw-r--r-- | coding/string_utf8_multilang.cpp (renamed from coding/multilang_utf8_string.cpp) | 6 | ||||
-rw-r--r-- | coding/string_utf8_multilang.hpp (renamed from coding/multilang_utf8_string.hpp) | 18 | ||||
-rw-r--r-- | coding/transliteration.cpp | 7 | ||||
-rw-r--r-- | coding/transliteration.hpp | 1 |
7 files changed, 37 insertions, 18 deletions
diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt index d07ad421f2..c7fbe99724 100644 --- a/coding/CMakeLists.txt +++ b/coding/CMakeLists.txt @@ -54,8 +54,6 @@ set( memory_region.hpp mmap_reader.cpp mmap_reader.hpp - multilang_utf8_string.cpp - multilang_utf8_string.hpp parse_xml.hpp point_coding.cpp point_coding.hpp @@ -71,13 +69,15 @@ set( reader_writer_ops.hpp serdes_binary_header.hpp serdes_json.hpp - simple_dense_coding.cpp - simple_dense_coding.hpp sha1.cpp sha1.hpp + simple_dense_coding.cpp + simple_dense_coding.hpp streams.hpp streams_common.hpp streams_sink.hpp + string_utf8_multilang.cpp + string_utf8_multilang.hpp succinct_mapper.hpp tesselator_decl.hpp text_storage.hpp diff --git a/coding/coding_tests/CMakeLists.txt b/coding/coding_tests/CMakeLists.txt index f8ee6ec16c..86d6ab7cbb 100644 --- a/coding/coding_tests/CMakeLists.txt +++ b/coding/coding_tests/CMakeLists.txt @@ -23,7 +23,6 @@ set( huffman_test.cpp mem_file_reader_test.cpp mem_file_writer_test.cpp - multilang_utf8_string_test.cpp png_decoder_test.cpp point_coding_tests.cpp reader_cache_test.cpp @@ -31,6 +30,7 @@ set( reader_test.hpp reader_writer_ops_test.cpp simple_dense_coding_test.cpp + string_utf8_multilang_tests.cpp succinct_mapper_test.cpp test_polylines.cpp test_polylines.hpp diff --git a/coding/coding_tests/multilang_utf8_string_test.cpp b/coding/coding_tests/string_utf8_multilang_tests.cpp index d9c89ee1db..b79a512a1b 100644 --- a/coding/coding_tests/multilang_utf8_string_test.cpp +++ b/coding/coding_tests/string_utf8_multilang_tests.cpp @@ -1,6 +1,6 @@ #include "testing/testing.hpp" -#include "coding/multilang_utf8_string.hpp" +#include "coding/string_utf8_multilang.hpp" #include "base/control_flow.hpp" @@ -121,11 +121,12 @@ UNIT_TEST(MultilangString_Unique) UNIT_TEST(MultilangString_LangNames) { // It is important to compare the contents of the strings, and not just pointers - TEST_EQUAL(string("Беларуская"), StringUtf8Multilang::GetLangNameByCode(StringUtf8Multilang::GetLangIndex("be")), ()); + TEST_EQUAL(string("Беларуская"), + StringUtf8Multilang::GetLangNameByCode(StringUtf8Multilang::GetLangIndex("be")), ()); auto const & langs = StringUtf8Multilang::GetSupportedLanguages(); - // Using size_t workaround, because our logging/testing macroses do not support passing POD types by value, - // only by reference. And our constant is a constexpr. + // Using size_t workaround, because our logging/testing macroses do not support passing POD types + // by value, only by reference. And our constant is a constexpr. TEST_EQUAL(langs.size(), size_t(StringUtf8Multilang::kMaxSupportedLanguages), ()); auto const international = StringUtf8Multilang::GetLangIndex("int_name"); TEST_EQUAL(langs[international].m_code, string("int_name"), ()); @@ -137,11 +138,11 @@ UNIT_TEST(MultilangString_HasString) s.AddString(0, "xxx"); s.AddString(18, "yyy"); s.AddString(63, "zzz"); - + TEST(s.HasString(0), ()); TEST(s.HasString(18), ()); TEST(s.HasString(63), ()); - + TEST(!s.HasString(1), ()); TEST(!s.HasString(32), ()); } diff --git a/coding/multilang_utf8_string.cpp b/coding/string_utf8_multilang.cpp index 51c5b58357..fb84f41c61 100644 --- a/coding/multilang_utf8_string.cpp +++ b/coding/string_utf8_multilang.cpp @@ -1,4 +1,4 @@ -#include "coding/multilang_utf8_string.hpp" +#include "coding/string_utf8_multilang.hpp" #include "defines.hpp" @@ -202,12 +202,12 @@ bool StringUtf8Multilang::GetString(int8_t lang, string & utf8s) const bool StringUtf8Multilang::HasString(int8_t lang) const { - for(size_t i = 0; i < m_s.size(); i = GetNextIndex(i)) + for (size_t i = 0; i < m_s.size(); i = GetNextIndex(i)) { if ((m_s[i] & 0x3F) == lang) return true; } - + return false; } diff --git a/coding/multilang_utf8_string.hpp b/coding/string_utf8_multilang.hpp index f336dd0f84..ca54fbac82 100644 --- a/coding/multilang_utf8_string.hpp +++ b/coding/string_utf8_multilang.hpp @@ -42,6 +42,24 @@ void ReadString(TSource & src, std::string & s) } } // namespace utils +// A class to store strings in multiple languages. +// May be used e.g. to store several translations of a feature's name. +// +// The coding scheme is as follows: +// * Pairs of the form (|lang|, |s|) are stored. |s| is a string in the UTF-8 +// encoding and |lang| is one of the 64 supported languages (see the list in the cpp file). +// +// * Each pair is represented by a byte encoding the lang followed by the +// UTF-8 bytes of the string. Then, all such representations are concatenated +// into a single std::string. +// The language code is encoded with 6 bits that are prepended with "10", i.e. +// 10xx xxxx. In the UTF-8 encoding that would be a continuation byte, so +// if you start reading the string and such a byte appears out of nowhere in +// a place where a continuation byte is not expected you may be sure +// that the string for the current language has ended and you've reached the +// string for the next language. Note that this breaks the self-synchronization property. +// +// * The order of the stored strings is not specified. Any language may come first. class StringUtf8Multilang { public: diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp index e8e73aa8f3..22f21d2bcf 100644 --- a/coding/transliteration.cpp +++ b/coding/transliteration.cpp @@ -1,5 +1,6 @@ #include "coding/transliteration.hpp" -#include "coding/multilang_utf8_string.hpp" + +#include "coding/string_utf8_multilang.hpp" #include "base/logging.hpp" #include "base/string_utils.hpp" @@ -10,8 +11,6 @@ #include "3party/icu/i18n/unicode/translit.h" #include "3party/icu/i18n/unicode/utrans.h" -#include "std/unique_ptr.hpp" - #include <cstring> #include <mutex> @@ -58,7 +57,7 @@ void Transliteration::Init(std::string const & icuDataDir) if (strlen(lang.m_transliteratorId) == 0 || m_transliterators.count(lang.m_transliteratorId) != 0) continue; - m_transliterators.emplace(lang.m_transliteratorId, make_unique<TransliteratorInfo>()); + m_transliterators.emplace(lang.m_transliteratorId, std::make_unique<TransliteratorInfo>()); } } diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp index 00fbaa0497..cefe519d82 100644 --- a/coding/transliteration.hpp +++ b/coding/transliteration.hpp @@ -1,6 +1,7 @@ #pragma once #include <atomic> +#include <cstdint> #include <map> #include <memory> #include <string> |