diff options
author | Daria Volvenkova <d.volvenkova@corp.mail.ru> | 2018-11-02 04:36:07 +0300 |
---|---|---|
committer | mpimenov <mpimenov@users.noreply.github.com> | 2018-11-23 13:49:43 +0300 |
commit | 7cd27d26171073cc20482ae0f5fbf695b985fdb4 (patch) | |
tree | 29dd01309ff151325b40da0bd69f91432572517c /descriptions | |
parent | e4d44c3bfa8258dc9b25de46eeeb5928b83246a0 (diff) |
[descriptions] Description section serializer and deserializer added.
Diffstat (limited to 'descriptions')
-rw-r--r-- | descriptions/CMakeLists.txt | 13 | ||||
-rw-r--r-- | descriptions/descriptions_tests/CMakeLists.txt | 22 | ||||
-rw-r--r-- | descriptions/descriptions_tests/descriptions_tests.cpp | 61 | ||||
-rw-r--r-- | descriptions/header.hpp | 43 | ||||
-rw-r--r-- | descriptions/loader.cpp | 44 | ||||
-rw-r--r-- | descriptions/loader.hpp | 41 | ||||
-rw-r--r-- | descriptions/serdes.cpp | 45 | ||||
-rw-r--r-- | descriptions/serdes.hpp | 291 |
8 files changed, 560 insertions, 0 deletions
diff --git a/descriptions/CMakeLists.txt b/descriptions/CMakeLists.txt new file mode 100644 index 0000000000..d8952eee7a --- /dev/null +++ b/descriptions/CMakeLists.txt @@ -0,0 +1,13 @@ +project(descriptions) + +set( + SRC + header.hpp + loader.cpp + loader.hpp + serdes.cpp + serdes.hpp +) + +omim_add_library(${PROJECT_NAME} ${SRC}) +omim_add_test_subdirectory(descriptions_tests) diff --git a/descriptions/descriptions_tests/CMakeLists.txt b/descriptions/descriptions_tests/CMakeLists.txt new file mode 100644 index 0000000000..52cfc02c4b --- /dev/null +++ b/descriptions/descriptions_tests/CMakeLists.txt @@ -0,0 +1,22 @@ +project(descriptions_tests) + +set( + SRC + descriptions_tests.cpp +) + +omim_add_test(${PROJECT_NAME} ${SRC}) + +omim_link_libraries( + ${PROJECT_NAME} + descriptions + indexer + platform + coding + base + jansson + stats_client + ${LIBZ} +) + +link_qt5_core(${PROJECT_NAME}) diff --git a/descriptions/descriptions_tests/descriptions_tests.cpp b/descriptions/descriptions_tests/descriptions_tests.cpp new file mode 100644 index 0000000000..e37592b6cc --- /dev/null +++ b/descriptions/descriptions_tests/descriptions_tests.cpp @@ -0,0 +1,61 @@ +#include "testing/testing.hpp" + +#include "descriptions/serdes.hpp" + +#include "coding/reader.hpp" +#include "coding/writer.hpp" + +#include <map> +#include <string> +#include <utility> +#include <vector> + +using namespace descriptions; + +UNIT_TEST(Descriptions_SerDes) +{ + std::map<FeatureIndex, std::map<LangCode, std::string>> data = + { {100, {{10, "Description of feature 100, language 10."}, + {11, "Описание фичи 100, язык 11."}}}, + {101, {{11, "Описание фичи 101, язык 11."}}}, + {102, {{11, "Описание фичи 102, язык 11."}, + {10, "Description of feature 102, language 10."}}} + }; + + DescriptionsCollection descriptionsCollection; + for (auto const & featureDesc : data) + { + StringUtf8Multilang str; + for (auto const & translation : featureDesc.second) + str.AddString(translation.first, translation.second); + descriptionsCollection.emplace_back(featureDesc.first, std::move(str)); + } + + std::vector<uint8_t> buffer; + { + Serializer ser(std::move(descriptionsCollection)); + MemWriter<decltype(buffer)> writer(buffer); + ser.Serialize(writer); + } + + std::string description1; + std::string description2; + std::string description3; + std::string description4; + std::string description5; + { + Deserializer des; + MemReader reader(buffer.data(), buffer.size()); + des.Deserialize(reader, 102, {11, 10}, description1); + des.Deserialize(reader, 100, {12, 10}, description2); + des.Deserialize(reader, 101, {12}, description3); + des.Deserialize(reader, 0, {10, 11}, description4); + des.Deserialize(reader, 102, {10}, description5); + } + + TEST_EQUAL(description1, "Описание фичи 102, язык 11.", ()); + TEST_EQUAL(description2, "Description of feature 100, language 10.", ()); + TEST_EQUAL(description3, "", ()); + TEST_EQUAL(description4, "", ()); + TEST_EQUAL(description5, "Description of feature 102, language 10.", ()); +} diff --git a/descriptions/header.hpp b/descriptions/header.hpp new file mode 100644 index 0000000000..647b70559d --- /dev/null +++ b/descriptions/header.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include "coding/reader.hpp" +#include "coding/serdes_binary_header.hpp" +#include "coding/write_to_sink.hpp" + +#include <cstdint> + +namespace descriptions +{ +struct HeaderV0 +{ + template <typename Visitor> + void Visit(Visitor & visitor) + { + visitor(m_featuresOffset, "featuresOffset"); + visitor(m_langMetaOffset, "langMetaOffset"); + visitor(m_indexOffset, "indexOffset"); + visitor(m_stringsOffset, "stringsOffset"); + visitor(m_eosOffset, "eosOffset"); + } + + template <typename Sink> + void Serialize(Sink & sink) + { + coding::binary::HeaderSerVisitor<Sink> visitor(sink); + visitor(*this); + } + + template <typename Source> + void Deserialize(Source & source) + { + coding::binary::HeaderDesVisitor<Source> visitor(source); + visitor(*this); + } + + uint64_t m_featuresOffset = 0; + uint64_t m_langMetaOffset = 0; + uint64_t m_indexOffset = 0; + uint64_t m_stringsOffset = 0; + uint64_t m_eosOffset = 0; // End of section. +}; +} // namespace descriptions diff --git a/descriptions/loader.cpp b/descriptions/loader.cpp new file mode 100644 index 0000000000..d55ae79adf --- /dev/null +++ b/descriptions/loader.cpp @@ -0,0 +1,44 @@ +#include "descriptions/loader.hpp" + +#include "indexer/data_source.hpp" + +#include "base/assert.hpp" + +#include "defines.hpp" + +namespace descriptions +{ +bool Loader::GetDescription(FeatureID const & featureId, std::vector<int8_t> const & langPriority, + std::string & description) +{ + auto const handle = m_dataSource.GetMwmHandleById(featureId.m_mwmId); + + if (!handle.IsAlive()) + return false; + + auto const & value = *handle.GetValue<MwmValue>(); + + if (!value.m_cont.IsExist(DESCRIPTIONS_FILE_TAG)) + return false; + + EntryPtr entry; + { + std::lock_guard<std::mutex> lock(m_mutex); + auto it = m_deserializers.find(featureId.m_mwmId); + + if (it == m_deserializers.end()) + { + auto const result = m_deserializers.emplace(featureId.m_mwmId, std::make_shared<Entry>()); + it = result.first; + } + entry = it->second; + } + + ASSERT(entry, ()); + + auto readerPtr = value.m_cont.GetReader(DESCRIPTIONS_FILE_TAG); + + std::lock_guard<std::mutex> lock(entry->m_mutex); + return entry->m_deserializer.Deserialize(*readerPtr.GetPtr(), featureId.m_index, langPriority, description); +} +} // namespace descriptions diff --git a/descriptions/loader.hpp b/descriptions/loader.hpp new file mode 100644 index 0000000000..6e225ca068 --- /dev/null +++ b/descriptions/loader.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include "descriptions/serdes.hpp" + +#include "indexer/feature_decl.hpp" +#include "indexer/mwm_set.hpp" + +#include <cstdint> +#include <map> +#include <memory> +#include <mutex> +#include <string> +#include <vector> + +class DataSource; + +namespace descriptions +{ +// *NOTE* This class IS thread-safe. +class Loader +{ +public: + explicit Loader(DataSource const & dataSource) : m_dataSource(dataSource) {} + + bool GetDescription(FeatureID const & featureId, std::vector<int8_t> const & langPriority, + std::string & description); + +private: + struct Entry + { + std::mutex m_mutex; + Deserializer m_deserializer; + }; + + using EntryPtr = std::shared_ptr<Entry>; + + DataSource const & m_dataSource; + std::map<MwmSet::MwmId, EntryPtr> m_deserializers; + std::mutex m_mutex; +}; +} // namespace descriptions diff --git a/descriptions/serdes.cpp b/descriptions/serdes.cpp new file mode 100644 index 0000000000..75da40dc8b --- /dev/null +++ b/descriptions/serdes.cpp @@ -0,0 +1,45 @@ +#include "descriptions/serdes.hpp" + +#include <utility> + +namespace descriptions +{ +Serializer::Serializer(DescriptionsCollection && descriptions) + : m_descriptions(std::move(descriptions)) +{ + std::sort(m_descriptions.begin(), m_descriptions.end(), base::LessBy(&FeatureDescription::m_featureIndex)); + + m_langMetaCollection.reserve(m_descriptions.size()); + + size_t stringsCount = 0; + + for (size_t i = 0; i < m_descriptions.size(); ++i) + { + auto & index = m_descriptions[i]; + + LangMeta langMeta; + index.m_description.ForEach([this, &stringsCount, &langMeta, i](LangCode lang, std::string const & str) + { + ++stringsCount; + auto & group = m_groupedByLang[lang]; + langMeta.insert(std::make_pair(lang, static_cast<StringIndex>(group.size()))); + group.push_back(i); + }); + m_langMetaCollection.push_back(langMeta); + } + + std::map<LangCode, uint32_t> indicesOffsets; + uint32_t currentOffset = 0; + for (auto & langIndices : m_groupedByLang) + { + indicesOffsets.insert(std::make_pair(langIndices.first, currentOffset)); + currentOffset += langIndices.second.size(); + } + + for (auto & langMeta : m_langMetaCollection) + { + for (auto & translation : langMeta) + translation.second += indicesOffsets[translation.first]; + } +} +} // namespace descriptions diff --git a/descriptions/serdes.hpp b/descriptions/serdes.hpp new file mode 100644 index 0000000000..8c072a09e4 --- /dev/null +++ b/descriptions/serdes.hpp @@ -0,0 +1,291 @@ +#pragma once + +#include "descriptions/header.hpp" + +#include "indexer/feature_decl.hpp" + +#include "coding/dd_vector.hpp" +#include "coding/multilang_utf8_string.hpp" +#include "coding/text_storage.hpp" + +#include "base/assert.hpp" +#include "base/stl_helpers.hpp" + +#include <algorithm> +#include <cstdint> +#include <iterator> +#include <map> +#include <memory> +#include <string> +#include <vector> +#include <unordered_map> +#include <utility> + +namespace descriptions +{ +using FeatureIndex = uint32_t; +using StringIndex = uint32_t; +using LangCode = int8_t; +using LangMeta = std::unordered_map<LangCode, StringIndex>; +using LangMetaOffset = uint32_t; + +enum class Version : uint8_t +{ + V0 = 0, + Latest = V0 +}; + +struct FeatureDescription +{ + FeatureDescription() = default; + FeatureDescription(FeatureIndex index, StringUtf8Multilang && description) + : m_featureIndex(index) + , m_description(std::move(description)) + {} + + FeatureIndex m_featureIndex = 0; + StringUtf8Multilang m_description; +}; +using DescriptionsCollection = std::vector<FeatureDescription>; + +/// \brief +/// Section name: "descriptions". +/// Description: keeping text descriptions of features in different languages. +/// Section tables: +/// * header +/// * sorted feature ids vector +/// * vector of unordered maps with language codes and string indices of corresponding translations of a description +/// * vector of maps offsets for each feature id (and one additional dummy offset in the end) +/// * BWT-compressed strings grouped by language. +class Serializer +{ +public: + /// \param descriptions unsorted collection of feature descriptions. + explicit Serializer(DescriptionsCollection && descriptions); + + template <typename Sink> + void Serialize(Sink & sink) + { + WriteToSink(sink, static_cast<uint8_t>(Version::Latest)); + + auto const startPos = sink.Pos(); + + HeaderV0 header; + header.Serialize(sink); + + header.m_featuresOffset = sink.Pos() - startPos; + SerializeFeaturesIndices(sink); + + std::vector<LangMetaOffset> offsets; + header.m_langMetaOffset = sink.Pos() - startPos; + SerializeLangMetaCollection(sink, offsets); + + header.m_indexOffset = sink.Pos() - startPos; + SerializeLangMetaIndex(sink, offsets); + + header.m_stringsOffset = sink.Pos() - startPos; + SerializeStrings(sink); + + header.m_eosOffset = sink.Pos() - startPos; + sink.Seek(startPos); + header.Serialize(sink); + sink.Seek(startPos + header.m_eosOffset); + } + + // Serializes a vector of 32-bit sorted feature ids. + template <typename Sink> + void SerializeFeaturesIndices(Sink & sink) + { + CHECK(std::is_sorted(m_descriptions.begin(), m_descriptions.end(), + base::LessBy(&FeatureDescription::m_featureIndex)), ()); + + for (auto const & index : m_descriptions) + WriteToSink(sink, index.m_featureIndex); + } + + template <typename Sink> + void SerializeLangMetaCollection(Sink & sink, std::vector<LangMetaOffset> & offsets) + { + auto const startPos = sink.Pos(); + for (auto const & meta : m_langMetaCollection) + { + offsets.push_back(static_cast<LangMetaOffset>(sink.Pos() - startPos)); + for (auto const & pair : meta) + { + WriteToSink(sink, pair.first); + WriteVarUint(sink, pair.second); + } + } + offsets.push_back(static_cast<LangMetaOffset>(sink.Pos() - startPos)); + } + + template <typename Sink> + void SerializeLangMetaIndex(Sink & sink, std::vector<LangMetaOffset> const & offsets) + { + for (auto const & offset : offsets) + WriteToSink(sink, offset); + } + + // Serializes strings in a compressed storage with block access. + template <typename Sink> + void SerializeStrings(Sink & sink) + { + coding::BlockedTextStorageWriter<Sink> writer(sink, 200000 /* blockSize */); + std::string str; + for (auto const & langIndices : m_groupedByLang) + { + for (auto const & descIndex : langIndices.second) + { + auto const found = m_descriptions[descIndex].m_description.GetString(langIndices.first, str); + CHECK(found, ()); + writer.Append(str); + } + } + } + +private: + DescriptionsCollection m_descriptions; + std::vector<LangMeta> m_langMetaCollection; + std::map<LangCode, std::vector<size_t>> m_groupedByLang; +}; + +class Deserializer +{ +public: + template <typename Reader> + bool Deserialize(Reader & reader, FeatureIndex featureIndex, std::vector<LangCode> const & langPriority, + std::string & description) + { + NonOwningReaderSource source(reader); + auto const version = static_cast<Version>(ReadPrimitiveFromSource<uint8_t>(source)); + + auto subReader = reader.CreateSubReader(source.Pos(), source.Size()); + CHECK(subReader, ()); + + switch (version) + { + case Version::V0: return DeserializeV0(*subReader, featureIndex, langPriority, description); + } + CHECK_SWITCH(); + + return false; + } + + template <typename Reader> + bool DeserializeV0(Reader & reader, FeatureIndex featureIndex, std::vector<LangCode> const & langPriority, + std::string & description) + { + InitializeIfNeeded(reader); + + LangMetaOffset startOffset = 0; + LangMetaOffset endOffset = 0; + { + ReaderPtr<Reader> idsSubReader(CreateFeatureIndicesSubReader(reader)); + DDVector<FeatureIndex, ReaderPtr<Reader>> ids(idsSubReader); + auto const it = std::lower_bound(ids.begin(), ids.end(), featureIndex); + if (it == ids.end() || *it != featureIndex) + return false; + + auto const d = static_cast<uint32_t>(std::distance(ids.begin(), it)); + + ReaderPtr<Reader> ofsSubReader(CreateLangMetaOffsetsSubReader(reader)); + DDVector<LangMetaOffset, ReaderPtr<Reader>> ofs(ofsSubReader); + CHECK_LESS(d, ofs.size(), ()); + CHECK_LESS(d + 1, ofs.size(), ()); + + startOffset = ofs[d]; + endOffset = ofs[d + 1]; + } + + LangMeta langMeta; + { + auto langMetaSubReader = CreateLangMetaSubReader(reader, startOffset, endOffset); + NonOwningReaderSource source(*langMetaSubReader); + + while (source.Size() > 0) + { + auto const lang = ReadPrimitiveFromSource<LangCode>(source); + auto const stringIndex = ReadVarUint<StringIndex>(source); + langMeta.insert(std::make_pair(lang, stringIndex)); + } + } + + auto stringsSubReader = CreateStringsSubReader(reader); + for (auto const lang : langPriority) + { + auto const it = langMeta.find(lang); + if (it != langMeta.end()) + { + description = m_stringsReader.ExtractString(*stringsSubReader, it->second); + return true; + } + } + + return false; + } + + template <typename Reader> + std::unique_ptr<Reader> CreateFeatureIndicesSubReader(Reader & reader) + { + CHECK(m_initialized, ()); + + auto const pos = m_header.m_featuresOffset; + CHECK_GREATER_OR_EQUAL(m_header.m_langMetaOffset, pos, ()); + auto const size = m_header.m_langMetaOffset - pos; + return reader.CreateSubReader(pos, size); + } + + template <typename Reader> + std::unique_ptr<Reader> CreateLangMetaOffsetsSubReader(Reader & reader) + { + CHECK(m_initialized, ()); + + auto const pos = m_header.m_indexOffset; + CHECK_GREATER_OR_EQUAL(m_header.m_stringsOffset, pos, ()); + auto const size = m_header.m_stringsOffset - pos; + return reader.CreateSubReader(pos, size); + } + + template <typename Reader> + std::unique_ptr<Reader> CreateLangMetaSubReader(Reader & reader, LangMetaOffset startOffset, LangMetaOffset endOffset) + { + CHECK(m_initialized, ()); + + auto const pos = m_header.m_langMetaOffset + startOffset; + CHECK_GREATER_OR_EQUAL(m_header.m_indexOffset, pos, ()); + auto const size = endOffset - startOffset; + CHECK_GREATER_OR_EQUAL(m_header.m_indexOffset, pos + size, ()); + return reader.CreateSubReader(pos, size); + } + + template <typename Reader> + std::unique_ptr<Reader> CreateStringsSubReader(Reader & reader) + { + CHECK(m_initialized, ()); + + auto const pos = m_header.m_stringsOffset; + CHECK_GREATER_OR_EQUAL(m_header.m_eosOffset, pos, ()); + auto const size = m_header.m_eosOffset - pos; + return reader.CreateSubReader(pos, size); + } + +private: + template <typename Reader> + void InitializeIfNeeded(Reader & reader) + { + if (m_initialized) + return; + + { + NonOwningReaderSource source(reader); + m_header.Deserialize(source); + } + + m_initialized = true; + } + + bool m_initialized = false; + HeaderV0 m_header; + coding::BlockedTextStorageReader m_stringsReader; +}; +} // namespace descriptions |