diff options
author | Yuri Gorshenin <mipt.vi002@gmail.com> | 2017-07-13 18:04:37 +0300 |
---|---|---|
committer | Arsentiy Milchakov <milcars@mapswithme.com> | 2017-07-13 18:04:37 +0300 |
commit | 92358135885b3f71767fa8b6fc53af160ae58d9d (patch) | |
tree | 1a37c4f0935ebbe5668180a93e9841b50d963cee /ugc/binary | |
parent | 498e98f8d0baf92b75ecac31cf96dbee2f3ec041 (diff) |
[ugc] Binary SerDes. (#6577)
* [ugc] Binary SerDes.
* Review fixes.
Diffstat (limited to 'ugc/binary')
-rw-r--r-- | ugc/binary/header_v0.hpp | 104 | ||||
-rw-r--r-- | ugc/binary/index_ugc.hpp | 27 | ||||
-rw-r--r-- | ugc/binary/serdes.cpp | 103 | ||||
-rw-r--r-- | ugc/binary/serdes.hpp | 298 | ||||
-rw-r--r-- | ugc/binary/ugc_holder.hpp | 25 | ||||
-rw-r--r-- | ugc/binary/visitors.hpp | 210 |
6 files changed, 767 insertions, 0 deletions
diff --git a/ugc/binary/header_v0.hpp b/ugc/binary/header_v0.hpp new file mode 100644 index 0000000000..be5846d2a3 --- /dev/null +++ b/ugc/binary/header_v0.hpp @@ -0,0 +1,104 @@ +#pragma once + +#include "coding/reader.hpp" +#include "coding/write_to_sink.hpp" + +#include <cstdint> + +namespace ugc +{ +namespace binary +{ +namespace impl +{ +struct HeaderSizeOfVisitor +{ + void operator()(uint64_t v, char const * /* name */ = nullptr) { m_size += sizeof(v); } + + template <typename R> + void operator()(R & r, char const * /* name */ = nullptr) + { + r.Visit(*this); + } + + uint64_t m_size = 0; +}; + +template <typename Sink> +struct HeaderSerVisitor +{ + HeaderSerVisitor(Sink & sink) : m_sink(sink) {} + + void operator()(uint64_t v, char const * /* name */ = nullptr) { WriteToSink(m_sink, v); } + + template <typename R> + void operator()(R & r, char const * /* name */ = nullptr) + { + r.Visit(*this); + } + + Sink & m_sink; +}; + +template <typename Source> +struct HeaderDesVisitor +{ + HeaderDesVisitor(Source & source): m_source(source) {} + + void operator()(uint64_t & v, char const * /* name */ = nullptr) + { + v = ReadPrimitiveFromSource<uint64_t>(m_source); + } + + template <typename R> + void operator()(R & r, char const * /* name */ = nullptr) + { + r.Visit(*this); + } + + Source & m_source; +}; +} // namespace impl + +struct HeaderV0 +{ + template <typename Visitor> + void Visit(Visitor & visitor) + { + visitor(m_keysOffset, "keysOffset"); + visitor(m_ugcsOffset, "ugcsOffset"); + visitor(m_indexOffset, "indexOffset"); + visitor(m_textsOffset, "textsOffset"); + visitor(m_eosOffset, "eosOffset"); + } + + template <typename Sink> + void Serialize(Sink & sink) + { + impl::HeaderSerVisitor<Sink> visitor(sink); + visitor(*this); + } + + template <typename Source> + void Deserialize(Source & source) + { + impl::HeaderDesVisitor<Source> visitor(source); + visitor(*this); + } + + // Calculates the size of serialized header in bytes. + uint64_t Size() + { + impl::HeaderSizeOfVisitor visitor; + visitor(*this); + return visitor.m_size; + } + + uint64_t m_keysOffset = 0; + uint64_t m_ugcsOffset = 0; + uint64_t m_indexOffset = 0; + uint64_t m_textsOffset = 0; + uint64_t m_eosOffset = 0; +}; +} // namespace binary +} // namespace ugc diff --git a/ugc/binary/index_ugc.hpp b/ugc/binary/index_ugc.hpp new file mode 100644 index 0000000000..bad35a25b8 --- /dev/null +++ b/ugc/binary/index_ugc.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "ugc/types.hpp" + +#include <cstdint> +#include <utility> + +namespace ugc +{ +namespace binary +{ +struct IndexUGC +{ + using Index = uint32_t; + + IndexUGC() = default; + + template <typename U> + IndexUGC(Index index, U && ugc) : m_index(index), m_ugc(std::forward<U>(ugc)) + { + } + + Index m_index = 0; + UGC m_ugc = {}; +}; +} // namespace binary +} // namespace ugc diff --git a/ugc/binary/serdes.cpp b/ugc/binary/serdes.cpp new file mode 100644 index 0000000000..fdecacaff8 --- /dev/null +++ b/ugc/binary/serdes.cpp @@ -0,0 +1,103 @@ +#include "ugc/binary/serdes.hpp" + +#include <set> + +using namespace std; + +namespace ugc +{ +namespace binary +{ +namespace +{ +class BaseCollector +{ +public: + virtual ~BaseCollector() = default; + + virtual void VisitRating(float const f, char const * /* name */ = nullptr) {} + virtual void operator()(string const & /* s */, char const * /* name */ = nullptr) {} + virtual void operator()(Sentiment const /* sentiment */, char const * /* name */ = nullptr) {} + virtual void operator()(Time const /* time */, char const * /* name */ = nullptr) {} + virtual void operator()(TranslationKey const & tk, char const * /* name */ = nullptr) {} + virtual void operator()(Text const & text, char const * /* name */ = nullptr) {} + + template <typename T> + void operator()(vector<T> const & vs, char const * /* name */ = nullptr) + { + for (auto const & v : vs) + (*this)(v); + } + + template <typename R> + typename enable_if<is_fundamental<R>::value>::type operator()(R const & r, + char const * /* name */ = nullptr) + { + } + + template <typename R> + typename enable_if<!is_fundamental<R>::value>::type operator()(R const & r, + char const * /* name */ = nullptr) + { + r.Visit(*this); + } +}; + +// Collects all translation keys from UGC. +class TranslationKeyCollector : public BaseCollector +{ +public: + using BaseCollector::operator(); + + explicit TranslationKeyCollector(set<TranslationKey> & keys) : m_keys(keys) {} + + void operator()(TranslationKey const & tk, char const * /* name */ = nullptr) override + { + m_keys.insert(tk.m_key); + } + +private: + set<TranslationKey> & m_keys; +}; + +// Collects all texts from UGC. +class TextCollector : public BaseCollector +{ +public: + using BaseCollector::operator(); + + TextCollector(vector<Text> & texts) : m_texts(texts) {} + + void operator()(Text const & text, char const * /* name */ = nullptr) override + { + m_texts.push_back(text); + } + +private: + vector<Text> & m_texts; +}; +} // namespace + +void UGCSeriaizer::CollectTranslationKeys() +{ + ASSERT(m_keys.empty(), ()); + + set<TranslationKey> keys; + TranslationKeyCollector collector(keys); + for (auto const & p : m_ugcs) + collector(p.m_ugc); + m_keys.assign(keys.begin(), keys.end()); +} + +void UGCSeriaizer::CollectTexts() +{ + ASSERT(m_texts.empty(), ()); + for (auto const & p : m_ugcs) + { + m_texts.emplace_back(); + TextCollector collector(m_texts.back()); + collector(p.m_ugc); + } +} +} // namespace binary +} // namespace ugc diff --git a/ugc/binary/serdes.hpp b/ugc/binary/serdes.hpp new file mode 100644 index 0000000000..10826a7064 --- /dev/null +++ b/ugc/binary/serdes.hpp @@ -0,0 +1,298 @@ +#pragma once + +#include "ugc/binary/header_v0.hpp" +#include "ugc/binary/index_ugc.hpp" +#include "ugc/binary/visitors.hpp" +#include "ugc/types.hpp" + +#include "coding/bwt_coder.hpp" +#include "coding/dd_vector.hpp" +#include "coding/read_write_utils.hpp" +#include "coding/reader.hpp" +#include "coding/text_storage.hpp" +#include "coding/write_to_sink.hpp" + +#include "base/assert.hpp" + +#include <algorithm> +#include <cstdint> +#include <string> +#include <utility> +#include <vector> + +namespace ugc +{ +namespace binary +{ +using FeatureIndex = uint32_t; +using UGCOffset = uint64_t; + +class UGCSeriaizer +{ +public: + template <typename IndexUGCList> + UGCSeriaizer(IndexUGCList && ugcs) : m_ugcs(std::forward<IndexUGCList>(ugcs)) + { + std::sort(m_ugcs.begin(), m_ugcs.end(), [&](IndexUGC const & lhs, IndexUGC const & rhs) { + return lhs.m_index < rhs.m_index; + }); + ASSERT(std::is_sorted(m_ugcs.begin(), m_ugcs.end(), + [&](IndexUGC const & lhs, IndexUGC const & rhs) { + return lhs.m_index < rhs.m_index; + }), + ()); + + CollectTranslationKeys(); + CollectTexts(); + } + + template <typename Sink> + void Serialize(Sink & sink) + { + auto const startPos = sink.Pos(); + + HeaderV0 header; + WriteZeroesToSink(sink, header.Size()); + + header.m_keysOffset = sink.Pos() - startPos; + SerializeTranslationKeys(sink); + + std::vector<UGCOffset> ugcOffsets; + + header.m_ugcsOffset = sink.Pos() - startPos; + SerializeUGC(sink, ugcOffsets); + + header.m_indexOffset = sink.Pos() - startPos; + SerializeIndex(sink, ugcOffsets); + + header.m_textsOffset = sink.Pos() - startPos; + SerializeTexts(sink); + + header.m_eosOffset = sink.Pos() - startPos; + sink.Seek(startPos); + header.Serialize(sink); + sink.Seek(startPos + header.m_eosOffset); + } + + // Concatenates all translation keys prefixed with their length as + // varuint, then compresses them via BWT. + template <typename Sink> + void SerializeTranslationKeys(Sink & sink) + { + std::string allKeys; + { + MemWriter<std::string> writer(allKeys); + for (auto const & key : m_keys) + rw::Write(writer, key.m_key); + } + coding::BWTCoder::EncodeAndWriteBlock(sink, allKeys); + } + + // Performs a binary serialization of all UGCS, writes all relative + // offsets of serialized blobs to |offsets|. + template <typename Sink> + void SerializeUGC(Sink & sink, std::vector<UGCOffset> & offsets) + { + auto const startPos = sink.Pos(); + ASSERT_EQUAL(m_ugcs.size(), m_texts.size(), ()); + + uint64_t textFrom = 0; + for (size_t i = 0; i < m_ugcs.size(); ++i) + { + auto const currPos = sink.Pos(); + offsets.push_back(currPos - startPos); + + SerializerVisitor<Sink> ser(sink, m_keys, m_texts[i], textFrom); + ser(m_ugcs[i].m_ugc); + + textFrom += m_texts[i].size(); + } + } + + // Serializes feature ids and offsets of UGC blobs as two fixed-bits + // vectors. Length of vectors is the number of UGCs. The first + // vector is 32-bit sorted feature ids of UGC objects. The second + // vector is 64-bit offsets of corresponding UGC blobs in the ugc + // section. + template <typename Sink> + void SerializeIndex(Sink & sink, std::vector<UGCOffset> const & offsets) + { + ASSERT_EQUAL(m_ugcs.size(), offsets.size(), ()); + for (auto const & p : m_ugcs) + WriteToSink(sink, p.m_index); + for (auto const & offset : offsets) + WriteToSink(sink, offset); + } + + // Serializes texts in a compressed storage with block access. + template <typename Sink> + void SerializeTexts(Sink & sink) + { + coding::BlockedTextStorageWriter<Sink> writer(sink, 200000 /* blockSize */); + for (auto const & collection : m_texts) + { + for (auto const & text : collection) + writer.Append(text.m_text); + } + } + + std::vector<TranslationKey> const & GetTranslationKeys() const { return m_keys; } + std::vector<std::vector<Text>> const & GetTexts() const { return m_texts; } + +private: + void CollectTranslationKeys(); + void CollectTexts(); + + std::vector<IndexUGC> m_ugcs; + std::vector<TranslationKey> m_keys; + std::vector<vector<Text>> m_texts; +}; + +// Deserializer for UGC. May be used for random-access, but it is more +// efficient to keep it alive between accesses. The instances of +// |reader| for Deserialize() may differ between calls, but all +// instances must be set to the beginning of the UGC section +class UGCDeserializerV0 +{ +public: + template <typename R> + bool Deserialize(R & reader, FeatureIndex index, UGC & ugc) + { + InitializeIfNeeded(reader); + + UGCOffset offset = 0; + { + ReaderPtr<Reader> idsSubReader(CreateFeatureIndexesSubReader(reader)); + DDVector<FeatureIndex, ReaderPtr<Reader>> ids(idsSubReader); + auto const it = std::lower_bound(ids.begin(), ids.end(), index); + if (it == ids.end() || *it != index) + return false; + + auto const d = static_cast<uint32_t>(distance(ids.begin(), it)); + + ReaderPtr<Reader> ofsSubReader(CreateUGCOffsetsSubReader(reader)); + DDVector<UGCOffset, ReaderPtr<Reader>> ofs(ofsSubReader); + offset = ofs[d]; + } + + { + auto ugcSubReader = CreateUGCSubReader(reader); + NonOwningReaderSource source(*ugcSubReader); + source.Skip(offset); + + auto textsSubReader = CreateTextsSubReader(reader); + DeserializerVisitorV0<NonOwningReaderSource> des(source, m_keys, *textsSubReader, m_texts); + des(ugc); + } + + return true; + } + + template <typename Reader> + void InitializeIfNeeded(Reader & reader) + { + if (m_initialized) + return; + + { + NonOwningReaderSource source(reader); + m_header.Deserialize(source); + } + + { + ASSERT_GREATER_OR_EQUAL(m_header.m_ugcsOffset, m_header.m_keysOffset, ()); + + auto const pos = m_header.m_keysOffset; + auto const size = m_header.m_ugcsOffset - pos; + + auto subReader = reader.CreateSubReader(pos, size); + NonOwningReaderSource source(*subReader); + DeserializeTranslationKeys(source); + } + + m_initialized = true; + } + + template <typename Source> + void DeserializeTranslationKeys(Source & source) + { + ASSERT(m_keys.empty(), ()); + + std::vector<uint8_t> block; + coding::BWTCoder::ReadAndDecodeBlock(source, std::back_inserter(block)); + + MemReader blockReader(block.data(), block.size()); + NonOwningReaderSource blockSource(blockReader); + while (blockSource.Size() != 0) + { + std::string key; + rw::Read(blockSource, key); + m_keys.emplace_back(move(key)); + } + } + + std::vector<TranslationKey> const & GetTranslationKeys() const { return m_keys; } + +private: + uint64_t GetNumUGCs() + { + ASSERT(m_initialized, ()); + ASSERT_GREATER_OR_EQUAL(m_header.m_textsOffset, m_header.m_indexOffset, ()); + auto const totalSize = m_header.m_textsOffset - m_header.m_indexOffset; + + size_t constexpr kIndexOffset = sizeof(FeatureIndex) + sizeof(UGCOffset); + ASSERT(totalSize % kIndexOffset == 0, (totalSize)); + + return totalSize / kIndexOffset; + } + + template <typename R> + std::unique_ptr<Reader> CreateUGCSubReader(R & reader) + { + ASSERT(m_initialized, ()); + + auto const pos = m_header.m_ugcsOffset; + ASSERT_GREATER_OR_EQUAL(m_header.m_indexOffset, pos, ()); + auto const size = m_header.m_indexOffset - pos; + return reader.CreateSubReader(pos, size); + } + + template <typename R> + std::unique_ptr<Reader> CreateFeatureIndexesSubReader(R & reader) + { + ASSERT(m_initialized, ()); + + auto const pos = m_header.m_indexOffset; + auto const n = GetNumUGCs(); + return reader.CreateSubReader(pos, n * sizeof(FeatureIndex)); + } + + template <typename R> + std::unique_ptr<Reader> CreateUGCOffsetsSubReader(R & reader) + { + ASSERT(m_initialized, ()); + + auto const pos = m_header.m_indexOffset; + auto const n = GetNumUGCs(); + return reader.CreateSubReader(pos + n * sizeof(FeatureIndex), n * sizeof(UGCOffset)); + } + + template <typename R> + std::unique_ptr<Reader> CreateTextsSubReader(R & reader) + { + ASSERT(m_initialized, ()); + + auto const pos = m_header.m_textsOffset; + ASSERT_GREATER_OR_EQUAL(m_header.m_eosOffset, pos, ()); + auto const size = m_header.m_eosOffset - pos; + return reader.CreateSubReader(pos, size); + } + + HeaderV0 m_header; + std::vector<TranslationKey> m_keys; + coding::BlockedTextStorageReader m_texts; + + bool m_initialized = false; +}; +} // namespace binary +} // namespace ugc diff --git a/ugc/binary/ugc_holder.hpp b/ugc/binary/ugc_holder.hpp new file mode 100644 index 0000000000..29425b5ddf --- /dev/null +++ b/ugc/binary/ugc_holder.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include "ugc/binary/index_ugc.hpp" + +#include <cstdint> +#include <utility> +#include <vector> + +namespace ugc +{ +namespace binary +{ +// Wrapper used to collect pairs (feature id, ugcs). +struct UGCHolder +{ + template <typename U> + void Add(uint32_t index, U && ugc) + { + m_ugcs.emplace_back(index, std::forward<U>(ugc)); + } + + std::vector<IndexUGC> m_ugcs; +}; +} // namespace binary +} // namespace ugc diff --git a/ugc/binary/visitors.hpp b/ugc/binary/visitors.hpp new file mode 100644 index 0000000000..d70edc1ad1 --- /dev/null +++ b/ugc/binary/visitors.hpp @@ -0,0 +1,210 @@ +#pragma once + +#include "ugc/types.hpp" + +#include "coding/read_write_utils.hpp" +#include "coding/text_storage.hpp" +#include "coding/varint.hpp" + +#include "base/assert.hpp" + +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <iterator> +#include <string> +#include <type_traits> +#include <vector> + +class Reader; + +namespace ugc +{ +namespace binary +{ +// This class is very similar to ugc::Serializer, with a few differences: +// * it writes indices of TranslationKeys instead of writing them directly +// * it writes indices of Texts instead of writing them directly +template <typename Sink> +class SerializerVisitor +{ +public: + // We assume that all texts from the UGC span the contiguous + // subsequence in the sequence of all texts, and this subsequence + // starts at |textsFrom|. + SerializerVisitor(Sink & sink, std::vector<TranslationKey> const & keys, + std::vector<Text> const & texts, uint64_t textsFrom) + : m_sink(sink), m_keys(keys), m_texts(texts), m_textsFrom(textsFrom) + { + VisitVarUint(m_textsFrom, "textsFrom"); + } + + void operator()(std::string const & s, char const * /* name */ = nullptr) + { + rw::Write(m_sink, s); + } + + void VisitRating(float const f, char const * name = nullptr) + { + CHECK_GREATER_OR_EQUAL(f, 0.0, ()); + auto const d = static_cast<uint32_t>(round(f * 10)); + VisitVarUint(d, name); + } + + template <typename T> + void VisitVarUint(T const & t, char const * /* name */ = nullptr) + { + WriteVarUint(m_sink, t); + } + + void operator()(TranslationKey const & key, char const * name = nullptr) + { + auto const it = std::lower_bound(m_keys.begin(), m_keys.end(), key); + CHECK(it != m_keys.end(), ()); + auto const offset = static_cast<uint64_t>(std::distance(m_keys.begin(), it)); + VisitVarUint(offset, name); + } + + void operator()(Text const & text, char const * name = nullptr) { (*this)(text.m_lang, "lang"); } + + void operator()(Time const & t, char const * name = nullptr) + { + VisitVarUint(ToDaysSinceEpoch(t), name); + } + + void operator()(Sentiment sentiment, char const * /* name */ = nullptr) + { + switch (sentiment) + { + case Sentiment::Negative: return (*this)(static_cast<uint8_t>(0)); + case Sentiment::Positive: return (*this)(static_cast<uint8_t>(1)); + } + } + + template <typename T> + void operator()(vector<T> const & vs, char const * /* name */ = nullptr) + { + VisitVarUint(static_cast<uint32_t>(vs.size())); + for (auto const & v : vs) + (*this)(v); + } + + template <typename D> + typename std::enable_if<std::is_integral<D>::value>::type operator()( + D d, char const * /* name */ = nullptr) + { + WriteToSink(m_sink, d); + } + + template <typename R> + typename std::enable_if<!std::is_integral<R>::value>::type operator()( + R const & r, char const * /* name */ = nullptr) + { + r.Visit(*this); + } + +private: + Sink & m_sink; + std::vector<TranslationKey> const & m_keys; + vector<Text> const & m_texts; + uint64_t m_textsFrom = 0; +}; + +template <typename Source> +class DeserializerVisitorV0 +{ +public: + // |source| must be set to the beginning of the UGC blob. + // |textsReader| must be set to the blocked text storage section. + DeserializerVisitorV0(Source & source, std::vector<TranslationKey> const & keys, + Reader & textsReader, coding::BlockedTextStorageReader & texts) + : m_source(source), m_keys(keys), m_textsReader(textsReader), m_texts(texts) + { + m_currText = DesVarUint<uint64_t>(); + } + + void operator()(std::string & s, char const * /* name */ = nullptr) { rw::Read(m_source, s); } + + void VisitRating(float & f, char const * /* name */ = nullptr) + { + auto const d = DesVarUint<uint32_t>(); + f = static_cast<float>(d) / 10; + } + + template <typename T> + void VisitVarUint(T & t, char const * /* name */ = nullptr) + { + t = ReadVarUint<T, Source>(m_source); + } + + template <typename T> + T DesVarUint() + { + return ReadVarUint<T, Source>(m_source); + } + + void operator()(TranslationKey & key, char const * /* name */ = nullptr) + { + auto const index = DesVarUint<uint64_t>(); + CHECK_LESS(index, m_keys.size(), ()); + key = m_keys[static_cast<size_t>(index)]; + } + + void operator()(Text & text, char const * /* name */ = nullptr) + { + (*this)(text.m_lang, "lang"); + text.m_text = m_texts.ExtractString(m_textsReader, m_currText); + ++m_currText; + } + + void operator()(Time & t, char const * /* name */ = nullptr) + { + t = FromDaysSinceEpoch(DesVarUint<uint32_t>()); + } + + void operator()(Sentiment & sentiment, char const * /* name */ = nullptr) + { + uint8_t s = 0; + (*this)(s); + switch (s) + { + case 0: sentiment = Sentiment::Negative; break; + case 1: sentiment = Sentiment::Positive; break; + default: CHECK(false, ("Can't parse sentiment from:", static_cast<int>(s))); break; + } + } + + template <typename T> + void operator()(std::vector<T> & vs, char const * /* name */ = nullptr) + { + auto const size = DesVarUint<uint32_t>(); + vs.resize(size); + for (auto & v : vs) + (*this)(v); + } + + template <typename D> + typename std::enable_if<std::is_integral<D>::value>::type operator()( + D & d, char const * /* name */ = nullptr) + { + ReadPrimitiveFromSource(m_source, d); + } + + template <typename R> + typename std::enable_if<!std::is_integral<R>::value>::type operator()( + R & r, char const * /* name */ = nullptr) + { + r.Visit(*this); + } + +private: + Source & m_source; + std::vector<TranslationKey> const & m_keys; + + Reader & m_textsReader; + coding::BlockedTextStorageReader & m_texts; + + uint64_t m_currText = 0; +}; +} // namespace binary +} // namespace ugc |