Welcome to mirror list, hosted at ThFree Co, Russian Federation.

text_storage.hpp « coding - github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 5994c9c0bf9a62a745c550102618fac5a0945dce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#pragma once

#include "coding/bwt_coder.hpp"
#include "coding/reader.hpp"
#include "coding/varint.hpp"
#include "coding/write_to_sink.hpp"

#include "base/assert.hpp"

#include <algorithm>
#include <cstdint>
#include <sstream>
#include <string>

namespace coding
{
// Writes a set of strings in a format that allows to efficiently
// access blocks of strings. This means that access of individual
// strings may be inefficient, but access to a block of strings can be
// performed in O(length of all strings in the block + log(number of
// blocks)). The size of each block roughly equals to the |blockSize|,
// because the whole number of strings is packed into a single block.
//
// Format description:
// * first 8 bytes - little endian-encoded offset of the index section
// * data section - represents a catenated sequence of BWT-compressed blocks with
//   a sequence of individual string lengths in the block
// * index section - represents a delta-encoded sequence of
//   BWT-compressed blocks offsets intermixed with the number of
//   strings inside each block.
//
// All numbers except the first offset are varints.
template <typename Writer>
class BlockedTextStorageWriter
{
public:
  BlockedTextStorageWriter(Writer & writer, uint64_t blockSize)
    : m_writer(writer), m_blockSize(blockSize), m_startOffset(writer.Pos()), m_blocks(1)
  {
    CHECK(m_blockSize != 0, ());
    WriteToSink(m_writer, static_cast<uint64_t>(0));
    m_dataOffset = m_writer.Pos();
  }

  ~BlockedTextStorageWriter()
  {
    if (!m_lengths.empty())
      FlushPool(m_lengths, m_pool);

    if (m_blocks.back().IsEmpty())
      m_blocks.pop_back();

    {
      auto const currentOffset = m_writer.Pos();
      ASSERT_GREATER_OR_EQUAL(currentOffset, m_startOffset, ());
      m_writer.Seek(m_startOffset);
      WriteToSink(m_writer, static_cast<uint64_t>(currentOffset - m_startOffset));
      m_writer.Seek(currentOffset);
    }

    WriteVarUint(m_writer, m_blocks.size());

    uint64_t prevOffset = 0;
    for (auto const & block : m_blocks)
    {
      ASSERT_GREATER_OR_EQUAL(block.m_offset, prevOffset, ());
      WriteVarUint(m_writer, block.m_offset - prevOffset);

      ASSERT(!block.IsEmpty(), ());
      WriteVarUint(m_writer, block.m_subs);

      prevOffset = block.m_offset;
    }
  }

  void Append(std::string const & s)
  {
    ASSERT(!m_blocks.empty(), ());

    ASSERT_LESS(m_pool.size(), m_blockSize, ());

    ++m_blocks.back().m_subs;
    m_pool.append(s);
    m_lengths.push_back(s.size());

    if (m_pool.size() >= m_blockSize)
    {
      FlushPool(m_lengths, m_pool);
      m_pool.clear();
      m_lengths.clear();
      m_blocks.emplace_back(m_writer.Pos() - m_dataOffset /* offset */, 0 /* subs */);
    }
  }

private:
  struct Block
  {
    Block() = default;
    Block(uint64_t offset, uint64_t subs) : m_offset(offset), m_subs(subs) {}

    bool IsEmpty() const { return m_subs == 0; }

    uint64_t m_offset = 0;  // offset of the block inside the sequence of compressed blocks
    uint64_t m_subs = 0;    // number of strings inside the block
  };

  void FlushPool(vector<uint64_t> const & lengths, string const & pool)
  {
    for (auto const & length : lengths)
      WriteVarUint(m_writer, length);
    BWTCoder::EncodeAndWriteBlock(m_writer, pool.size(),
                                  reinterpret_cast<uint8_t const *>(pool.c_str()));
  }

  Writer & m_writer;
  uint64_t const m_blockSize;
  uint64_t m_startOffset = 0;
  uint64_t m_dataOffset = 0;

  vector<Block> m_blocks;

  std::string m_pool;          // concatenated strings
  vector<uint64_t> m_lengths;  // lengths of strings inside the |m_pool|
};

class BlockedTextStorageIndex
{
public:
  struct BlockInfo
  {
    // Returns the index of the first string belonging to the block.
    uint64_t From() const { return m_from; }

    // Returns the index of the first string from the next block.
    uint64_t To() const { return m_from + m_subs; }

    uint64_t m_offset = 0;  // offset of the block from the beginning of the section
    uint64_t m_from = 0;    // index of the first string in the block
    uint64_t m_subs = 0;    // number of strings in the block
  };

  size_t GetNumBlockInfos() const { return m_blocks.size(); }
  size_t GetNumStrings() const { return m_blocks.empty() ? 0 : m_blocks.back().To(); }

  BlockInfo const & GetBlockInfo(size_t blockIx) const
  {
    ASSERT_LESS(blockIx, GetNumBlockInfos(), ());
    return m_blocks[blockIx];
  }

  // Returns the index of the block the |stringIx| belongs to.
  // Returns the number of blocks if there're no such block.
  size_t GetBlockIx(size_t stringIx) const
  {
    if (m_blocks.empty() || stringIx >= m_blocks.back().To())
      return GetNumBlockInfos();
    if (stringIx >= m_blocks.back().From())
      return GetNumBlockInfos() - 1;

    size_t lo = 0, hi = GetNumBlockInfos() - 1;
    while (lo + 1 != hi)
    {
      ASSERT_GREATER_OR_EQUAL(stringIx, m_blocks[lo].From(), ());
      ASSERT_LESS(stringIx, m_blocks[hi].From(), ());

      auto const mi = lo + (hi - lo) / 2;
      if (stringIx >= m_blocks[mi].From())
        lo = mi;
      else
        hi = mi;
    }

    ASSERT_GREATER_OR_EQUAL(stringIx, m_blocks[lo].From(), ());
    ASSERT_LESS(stringIx, m_blocks[hi].From(), ());
    return lo;
  }

  template <typename Reader>
  void Read(Reader & reader)
  {
    auto const indexOffset = ReadPrimitiveFromPos<uint64_t, Reader>(reader, 0);

    NonOwningReaderSource source(reader);
    source.Skip(indexOffset);

    auto const numBlocks = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
    m_blocks.assign(numBlocks, {});

    uint64_t prevOffset = 8;  // 8 bytes for the offset of the data section
    for (uint64_t i = 0; i < numBlocks; ++i)
    {
      auto const delta = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
      CHECK_GREATER_OR_EQUAL(prevOffset + delta, prevOffset, ());
      prevOffset += delta;

      auto & block = m_blocks[i];
      block.m_offset = prevOffset;
      block.m_from = i == 0 ? 0 : m_blocks[i - 1].To();
      block.m_subs = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
      CHECK_GREATER_OR_EQUAL(block.m_from + block.m_subs, block.m_from, ());
    }
  }

private:
  std::vector<BlockInfo> m_blocks;
};

class BlockedTextStorageReader
{
public:
  template <typename Reader>
  void InitializeIfNeeded(Reader & reader)
  {
    if (m_initialized)
      return;
    m_index.Read(reader);
    m_initialized = true;
  }

  size_t GetNumStrings() const
  {
    CHECK(m_initialized, ());
    return m_index.GetNumStrings();
  }

  template <typename Reader>
  std::string ExtractString(Reader & reader, size_t stringIx)
  {
    InitializeIfNeeded(reader);

    auto const blockIx = m_index.GetBlockIx(stringIx);
    CHECK_LESS(blockIx, m_index.GetNumBlockInfos(), ());

    if (blockIx >= m_cache.size())
      m_cache.resize(blockIx + 1);
    ASSERT_LESS(blockIx, m_cache.size(), ());

    auto const & bi = m_index.GetBlockInfo(blockIx);

    auto & entry = m_cache[blockIx];
    if (!entry.m_valid)
    {
      NonOwningReaderSource source(reader);
      source.Skip(bi.m_offset);

      entry.m_value.clear();
      entry.m_subs.resize(bi.m_subs);

      uint64_t offset = 0;
      for (size_t i = 0; i < entry.m_subs.size(); ++i)
      {
        auto & sub = entry.m_subs[i];
        sub.m_offset = offset;
        sub.m_length = ReadVarUint<uint64_t>(source);
        CHECK_GREATER_OR_EQUAL(sub.m_offset + sub.m_length, sub.m_offset, ());
        offset += sub.m_length;
      }
      BWTCoder::ReadAndDecodeBlock(source, std::back_inserter(entry.m_value));
      entry.m_valid = true;
    }
    ASSERT(entry.m_valid, ());

    ASSERT_GREATER_OR_EQUAL(stringIx, bi.From(), ());
    ASSERT_LESS(stringIx, bi.To(), ());

    stringIx -= bi.From();
    ASSERT_LESS(stringIx, entry.m_subs.size(), ());

    auto const & si = entry.m_subs[stringIx];
    auto const & value = entry.m_value;
    ASSERT_LESS_OR_EQUAL(si.m_offset + si.m_length, value.size(), ());
    return value.substr(si.m_offset, si.m_length);
  }

  void ClearCache() { m_cache.clear(); }

private:
  struct StringInfo
  {
    StringInfo() = default;
    StringInfo(uint64_t offset, uint64_t length): m_offset(offset), m_length(length) {}

    uint64_t m_offset = 0;  // offset of the string inside the decompressed block
    uint64_t m_length = 0;  // length of the string
  };

  struct CacheEntry
  {
    std::string m_value;             // concatenation of the strings
    std::vector<StringInfo> m_subs;  // indices of individual strings
    bool m_valid = false;
  };

  BlockedTextStorageIndex m_index;
  std::vector<CacheEntry> m_cache;
  bool m_initialized = false;
};

template <typename Reader>
class BlockedTextStorage
{
public:
  explicit BlockedTextStorage(Reader & reader) : m_reader(reader)
  {
    m_storage.InitializeIfNeeded(m_reader);
  }

  size_t GetNumStrings() const { return m_storage.GetNumStrings(); }
  std::string ExtractString(size_t stringIx) { return m_storage.ExtractString(m_reader, stringIx); }
  void ClearCache() { m_storage.ClearCache(); }

private:
  BlockedTextStorageReader m_storage;
  Reader & m_reader;
};
}  // namespace coding