Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYury Melnichek <melnichek@gmail.com>2011-04-26 20:46:37 +0400
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:17:28 +0300
commit45bb508bad5df2c7e8effe06cd7b72915e0a0a3c (patch)
tree1dd5aa4945fd23c6eb75d5a5651ecf1aa49b2f35 /indexer/interval_index_builder.hpp
parent9179f04bfec92ad07c4c70bd232f7dc657718a3e (diff)
New smaller index format.
Diffstat (limited to 'indexer/interval_index_builder.hpp')
-rw-r--r--indexer/interval_index_builder.hpp212
1 files changed, 130 insertions, 82 deletions
diff --git a/indexer/interval_index_builder.hpp b/indexer/interval_index_builder.hpp
index 08dc1e870f..1802cbff35 100644
--- a/indexer/interval_index_builder.hpp
+++ b/indexer/interval_index_builder.hpp
@@ -1,19 +1,57 @@
#pragma once
#include "interval_index.hpp"
+#include "../coding/byte_stream.hpp"
#include "../coding/endianness.hpp"
+#include "../coding/varint.hpp"
#include "../coding/write_to_sink.hpp"
#include "../base/assert.hpp"
#include "../base/base.hpp"
+#include "../base/bits.hpp"
#include "../base/logging.hpp"
#include "../std/vector.hpp"
#include "../std/memcpy.hpp"
-// TODO: BuildIntervalIndex() shouldn't rely on indexing cellid-feature pairs.
-template <int kCellIdBytes, typename CellIdValueIterT, class SinkT>
-void BuildIntervalIndex(CellIdValueIterT const & beg, CellIdValueIterT const & end,
- SinkT & writer, int const leafBytes = 1)
+namespace impl
+{
+
+template <class WriterT>
+void WriteIntervalIndexNode(WriterT & writer, uint64_t offset, uint64_t bitMask)
+{
+ // At the moment, uint32_t is used as a bitMask, but this can change in the future.
+ CHECK_EQUAL(static_cast<uint32_t>(bitMask), bitMask, (offset));
+ CHECK_GREATER_OR_EQUAL(offset, writer.Pos() + 8, ());
+ WriteToSink(writer, static_cast<uint32_t>(offset - writer.Pos() - 8));
+ WriteToSink(writer, static_cast<uint32_t>(bitMask));
+}
+
+template <class SinkT> void WriteIntervalIndexLeaf(SinkT & sink, int bitsPerLevel,
+ uint64_t prevKey, uint64_t prevValue,
+ uint64_t key, uint64_t value)
+{
+ uint64_t const lastBitsZeroMask = (1ULL << bitsPerLevel) - 1;
+ if ((key & ~lastBitsZeroMask) != (prevKey & ~lastBitsZeroMask))
+ prevValue = 0;
+
+ int64_t const delta = static_cast<int64_t>(value) - static_cast<int64_t>(prevValue);
+ uint64_t const encodedDelta = bits::ZigZagEncode(delta);
+ uint64_t const code = (encodedDelta << 1) + (key == prevKey ? 0 : 1);
+ WriteVarUint(sink, code);
+}
+
+inline uint32_t IntervalIndexLeafSize(int bitsPerLevel,
+ uint64_t prevKey, uint64_t prevValue,
+ uint64_t key, uint64_t value)
+{
+ CountingSink sink;
+ WriteIntervalIndexLeaf(sink, bitsPerLevel, prevKey, prevValue, key, value);
+ return sink.GetCount();
+}
+
+template <typename CellIdValueIterT>
+bool CheckIntervalIndexInputSequence(CellIdValueIterT const & beg,
+ CellIdValueIterT const & end,
+ uint32_t keyBits)
{
-#ifdef DEBUG
// Check that [beg, end) is sorted and log most populous cell.
if (beg != end)
{
@@ -24,8 +62,8 @@ void BuildIntervalIndex(CellIdValueIterT const & beg, CellIdValueIterT const & e
uint64_t prev = it->GetCell();
for (++it; it != end; ++it)
{
- ASSERT_GREATER(it->GetCell(), 0, ());
- ASSERT_LESS_OR_EQUAL(prev, it->GetCell(), ());
+ CHECK_GREATER(it->GetCell(), 0, ());
+ CHECK_LESS_OR_EQUAL(prev, it->GetCell(), ());
count = (prev == it->GetCell() ? count + 1 : 0);
if (count > maxCount)
{
@@ -41,98 +79,108 @@ void BuildIntervalIndex(CellIdValueIterT const & beg, CellIdValueIterT const & e
}
}
for (CellIdValueIterT it = beg; it != end; ++it)
- ASSERT_LESS(it->GetCell(), 1ULL << 8 * kCellIdBytes, ());
-#endif
+ CHECK_LESS(it->GetCell(), 1ULL << keyBits, ());
+ return true;
+}
+
+}
+
+// TODO: BuildIntervalIndex() shouldn't rely on indexing cellid-feature pairs.
+template <typename CellIdValueIterT, class SinkT>
+void BuildIntervalIndex(CellIdValueIterT const & beg, CellIdValueIterT const & end,
+ SinkT & writer, uint8_t const keyBits)
+{
+ CHECK_LESS(keyBits, 63, ());
+ CHECK(impl::CheckIntervalIndexInputSequence(beg, end, keyBits), ());
+
+ uint32_t const bitsPerLevel = 5;
+ uint32_t const lastBitsMask = (1 << bitsPerLevel) - 1;
+ int const levelCount = (keyBits + bitsPerLevel - 1) / bitsPerLevel;
+
// Write header.
{
IntervalIndexBase::Header header;
- header.m_CellIdLeafBytes = leafBytes;
+ header.m_BitsPerLevel = bitsPerLevel;
+ header.m_Levels = levelCount;
writer.Write(&header, sizeof(header));
}
-#ifdef DEBUG
- vector<uint32_t> childOffsets;
- childOffsets.push_back(static_cast<uint32_t>(writer.Pos()));
-#endif
- uint32_t childOffset = static_cast<uint32_t>(writer.Pos()) + sizeof(IntervalIndexBase::Index);
- uint32_t thisLevelCount = 1;
- for (int level = kCellIdBytes - 1; level >= leafBytes; --level)
+ if (beg == end)
+ {
+ // Write empty index.
+ CHECK_GREATER(levelCount, 1, ());
+ impl::WriteIntervalIndexNode(writer, writer.Pos() + sizeof(IntervalIndexBase::Index), 0);
+ LOG(LWARNING, ("Written empty index."));
+ return;
+ }
+
+ // Write internal nodes.
+ uint64_t childOffset = writer.Pos() + sizeof(IntervalIndexBase::Index);
+ uint64_t nextChildOffset = childOffset;
+ for (int level = levelCount - 1; level >= 0; --level)
{
- LOG(LINFO, ("Building interval index, level", level));
-#ifdef DEBUG
- ASSERT_EQUAL(childOffsets.back(), writer.Pos(), ());
- childOffsets.push_back(childOffset);
-#endif
- uint64_t const initialWriterPos = writer.Pos();
- uint32_t childCount = 0, totalChildCount = 0;
- IntervalIndexBase::Index index;
- memset(&index, 0, sizeof(index));
- uint64_t prevParentBytes = 0;
- uint8_t prevByte = 0;
+ // LOG(LINFO, ("Building interval index, level", level));
+ uint64_t const initialLevelWriterPos = writer.Pos();
+
+ uint64_t bitMask = 0;
+ uint64_t prevKey = 0;
+ uint64_t prevValue = 0;
for (CellIdValueIterT it = beg; it != end; ++it)
{
- uint64_t id = it->GetCell();
- uint64_t const thisParentBytes = id >> 8 * (level + 1);
- uint8_t const thisByte = static_cast<uint8_t>(0xFF & (id >> 8 * level));
- if (it != beg && prevParentBytes != thisParentBytes)
- {
- // Writing index for previous parent.
- index.SetBaseOffset(childOffset);
- for (uint32_t i = 0; i < 256; ++i)
- index.m_Count[i] = SwapIfBigEndian(index.m_Count[i]);
- writer.Write(&index, sizeof(index));
- memset(&index, 0, sizeof(index));
- if (level != leafBytes)
- childOffset += childCount * sizeof(index);
- else
- childOffset += (sizeof(beg->GetFeature()) + leafBytes) * childCount;
- childCount = 0;
- --thisLevelCount;
- }
+ uint64_t const key = it->GetCell() >> (level * bitsPerLevel);
+ uint32_t const value = it->GetFeature();
- if (level == leafBytes || it == beg ||
- prevByte != thisByte || prevParentBytes != thisParentBytes)
+ if (it != beg && (prevKey & ~lastBitsMask) != (key & ~lastBitsMask))
{
- ++childCount;
- ++totalChildCount;
- ++index.m_Count[thisByte];
- CHECK_LESS(index.m_Count[thisByte], 65535, (level, leafBytes, prevByte, thisByte, \
- prevParentBytes, thisParentBytes, \
- it->GetCell()));
+ // Write node for the previous parent.
+ impl::WriteIntervalIndexNode(writer, childOffset, bitMask);
+ childOffset = nextChildOffset;
+ bitMask = 0;
}
- prevParentBytes = thisParentBytes;
- prevByte = thisByte;
+ bitMask |= (1ULL << (key & lastBitsMask));
+
+ if (level == 0)
+ nextChildOffset += impl::IntervalIndexLeafSize(bitsPerLevel,
+ prevKey, prevValue, key, value);
+ else if (it == beg || prevKey != key)
+ nextChildOffset += sizeof(IntervalIndexBase::Index);
+
+ prevKey = key;
+ prevValue = value;
}
- index.SetBaseOffset(childOffset);
- for (uint32_t i = 0; i < 256; ++i)
- index.m_Count[i] = SwapIfBigEndian(index.m_Count[i]);
- writer.Write(&index, sizeof(index));
- memset(&index, 0, sizeof(index));
- // if level == leafBytes, this is wrong, but childOffset is not needed any more.
- childOffset += childCount * sizeof(IntervalIndexBase::Index);
- --thisLevelCount;
- CHECK_EQUAL(thisLevelCount, 0, (kCellIdBytes, leafBytes));
- thisLevelCount = totalChildCount;
-
- LOG(LINFO, ("Level size:", writer.Pos() - initialWriterPos));
-
- if (beg == end)
- break;
+
+ // Write the last node.
+ impl::WriteIntervalIndexNode(writer, childOffset, bitMask);
+
+ if (level == 1)
+ nextChildOffset += sizeof(IntervalIndexBase::Index);
+
+ childOffset = nextChildOffset;
+
+ LOG(LINFO, ("Level:", level, "size:", writer.Pos() - initialLevelWriterPos));
}
- // Writing values.
- ASSERT_EQUAL(childOffsets.back(), writer.Pos(), ());
- LOG(LINFO, ("Building interval, leaves."));
- uint64_t const initialWriterPos = writer.Pos();
- uint32_t const mask = (1ULL << 8 * leafBytes) - 1;
- for (CellIdValueIterT it = beg; it != end; ++it)
+ // Write the dummy one-after-last node.
+ impl::WriteIntervalIndexNode(writer, nextChildOffset, 0);
+
+ // Write leaves.
{
- WriteToSink(writer, it->GetFeature());
- uint32_t cellId = static_cast<uint32_t>(it->GetCell() & mask);
- cellId = SwapIfBigEndian(cellId);
- writer.Write(&cellId, leafBytes);
+ uint64_t const initialLevelWriterPos = writer.Pos();
+
+ uint64_t prevKey = -1;
+ uint32_t prevValue = 0;
+ for (CellIdValueIterT it = beg; it != end; ++it)
+ {
+ uint64_t const key = it->GetCell();
+ uint32_t const value = it->GetFeature();
+ impl::WriteIntervalIndexLeaf(writer, bitsPerLevel, prevKey, prevValue, key, value);
+ prevKey = key;
+ prevValue = value;
+ }
+
+ LOG(LINFO, ("Leaves size:", writer.Pos() - initialLevelWriterPos));
}
- LOG(LINFO, ("Level size:", writer.Pos() - initialWriterPos));
+
LOG(LINFO, ("Interval index building done."));
}