Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuri Gorshenin <y@maps.me>2015-03-02 16:50:57 +0300
committerAlex Zolotarev <alex@maps.me>2015-09-23 02:38:13 +0300
commitbbbd269cd3337ee71b52de71a50bfc4053db9924 (patch)
tree7345b29caeccf09cb4f733ccadcc9b52de90fcca
parentdc10c7fffaf5645290ca1bc7f3c52347e27177d7 (diff)
Implemented compressed search-index building.
-rw-r--r--coding/coding_tests/trie_test.cpp57
-rw-r--r--coding/trie_builder.hpp87
-rw-r--r--defines.hpp1
-rw-r--r--indexer/indexer.pro2
-rw-r--r--indexer/search_index_builder.cpp205
-rw-r--r--indexer/search_index_builder.hpp13
-rw-r--r--indexer/string_file.cpp128
-rw-r--r--indexer/string_file.hpp156
-rw-r--r--indexer/string_file_values.hpp150
9 files changed, 567 insertions, 232 deletions
diff --git a/coding/coding_tests/trie_test.cpp b/coding/coding_tests/trie_test.cpp
index 4bed389d6e..f57ba2dfe5 100644
--- a/coding/coding_tests/trie_test.cpp
+++ b/coding/coding_tests/trie_test.cpp
@@ -54,11 +54,12 @@ struct KeyValuePair
uint32_t GetKeySize() const { return m_key.size(); }
trie::TrieChar const * GetKeyData() const { return m_key.data(); }
+ uint32_t GetValue() const { return m_value; }
template <class TCont> void SerializeValue(TCont & cont) const
{
- cont.resize(4);
- memcpy(cont.data(), &m_value, 4);
+ cont.resize(sizeof(m_value));
+ memcpy(cont.data(), &m_value, sizeof(m_value));
}
bool operator == (KeyValuePair const & p) const
@@ -113,6 +114,49 @@ struct MaxValueCalc
}
};
+class CharValueList
+{
+public:
+ CharValueList(const string & s) : m_string(s) {}
+
+ size_t size() const { return m_string.size(); }
+
+ template <typename SinkT>
+ void Dump(SinkT & sink) const
+ {
+ sink.Write(m_string.data(), m_string.size());
+ }
+
+private:
+ string m_string;
+};
+
+class Uint32ValueList
+{
+public:
+ Uint32ValueList() : m_size(0) {}
+
+ void Append(uint32_t value)
+ {
+ size_t const originalSize = m_value.size();
+ m_value.resize(originalSize + sizeof(value));
+ memcpy(m_value.data() + originalSize, &value, sizeof(value));
+ ++m_size;
+ }
+
+ uint32_t size() const { return m_size; }
+
+ template <typename SinkT>
+ void Dump(SinkT & sink) const
+ {
+ sink.Write(&m_value[0], m_value.size());
+ }
+
+private:
+ vector<uint8_t> m_value;
+ uint32_t m_size;
+};
+
} // unnamed namespace
#define ZENC bits::ZigZagEncode
@@ -132,8 +176,9 @@ UNIT_TEST(TrieBuilder_WriteNode_Smoke)
"abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij", "i4"),
ChildNodeInfo(true, 5, "a", "5z")
};
- trie::builder::WriteNode(sink, 0, 3, "123", 3,
- &children[0], &children[0] + ARRAY_SIZE(children));
+
+ CharValueList valueList("123");
+ trie::builder::WriteNode(sink, 0, valueList, &children[0], &children[0] + ARRAY_SIZE(children));
uint8_t const expected [] =
{
BOOST_BINARY(11000101), // Header: [0b11] [0b000101]
@@ -202,7 +247,9 @@ UNIT_TEST(TrieBuilder_Build)
vector<uint8_t> serial;
PushBackByteSink<vector<uint8_t> > sink(serial);
- trie::Build(sink, v.begin(), v.end(), trie::builder::MaxValueEdgeBuilder<MaxValueCalc>());
+ trie::Build<PushBackByteSink<vector<uint8_t> >, typename vector<KeyValuePair>::iterator,
+ trie::builder::MaxValueEdgeBuilder<MaxValueCalc>, Uint32ValueList>(
+ sink, v.begin(), v.end(), trie::builder::MaxValueEdgeBuilder<MaxValueCalc>());
reverse(serial.begin(), serial.end());
// LOG(LINFO, (serial.size(), vs));
diff --git a/coding/trie_builder.hpp b/coding/trie_builder.hpp
index f14461284c..7f222de854 100644
--- a/coding/trie_builder.hpp
+++ b/coding/trie_builder.hpp
@@ -1,7 +1,10 @@
#pragma once
-#include "../coding/byte_stream.hpp"
-#include "../coding/varint.hpp"
+
+#include "byte_stream.hpp"
+#include "varint.hpp"
+
#include "../base/buffer_vector.hpp"
+
#include "../std/algorithm.hpp"
// Trie format:
@@ -39,27 +42,25 @@ namespace trie
{
namespace builder
{
-
-template <typename SinkT, typename ChildIterT>
-void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
- void const * const valuesDataSize, uint32_t const valuesSize,
- ChildIterT const begChild, ChildIterT const endChild,
- bool isRoot = false)
+template <typename SinkT, typename ChildIterT, typename ValueListT>
+void WriteNode(SinkT & sink, TrieChar baseChar, ValueListT & valueList,
+ ChildIterT const begChild, ChildIterT const endChild, bool isRoot = false)
{
if (begChild == endChild && !isRoot)
{
// Leaf node.
- sink.Write(valuesDataSize, valuesSize);
+ valueList.Dump(sink);
return;
}
uint32_t const childCount = endChild - begChild;
+ uint32_t const valueCount = valueList.size();
uint8_t const header = static_cast<uint32_t>((min(valueCount, 3U) << 6) + min(childCount, 63U));
sink.Write(&header, 1);
if (valueCount >= 3)
WriteVarUint(sink, valueCount);
if (childCount >= 63)
WriteVarUint(sink, childCount);
- sink.Write(valuesDataSize, valuesSize);
+ valueList.Dump(sink);
for (ChildIterT it = begChild; it != endChild; /*++it*/)
{
uint8_t header = (it->IsLeaf() ? 128 : 0);
@@ -70,9 +71,9 @@ void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar));
if (edgeSize == 1 && (diff0 & ~63U) == 0)
{
- header |= 64;
- header |= diff0;
- WriteToSink(sink, header);
+ header |= 64;
+ header |= diff0;
+ WriteToSink(sink, header);
}
else
{
@@ -102,20 +103,6 @@ void WriteNode(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
}
}
-template <typename SinkT, typename ChildIterT>
-void WriteNodeReverse(SinkT & sink, TrieChar baseChar, uint32_t const valueCount,
- void const * const valuesDataSize, uint32_t const valuesSize,
- ChildIterT const begChild, ChildIterT const endChild,
- bool isRoot = false)
-{
- typedef buffer_vector<uint8_t, 64> OutStorageType;
- OutStorageType out;
- PushBackByteSink<OutStorageType> outSink(out);
- WriteNode(outSink, baseChar, valueCount, valuesDataSize, valuesSize, begChild, endChild, isRoot);
- reverse(out.begin(), out.end());
- sink.Write(out.data(), out.size());
-}
-
struct ChildInfo
{
bool m_isLeaf;
@@ -136,21 +123,35 @@ struct ChildInfo
uint32_t GetEdgeValueSize() const { return m_edgeValue.size(); }
};
-template <class EdgeBuilderT>
+template <class EdgeBuilderT, class ValueListT>
struct NodeInfo
{
uint64_t m_begPos;
TrieChar m_char;
vector<ChildInfo> m_children;
- buffer_vector<uint8_t, 32> m_values;
- uint32_t m_valueCount;
+ ValueListT m_valueList;
EdgeBuilderT m_edgeBuilder;
- NodeInfo() : m_valueCount(0) {}
+ NodeInfo() : m_begPos(0), m_char(0) {}
NodeInfo(uint64_t pos, TrieChar trieChar, EdgeBuilderT const & edgeBuilder)
- : m_begPos(pos), m_char(trieChar), m_valueCount(0), m_edgeBuilder(edgeBuilder) {}
+ : m_begPos(pos), m_char(trieChar), m_edgeBuilder(edgeBuilder)
+ {
+ }
};
+template <typename SinkT, typename EdgeBuilderT, typename ValueListT>
+void WriteNodeReverse(SinkT & sink, TrieChar baseChar, NodeInfo<EdgeBuilderT, ValueListT> & node,
+ bool isRoot = false)
+{
+ typedef buffer_vector<uint8_t, 64> OutStorageType;
+ OutStorageType out;
+ PushBackByteSink<OutStorageType> outSink(out);
+ WriteNode(outSink, baseChar, node.m_valueList, node.m_children.rbegin(), node.m_children.rend(),
+ isRoot);
+ reverse(out.begin(), out.end());
+ sink.Write(out.data(), out.size());
+}
+
template <typename SinkT, class NodesT>
void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop)
{
@@ -161,9 +162,8 @@ void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop)
NodeInfoType & node = nodes.back();
NodeInfoType & prevNode = nodes[nodes.size() - 2];
- if (node.m_valueCount == 0 && node.m_children.size() <= 1)
+ if (node.m_valueList.size() == 0 && node.m_children.size() <= 1)
{
- ASSERT(node.m_values.empty(), ());
ASSERT_EQUAL(node.m_children.size(), 1, ());
ChildInfo & child = node.m_children[0];
prevNode.m_children.push_back(ChildInfo(child.m_isLeaf, child.m_size, node.m_char));
@@ -171,9 +171,7 @@ void PopNodes(SinkT & sink, NodesT & nodes, int nodesToPop)
}
else
{
- WriteNodeReverse(sink, node.m_char, node.m_valueCount,
- node.m_values.data(), node.m_values.size(),
- node.m_children.rbegin(), node.m_children.rend());
+ WriteNodeReverse(sink, node.m_char, node);
prevNode.m_children.push_back(ChildInfo(node.m_children.empty(),
static_cast<uint32_t>(sink.Pos() - node.m_begPos),
node.m_char));
@@ -231,12 +229,12 @@ struct MaxValueEdgeBuilder
} // namespace builder
-template <typename SinkT, typename IterT, class EdgeBuilderT>
+template <typename SinkT, typename IterT, typename EdgeBuilderT, typename ValueListT>
void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const & edgeBuilder)
{
typedef buffer_vector<TrieChar, 32> TrieString;
typedef buffer_vector<uint8_t, 32> TrieValue;
- typedef builder::NodeInfo<EdgeBuilderT> NodeInfoT;
+ typedef builder::NodeInfo<EdgeBuilderT, ValueListT> NodeInfoT;
buffer_vector<NodeInfoT, 32> nodes;
nodes.push_back(NodeInfoT(sink.Pos(), DEFAULT_CHAR, edgeBuilder));
@@ -264,12 +262,10 @@ void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const &
uint64_t const pos = sink.Pos();
for (size_t i = nCommon; i < key.size(); ++i)
- nodes.push_back(builder::NodeInfo<EdgeBuilderT>(pos, key[i], edgeBuilder));
+ nodes.push_back(NodeInfoT(pos, key[i], edgeBuilder));
+ nodes.back().m_valueList.Append(e.GetValue());
e.SerializeValue(value);
- nodes.back().m_values.insert(nodes.back().m_values.end(),
- value.begin(), value.end());
- nodes.back().m_valueCount += 1;
nodes.back().m_edgeBuilder.AddValue(value.data(), value.size());
prevKey.swap(key);
@@ -280,10 +276,7 @@ void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const &
builder::PopNodes(sink, nodes, nodes.size() - 1);
// Write the root.
- WriteNodeReverse(sink, DEFAULT_CHAR, nodes.back().m_valueCount,
- nodes.back().m_values.data(), nodes.back().m_values.size(),
- nodes.back().m_children.rbegin(), nodes.back().m_children.rend(),
- true);
+ WriteNodeReverse(sink, DEFAULT_CHAR /* baseChar */, nodes.back(), true /* isRoot */);
}
} // namespace trie
diff --git a/defines.hpp b/defines.hpp
index 0e949c8e3c..8fec4e0fdb 100644
--- a/defines.hpp
+++ b/defines.hpp
@@ -15,6 +15,7 @@
#define METADATA_FILE_TAG "meta"
#define METADATA_INDEX_FILE_TAG "metaidx"
#define FEATURES_OFFSETS_TABLE_FILE_TAG "offsets"
+#define COMPRESSED_SEARCH_INDEX_FILE_TAG "csdx"
#define ROUTING_MATRIX_FILE_TAG "mercedes"
#define ROUTING_EDGEDATA_FILE_TAG "daewoo"
diff --git a/indexer/indexer.pro b/indexer/indexer.pro
index c45197d6b5..b227e74d31 100644
--- a/indexer/indexer.pro
+++ b/indexer/indexer.pro
@@ -44,7 +44,6 @@ SOURCES += \
search_delimiters.cpp \
search_index_builder.cpp \
search_string_utils.cpp \
- string_file.cpp \
types_mapping.cpp \
HEADERS += \
@@ -94,6 +93,7 @@ HEADERS += \
search_string_utils.hpp \
search_trie.hpp \
string_file.hpp \
+ string_file_values.hpp \
tesselator_decl.hpp \
tree_structure.hpp \
types_mapping.hpp \
diff --git a/indexer/search_index_builder.cpp b/indexer/search_index_builder.cpp
index 12ca7b4e68..fb1c5cdcaf 100644
--- a/indexer/search_index_builder.cpp
+++ b/indexer/search_index_builder.cpp
@@ -5,6 +5,7 @@
#include "search_trie.hpp"
#include "search_string_utils.hpp"
#include "string_file.hpp"
+#include "string_file_values.hpp"
#include "classificator.hpp"
#include "feature_visibility.hpp"
#include "categories_holder.hpp"
@@ -20,15 +21,19 @@
#include "../coding/writer.hpp"
#include "../coding/reader_writer_ops.hpp"
+#include "../base/assert.hpp"
+#include "../base/timer.hpp"
+#include "../base/scope_guard.hpp"
#include "../base/string_utils.hpp"
#include "../base/logging.hpp"
#include "../base/stl_add.hpp"
#include "../std/algorithm.hpp"
-#include "../std/vector.hpp"
-#include "../std/unordered_map.hpp"
#include "../std/fstream.hpp"
#include "../std/initializer_list.hpp"
+#include "../std/limits.hpp"
+#include "../std/unordered_map.hpp"
+#include "../std/vector.hpp"
#define SYNONYMS_FILE "synonyms.txt"
@@ -84,20 +89,21 @@ public:
}
};
+template<typename StringsFileT>
struct FeatureNameInserter
{
SynonymsHolder * m_synonyms;
- StringsFile & m_names;
- StringsFile::ValueT m_val;
+ StringsFileT & m_names;
+ typename StringsFileT::ValueT m_val;
- FeatureNameInserter(SynonymsHolder * synonyms, StringsFile & names)
+ FeatureNameInserter(SynonymsHolder * synonyms, StringsFileT & names)
: m_synonyms(synonyms), m_names(names)
{
}
void AddToken(signed char lang, strings::UniString const & s) const
{
- m_names.AddString(StringsFile::StringT(s, lang, m_val));
+ m_names.AddString(typename StringsFileT::StringT(s, lang, m_val));
}
private:
@@ -141,22 +147,19 @@ public:
}
};
-class FeatureInserter
-{
- SynonymsHolder * m_synonyms;
- StringsFile & m_names;
-
- CategoriesHolder const & m_categories;
+template <typename ValueT>
+struct ValueBuilder;
- typedef StringsFile::ValueT ValueT;
+template <>
+struct ValueBuilder<SerializedFeatureInfoValue>
+{
typedef search::trie::ValueReader SaverT;
SaverT m_valueSaver;
- pair<int, int> m_scales;
-
+ ValueBuilder(serial::CodingParams const & cp) : m_valueSaver(cp) {}
- void MakeValue(FeatureType const & f, feature::TypesHolder const & types,
- uint64_t pos, ValueT & value) const
+ void MakeValue(FeatureType const & f, feature::TypesHolder const & types, uint64_t pos,
+ SerializedFeatureInfoValue & value) const
{
SaverT::ValueType v;
v.m_featureId = static_cast<uint32_t>(pos);
@@ -166,9 +169,36 @@ class FeatureInserter
v.m_rank = feature::GetSearchRank(types, v.m_pt, f.GetPopulation());
// write to buffer
- PushBackByteSink<ValueT> sink(value);
+ PushBackByteSink<SerializedFeatureInfoValue::ValueT> sink(value.m_value);
m_valueSaver.Save(sink, v);
}
+};
+
+template <>
+struct ValueBuilder<FeatureIndexValue>
+{
+ void MakeValue(FeatureType const & /* f */, feature::TypesHolder const & /* types */,
+ uint64_t pos, FeatureIndexValue & value) const
+ {
+ ASSERT_LESS(pos, numeric_limits<uint32_t>::max(), ());
+ value.m_value = static_cast<uint32_t>(pos);
+ }
+};
+
+template <typename StringsFileT>
+class FeatureInserter
+{
+ SynonymsHolder * m_synonyms;
+ StringsFileT & m_names;
+
+ CategoriesHolder const & m_categories;
+
+ typedef typename StringsFileT::ValueT ValueT;
+ typedef search::trie::ValueReader SaverT;
+
+ pair<int, int> m_scales;
+
+ ValueBuilder<ValueT> const & m_valueBuilder;
/// There are 3 different ways of search index skipping:
/// - skip features in any case (m_skipFeatures)
@@ -276,17 +306,16 @@ class FeatureInserter
};
public:
- FeatureInserter(SynonymsHolder * synonyms, StringsFile & names,
- CategoriesHolder const & catHolder,
- serial::CodingParams const & cp,
- pair<int, int> const & scales)
+ FeatureInserter(SynonymsHolder * synonyms, StringsFileT & names,
+ CategoriesHolder const & catHolder, pair<int, int> const & scales,
+ ValueBuilder<ValueT> const & valueBuilder)
: m_synonyms(synonyms), m_names(names),
- m_categories(catHolder), m_valueSaver(cp), m_scales(scales)
+ m_categories(catHolder), m_scales(scales), m_valueBuilder(valueBuilder)
{
}
void operator() (FeatureType const & f, uint64_t pos) const
- {
+ {
feature::TypesHolder types(f);
static SkipIndexing skipIndex;
@@ -297,8 +326,9 @@ public:
// Init inserter with serialized value.
// Insert synonyms only for countries and states (maybe will add cities in future).
- FeatureNameInserter inserter(skipIndex.IsCountryOrState(types) ? m_synonyms : 0, m_names);
- MakeValue(f, types, pos, inserter.m_val);
+ FeatureNameInserter<StringsFileT> inserter(skipIndex.IsCountryOrState(types) ? m_synonyms : 0,
+ m_names);
+ m_valueBuilder.MakeValue(f, types, pos, inserter.m_val);
// Skip types for features without names.
if (!f.ForEachNameRef(inserter))
@@ -332,6 +362,41 @@ public:
}
};
+template <typename FeatureInserterT>
+struct FeatureInserterAdapter
+{
+ FeatureInserterAdapter(FeatureInserterT & inserter) : m_inserter(inserter), m_index(0) {}
+
+ void operator()(FeatureType const & f, uint64_t pos)
+ {
+ m_inserter(f, m_index++);
+ }
+
+ FeatureInserterT & m_inserter;
+ uint64_t m_index;
+};
+
+void AddFeatureNameIndexPairs(FilesContainerR const & container,
+ CategoriesHolder & categoriesHolder,
+ StringsFile<FeatureIndexValue> & stringsFile)
+{
+ feature::DataHeader header;
+ header.Load(container.GetReader(HEADER_FILE_TAG));
+ FeaturesVector features(container, header);
+
+ ValueBuilder<FeatureIndexValue> valueBuilder;
+
+ unique_ptr<SynonymsHolder> synonyms;
+ if (header.GetType() == feature::DataHeader::world)
+ synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE)));
+
+ FeatureInserter<StringsFile<FeatureIndexValue>> inserter(
+ synonyms.get(), stringsFile, categoriesHolder, header.GetScaleRange(), valueBuilder);
+
+ FeatureInserterAdapter<FeatureInserter<StringsFile<FeatureIndexValue>>> adapter(inserter);
+ features.ForEachOffset(adapter);
+}
+
void BuildSearchIndex(FilesContainerR const & cont, CategoriesHolder const & catHolder,
Writer & writer, string const & tmpFilePath)
{
@@ -341,30 +406,33 @@ void BuildSearchIndex(FilesContainerR const & cont, CategoriesHolder const & cat
FeaturesVector featuresV(cont, header);
serial::CodingParams cp(search::GetCPForTrie(header.GetDefCodingParams()));
+ ValueBuilder<SerializedFeatureInfoValue> valueBuilder(cp);
unique_ptr<SynonymsHolder> synonyms;
if (header.GetType() == feature::DataHeader::world)
synonyms.reset(new SynonymsHolder(GetPlatform().WritablePathForFile(SYNONYMS_FILE)));
- StringsFile names(tmpFilePath);
+ StringsFile<SerializedFeatureInfoValue> names(tmpFilePath);
- featuresV.ForEachOffset(FeatureInserter(synonyms.get(), names,
- catHolder, cp, header.GetScaleRange()));
+ featuresV.ForEachOffset(FeatureInserter<StringsFile<SerializedFeatureInfoValue>>(
+ synonyms.get(), names, catHolder, header.GetScaleRange(), valueBuilder));
names.EndAdding();
names.OpenForRead();
- trie::Build(writer, names.Begin(), names.End(), trie::builder::EmptyEdgeBuilder());
+ trie::Build<Writer, typename StringsFile<SerializedFeatureInfoValue>::IteratorT,
+ trie::builder::EmptyEdgeBuilder, ValueList<SerializedFeatureInfoValue>>(
+ writer, names.Begin(), names.End(), trie::builder::EmptyEdgeBuilder());
// at this point all readers of StringsFile should be dead
}
FileWriter::DeleteFileX(tmpFilePath);
}
+} // namespace
-}
-
-bool indexer::BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild)
+namespace indexer {
+bool BuildSearchIndexFromDatFile(string const & datFile, bool forceRebuild)
{
LOG(LINFO, ("Start building search index. Bits = ", search::POINT_CODING_BITS));
@@ -412,3 +480,74 @@ bool indexer::BuildSearchIndexFromDatFile(string const & datFile, bool forceRebu
LOG(LINFO, ("End building search index."));
return true;
}
+
+bool AddCompresedSearchIndexSection(string const & fName, bool forceRebuild)
+{
+ Platform & platform = GetPlatform();
+
+ FilesContainerR readContainer(platform.GetReader(fName));
+ if (readContainer.IsExist(COMPRESSED_SEARCH_INDEX_FILE_TAG) && !forceRebuild)
+ return true;
+
+ string const indexFile = platform.WritablePathForFile("compressed-search-index.tmp");
+ MY_SCOPE_GUARD(indexFileGuard, bind(&FileWriter::DeleteFileX, indexFile));
+
+ try
+ {
+ {
+ FileWriter indexWriter(indexFile);
+ BuildCompressedSearchIndex(readContainer, indexWriter);
+ }
+ {
+ FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING);
+ FileWriter writer = writeContainer.GetWriter(COMPRESSED_SEARCH_INDEX_FILE_TAG);
+ rw_ops::Reverse(FileReader(indexFile), writer);
+ }
+ }
+ catch (Reader::Exception const & e)
+ {
+ LOG(LERROR, ("Error while reading file: ", e.Msg()));
+ return false;
+ }
+ catch (Writer::Exception const & e)
+ {
+ LOG(LERROR, ("Error writing index file: ", e.Msg()));
+ return false;
+ }
+
+ return true;
+}
+
+void BuildCompressedSearchIndex(FilesContainerR & container, Writer & indexWriter)
+{
+ Platform & platform = GetPlatform();
+
+ LOG(LINFO, ("Start building compressed search index for", container.GetFileName()));
+ my::Timer timer;
+
+ string stringsFilePath = platform.WritablePathForFile("strings.tmp");
+ StringsFile<FeatureIndexValue> stringsFile(stringsFilePath);
+ MY_SCOPE_GUARD(stringsFileGuard, bind(&FileWriter::DeleteFileX, stringsFilePath));
+
+ CategoriesHolder categoriesHolder(platform.GetReader(SEARCH_CATEGORIES_FILE_NAME));
+
+ AddFeatureNameIndexPairs(container, categoriesHolder, stringsFile);
+
+ stringsFile.EndAdding();
+
+ LOG(LINFO, ("End sorting strings:", timer.ElapsedSeconds()));
+
+ stringsFile.OpenForRead();
+ trie::Build<Writer, typename StringsFile<FeatureIndexValue>::IteratorT,
+ trie::builder::EmptyEdgeBuilder, ValueList<FeatureIndexValue>>(
+ indexWriter, stringsFile.Begin(), stringsFile.End(), trie::builder::EmptyEdgeBuilder());
+
+ LOG(LINFO, ("End building compressed search index, elapsed seconds:", timer.ElapsedSeconds()));
+}
+
+void BuildCompressedSearchIndex(string const & fName, Writer & indexWriter)
+{
+ FilesContainerR container(GetPlatform().GetReader(fName));
+ BuildCompressedSearchIndex(container, indexWriter);
+}
+} // namespace indexer
diff --git a/indexer/search_index_builder.hpp b/indexer/search_index_builder.hpp
index 41ea88b8cb..c661a6ebb4 100644
--- a/indexer/search_index_builder.hpp
+++ b/indexer/search_index_builder.hpp
@@ -1,8 +1,17 @@
#pragma once
+
#include "../std/string.hpp"
+class FilesContainerR;
+class Writer;
namespace indexer
{
- bool BuildSearchIndexFromDatFile(string const & fName, bool forceRebuild = false);
-}
+bool BuildSearchIndexFromDatFile(string const & fName, bool forceRebuild = false);
+
+bool AddCompresedSearchIndexSection(string const & fName, bool forceRebuild);
+
+void BuildCompressedSearchIndex(FilesContainerR & container, Writer & indexWriter);
+
+void BuildCompressedSearchIndex(string const & fName, Writer & indexWriter);
+} // namespace indexer
diff --git a/indexer/string_file.cpp b/indexer/string_file.cpp
deleted file mode 100644
index 6bfb1ee8c7..0000000000
--- a/indexer/string_file.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-#include "string_file.hpp"
-
-#include "../std/algorithm.hpp"
-#include "../std/bind.hpp"
-
-
-template <class TWriter>
-StringsFile::IdT StringsFile::StringT::Write(TWriter & writer) const
-{
- IdT const pos = static_cast<IdT>(writer.Pos());
- CHECK_EQUAL(static_cast<uint64_t>(pos), writer.Pos(), ());
-
- rw::Write(writer, m_name);
- rw::WriteVectorOfPOD(writer, m_val);
-
- return pos;
-}
-
-template <class TReader>
-void StringsFile::StringT::Read(TReader & src)
-{
- rw::Read(src, m_name);
- rw::ReadVectorOfPOD(src, m_val);
-}
-
-bool StringsFile::StringT::operator < (StringT const & name) const
-{
- if (m_name != name.m_name)
- return (m_name < name.m_name);
-
- return (m_val < name.m_val);
-}
-
-bool StringsFile::StringT::operator == (StringT const & name) const
-{
- return (m_name == name.m_name && m_val == name.m_val);
-}
-
-void StringsFile::AddString(StringT const & s)
-{
-#ifdef OMIM_OS_DESKTOP
- size_t const maxSize = 1000000;
-#else
- size_t const maxSize = 30000;
-#endif
-
- if (m_strings.size() >= maxSize)
- Flush();
-
- m_strings.push_back(s);
-}
-
-bool StringsFile::IteratorT::IsEnd() const
-{
- return m_file.m_queue.empty();
-}
-
-StringsFile::StringT StringsFile::IteratorT::dereference() const
-{
- ASSERT ( IsValid(), () );
- return m_file.m_queue.top().m_string;
-}
-
-void StringsFile::IteratorT::increment()
-{
- ASSERT ( IsValid(), () );
- int const index = m_file.m_queue.top().m_index;
-
- m_file.m_queue.pop();
-
- if (!m_file.PushNextValue(index))
- m_end = IsEnd();
-}
-
-StringsFile::StringsFile(string const & fPath)
- : m_workerThread(1 /* maxTasks */)
-{
- m_writer.reset(new FileWriter(fPath));
-}
-
-void StringsFile::Flush()
-{
- shared_ptr<SortAndDumpStringsTask> task(
- new SortAndDumpStringsTask(*m_writer, m_offsets, m_strings));
- m_workerThread.Push(task);
-}
-
-bool StringsFile::PushNextValue(size_t i)
-{
- // reach the end of the portion file
- if (m_offsets[i].first >= m_offsets[i].second)
- return false;
-
- // init source to needed offset
- ReaderSource<FileReader> src(*m_reader);
- src.Skip(m_offsets[i].first);
-
- // read string
- StringT s;
- s.Read(src);
-
- // update offset
- m_offsets[i].first = src.Pos();
-
- // push value to queue
- m_queue.push(QValue(s, i));
- return true;
-}
-
-void StringsFile::EndAdding()
-{
- Flush();
-
- m_workerThread.RunUntilIdleAndStop();
-
- m_writer->Flush();
-}
-
-void StringsFile::OpenForRead()
-{
- string const fPath = m_writer->GetName();
- m_writer.reset();
-
- m_reader.reset(new FileReader(fPath));
-
- for (size_t i = 0; i < m_offsets.size(); ++i)
- PushNextValue(i);
-}
diff --git a/indexer/string_file.hpp b/indexer/string_file.hpp
index d6f4b2ad7a..2651394cd5 100644
--- a/indexer/string_file.hpp
+++ b/indexer/string_file.hpp
@@ -14,12 +14,12 @@
#include "../std/functional.hpp"
#include "../std/unique_ptr.hpp"
-
+template <typename TValue>
class StringsFile
{
public:
- typedef buffer_vector<uint8_t, 32> ValueT;
- typedef uint32_t IdT;
+ using ValueT = TValue;
+ using IdT = uint32_t;
class StringT
{
@@ -28,14 +28,17 @@ public:
public:
StringT() {}
- StringT(strings::UniString const & name, signed char lang, ValueT const & val)
- : m_val(val)
+ StringT(strings::UniString const & name, signed char lang, ValueT const & val) : m_val(val)
{
m_name.reserve(name.size() + 1);
m_name.push_back(static_cast<uint8_t>(lang));
m_name.append(name.begin(), name.end());
}
+ StringT(strings::UniString const & langName, ValueT const & val) : m_name(langName), m_val(val)
+ {
+ }
+
uint32_t GetKeySize() const { return m_name.size(); }
uint32_t const * GetKeyData() const { return m_name.data(); }
@@ -43,16 +46,42 @@ public:
ValueT const & GetValue() const { return m_val; }
- template <class TCont> void SerializeValue(TCont & cont) const
+ bool operator<(StringT const & name) const
+ {
+ if (m_name != name.m_name)
+ return m_name < name.m_name;
+ return m_val < name.m_val;
+ }
+
+ bool operator==(StringT const & name) const
{
- cont.assign(m_val.begin(), m_val.end());
+ return m_name == name.m_name && m_val == name.m_val;
}
- bool operator < (StringT const & name) const;
- bool operator == (StringT const & name) const;
+ template <class TWriter>
+ IdT Write(TWriter & writer) const
+ {
+ IdT const pos = static_cast<IdT>(writer.Pos());
+ CHECK_EQUAL(static_cast<uint64_t>(pos), writer.Pos(), ());
+
+ rw::Write(writer, m_name);
+ m_val.Write(writer);
- template <class TWriter> IdT Write(TWriter & writer) const;
- template <class TReader> void Read(TReader & src);
+ return pos;
+ }
+
+ template <class TReader>
+ void Read(TReader & src)
+ {
+ rw::Read(src, m_name);
+ m_val.Read(src);
+ }
+
+ template <class TCont>
+ void SerializeValue(TCont & cont) const
+ {
+ m_val.Serialize(cont);
+ }
void Swap(StringT & r)
{
@@ -98,7 +127,7 @@ public:
trie.ForEach([&memWriter](const strings::UniString & s, const ValueT & v)
{
rw::Write(memWriter, s);
- rw::WriteVectorOfPOD(memWriter, v);
+ v.Write(memWriter);
});
}
@@ -126,8 +155,7 @@ public:
inline bool IsValid() const { return (!m_end && !IsEnd()); }
public:
- IteratorT(StringsFile & file, bool isEnd)
- : m_file(file), m_end(isEnd)
+ IteratorT(StringsFile & file, bool isEnd) : m_file(file), m_end(isEnd)
{
// Additional check in case for empty sequence.
if (!m_end)
@@ -173,8 +201,104 @@ private:
QValue(StringT const & s, size_t i) : m_string(s), m_index(i) {}
- inline bool operator > (QValue const & rhs) const { return !(m_string < rhs.m_string); }
+ inline bool operator>(QValue const & rhs) const { return !(m_string < rhs.m_string); }
};
- priority_queue<QValue, vector<QValue>, greater<QValue> > m_queue;
+ priority_queue<QValue, vector<QValue>, greater<QValue>> m_queue;
};
+
+template <typename ValueT>
+void StringsFile<ValueT>::AddString(StringT const & s)
+{
+ size_t const maxSize = 1000000;
+
+ if (m_strings.size() >= maxSize)
+ Flush();
+
+ m_strings.push_back(s);
+}
+
+template <typename ValueT>
+bool StringsFile<ValueT>::IteratorT::IsEnd() const
+{
+ return m_file.m_queue.empty();
+}
+
+template <typename ValueT>
+typename StringsFile<ValueT>::StringT StringsFile<ValueT>::IteratorT::dereference() const
+{
+ ASSERT(IsValid(), ());
+ return m_file.m_queue.top().m_string;
+}
+
+template <typename ValueT>
+void StringsFile<ValueT>::IteratorT::increment()
+{
+ ASSERT(IsValid(), ());
+ int const index = m_file.m_queue.top().m_index;
+
+ m_file.m_queue.pop();
+
+ if (!m_file.PushNextValue(index))
+ m_end = IsEnd();
+}
+
+template <typename ValueT>
+StringsFile<ValueT>::StringsFile(string const & fPath)
+ : m_workerThread(1 /* maxTasks */)
+{
+ m_writer.reset(new FileWriter(fPath));
+}
+
+template <typename ValueT>
+void StringsFile<ValueT>::Flush()
+{
+ shared_ptr<SortAndDumpStringsTask> task(
+ new SortAndDumpStringsTask(*m_writer, m_offsets, m_strings));
+ m_workerThread.Push(task);
+}
+
+template <typename ValueT>
+bool StringsFile<ValueT>::PushNextValue(size_t i)
+{
+ // reach the end of the portion file
+ if (m_offsets[i].first >= m_offsets[i].second)
+ return false;
+
+ // init source to needed offset
+ ReaderSource<FileReader> src(*m_reader);
+ src.Skip(m_offsets[i].first);
+
+ // read string
+ StringT s;
+ s.Read(src);
+
+ // update offset
+ m_offsets[i].first = src.Pos();
+
+ // push value to queue
+ m_queue.push(QValue(s, i));
+ return true;
+}
+
+template <typename ValueT>
+void StringsFile<ValueT>::EndAdding()
+{
+ Flush();
+
+ m_workerThread.RunUntilIdleAndStop();
+
+ m_writer->Flush();
+}
+
+template <typename ValueT>
+void StringsFile<ValueT>::OpenForRead()
+{
+ string const fPath = m_writer->GetName();
+ m_writer.reset();
+
+ m_reader.reset(new FileReader(fPath));
+
+ for (size_t i = 0; i < m_offsets.size(); ++i)
+ PushNextValue(i);
+}
diff --git a/indexer/string_file_values.hpp b/indexer/string_file_values.hpp
new file mode 100644
index 0000000000..a0da9deecc
--- /dev/null
+++ b/indexer/string_file_values.hpp
@@ -0,0 +1,150 @@
+#pragma once
+
+#include "../coding/compressed_bit_vector.hpp"
+#include "../coding/read_write_utils.hpp"
+#include "../coding/write_to_sink.hpp"
+
+#include "../base/assert.hpp"
+
+#include "../std/algorithm.hpp"
+
+/// Following classes are supposed to be used with StringsFile. They
+/// allow to write/read them, compare or serialize to an in-memory
+/// buffer. The reason to use these classes instead of
+/// buffer_vector<uint8_t, N> is that in some specific cases, like
+/// compressed search index construction, they allow to avoid
+/// redundant serialization-deserialization or sorting.
+
+/// A wrapper around feature index.
+struct FeatureIndexValue
+{
+ FeatureIndexValue() : m_value(0) {}
+
+ template <typename TWriter>
+ void Write(TWriter & writer) const
+ {
+ WriteToSink(writer, m_value);
+ }
+
+ template <typename TReader>
+ void Read(TReader & reader)
+ {
+ m_value = ReadPrimitiveFromSource<uint32_t>(reader);
+ }
+
+ template <typename TCont>
+ void Serialize(TCont & cont) const
+ {
+ cont.resize(sizeof(m_value));
+ memcpy(cont.data(), &m_value, sizeof(m_value));
+ }
+
+ bool operator<(FeatureIndexValue const & value) const { return m_value < value.m_value; }
+
+ bool operator==(FeatureIndexValue const & value) const { return m_value == value.m_value; }
+
+ void swap(FeatureIndexValue & value) { ::swap(m_value, value.m_value); }
+
+ uint32_t m_value;
+};
+
+/// A wrapper around serialized SaverT::ValueType.
+struct SerializedFeatureInfoValue
+{
+ using ValueT = buffer_vector<uint8_t, 32>;
+
+ template <typename TWriter>
+ void Write(TWriter & writer) const
+ {
+ rw::WriteVectorOfPOD(writer, m_value);
+ }
+
+ template <typename TReader>
+ void Read(TReader & reader)
+ {
+ rw::ReadVectorOfPOD(reader, m_value);
+ }
+
+ template <typename TCont>
+ void Serialize(TCont & cont) const
+ {
+ cont.assign(m_value.begin(), m_value.end());
+ }
+
+ bool operator<(SerializedFeatureInfoValue const & value) const { return m_value < value.m_value; }
+
+ bool operator==(SerializedFeatureInfoValue const & value) const
+ {
+ return m_value == value.m_value;
+ }
+
+ void swap(SerializedFeatureInfoValue & value) { m_value.swap(value.m_value); }
+
+ ValueT m_value;
+};
+
+/// This template is used to accumulate and serialize a group of
+/// values of the same type.
+template <typename Value>
+class ValueList;
+
+/// ValueList<FeatureIndexValue> serializes a group of features
+/// indices as a compressed bit vector, thus, allowing us to save a
+/// disk space.
+template <>
+class ValueList<FeatureIndexValue>
+{
+public:
+ void Append(FeatureIndexValue const & value)
+ {
+ // External-memory trie adds <string, value> pairs in a sorted
+ // order, thus, values are supposed to be accumulated in a
+ // sorted order, and we can avoid sorting them before construction
+ // of a CompressedBitVector.
+ ASSERT(m_offsets.empty() || m_offsets.back() <= value.m_value, ());
+ if (!m_offsets.empty() && m_offsets.back() == value.m_value)
+ return;
+ m_offsets.push_back(value.m_value);
+ }
+
+ size_t size() const { return m_offsets.empty() ? 0 : 1; }
+
+ template <typename SinkT>
+ void Dump(SinkT & sink) const
+ {
+ vector<uint8_t> buffer;
+ MemWriter<vector<uint8_t>> writer(buffer);
+ BuildCompressedBitVector(writer, m_offsets);
+ sink.Write(buffer.data(), buffer.size());
+ }
+
+private:
+ vector<uint32_t> m_offsets;
+};
+
+/// ValueList<SerializedFeatureInfoValue> sequentially serializes
+/// encoded features infos.
+template <>
+class ValueList<SerializedFeatureInfoValue>
+{
+public:
+ ValueList() : m_size(0) {}
+
+ void Append(SerializedFeatureInfoValue const & value)
+ {
+ m_value.insert(m_value.end(), value.m_value.begin(), value.m_value.end());
+ ++m_size;
+ }
+
+ size_t size() const { return m_size; }
+
+ template <typename SinkT>
+ void Dump(SinkT & sink) const
+ {
+ sink.Write(m_value.data(), m_value.size());
+ }
+
+private:
+ buffer_vector<uint8_t, 32> m_value;
+ uint32_t m_size;
+};