Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYury Melnichek <melnichek@gmail.com>2011-03-06 01:12:21 +0300
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:12:57 +0300
commit5d9239a1edede8693c69119e282017d965d2c326 (patch)
tree2ac33f5cf71ee8f3814de8f2dfe3688e8c3820e1 /publisher
parenta5ad66614265259763a1ed849db7ae8e918e8a3c (diff)
Add sloynik to omim!! Yea!
Diffstat (limited to 'publisher')
-rw-r--r--publisher/aard_dictionary.cpp225
-rw-r--r--publisher/aard_dictionary.hpp28
-rw-r--r--publisher/main.cpp57
-rw-r--r--publisher/publisher.pro18
-rw-r--r--publisher/publisher_tests/publisher_tests.pro14
-rw-r--r--publisher/publisher_tests/slof_indexer_test.cpp86
-rw-r--r--publisher/slof_indexer.cpp188
-rw-r--r--publisher/slof_indexer.hpp49
8 files changed, 665 insertions, 0 deletions
diff --git a/publisher/aard_dictionary.cpp b/publisher/aard_dictionary.cpp
new file mode 100644
index 0000000000..e39bafa8c9
--- /dev/null
+++ b/publisher/aard_dictionary.cpp
@@ -0,0 +1,225 @@
+#include "aard_dictionary.hpp"
+#include "../coding_sloynik/bzip2_compressor.hpp"
+#include "../coding_sloynik/gzip_compressor.hpp"
+#include "../coding/endianness.hpp"
+#include "../coding/reader.hpp"
+#include "../base/logging.hpp"
+#include "../3party_sloynik/jansson/myjansson.hpp"
+#include "../std/exception.hpp"
+
+namespace
+{
+ template <typename T> inline T SwapIfLittleEndian(T t)
+ {
+ #ifdef ENDIAN_IS_BIG
+ return t;
+ #else
+ return ReverseByteOrder(t);
+ #endif
+ }
+ template <typename PrimitiveT, class TReader>
+ PrimitiveT ReadLittleEndianPrimitiveFromPos(TReader & reader, uint64_t pos)
+ {
+ PrimitiveT primitive;
+ ReadFromPos(reader, pos, &primitive, sizeof(primitive));
+ return SwapIfLittleEndian(primitive);
+ }
+
+}
+
+namespace sl
+{
+
+enum
+{
+ AARD_DICTIONARY_MAX_ARTICLE_SIZE = 1 << 24
+};
+
+#pragma pack(push, 1)
+struct AardDictionaryIndex1Item
+{
+ uint32_t m_KeyPos;
+ uint32_t m_ArticlePos;
+};
+
+struct AardDictionary::AardDictionaryHeader
+{
+ char m_Signature[4];
+ uint8_t m_Sha1[40];
+ uint16_t m_Version;
+ uint8_t m_Uuid[16];
+ uint16_t m_Volume;
+ uint16_t m_TotalVolumes;
+ uint32_t m_MetaLength;
+ uint32_t m_IndexCount;
+ uint32_t m_ArticleOffset;
+ char m_Index1ItemFormat[4];
+ char m_KeyLengthFormat[2];
+ char m_ArticleLengthFormat[2];
+
+ // Offset of the index1 items.
+ uint32_t Index1Offset() const
+ {
+ return sizeof(AardDictionaryHeader) + m_MetaLength;
+ }
+ // Offset of the keys data.
+ uint32_t KeyDataOffset() const
+ {
+ return Index1Offset() + sizeof(AardDictionaryIndex1Item) * m_IndexCount;
+ }
+ // Offset of the article data.
+ uint32_t ArticleOffset() const
+ {
+ return m_ArticleOffset;
+ }
+};
+#pragma pack(pop)
+
+void AardDecompress(char const * pSrc, size_t srcSize, string & dst)
+{
+ bool decompressed = false;
+ try
+ {
+ DecompressGZip(pSrc, srcSize, dst);
+ decompressed = true;
+ }
+ catch (StringCodingException &) {}
+ if (decompressed)
+ return;
+ try
+ {
+ DecompressBZip2(pSrc, srcSize, dst);
+ decompressed = true;
+ }
+ catch (StringCodingException &) {}
+ if (decompressed)
+ return;
+ dst.assign(pSrc, pSrc + srcSize);
+}
+
+AardDictionary::AardDictionary(Reader const & reader)
+ : m_Reader(reader), m_pHeader(new AardDictionaryHeader)
+{
+ m_Reader.Read(0, m_pHeader.get(), sizeof(AardDictionaryHeader));
+ m_pHeader->m_Volume = SwapIfLittleEndian(m_pHeader->m_Volume);
+ m_pHeader->m_TotalVolumes = SwapIfLittleEndian(m_pHeader->m_TotalVolumes);
+ m_pHeader->m_MetaLength = SwapIfLittleEndian(m_pHeader->m_MetaLength);
+ m_pHeader->m_IndexCount = SwapIfLittleEndian(m_pHeader->m_IndexCount);
+ m_pHeader->m_ArticleOffset = SwapIfLittleEndian(m_pHeader->m_ArticleOffset);
+ if (memcmp(m_pHeader->m_Signature, "aard", 4) != 0)
+ MYTHROW(Dictionary::OpenDictionaryException, ("Invalid signature."));
+ if (memcmp(m_pHeader->m_Index1ItemFormat, ">LL\0", 4) != 0)
+ MYTHROW(Dictionary::OpenDictionaryException, ("Invalid index1 item format."));
+ if (memcmp(m_pHeader->m_KeyLengthFormat, ">H", 2) != 0)
+ MYTHROW(Dictionary::OpenDictionaryException, ("Invalid key length format."));
+ if (memcmp(m_pHeader->m_ArticleLengthFormat, ">L", 2) != 0)
+ MYTHROW(Dictionary::OpenDictionaryException, ("Invalid article length format."));
+ LOG(LINFO, ("Loaded aard dictionary, volume:", m_pHeader->m_Volume,
+ "of", m_pHeader->m_TotalVolumes,
+ "meta length:", m_pHeader->m_MetaLength,
+ "words:", m_pHeader->m_IndexCount));
+
+ // TODO: What to do with duplicate keys?
+ for (Id i = 0; i < KeyCount(); ++i)
+ {
+ string key;
+ KeyById(i, key);
+ m_KeyToIdMap[key] = i;
+ }
+}
+
+AardDictionary::~AardDictionary()
+{
+}
+
+sl::Dictionary::Id AardDictionary::KeyCount() const
+{
+ return m_pHeader->m_IndexCount;
+}
+
+void AardDictionary::KeyById(Id id, string & key) const
+{
+ AardDictionaryIndex1Item item;
+ m_Reader.Read(m_pHeader->Index1Offset() + id * sizeof(item), &item, sizeof(item));
+ uint64_t const keyPos = m_pHeader->KeyDataOffset() + SwapIfLittleEndian(item.m_KeyPos);
+ uint16_t const keyLength = ReadLittleEndianPrimitiveFromPos<uint16_t>(m_Reader, keyPos);
+ if (keyLength == 0)
+ MYTHROW(Dictionary::BrokenDictionaryException, (keyLength));
+ key.resize(keyLength);
+ m_Reader.Read(keyPos + 2, &key[0], keyLength);
+}
+
+void AardDictionary::ArticleById(Id id, string & article) const
+{
+ string key;
+ KeyById(id, key);
+
+ AardDictionaryIndex1Item item;
+ m_Reader.Read(m_pHeader->Index1Offset() + id * sizeof(item), &item, sizeof(item));
+ uint64_t const articlePos = m_pHeader->ArticleOffset() + SwapIfLittleEndian(item.m_ArticlePos);
+ uint32_t articleLength = ReadLittleEndianPrimitiveFromPos<uint32_t>(m_Reader, articlePos);
+ if (articleLength > AARD_DICTIONARY_MAX_ARTICLE_SIZE)
+ MYTHROW(BrokenDictionaryException, (articleLength));
+ CHECK_NOT_EQUAL(articleLength, 0, ());
+ try
+ {
+ string compressedArticle(articleLength, '.');
+ m_Reader.Read(articlePos + 4, &compressedArticle[0], articleLength);
+ string articleJSON;
+ AardDecompress(&compressedArticle[0], compressedArticle.size(), articleJSON);
+
+ my::Json root(articleJSON.c_str());
+ CHECK_EQUAL(json_typeof(root), JSON_ARRAY, (id, key));
+ json_t * pJsonElement0(json_array_get(root, 0));
+ CHECK(pJsonElement0, (id, key));
+ CHECK_EQUAL(json_typeof(pJsonElement0), JSON_STRING, (id, key));
+ string s0 = json_string_value(pJsonElement0);
+ if (s0.size() > 0)
+ {
+ // Normal article.
+ for (unsigned int i = 1; i < json_array_size(root); ++i)
+ {
+ json_t * pJsonElementI = json_array_get(root, i);
+ CHECK(pJsonElementI, (id, key));
+ switch (json_type jsonElementIType = json_typeof(pJsonElementI))
+ {
+ case JSON_ARRAY:
+ CHECK_EQUAL(json_array_size(pJsonElementI), 0, (id, key, articleJSON));
+ break;
+ case JSON_OBJECT:
+ CHECK_EQUAL(json_object_size(pJsonElementI), 0, (id, key, articleJSON));
+ break;
+ default:
+ CHECK(false, (id, key, jsonElementIType, articleJSON));
+ }
+ }
+ article.swap(s0);
+ }
+ else
+ {
+ // Redirect
+ CHECK_EQUAL(json_array_size(root), 3, (id, key));
+ json_t * pJsonElement2(json_array_get(root, 2));
+ CHECK_EQUAL(json_typeof(pJsonElement2), JSON_OBJECT, (id, key));
+ CHECK_EQUAL(json_object_size(pJsonElement2), 1, (id, key));
+ json_t * pJsonRedirect = json_object_get(pJsonElement2, "r");
+ CHECK(pJsonRedirect, (id, key));
+ CHECK_EQUAL(json_typeof(pJsonRedirect), JSON_STRING, (id, key));
+ string redirectStr(json_string_value(pJsonRedirect));
+ CHECK_GREATER(redirectStr.size(), 0, (id, key));
+ map<string, Id>::const_iterator it = m_KeyToIdMap.find(redirectStr);
+ if (it == m_KeyToIdMap.end())
+ {
+ LOG(LWARNING, ("Incorrect redirect", id, key, redirectStr));
+ return;
+ }
+ ArticleById(it->second, article);
+ }
+ }
+ catch (exception & e)
+ {
+ CHECK(false, (id, key, e.what()));
+ }
+}
+
+}
diff --git a/publisher/aard_dictionary.hpp b/publisher/aard_dictionary.hpp
new file mode 100644
index 0000000000..3173ad0a7a
--- /dev/null
+++ b/publisher/aard_dictionary.hpp
@@ -0,0 +1,28 @@
+#pragma once
+#include "../words/dictionary.hpp"
+#include "../base/base.hpp"
+#include "../std/map.hpp"
+#include "../std/scoped_ptr.hpp"
+#include "../std/string.hpp"
+
+class Reader;
+
+namespace sl
+{
+
+class AardDictionary : public Dictionary
+{
+public:
+ explicit AardDictionary(Reader const & reader);
+ ~AardDictionary();
+ Id KeyCount() const;
+ void KeyById(Id id, string & key) const;
+ void ArticleById(Id id, string & article) const;
+private:
+ Reader const & m_Reader;
+ struct AardDictionaryHeader;
+ scoped_ptr<AardDictionaryHeader> m_pHeader;
+ map<string, Id> m_KeyToIdMap;
+};
+
+}
diff --git a/publisher/main.cpp b/publisher/main.cpp
new file mode 100644
index 0000000000..52fdbf64f6
--- /dev/null
+++ b/publisher/main.cpp
@@ -0,0 +1,57 @@
+#include "aard_dictionary.hpp"
+#include "slof_indexer.hpp"
+#include "../coding_sloynik/bzip2_compressor.hpp"
+#include "../coding/file_reader.hpp"
+#include "../coding/file_writer.hpp"
+#include "../base/base.hpp"
+#include "../base/assert.hpp"
+#include "../base/logging.hpp"
+#include "../std/bind.hpp"
+#include "../3party/gflags/src/gflags/gflags.h"
+
+DEFINE_int32(max_uncompressed_article_chunk_size, 899950,
+ "Max size of chunk of articles, uncompressed.");
+DEFINE_int32(compression_level, 9, "BZip2 compression level.");
+DEFINE_string(input, "", "Input file.");
+DEFINE_string(output, "", "Output dictionary file.");
+
+int main(int argc, char ** argv)
+{
+ google::ParseCommandLineFlags(&argc, &argv, true);
+ CHECK(!FLAGS_input.empty(), ());
+ CHECK(!FLAGS_output.empty(), ());
+ FileReader inputReader(FLAGS_input.c_str());
+ FileWriter outputWriter(FLAGS_output.c_str());
+ {
+ sl::AardDictionary inputDictionary(inputReader);
+ sl::SlofIndexer indexer(outputWriter,
+ FLAGS_max_uncompressed_article_chunk_size,
+ bind(&CompressBZip2, FLAGS_compression_level, _1, _2, _3));
+ LOG(LINFO, ("Starting indexing, keys:", inputDictionary.KeyCount()));
+ for (uint32_t id = 0; id < inputDictionary.KeyCount(); ++id)
+ {
+ if ((id % 5000) == 0)
+ LOG(LINFO, (id, "done."));
+ // TODO: Handle redirects.
+ // TODO: Handle several keys for article?
+ string key, article;
+ inputDictionary.KeyById(id, key);
+ inputDictionary.ArticleById(id, article);
+ if (article.empty())
+ {
+ LOG(LWARNING, ("Skipping empty article for:", key));
+ }
+ else
+ {
+ uint64_t const articleId = indexer.AddArticle(article);
+ indexer.AddKey(key, articleId);
+ }
+ }
+ LOG(LINFO, ("Logging stats."));
+ indexer.LogStats();
+ LOG(LINFO, ("Finishing indexing."));
+ }
+ LOG(LINFO, ("Indexing done."));
+ LOG(LINFO, ("Input size:", inputReader.Size()));
+ LOG(LINFO, ("Output size:", outputWriter.Pos()));
+}
diff --git a/publisher/publisher.pro b/publisher/publisher.pro
new file mode 100644
index 0000000000..1acc92085e
--- /dev/null
+++ b/publisher/publisher.pro
@@ -0,0 +1,18 @@
+TARGET = publisher
+TEMPLATE = app
+CONFIG += console
+CONFIG -= app_bundle
+
+SLOYNIK_DIR = ..
+DEPENDENCIES = gflags bzip2 zlib jansson base coding coding_sloynik words
+
+include($$SLOYNIK_DIR/sloynik_common.pri)
+
+HEADERS += \
+ aard_dictionary.hpp \
+ slof_indexer.hpp
+
+SOURCES += \
+ aard_dictionary.cpp \
+ main.cpp \
+ slof_indexer.cpp
diff --git a/publisher/publisher_tests/publisher_tests.pro b/publisher/publisher_tests/publisher_tests.pro
new file mode 100644
index 0000000000..5b46a73c1b
--- /dev/null
+++ b/publisher/publisher_tests/publisher_tests.pro
@@ -0,0 +1,14 @@
+TARGET = publisher_tests
+TEMPLATE = app
+CONFIG += console
+CONFIG -= app_bundle
+
+SLOYNIK_DIR = ../..
+DEPENDENCIES = gflags bzip2 zlib base coding coding_sloynik words
+
+include($$SLOYNIK_DIR/sloynik_common.pri)
+
+SOURCES += $$SLOYNIK_DIR/testing/testingmain.cpp \
+ slof_indexer_test.cpp ../slof_indexer.cpp
+
+HEADERS +=
diff --git a/publisher/publisher_tests/slof_indexer_test.cpp b/publisher/publisher_tests/slof_indexer_test.cpp
new file mode 100644
index 0000000000..29ef3a87c4
--- /dev/null
+++ b/publisher/publisher_tests/slof_indexer_test.cpp
@@ -0,0 +1,86 @@
+#include "../../testing/testing.hpp"
+#include "../slof_indexer.hpp"
+#include "../../words/slof_dictionary.hpp"
+#include "../../words/sloynik_engine.hpp"
+#include "../../coding/reader.hpp"
+#include "../../coding/writer.hpp"
+#include "../../base/logging.hpp"
+#include "../../base/macros.hpp"
+#include "../../std/string.hpp"
+#include "../../std/vector.hpp"
+
+namespace
+{
+ void TestCompressor(char const * pSrc, size_t srcSize, string & res)
+ {
+ res = "<";
+ res.insert(res.end(), pSrc, pSrc + srcSize);
+ res.insert(res.end(), '>');
+ }
+
+ void TestDecompressor(char const * pSrc, size_t srcSize, char * pDst, size_t dstSize)
+ {
+ TEST_GREATER_OR_EQUAL(srcSize, 2, ());
+ TEST_EQUAL(srcSize - 2, dstSize, ());
+ TEST_EQUAL(pSrc[0], '<', ());
+ TEST_EQUAL(pSrc[srcSize-1], '>', ());
+ memcpy(pDst, pSrc + 1, srcSize - 2);
+ }
+
+ string Key(sl::SlofDictionary const & dic, sl::Dictionary::Id id)
+ {
+ string res;
+ dic.KeyById(id, res);
+ return res;
+ }
+
+ string Article(sl::SlofDictionary const & dic, sl::Dictionary::Id id)
+ {
+ string res;
+ dic.ArticleById(id, res);
+ return res;
+ }
+}
+
+UNIT_TEST(SlofIndexerEmptyTest)
+{
+ string serializedDictionary;
+ {
+ MemWriter<string> writer(serializedDictionary);
+ sl::SlofIndexer indexer(writer, 20, &TestCompressor);
+ }
+ sl::SlofDictionary dic(new MemReader(&serializedDictionary[0], serializedDictionary.size()),
+ &TestDecompressor);
+ TEST_EQUAL(dic.KeyCount(), 0, ());
+}
+
+UNIT_TEST(SlofIndexerSmokeTest)
+{
+ string serializedDictionary;
+ {
+ MemWriter<string> writer(serializedDictionary);
+ sl::SlofIndexer indexer(writer, 25, &TestCompressor);
+ uint64_t articleM = indexer.AddArticle("ArticleM");
+ indexer.AddKey("M", articleM);
+ uint64_t articleHello = indexer.AddArticle("ArticleHello");
+ indexer.AddKey("He", articleHello);
+ uint64_t articleOk = indexer.AddArticle("ArticleOK");
+ indexer.AddKey("OK", articleOk);
+ indexer.AddKey("Hello", articleHello);
+ }
+ {
+ sl::SlofDictionary dic(new MemReader(&serializedDictionary[0], serializedDictionary.size()),
+ &TestDecompressor);
+ TEST_EQUAL(dic.KeyCount(), 4, ());
+ TEST_EQUAL(Key(dic, 0), "He", ());
+ TEST_EQUAL(Key(dic, 1), "Hello", ());
+ TEST_EQUAL(Key(dic, 2), "M", ());
+ TEST_EQUAL(Key(dic, 3), "OK", ());
+ TEST_EQUAL(Article(dic, 0), "ArticleHello", ());
+ TEST_EQUAL(Article(dic, 1), "ArticleHello", ());
+ TEST_EQUAL(Article(dic, 2), "ArticleM", ());
+ TEST_EQUAL(Article(dic, 3), "ArticleOK", ());
+ }
+}
+
+// TODO: Write end-to-end test (publisher-to-engine).
diff --git a/publisher/slof_indexer.cpp b/publisher/slof_indexer.cpp
new file mode 100644
index 0000000000..a2e752b60b
--- /dev/null
+++ b/publisher/slof_indexer.cpp
@@ -0,0 +1,188 @@
+#include "slof_indexer.hpp"
+#include "../words/slof.hpp"
+#include "../coding/byte_stream.hpp"
+#include "../coding/endianness.hpp"
+#include "../coding/varint.hpp"
+#include "../coding/writer.hpp"
+#include "../coding/write_to_sink.hpp"
+#include "../base/assert.hpp"
+#include "../base/base.hpp"
+#include "../base/logging.hpp"
+#include "../std/algorithm.hpp"
+#include "../std/set.hpp"
+#include "../std/string.hpp"
+
+namespace
+{
+ template <typename T> uint8_t VarUintSize(T x)
+ {
+ uint8_t res = 0;
+ while (x > 127)
+ {
+ ++res;
+ x >>= 7;
+ }
+ return res + 1;
+ }
+}
+
+sl::SlofIndexer::SlofIndexer(Writer & writer,
+ size_t maxUncompressedArticleChunkSize,
+ function<void (char const *, size_t, string &)> const & compressor) :
+m_Writer(writer),
+m_MaxUncompressedArticleChunkSize(maxUncompressedArticleChunkSize),
+m_Compressor(compressor),
+m_ArticleOffset(m_Writer.Pos() + sizeof(sl::SlofHeader)),
+m_ArticleCount(0),
+m_ArticleChunkCount(0),
+m_MaxArticleSize(0)
+{
+ CHECK_LESS(maxUncompressedArticleChunkSize, 1 << 24, ());
+ m_Writer.Seek(sizeof(sl::SlofHeader));
+ CHECK_EQUAL(m_ArticleOffset, m_Writer.Pos(), ());
+}
+
+void sl::SlofIndexer::AddKey(string const & word, uint64_t articleId)
+{
+ CHECK(!word.empty(), ());
+ WordsContainerType::const_iterator it = m_Words.lower_bound(make_pair(word, 0ULL));
+ if (it != m_Words.end() && it->first == word)
+ {
+ LOG(LINFO, ("Duplicate key:", word, it->second, articleId));
+ }
+ CHECK(m_Words.insert(make_pair(word, articleId)).second, (word, articleId));
+}
+
+uint64_t sl::SlofIndexer::AddArticle(string const & article, bool forceChunkFlush)
+{
+ // if (article.size() > m_MaxUncompressedArticleChunkSize)
+ // LOG(LWARNING, ("Article bigger than chunk:", article.size(), article.substr(0, 64)));
+
+ if (m_CurrentArticleChunk.size() + article.size() > m_MaxUncompressedArticleChunkSize ||
+ forceChunkFlush)
+ FlushArticleChunk();
+
+ uint64_t const articleId =
+ ((m_Writer.Pos() - m_ArticleOffset) << 24) + m_ArticleSizesInChunk.size();
+ m_CurrentArticleChunk += article;
+ m_ArticleSizesInChunk.push_back(article.size());
+
+ ++m_ArticleCount;
+ m_TotalArticleSizeUncompressed += article.size();
+ m_MaxArticleSize = max(m_MaxArticleSize, static_cast<uint32_t>(article.size()));
+
+ return articleId;
+}
+
+void sl::SlofIndexer::FlushArticleChunk()
+{
+ if (m_ArticleSizesInChunk.empty())
+ return;
+
+ vector<char> chunkHeader;
+ { // Write chunk header.
+ {
+ PushBackByteSink<vector<char> > sink(chunkHeader);
+ // Write decompressed size of all articles.
+ WriteVarUint(sink, m_CurrentArticleChunk.size());
+ // Write individual article sizes.
+ for (size_t i = 0; i < m_ArticleSizesInChunk.size(); ++i)
+ WriteVarUint(sink, m_ArticleSizesInChunk[i]);
+ }
+ { // Write size of the header at the beginning of the header.
+ vector<char> chunkHeaderSize;
+ PushBackByteSink<vector<char> > sink(chunkHeaderSize);
+ WriteVarUint(sink, chunkHeader.size());
+ chunkHeader.insert(chunkHeader.begin(), chunkHeaderSize.begin(), chunkHeaderSize.end());
+ }
+ }
+
+ // Compress the article chunk.
+ string compressedArticleChunk;
+ m_Compressor(&m_CurrentArticleChunk[0], m_CurrentArticleChunk.size(), compressedArticleChunk);
+
+ // Write everything.
+ WriteToSink(m_Writer, static_cast<uint32_t>(chunkHeader.size() + compressedArticleChunk.size()));
+ m_Writer.Write(&chunkHeader[0], chunkHeader.size());
+ m_Writer.Write(&compressedArticleChunk[0], compressedArticleChunk.size());
+
+ // Reset everything.
+ m_CurrentArticleChunk.clear();
+ m_ArticleSizesInChunk.clear();
+ ++m_ArticleChunkCount;
+}
+
+void sl::SlofIndexer::LogStats() const
+{
+ LOG(LINFO, ("Dictionary stats"));
+ set<uint64_t> articleIds;
+ uint32_t maxKeyLength = 0, totalWordLength = 0, dupKeysCount = 0;
+ for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it)
+ {
+ WordsContainerType::const_iterator next = it;
+ ++next;
+ if (next != m_Words.end() && next->first == it->first)
+ ++dupKeysCount;
+ maxKeyLength = max(maxKeyLength, static_cast<uint32_t>(it->first.size()));
+ totalWordLength += it->first.size();
+ articleIds.insert(it->second);
+ }
+
+ CHECK_EQUAL(m_ArticleCount, articleIds.size(), ());
+
+ LOG(LINFO, ("Keys:", m_Words.size()));
+ LOG(LINFO, ("Unique keys:", m_Words.size() - dupKeysCount));
+ LOG(LINFO, ("Duplicate keys:", dupKeysCount));
+ LOG(LINFO, ("Duplicate keys %:", 100.0 * dupKeysCount / m_Words.size()));
+ LOG(LINFO, ("Max key length:", maxKeyLength));
+ LOG(LINFO, ("Average key length:", totalWordLength * 1.0 / m_Words.size()));
+ LOG(LINFO, ("Articles:", m_ArticleCount));
+ LOG(LINFO, ("Keys per article:", m_Words.size() * 1.0 / m_ArticleCount));
+ LOG(LINFO, ("Article chunks:", m_ArticleChunkCount));
+ LOG(LINFO, ("Articles per chunk:", m_ArticleCount * 1.0 / m_ArticleChunkCount));
+ LOG(LINFO, ("Average article size:", m_TotalArticleSizeUncompressed * 1.0 / m_ArticleCount));
+ LOG(LINFO, ("Max article size:", m_MaxArticleSize));
+}
+
+sl::SlofIndexer::~SlofIndexer()
+{
+ FlushArticleChunk();
+
+ // Filling in header information.
+ sl::SlofHeader header;
+ memcpy(&header.m_Signature, "slof", 4);
+ header.m_MajorVersion = SwapIfBigEndian(uint16_t(1));
+ header.m_MinorVersion = SwapIfBigEndian(uint16_t(1));
+ header.m_KeyCount = SwapIfBigEndian(static_cast<uint32_t>(m_Words.size()));
+ header.m_ArticleCount = SwapIfBigEndian(m_ArticleCount);
+ header.m_ArticleOffset = SwapIfBigEndian(static_cast<uint64_t>(sizeof(header)));
+
+ // Writing key index.
+ header.m_KeyIndexOffset = SwapIfBigEndian(m_Writer.Pos());
+ {
+ WriteToSink(m_Writer, static_cast<uint32_t>(0));
+ uint32_t cumSize = 0;
+ for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it)
+ {
+ cumSize += it->first.size();
+ cumSize += VarUintSize(it->second >> 24);
+ cumSize += VarUintSize(it->second & 0xFFFFF);
+ WriteToSink(m_Writer, cumSize);
+ }
+ }
+
+ // Writing key data.
+ header.m_KeyDataOffset = SwapIfBigEndian(m_Writer.Pos());
+ for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it)
+ {
+ WriteVarUint(m_Writer, it->second >> 24);
+ WriteVarUint(m_Writer, it->second & 0xFFFFFF);
+ m_Writer.Write(&it->first[0], it->first.size());
+ }
+
+ // Writing header.
+ uint64_t const lastPos = m_Writer.Pos();
+ m_Writer.Seek(0);
+ m_Writer.Write(&header, sizeof(header));
+ m_Writer.Seek(lastPos);
+}
diff --git a/publisher/slof_indexer.hpp b/publisher/slof_indexer.hpp
new file mode 100644
index 0000000000..156761b711
--- /dev/null
+++ b/publisher/slof_indexer.hpp
@@ -0,0 +1,49 @@
+#pragma once
+#include "../base/base.hpp"
+#include "../std/function.hpp"
+#include "../std/set.hpp"
+#include "../std/string.hpp"
+#include "../std/utility.hpp"
+#include "../std/vector.hpp"
+
+class Writer;
+
+namespace sl
+{
+
+class SlofIndexer
+{
+public:
+ SlofIndexer(Writer & writer,
+ size_t maxUncompressedArticleChunkSize,
+ function<void (char const *, size_t, string &)> const & compressor);
+ ~SlofIndexer();
+
+ // Add article and return its id.
+ uint64_t AddArticle(string const & article, bool forceChunkFlush = false);
+
+ // Add key with given article id. Keys may be passed in arbitry order.
+ void AddKey(string const & word, uint64_t articleId);
+
+ void LogStats() const;
+
+private:
+ void FlushArticleChunk();
+
+ Writer & m_Writer;
+ size_t const m_MaxUncompressedArticleChunkSize;
+ function<void (char const *, size_t, string &)> m_Compressor;
+ typedef set<pair<string, uint64_t> > WordsContainerType;
+ WordsContainerType m_Words;
+ uint64_t const m_ArticleOffset;
+ string m_CurrentArticleChunk;
+ vector<uint32_t> m_ArticleSizesInChunk;
+ uint32_t m_ArticleCount;
+
+ // Just for stats.
+ uint32_t m_ArticleChunkCount;
+ uint64_t m_TotalArticleSizeUncompressed;
+ uint32_t m_MaxArticleSize;
+};
+
+}