Add sloynik to omim!! Yea!

author: Yury Melnichek <melnichek@gmail.com> 2011-03-06 01:12:21 +0300
committer: Alex Zolotarev <alex@maps.me> 2015-09-23 01:12:57 +0300
commit: 5d9239a1edede8693c69119e282017d965d2c326 (patch)
tree: 2ac33f5cf71ee8f3814de8f2dfe3688e8c3820e1 /publisher
parent: a5ad66614265259763a1ed849db7ae8e918e8a3c (diff)
8 files changed, 665 insertions, 0 deletions
diff --git a/publisher/aard_dictionary.cpp b/publisher/aard_dictionary.cpp
new file mode 100644
index 0000000000..e39bafa8c9
--- /dev/null
+++ b/publisher/aard_dictionary.cpp
@@ -0,0 +1,225 @@
+#include "aard_dictionary.hpp"
+#include "../coding_sloynik/bzip2_compressor.hpp"
+#include "../coding_sloynik/gzip_compressor.hpp"
+#include "../coding/endianness.hpp"
+#include "../coding/reader.hpp"
+#include "../base/logging.hpp"
+#include "../3party_sloynik/jansson/myjansson.hpp"
+#include "../std/exception.hpp"
+
+namespace
+{
+  template <typename T> inline T SwapIfLittleEndian(T t)
+  {
+  #ifdef ENDIAN_IS_BIG
+    return t;
+  #else
+    return ReverseByteOrder(t);
+  #endif
+  }
+  template <typename PrimitiveT, class TReader>
+  PrimitiveT ReadLittleEndianPrimitiveFromPos(TReader & reader, uint64_t pos)
+  {
+    PrimitiveT primitive;
+    ReadFromPos(reader, pos, &primitive, sizeof(primitive));
+    return SwapIfLittleEndian(primitive);
+  }
+
+}
+
+namespace sl
+{
+
+enum
+{
+  AARD_DICTIONARY_MAX_ARTICLE_SIZE = 1 << 24
+};
+
+#pragma pack(push, 1)
+struct AardDictionaryIndex1Item
+{
+  uint32_t m_KeyPos;
+  uint32_t m_ArticlePos;
+};
+
+struct AardDictionary::AardDictionaryHeader
+{
+  char m_Signature[4];
+  uint8_t m_Sha1[40];
+  uint16_t m_Version;
+  uint8_t m_Uuid[16];
+  uint16_t m_Volume;
+  uint16_t m_TotalVolumes;
+  uint32_t m_MetaLength;
+  uint32_t m_IndexCount;
+  uint32_t m_ArticleOffset;
+  char m_Index1ItemFormat[4];
+  char m_KeyLengthFormat[2];
+  char m_ArticleLengthFormat[2];
+
+  // Offset of the index1 items.
+  uint32_t Index1Offset() const
+  {
+    return sizeof(AardDictionaryHeader) + m_MetaLength;
+  }
+  // Offset of the keys data.
+  uint32_t KeyDataOffset() const
+  {
+    return Index1Offset() + sizeof(AardDictionaryIndex1Item) * m_IndexCount;
+  }
+  // Offset of the article data.
+  uint32_t ArticleOffset() const
+  {
+    return m_ArticleOffset;
+  }
+};
+#pragma pack(pop)
+
+void AardDecompress(char const * pSrc, size_t srcSize, string & dst)
+{
+  bool decompressed = false;
+  try
+  {
+    DecompressGZip(pSrc, srcSize, dst);
+    decompressed = true;
+  }
+  catch (StringCodingException &) {}
+  if (decompressed)
+    return;
+  try
+  {
+    DecompressBZip2(pSrc, srcSize, dst);
+    decompressed = true;
+  }
+  catch (StringCodingException &) {}
+  if (decompressed)
+    return;
+  dst.assign(pSrc, pSrc + srcSize);
+}
+
+AardDictionary::AardDictionary(Reader const & reader)
+  : m_Reader(reader), m_pHeader(new AardDictionaryHeader)
+{
+  m_Reader.Read(0, m_pHeader.get(), sizeof(AardDictionaryHeader));
+  m_pHeader->m_Volume = SwapIfLittleEndian(m_pHeader->m_Volume);
+  m_pHeader->m_TotalVolumes = SwapIfLittleEndian(m_pHeader->m_TotalVolumes);
+  m_pHeader->m_MetaLength = SwapIfLittleEndian(m_pHeader->m_MetaLength);
+  m_pHeader->m_IndexCount = SwapIfLittleEndian(m_pHeader->m_IndexCount);
+  m_pHeader->m_ArticleOffset = SwapIfLittleEndian(m_pHeader->m_ArticleOffset);
+  if (memcmp(m_pHeader->m_Signature, "aard", 4) != 0)
+    MYTHROW(Dictionary::OpenDictionaryException, ("Invalid signature."));
+  if (memcmp(m_pHeader->m_Index1ItemFormat, ">LL\0", 4) != 0)
+    MYTHROW(Dictionary::OpenDictionaryException, ("Invalid index1 item format."));
+  if (memcmp(m_pHeader->m_KeyLengthFormat, ">H", 2) != 0)
+    MYTHROW(Dictionary::OpenDictionaryException, ("Invalid key length format."));
+  if (memcmp(m_pHeader->m_ArticleLengthFormat, ">L", 2) != 0)
+    MYTHROW(Dictionary::OpenDictionaryException, ("Invalid article length format."));
+  LOG(LINFO, ("Loaded aard dictionary, volume:", m_pHeader->m_Volume,
+              "of", m_pHeader->m_TotalVolumes,
+              "meta length:", m_pHeader->m_MetaLength,
+              "words:", m_pHeader->m_IndexCount));
+
+  // TODO: What to do with duplicate keys?
+  for (Id i = 0; i < KeyCount(); ++i)
+  {
+    string key;
+    KeyById(i, key);
+    m_KeyToIdMap[key] = i;
+  }
+}
+
+AardDictionary::~AardDictionary()
+{
+}
+
+sl::Dictionary::Id AardDictionary::KeyCount() const
+{
+  return m_pHeader->m_IndexCount;
+}
+
+void AardDictionary::KeyById(Id id, string & key) const
+{
+  AardDictionaryIndex1Item item;
+  m_Reader.Read(m_pHeader->Index1Offset() + id * sizeof(item), &item, sizeof(item));
+  uint64_t const keyPos = m_pHeader->KeyDataOffset() + SwapIfLittleEndian(item.m_KeyPos);
+  uint16_t const keyLength = ReadLittleEndianPrimitiveFromPos<uint16_t>(m_Reader, keyPos);
+  if (keyLength == 0)
+    MYTHROW(Dictionary::BrokenDictionaryException, (keyLength));
+  key.resize(keyLength);
+  m_Reader.Read(keyPos + 2, &key[0], keyLength);
+}
+
+void AardDictionary::ArticleById(Id id, string & article) const
+{
+  string key;
+  KeyById(id, key);
+
+  AardDictionaryIndex1Item item;
+  m_Reader.Read(m_pHeader->Index1Offset() + id * sizeof(item), &item, sizeof(item));
+  uint64_t const articlePos = m_pHeader->ArticleOffset() + SwapIfLittleEndian(item.m_ArticlePos);
+  uint32_t articleLength = ReadLittleEndianPrimitiveFromPos<uint32_t>(m_Reader, articlePos);
+  if (articleLength > AARD_DICTIONARY_MAX_ARTICLE_SIZE)
+    MYTHROW(BrokenDictionaryException, (articleLength));
+  CHECK_NOT_EQUAL(articleLength, 0, ());
+  try
+  {
+    string compressedArticle(articleLength, '.');
+    m_Reader.Read(articlePos + 4, &compressedArticle[0], articleLength);
+    string articleJSON;
+    AardDecompress(&compressedArticle[0], compressedArticle.size(), articleJSON);
+
+    my::Json root(articleJSON.c_str());
+    CHECK_EQUAL(json_typeof(root), JSON_ARRAY, (id, key));
+    json_t * pJsonElement0(json_array_get(root, 0));
+    CHECK(pJsonElement0, (id, key));
+    CHECK_EQUAL(json_typeof(pJsonElement0), JSON_STRING, (id, key));
+    string s0 = json_string_value(pJsonElement0);
+    if (s0.size() > 0)
+    {
+      // Normal article.
+      for (unsigned int i = 1; i < json_array_size(root); ++i)
+      {
+        json_t * pJsonElementI = json_array_get(root, i);
+        CHECK(pJsonElementI, (id, key));
+        switch (json_type jsonElementIType = json_typeof(pJsonElementI))
+        {
+        case JSON_ARRAY:
+          CHECK_EQUAL(json_array_size(pJsonElementI), 0, (id, key, articleJSON));
+          break;
+        case JSON_OBJECT:
+          CHECK_EQUAL(json_object_size(pJsonElementI), 0, (id, key, articleJSON));
+          break;
+        default:
+          CHECK(false, (id, key, jsonElementIType, articleJSON));
+        }
+      }
+      article.swap(s0);
+    }
+    else
+    {
+      // Redirect
+      CHECK_EQUAL(json_array_size(root), 3, (id, key));
+      json_t * pJsonElement2(json_array_get(root, 2));
+      CHECK_EQUAL(json_typeof(pJsonElement2), JSON_OBJECT, (id, key));
+      CHECK_EQUAL(json_object_size(pJsonElement2), 1, (id, key));
+      json_t * pJsonRedirect = json_object_get(pJsonElement2, "r");
+      CHECK(pJsonRedirect, (id, key));
+      CHECK_EQUAL(json_typeof(pJsonRedirect), JSON_STRING, (id, key));
+      string redirectStr(json_string_value(pJsonRedirect));
+      CHECK_GREATER(redirectStr.size(), 0, (id, key));
+      map<string, Id>::const_iterator it = m_KeyToIdMap.find(redirectStr);
+      if (it == m_KeyToIdMap.end())
+      {
+        LOG(LWARNING, ("Incorrect redirect", id, key, redirectStr));
+        return;
+      }
+      ArticleById(it->second, article);
+    }
+  }
+  catch (exception & e)
+  {
+    CHECK(false, (id, key, e.what()));
+  }
+}
+
+}
diff --git a/publisher/aard_dictionary.hpp b/publisher/aard_dictionary.hpp
new file mode 100644
index 0000000000..3173ad0a7a
--- /dev/null
+++ b/publisher/aard_dictionary.hpp
@@ -0,0 +1,28 @@
+#pragma once
+#include "../words/dictionary.hpp"
+#include "../base/base.hpp"
+#include "../std/map.hpp"
+#include "../std/scoped_ptr.hpp"
+#include "../std/string.hpp"
+
+class Reader;
+
+namespace sl
+{
+
+class AardDictionary : public Dictionary
+{
+public:
+  explicit AardDictionary(Reader const & reader);
+  ~AardDictionary();
+  Id KeyCount() const;
+  void KeyById(Id id, string & key) const;
+  void ArticleById(Id id, string & article) const;
+private:
+  Reader const & m_Reader;
+  struct AardDictionaryHeader;
+  scoped_ptr<AardDictionaryHeader> m_pHeader;
+  map<string, Id> m_KeyToIdMap;
+};
+
+}
diff --git a/publisher/main.cpp b/publisher/main.cpp
new file mode 100644
index 0000000000..52fdbf64f6
--- /dev/null
+++ b/publisher/main.cpp
@@ -0,0 +1,57 @@
+#include "aard_dictionary.hpp"
+#include "slof_indexer.hpp"
+#include "../coding_sloynik/bzip2_compressor.hpp"
+#include "../coding/file_reader.hpp"
+#include "../coding/file_writer.hpp"
+#include "../base/base.hpp"
+#include "../base/assert.hpp"
+#include "../base/logging.hpp"
+#include "../std/bind.hpp"
+#include "../3party/gflags/src/gflags/gflags.h"
+
+DEFINE_int32(max_uncompressed_article_chunk_size, 899950,
+             "Max size of chunk of articles, uncompressed.");
+DEFINE_int32(compression_level, 9, "BZip2 compression level.");
+DEFINE_string(input, "", "Input file.");
+DEFINE_string(output, "", "Output dictionary file.");
+
+int main(int argc, char ** argv)
+{
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  CHECK(!FLAGS_input.empty(), ());
+  CHECK(!FLAGS_output.empty(), ());
+  FileReader inputReader(FLAGS_input.c_str());
+  FileWriter outputWriter(FLAGS_output.c_str());
+  {
+    sl::AardDictionary inputDictionary(inputReader);
+    sl::SlofIndexer indexer(outputWriter,
+                            FLAGS_max_uncompressed_article_chunk_size,
+                            bind(&CompressBZip2, FLAGS_compression_level, _1, _2, _3));
+    LOG(LINFO, ("Starting indexing, keys:", inputDictionary.KeyCount()));
+    for (uint32_t id = 0; id < inputDictionary.KeyCount(); ++id)
+    {
+      if ((id % 5000) == 0)
+        LOG(LINFO, (id, "done."));
+      // TODO: Handle redirects.
+      // TODO: Handle several keys for article?
+      string key, article;
+      inputDictionary.KeyById(id, key);
+      inputDictionary.ArticleById(id, article);
+      if (article.empty())
+      {
+        LOG(LWARNING, ("Skipping empty article for:", key));
+      }
+      else
+      {
+        uint64_t const articleId = indexer.AddArticle(article);
+        indexer.AddKey(key, articleId);
+      }
+    }
+    LOG(LINFO, ("Logging stats."));
+    indexer.LogStats();
+    LOG(LINFO, ("Finishing indexing."));
+  }
+  LOG(LINFO, ("Indexing done."));
+  LOG(LINFO, ("Input size:", inputReader.Size()));
+  LOG(LINFO, ("Output size:", outputWriter.Pos()));
+}
diff --git a/publisher/publisher.pro b/publisher/publisher.pro
new file mode 100644
index 0000000000..1acc92085e
--- /dev/null
+++ b/publisher/publisher.pro
@@ -0,0 +1,18 @@
+TARGET = publisher
+TEMPLATE = app
+CONFIG += console
+CONFIG -= app_bundle
+
+SLOYNIK_DIR = ..
+DEPENDENCIES = gflags bzip2 zlib jansson base coding coding_sloynik words
+
+include($$SLOYNIK_DIR/sloynik_common.pri)
+
+HEADERS += \
+  aard_dictionary.hpp \
+  slof_indexer.hpp
+
+SOURCES += \
+  aard_dictionary.cpp \
+  main.cpp \
+  slof_indexer.cpp
diff --git a/publisher/publisher_tests/publisher_tests.pro b/publisher/publisher_tests/publisher_tests.pro
new file mode 100644
index 0000000000..5b46a73c1b
--- /dev/null
+++ b/publisher/publisher_tests/publisher_tests.pro
@@ -0,0 +1,14 @@
+TARGET = publisher_tests
+TEMPLATE = app
+CONFIG += console
+CONFIG -= app_bundle
+
+SLOYNIK_DIR = ../..
+DEPENDENCIES = gflags bzip2 zlib base coding coding_sloynik words
+
+include($$SLOYNIK_DIR/sloynik_common.pri)
+
+SOURCES += $$SLOYNIK_DIR/testing/testingmain.cpp \
+  slof_indexer_test.cpp ../slof_indexer.cpp
+
+HEADERS +=
diff --git a/publisher/publisher_tests/slof_indexer_test.cpp b/publisher/publisher_tests/slof_indexer_test.cpp
new file mode 100644
index 0000000000..29ef3a87c4
--- /dev/null
+++ b/publisher/publisher_tests/slof_indexer_test.cpp
@@ -0,0 +1,86 @@
+#include "../../testing/testing.hpp"
+#include "../slof_indexer.hpp"
+#include "../../words/slof_dictionary.hpp"
+#include "../../words/sloynik_engine.hpp"
+#include "../../coding/reader.hpp"
+#include "../../coding/writer.hpp"
+#include "../../base/logging.hpp"
+#include "../../base/macros.hpp"
+#include "../../std/string.hpp"
+#include "../../std/vector.hpp"
+
+namespace
+{
+  void TestCompressor(char const * pSrc, size_t srcSize, string & res)
+  {
+    res = "<";
+    res.insert(res.end(), pSrc, pSrc + srcSize);
+    res.insert(res.end(), '>');
+  }
+
+  void TestDecompressor(char const * pSrc, size_t srcSize, char * pDst, size_t dstSize)
+  {
+    TEST_GREATER_OR_EQUAL(srcSize, 2, ());
+    TEST_EQUAL(srcSize - 2, dstSize, ());
+    TEST_EQUAL(pSrc[0], '<', ());
+    TEST_EQUAL(pSrc[srcSize-1], '>', ());
+    memcpy(pDst, pSrc + 1, srcSize - 2);
+  }
+
+  string Key(sl::SlofDictionary const & dic, sl::Dictionary::Id id)
+  {
+    string res;
+    dic.KeyById(id, res);
+    return res;
+  }
+
+  string Article(sl::SlofDictionary const & dic, sl::Dictionary::Id id)
+  {
+    string res;
+    dic.ArticleById(id, res);
+    return res;
+  }
+}
+
+UNIT_TEST(SlofIndexerEmptyTest)
+{
+  string serializedDictionary;
+  {
+    MemWriter<string> writer(serializedDictionary);
+    sl::SlofIndexer indexer(writer, 20, &TestCompressor);
+  }
+  sl::SlofDictionary dic(new MemReader(&serializedDictionary[0], serializedDictionary.size()),
+                         &TestDecompressor);
+  TEST_EQUAL(dic.KeyCount(), 0, ());
+}
+
+UNIT_TEST(SlofIndexerSmokeTest)
+{
+  string serializedDictionary;
+  {
+    MemWriter<string> writer(serializedDictionary);
+    sl::SlofIndexer indexer(writer, 25, &TestCompressor);
+    uint64_t articleM = indexer.AddArticle("ArticleM");
+    indexer.AddKey("M", articleM);
+    uint64_t articleHello = indexer.AddArticle("ArticleHello");
+    indexer.AddKey("He", articleHello);
+    uint64_t articleOk = indexer.AddArticle("ArticleOK");
+    indexer.AddKey("OK", articleOk);
+    indexer.AddKey("Hello", articleHello);
+  }
+  {
+    sl::SlofDictionary dic(new MemReader(&serializedDictionary[0], serializedDictionary.size()),
+                           &TestDecompressor);
+    TEST_EQUAL(dic.KeyCount(), 4, ());
+    TEST_EQUAL(Key(dic, 0), "He", ());
+    TEST_EQUAL(Key(dic, 1), "Hello", ());
+    TEST_EQUAL(Key(dic, 2), "M", ());
+    TEST_EQUAL(Key(dic, 3), "OK", ());
+    TEST_EQUAL(Article(dic, 0), "ArticleHello", ());
+    TEST_EQUAL(Article(dic, 1), "ArticleHello", ());
+    TEST_EQUAL(Article(dic, 2), "ArticleM", ());
+    TEST_EQUAL(Article(dic, 3), "ArticleOK", ());
+  }
+}
+
+// TODO: Write end-to-end test (publisher-to-engine).
diff --git a/publisher/slof_indexer.cpp b/publisher/slof_indexer.cpp
new file mode 100644
index 0000000000..a2e752b60b
--- /dev/null
+++ b/publisher/slof_indexer.cpp
@@ -0,0 +1,188 @@
+#include "slof_indexer.hpp"
+#include "../words/slof.hpp"
+#include "../coding/byte_stream.hpp"
+#include "../coding/endianness.hpp"
+#include "../coding/varint.hpp"
+#include "../coding/writer.hpp"
+#include "../coding/write_to_sink.hpp"
+#include "../base/assert.hpp"
+#include "../base/base.hpp"
+#include "../base/logging.hpp"
+#include "../std/algorithm.hpp"
+#include "../std/set.hpp"
+#include "../std/string.hpp"
+
+namespace
+{
+  template <typename T> uint8_t VarUintSize(T x)
+  {
+    uint8_t res = 0;
+    while (x > 127)
+    {
+      ++res;
+      x >>= 7;
+    }
+    return res + 1;
+  }
+}
+
+sl::SlofIndexer::SlofIndexer(Writer & writer,
+                             size_t maxUncompressedArticleChunkSize,
+                             function<void (char const *, size_t, string &)> const & compressor) :
+m_Writer(writer),
+m_MaxUncompressedArticleChunkSize(maxUncompressedArticleChunkSize),
+m_Compressor(compressor),
+m_ArticleOffset(m_Writer.Pos() + sizeof(sl::SlofHeader)),
+m_ArticleCount(0),
+m_ArticleChunkCount(0),
+m_MaxArticleSize(0)
+{
+  CHECK_LESS(maxUncompressedArticleChunkSize, 1 << 24, ());
+  m_Writer.Seek(sizeof(sl::SlofHeader));
+  CHECK_EQUAL(m_ArticleOffset, m_Writer.Pos(), ());
+}
+
+void sl::SlofIndexer::AddKey(string const & word, uint64_t articleId)
+{
+  CHECK(!word.empty(), ());
+  WordsContainerType::const_iterator it = m_Words.lower_bound(make_pair(word, 0ULL));
+  if (it != m_Words.end() && it->first == word)
+  {
+    LOG(LINFO, ("Duplicate key:", word, it->second, articleId));
+  }
+  CHECK(m_Words.insert(make_pair(word, articleId)).second, (word, articleId));
+}
+
+uint64_t sl::SlofIndexer::AddArticle(string const & article, bool forceChunkFlush)
+{
+  // if (article.size() > m_MaxUncompressedArticleChunkSize)
+  //   LOG(LWARNING, ("Article bigger than chunk:", article.size(), article.substr(0, 64)));
+
+  if (m_CurrentArticleChunk.size() + article.size() > m_MaxUncompressedArticleChunkSize ||
+      forceChunkFlush)
+    FlushArticleChunk();
+
+  uint64_t const articleId =
+      ((m_Writer.Pos() - m_ArticleOffset) << 24) + m_ArticleSizesInChunk.size();
+  m_CurrentArticleChunk += article;
+  m_ArticleSizesInChunk.push_back(article.size());
+
+  ++m_ArticleCount;
+  m_TotalArticleSizeUncompressed += article.size();
+  m_MaxArticleSize = max(m_MaxArticleSize, static_cast<uint32_t>(article.size()));
+
+  return articleId;
+}
+
+void sl::SlofIndexer::FlushArticleChunk()
+{
+  if (m_ArticleSizesInChunk.empty())
+    return;
+
+  vector<char> chunkHeader;
+  { // Write chunk header.
+    {
+      PushBackByteSink<vector<char> > sink(chunkHeader);
+      // Write decompressed size of all articles.
+      WriteVarUint(sink, m_CurrentArticleChunk.size());
+      // Write individual article sizes.
+      for (size_t i = 0; i < m_ArticleSizesInChunk.size(); ++i)
+        WriteVarUint(sink, m_ArticleSizesInChunk[i]);
+    }
+    { // Write size of the header at the beginning of the header.
+      vector<char> chunkHeaderSize;
+      PushBackByteSink<vector<char> > sink(chunkHeaderSize);
+      WriteVarUint(sink, chunkHeader.size());
+      chunkHeader.insert(chunkHeader.begin(), chunkHeaderSize.begin(), chunkHeaderSize.end());
+    }
+  }
+
+  // Compress the article chunk.
+  string compressedArticleChunk;
+  m_Compressor(&m_CurrentArticleChunk[0], m_CurrentArticleChunk.size(), compressedArticleChunk);
+
+  // Write everything.
+  WriteToSink(m_Writer, static_cast<uint32_t>(chunkHeader.size() + compressedArticleChunk.size()));
+  m_Writer.Write(&chunkHeader[0], chunkHeader.size());
+  m_Writer.Write(&compressedArticleChunk[0], compressedArticleChunk.size());
+
+  // Reset everything.
+  m_CurrentArticleChunk.clear();
+  m_ArticleSizesInChunk.clear();
+  ++m_ArticleChunkCount;
+}
+
+void sl::SlofIndexer::LogStats() const
+{
+  LOG(LINFO, ("Dictionary stats"));
+  set<uint64_t> articleIds;
+  uint32_t maxKeyLength = 0, totalWordLength = 0, dupKeysCount = 0;
+  for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it)
+  {
+    WordsContainerType::const_iterator next = it;
+    ++next;
+    if (next != m_Words.end() && next->first == it->first)
+      ++dupKeysCount;
+    maxKeyLength = max(maxKeyLength, static_cast<uint32_t>(it->first.size()));
+    totalWordLength += it->first.size();
+    articleIds.insert(it->second);
+  }
+
+  CHECK_EQUAL(m_ArticleCount, articleIds.size(), ());
+
+  LOG(LINFO, ("Keys:", m_Words.size()));
+  LOG(LINFO, ("Unique keys:", m_Words.size() - dupKeysCount));
+  LOG(LINFO, ("Duplicate keys:", dupKeysCount));
+  LOG(LINFO, ("Duplicate keys %:", 100.0 * dupKeysCount / m_Words.size()));
+  LOG(LINFO, ("Max key length:", maxKeyLength));
+  LOG(LINFO, ("Average key length:", totalWordLength * 1.0 / m_Words.size()));
+  LOG(LINFO, ("Articles:", m_ArticleCount));
+  LOG(LINFO, ("Keys per article:", m_Words.size() * 1.0 / m_ArticleCount));
+  LOG(LINFO, ("Article chunks:", m_ArticleChunkCount));
+  LOG(LINFO, ("Articles per chunk:", m_ArticleCount * 1.0 / m_ArticleChunkCount));
+  LOG(LINFO, ("Average article size:", m_TotalArticleSizeUncompressed * 1.0 / m_ArticleCount));
+  LOG(LINFO, ("Max article size:", m_MaxArticleSize));
+}
+
+sl::SlofIndexer::~SlofIndexer()
+{
+  FlushArticleChunk();
+
+  // Filling in header information.
+  sl::SlofHeader header;
+  memcpy(&header.m_Signature, "slof", 4);
+  header.m_MajorVersion = SwapIfBigEndian(uint16_t(1));
+  header.m_MinorVersion = SwapIfBigEndian(uint16_t(1));
+  header.m_KeyCount = SwapIfBigEndian(static_cast<uint32_t>(m_Words.size()));
+  header.m_ArticleCount = SwapIfBigEndian(m_ArticleCount);
+  header.m_ArticleOffset = SwapIfBigEndian(static_cast<uint64_t>(sizeof(header)));
+
+  // Writing key index.
+  header.m_KeyIndexOffset = SwapIfBigEndian(m_Writer.Pos());
+  {
+    WriteToSink(m_Writer, static_cast<uint32_t>(0));
+    uint32_t cumSize = 0;
+    for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it)
+    {
+      cumSize += it->first.size();
+      cumSize += VarUintSize(it->second >> 24);
+      cumSize += VarUintSize(it->second & 0xFFFFF);
+      WriteToSink(m_Writer, cumSize);
+    }
+  }
+
+  // Writing key data.
+  header.m_KeyDataOffset = SwapIfBigEndian(m_Writer.Pos());
+  for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it)
+  {
+    WriteVarUint(m_Writer, it->second >> 24);
+    WriteVarUint(m_Writer, it->second & 0xFFFFFF);
+    m_Writer.Write(&it->first[0], it->first.size());
+  }
+
+  // Writing header.
+  uint64_t const lastPos = m_Writer.Pos();
+  m_Writer.Seek(0);
+  m_Writer.Write(&header, sizeof(header));
+  m_Writer.Seek(lastPos);
+}
diff --git a/publisher/slof_indexer.hpp b/publisher/slof_indexer.hpp
new file mode 100644
index 0000000000..156761b711
--- /dev/null
+++ b/publisher/slof_indexer.hpp
@@ -0,0 +1,49 @@
+#pragma once
+#include "../base/base.hpp"
+#include "../std/function.hpp"
+#include "../std/set.hpp"
+#include "../std/string.hpp"
+#include "../std/utility.hpp"
+#include "../std/vector.hpp"
+
+class Writer;
+
+namespace sl
+{
+
+class SlofIndexer
+{
+public:
+  SlofIndexer(Writer & writer,
+            size_t maxUncompressedArticleChunkSize,
+            function<void (char const *, size_t, string &)> const & compressor);
+  ~SlofIndexer();
+
+  // Add article and return its id.
+  uint64_t AddArticle(string const & article, bool forceChunkFlush = false);
+
+  // Add key with given article id. Keys may be passed in arbitry order.
+  void AddKey(string const & word, uint64_t articleId);
+
+  void LogStats() const;
+
+private:
+  void FlushArticleChunk();
+
+  Writer & m_Writer;
+  size_t const m_MaxUncompressedArticleChunkSize;
+  function<void (char const *, size_t, string &)> m_Compressor;
+  typedef set<pair<string, uint64_t> > WordsContainerType;
+  WordsContainerType m_Words;
+  uint64_t const m_ArticleOffset;
+  string m_CurrentArticleChunk;
+  vector<uint32_t> m_ArticleSizesInChunk;
+  uint32_t m_ArticleCount;
+
+  // Just for stats.
+  uint32_t m_ArticleChunkCount;
+  uint64_t m_TotalArticleSizeUncompressed;
+  uint32_t m_MaxArticleSize;
+};
+
+}
author	Yury Melnichek <melnichek@gmail.com>	2011-03-06 01:12:21 +0300
committer	Alex Zolotarev <alex@maps.me>	2015-09-23 01:12:57 +0300
commit	5d9239a1edede8693c69119e282017d965d2c326 (patch)
tree	2ac33f5cf71ee8f3814de8f2dfe3688e8c3820e1 /publisher
parent	a5ad66614265259763a1ed849db7ae8e918e8a3c (diff)