diff options
author | Yury Melnichek <melnichek@gmail.com> | 2011-03-06 01:12:21 +0300 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2015-09-23 01:12:57 +0300 |
commit | 5d9239a1edede8693c69119e282017d965d2c326 (patch) | |
tree | 2ac33f5cf71ee8f3814de8f2dfe3688e8c3820e1 /publisher | |
parent | a5ad66614265259763a1ed849db7ae8e918e8a3c (diff) |
Add sloynik to omim!! Yea!
Diffstat (limited to 'publisher')
-rw-r--r-- | publisher/aard_dictionary.cpp | 225 | ||||
-rw-r--r-- | publisher/aard_dictionary.hpp | 28 | ||||
-rw-r--r-- | publisher/main.cpp | 57 | ||||
-rw-r--r-- | publisher/publisher.pro | 18 | ||||
-rw-r--r-- | publisher/publisher_tests/publisher_tests.pro | 14 | ||||
-rw-r--r-- | publisher/publisher_tests/slof_indexer_test.cpp | 86 | ||||
-rw-r--r-- | publisher/slof_indexer.cpp | 188 | ||||
-rw-r--r-- | publisher/slof_indexer.hpp | 49 |
8 files changed, 665 insertions, 0 deletions
diff --git a/publisher/aard_dictionary.cpp b/publisher/aard_dictionary.cpp new file mode 100644 index 0000000000..e39bafa8c9 --- /dev/null +++ b/publisher/aard_dictionary.cpp @@ -0,0 +1,225 @@ +#include "aard_dictionary.hpp" +#include "../coding_sloynik/bzip2_compressor.hpp" +#include "../coding_sloynik/gzip_compressor.hpp" +#include "../coding/endianness.hpp" +#include "../coding/reader.hpp" +#include "../base/logging.hpp" +#include "../3party_sloynik/jansson/myjansson.hpp" +#include "../std/exception.hpp" + +namespace +{ + template <typename T> inline T SwapIfLittleEndian(T t) + { + #ifdef ENDIAN_IS_BIG + return t; + #else + return ReverseByteOrder(t); + #endif + } + template <typename PrimitiveT, class TReader> + PrimitiveT ReadLittleEndianPrimitiveFromPos(TReader & reader, uint64_t pos) + { + PrimitiveT primitive; + ReadFromPos(reader, pos, &primitive, sizeof(primitive)); + return SwapIfLittleEndian(primitive); + } + +} + +namespace sl +{ + +enum +{ + AARD_DICTIONARY_MAX_ARTICLE_SIZE = 1 << 24 +}; + +#pragma pack(push, 1) +struct AardDictionaryIndex1Item +{ + uint32_t m_KeyPos; + uint32_t m_ArticlePos; +}; + +struct AardDictionary::AardDictionaryHeader +{ + char m_Signature[4]; + uint8_t m_Sha1[40]; + uint16_t m_Version; + uint8_t m_Uuid[16]; + uint16_t m_Volume; + uint16_t m_TotalVolumes; + uint32_t m_MetaLength; + uint32_t m_IndexCount; + uint32_t m_ArticleOffset; + char m_Index1ItemFormat[4]; + char m_KeyLengthFormat[2]; + char m_ArticleLengthFormat[2]; + + // Offset of the index1 items. + uint32_t Index1Offset() const + { + return sizeof(AardDictionaryHeader) + m_MetaLength; + } + // Offset of the keys data. + uint32_t KeyDataOffset() const + { + return Index1Offset() + sizeof(AardDictionaryIndex1Item) * m_IndexCount; + } + // Offset of the article data. + uint32_t ArticleOffset() const + { + return m_ArticleOffset; + } +}; +#pragma pack(pop) + +void AardDecompress(char const * pSrc, size_t srcSize, string & dst) +{ + bool decompressed = false; + try + { + DecompressGZip(pSrc, srcSize, dst); + decompressed = true; + } + catch (StringCodingException &) {} + if (decompressed) + return; + try + { + DecompressBZip2(pSrc, srcSize, dst); + decompressed = true; + } + catch (StringCodingException &) {} + if (decompressed) + return; + dst.assign(pSrc, pSrc + srcSize); +} + +AardDictionary::AardDictionary(Reader const & reader) + : m_Reader(reader), m_pHeader(new AardDictionaryHeader) +{ + m_Reader.Read(0, m_pHeader.get(), sizeof(AardDictionaryHeader)); + m_pHeader->m_Volume = SwapIfLittleEndian(m_pHeader->m_Volume); + m_pHeader->m_TotalVolumes = SwapIfLittleEndian(m_pHeader->m_TotalVolumes); + m_pHeader->m_MetaLength = SwapIfLittleEndian(m_pHeader->m_MetaLength); + m_pHeader->m_IndexCount = SwapIfLittleEndian(m_pHeader->m_IndexCount); + m_pHeader->m_ArticleOffset = SwapIfLittleEndian(m_pHeader->m_ArticleOffset); + if (memcmp(m_pHeader->m_Signature, "aard", 4) != 0) + MYTHROW(Dictionary::OpenDictionaryException, ("Invalid signature.")); + if (memcmp(m_pHeader->m_Index1ItemFormat, ">LL\0", 4) != 0) + MYTHROW(Dictionary::OpenDictionaryException, ("Invalid index1 item format.")); + if (memcmp(m_pHeader->m_KeyLengthFormat, ">H", 2) != 0) + MYTHROW(Dictionary::OpenDictionaryException, ("Invalid key length format.")); + if (memcmp(m_pHeader->m_ArticleLengthFormat, ">L", 2) != 0) + MYTHROW(Dictionary::OpenDictionaryException, ("Invalid article length format.")); + LOG(LINFO, ("Loaded aard dictionary, volume:", m_pHeader->m_Volume, + "of", m_pHeader->m_TotalVolumes, + "meta length:", m_pHeader->m_MetaLength, + "words:", m_pHeader->m_IndexCount)); + + // TODO: What to do with duplicate keys? + for (Id i = 0; i < KeyCount(); ++i) + { + string key; + KeyById(i, key); + m_KeyToIdMap[key] = i; + } +} + +AardDictionary::~AardDictionary() +{ +} + +sl::Dictionary::Id AardDictionary::KeyCount() const +{ + return m_pHeader->m_IndexCount; +} + +void AardDictionary::KeyById(Id id, string & key) const +{ + AardDictionaryIndex1Item item; + m_Reader.Read(m_pHeader->Index1Offset() + id * sizeof(item), &item, sizeof(item)); + uint64_t const keyPos = m_pHeader->KeyDataOffset() + SwapIfLittleEndian(item.m_KeyPos); + uint16_t const keyLength = ReadLittleEndianPrimitiveFromPos<uint16_t>(m_Reader, keyPos); + if (keyLength == 0) + MYTHROW(Dictionary::BrokenDictionaryException, (keyLength)); + key.resize(keyLength); + m_Reader.Read(keyPos + 2, &key[0], keyLength); +} + +void AardDictionary::ArticleById(Id id, string & article) const +{ + string key; + KeyById(id, key); + + AardDictionaryIndex1Item item; + m_Reader.Read(m_pHeader->Index1Offset() + id * sizeof(item), &item, sizeof(item)); + uint64_t const articlePos = m_pHeader->ArticleOffset() + SwapIfLittleEndian(item.m_ArticlePos); + uint32_t articleLength = ReadLittleEndianPrimitiveFromPos<uint32_t>(m_Reader, articlePos); + if (articleLength > AARD_DICTIONARY_MAX_ARTICLE_SIZE) + MYTHROW(BrokenDictionaryException, (articleLength)); + CHECK_NOT_EQUAL(articleLength, 0, ()); + try + { + string compressedArticle(articleLength, '.'); + m_Reader.Read(articlePos + 4, &compressedArticle[0], articleLength); + string articleJSON; + AardDecompress(&compressedArticle[0], compressedArticle.size(), articleJSON); + + my::Json root(articleJSON.c_str()); + CHECK_EQUAL(json_typeof(root), JSON_ARRAY, (id, key)); + json_t * pJsonElement0(json_array_get(root, 0)); + CHECK(pJsonElement0, (id, key)); + CHECK_EQUAL(json_typeof(pJsonElement0), JSON_STRING, (id, key)); + string s0 = json_string_value(pJsonElement0); + if (s0.size() > 0) + { + // Normal article. + for (unsigned int i = 1; i < json_array_size(root); ++i) + { + json_t * pJsonElementI = json_array_get(root, i); + CHECK(pJsonElementI, (id, key)); + switch (json_type jsonElementIType = json_typeof(pJsonElementI)) + { + case JSON_ARRAY: + CHECK_EQUAL(json_array_size(pJsonElementI), 0, (id, key, articleJSON)); + break; + case JSON_OBJECT: + CHECK_EQUAL(json_object_size(pJsonElementI), 0, (id, key, articleJSON)); + break; + default: + CHECK(false, (id, key, jsonElementIType, articleJSON)); + } + } + article.swap(s0); + } + else + { + // Redirect + CHECK_EQUAL(json_array_size(root), 3, (id, key)); + json_t * pJsonElement2(json_array_get(root, 2)); + CHECK_EQUAL(json_typeof(pJsonElement2), JSON_OBJECT, (id, key)); + CHECK_EQUAL(json_object_size(pJsonElement2), 1, (id, key)); + json_t * pJsonRedirect = json_object_get(pJsonElement2, "r"); + CHECK(pJsonRedirect, (id, key)); + CHECK_EQUAL(json_typeof(pJsonRedirect), JSON_STRING, (id, key)); + string redirectStr(json_string_value(pJsonRedirect)); + CHECK_GREATER(redirectStr.size(), 0, (id, key)); + map<string, Id>::const_iterator it = m_KeyToIdMap.find(redirectStr); + if (it == m_KeyToIdMap.end()) + { + LOG(LWARNING, ("Incorrect redirect", id, key, redirectStr)); + return; + } + ArticleById(it->second, article); + } + } + catch (exception & e) + { + CHECK(false, (id, key, e.what())); + } +} + +} diff --git a/publisher/aard_dictionary.hpp b/publisher/aard_dictionary.hpp new file mode 100644 index 0000000000..3173ad0a7a --- /dev/null +++ b/publisher/aard_dictionary.hpp @@ -0,0 +1,28 @@ +#pragma once +#include "../words/dictionary.hpp" +#include "../base/base.hpp" +#include "../std/map.hpp" +#include "../std/scoped_ptr.hpp" +#include "../std/string.hpp" + +class Reader; + +namespace sl +{ + +class AardDictionary : public Dictionary +{ +public: + explicit AardDictionary(Reader const & reader); + ~AardDictionary(); + Id KeyCount() const; + void KeyById(Id id, string & key) const; + void ArticleById(Id id, string & article) const; +private: + Reader const & m_Reader; + struct AardDictionaryHeader; + scoped_ptr<AardDictionaryHeader> m_pHeader; + map<string, Id> m_KeyToIdMap; +}; + +} diff --git a/publisher/main.cpp b/publisher/main.cpp new file mode 100644 index 0000000000..52fdbf64f6 --- /dev/null +++ b/publisher/main.cpp @@ -0,0 +1,57 @@ +#include "aard_dictionary.hpp" +#include "slof_indexer.hpp" +#include "../coding_sloynik/bzip2_compressor.hpp" +#include "../coding/file_reader.hpp" +#include "../coding/file_writer.hpp" +#include "../base/base.hpp" +#include "../base/assert.hpp" +#include "../base/logging.hpp" +#include "../std/bind.hpp" +#include "../3party/gflags/src/gflags/gflags.h" + +DEFINE_int32(max_uncompressed_article_chunk_size, 899950, + "Max size of chunk of articles, uncompressed."); +DEFINE_int32(compression_level, 9, "BZip2 compression level."); +DEFINE_string(input, "", "Input file."); +DEFINE_string(output, "", "Output dictionary file."); + +int main(int argc, char ** argv) +{ + google::ParseCommandLineFlags(&argc, &argv, true); + CHECK(!FLAGS_input.empty(), ()); + CHECK(!FLAGS_output.empty(), ()); + FileReader inputReader(FLAGS_input.c_str()); + FileWriter outputWriter(FLAGS_output.c_str()); + { + sl::AardDictionary inputDictionary(inputReader); + sl::SlofIndexer indexer(outputWriter, + FLAGS_max_uncompressed_article_chunk_size, + bind(&CompressBZip2, FLAGS_compression_level, _1, _2, _3)); + LOG(LINFO, ("Starting indexing, keys:", inputDictionary.KeyCount())); + for (uint32_t id = 0; id < inputDictionary.KeyCount(); ++id) + { + if ((id % 5000) == 0) + LOG(LINFO, (id, "done.")); + // TODO: Handle redirects. + // TODO: Handle several keys for article? + string key, article; + inputDictionary.KeyById(id, key); + inputDictionary.ArticleById(id, article); + if (article.empty()) + { + LOG(LWARNING, ("Skipping empty article for:", key)); + } + else + { + uint64_t const articleId = indexer.AddArticle(article); + indexer.AddKey(key, articleId); + } + } + LOG(LINFO, ("Logging stats.")); + indexer.LogStats(); + LOG(LINFO, ("Finishing indexing.")); + } + LOG(LINFO, ("Indexing done.")); + LOG(LINFO, ("Input size:", inputReader.Size())); + LOG(LINFO, ("Output size:", outputWriter.Pos())); +} diff --git a/publisher/publisher.pro b/publisher/publisher.pro new file mode 100644 index 0000000000..1acc92085e --- /dev/null +++ b/publisher/publisher.pro @@ -0,0 +1,18 @@ +TARGET = publisher +TEMPLATE = app +CONFIG += console +CONFIG -= app_bundle + +SLOYNIK_DIR = .. +DEPENDENCIES = gflags bzip2 zlib jansson base coding coding_sloynik words + +include($$SLOYNIK_DIR/sloynik_common.pri) + +HEADERS += \ + aard_dictionary.hpp \ + slof_indexer.hpp + +SOURCES += \ + aard_dictionary.cpp \ + main.cpp \ + slof_indexer.cpp diff --git a/publisher/publisher_tests/publisher_tests.pro b/publisher/publisher_tests/publisher_tests.pro new file mode 100644 index 0000000000..5b46a73c1b --- /dev/null +++ b/publisher/publisher_tests/publisher_tests.pro @@ -0,0 +1,14 @@ +TARGET = publisher_tests +TEMPLATE = app +CONFIG += console +CONFIG -= app_bundle + +SLOYNIK_DIR = ../.. +DEPENDENCIES = gflags bzip2 zlib base coding coding_sloynik words + +include($$SLOYNIK_DIR/sloynik_common.pri) + +SOURCES += $$SLOYNIK_DIR/testing/testingmain.cpp \ + slof_indexer_test.cpp ../slof_indexer.cpp + +HEADERS += diff --git a/publisher/publisher_tests/slof_indexer_test.cpp b/publisher/publisher_tests/slof_indexer_test.cpp new file mode 100644 index 0000000000..29ef3a87c4 --- /dev/null +++ b/publisher/publisher_tests/slof_indexer_test.cpp @@ -0,0 +1,86 @@ +#include "../../testing/testing.hpp" +#include "../slof_indexer.hpp" +#include "../../words/slof_dictionary.hpp" +#include "../../words/sloynik_engine.hpp" +#include "../../coding/reader.hpp" +#include "../../coding/writer.hpp" +#include "../../base/logging.hpp" +#include "../../base/macros.hpp" +#include "../../std/string.hpp" +#include "../../std/vector.hpp" + +namespace +{ + void TestCompressor(char const * pSrc, size_t srcSize, string & res) + { + res = "<"; + res.insert(res.end(), pSrc, pSrc + srcSize); + res.insert(res.end(), '>'); + } + + void TestDecompressor(char const * pSrc, size_t srcSize, char * pDst, size_t dstSize) + { + TEST_GREATER_OR_EQUAL(srcSize, 2, ()); + TEST_EQUAL(srcSize - 2, dstSize, ()); + TEST_EQUAL(pSrc[0], '<', ()); + TEST_EQUAL(pSrc[srcSize-1], '>', ()); + memcpy(pDst, pSrc + 1, srcSize - 2); + } + + string Key(sl::SlofDictionary const & dic, sl::Dictionary::Id id) + { + string res; + dic.KeyById(id, res); + return res; + } + + string Article(sl::SlofDictionary const & dic, sl::Dictionary::Id id) + { + string res; + dic.ArticleById(id, res); + return res; + } +} + +UNIT_TEST(SlofIndexerEmptyTest) +{ + string serializedDictionary; + { + MemWriter<string> writer(serializedDictionary); + sl::SlofIndexer indexer(writer, 20, &TestCompressor); + } + sl::SlofDictionary dic(new MemReader(&serializedDictionary[0], serializedDictionary.size()), + &TestDecompressor); + TEST_EQUAL(dic.KeyCount(), 0, ()); +} + +UNIT_TEST(SlofIndexerSmokeTest) +{ + string serializedDictionary; + { + MemWriter<string> writer(serializedDictionary); + sl::SlofIndexer indexer(writer, 25, &TestCompressor); + uint64_t articleM = indexer.AddArticle("ArticleM"); + indexer.AddKey("M", articleM); + uint64_t articleHello = indexer.AddArticle("ArticleHello"); + indexer.AddKey("He", articleHello); + uint64_t articleOk = indexer.AddArticle("ArticleOK"); + indexer.AddKey("OK", articleOk); + indexer.AddKey("Hello", articleHello); + } + { + sl::SlofDictionary dic(new MemReader(&serializedDictionary[0], serializedDictionary.size()), + &TestDecompressor); + TEST_EQUAL(dic.KeyCount(), 4, ()); + TEST_EQUAL(Key(dic, 0), "He", ()); + TEST_EQUAL(Key(dic, 1), "Hello", ()); + TEST_EQUAL(Key(dic, 2), "M", ()); + TEST_EQUAL(Key(dic, 3), "OK", ()); + TEST_EQUAL(Article(dic, 0), "ArticleHello", ()); + TEST_EQUAL(Article(dic, 1), "ArticleHello", ()); + TEST_EQUAL(Article(dic, 2), "ArticleM", ()); + TEST_EQUAL(Article(dic, 3), "ArticleOK", ()); + } +} + +// TODO: Write end-to-end test (publisher-to-engine). diff --git a/publisher/slof_indexer.cpp b/publisher/slof_indexer.cpp new file mode 100644 index 0000000000..a2e752b60b --- /dev/null +++ b/publisher/slof_indexer.cpp @@ -0,0 +1,188 @@ +#include "slof_indexer.hpp" +#include "../words/slof.hpp" +#include "../coding/byte_stream.hpp" +#include "../coding/endianness.hpp" +#include "../coding/varint.hpp" +#include "../coding/writer.hpp" +#include "../coding/write_to_sink.hpp" +#include "../base/assert.hpp" +#include "../base/base.hpp" +#include "../base/logging.hpp" +#include "../std/algorithm.hpp" +#include "../std/set.hpp" +#include "../std/string.hpp" + +namespace +{ + template <typename T> uint8_t VarUintSize(T x) + { + uint8_t res = 0; + while (x > 127) + { + ++res; + x >>= 7; + } + return res + 1; + } +} + +sl::SlofIndexer::SlofIndexer(Writer & writer, + size_t maxUncompressedArticleChunkSize, + function<void (char const *, size_t, string &)> const & compressor) : +m_Writer(writer), +m_MaxUncompressedArticleChunkSize(maxUncompressedArticleChunkSize), +m_Compressor(compressor), +m_ArticleOffset(m_Writer.Pos() + sizeof(sl::SlofHeader)), +m_ArticleCount(0), +m_ArticleChunkCount(0), +m_MaxArticleSize(0) +{ + CHECK_LESS(maxUncompressedArticleChunkSize, 1 << 24, ()); + m_Writer.Seek(sizeof(sl::SlofHeader)); + CHECK_EQUAL(m_ArticleOffset, m_Writer.Pos(), ()); +} + +void sl::SlofIndexer::AddKey(string const & word, uint64_t articleId) +{ + CHECK(!word.empty(), ()); + WordsContainerType::const_iterator it = m_Words.lower_bound(make_pair(word, 0ULL)); + if (it != m_Words.end() && it->first == word) + { + LOG(LINFO, ("Duplicate key:", word, it->second, articleId)); + } + CHECK(m_Words.insert(make_pair(word, articleId)).second, (word, articleId)); +} + +uint64_t sl::SlofIndexer::AddArticle(string const & article, bool forceChunkFlush) +{ + // if (article.size() > m_MaxUncompressedArticleChunkSize) + // LOG(LWARNING, ("Article bigger than chunk:", article.size(), article.substr(0, 64))); + + if (m_CurrentArticleChunk.size() + article.size() > m_MaxUncompressedArticleChunkSize || + forceChunkFlush) + FlushArticleChunk(); + + uint64_t const articleId = + ((m_Writer.Pos() - m_ArticleOffset) << 24) + m_ArticleSizesInChunk.size(); + m_CurrentArticleChunk += article; + m_ArticleSizesInChunk.push_back(article.size()); + + ++m_ArticleCount; + m_TotalArticleSizeUncompressed += article.size(); + m_MaxArticleSize = max(m_MaxArticleSize, static_cast<uint32_t>(article.size())); + + return articleId; +} + +void sl::SlofIndexer::FlushArticleChunk() +{ + if (m_ArticleSizesInChunk.empty()) + return; + + vector<char> chunkHeader; + { // Write chunk header. + { + PushBackByteSink<vector<char> > sink(chunkHeader); + // Write decompressed size of all articles. + WriteVarUint(sink, m_CurrentArticleChunk.size()); + // Write individual article sizes. + for (size_t i = 0; i < m_ArticleSizesInChunk.size(); ++i) + WriteVarUint(sink, m_ArticleSizesInChunk[i]); + } + { // Write size of the header at the beginning of the header. + vector<char> chunkHeaderSize; + PushBackByteSink<vector<char> > sink(chunkHeaderSize); + WriteVarUint(sink, chunkHeader.size()); + chunkHeader.insert(chunkHeader.begin(), chunkHeaderSize.begin(), chunkHeaderSize.end()); + } + } + + // Compress the article chunk. + string compressedArticleChunk; + m_Compressor(&m_CurrentArticleChunk[0], m_CurrentArticleChunk.size(), compressedArticleChunk); + + // Write everything. + WriteToSink(m_Writer, static_cast<uint32_t>(chunkHeader.size() + compressedArticleChunk.size())); + m_Writer.Write(&chunkHeader[0], chunkHeader.size()); + m_Writer.Write(&compressedArticleChunk[0], compressedArticleChunk.size()); + + // Reset everything. + m_CurrentArticleChunk.clear(); + m_ArticleSizesInChunk.clear(); + ++m_ArticleChunkCount; +} + +void sl::SlofIndexer::LogStats() const +{ + LOG(LINFO, ("Dictionary stats")); + set<uint64_t> articleIds; + uint32_t maxKeyLength = 0, totalWordLength = 0, dupKeysCount = 0; + for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it) + { + WordsContainerType::const_iterator next = it; + ++next; + if (next != m_Words.end() && next->first == it->first) + ++dupKeysCount; + maxKeyLength = max(maxKeyLength, static_cast<uint32_t>(it->first.size())); + totalWordLength += it->first.size(); + articleIds.insert(it->second); + } + + CHECK_EQUAL(m_ArticleCount, articleIds.size(), ()); + + LOG(LINFO, ("Keys:", m_Words.size())); + LOG(LINFO, ("Unique keys:", m_Words.size() - dupKeysCount)); + LOG(LINFO, ("Duplicate keys:", dupKeysCount)); + LOG(LINFO, ("Duplicate keys %:", 100.0 * dupKeysCount / m_Words.size())); + LOG(LINFO, ("Max key length:", maxKeyLength)); + LOG(LINFO, ("Average key length:", totalWordLength * 1.0 / m_Words.size())); + LOG(LINFO, ("Articles:", m_ArticleCount)); + LOG(LINFO, ("Keys per article:", m_Words.size() * 1.0 / m_ArticleCount)); + LOG(LINFO, ("Article chunks:", m_ArticleChunkCount)); + LOG(LINFO, ("Articles per chunk:", m_ArticleCount * 1.0 / m_ArticleChunkCount)); + LOG(LINFO, ("Average article size:", m_TotalArticleSizeUncompressed * 1.0 / m_ArticleCount)); + LOG(LINFO, ("Max article size:", m_MaxArticleSize)); +} + +sl::SlofIndexer::~SlofIndexer() +{ + FlushArticleChunk(); + + // Filling in header information. + sl::SlofHeader header; + memcpy(&header.m_Signature, "slof", 4); + header.m_MajorVersion = SwapIfBigEndian(uint16_t(1)); + header.m_MinorVersion = SwapIfBigEndian(uint16_t(1)); + header.m_KeyCount = SwapIfBigEndian(static_cast<uint32_t>(m_Words.size())); + header.m_ArticleCount = SwapIfBigEndian(m_ArticleCount); + header.m_ArticleOffset = SwapIfBigEndian(static_cast<uint64_t>(sizeof(header))); + + // Writing key index. + header.m_KeyIndexOffset = SwapIfBigEndian(m_Writer.Pos()); + { + WriteToSink(m_Writer, static_cast<uint32_t>(0)); + uint32_t cumSize = 0; + for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it) + { + cumSize += it->first.size(); + cumSize += VarUintSize(it->second >> 24); + cumSize += VarUintSize(it->second & 0xFFFFF); + WriteToSink(m_Writer, cumSize); + } + } + + // Writing key data. + header.m_KeyDataOffset = SwapIfBigEndian(m_Writer.Pos()); + for (WordsContainerType::const_iterator it = m_Words.begin(); it != m_Words.end(); ++it) + { + WriteVarUint(m_Writer, it->second >> 24); + WriteVarUint(m_Writer, it->second & 0xFFFFFF); + m_Writer.Write(&it->first[0], it->first.size()); + } + + // Writing header. + uint64_t const lastPos = m_Writer.Pos(); + m_Writer.Seek(0); + m_Writer.Write(&header, sizeof(header)); + m_Writer.Seek(lastPos); +} diff --git a/publisher/slof_indexer.hpp b/publisher/slof_indexer.hpp new file mode 100644 index 0000000000..156761b711 --- /dev/null +++ b/publisher/slof_indexer.hpp @@ -0,0 +1,49 @@ +#pragma once +#include "../base/base.hpp" +#include "../std/function.hpp" +#include "../std/set.hpp" +#include "../std/string.hpp" +#include "../std/utility.hpp" +#include "../std/vector.hpp" + +class Writer; + +namespace sl +{ + +class SlofIndexer +{ +public: + SlofIndexer(Writer & writer, + size_t maxUncompressedArticleChunkSize, + function<void (char const *, size_t, string &)> const & compressor); + ~SlofIndexer(); + + // Add article and return its id. + uint64_t AddArticle(string const & article, bool forceChunkFlush = false); + + // Add key with given article id. Keys may be passed in arbitry order. + void AddKey(string const & word, uint64_t articleId); + + void LogStats() const; + +private: + void FlushArticleChunk(); + + Writer & m_Writer; + size_t const m_MaxUncompressedArticleChunkSize; + function<void (char const *, size_t, string &)> m_Compressor; + typedef set<pair<string, uint64_t> > WordsContainerType; + WordsContainerType m_Words; + uint64_t const m_ArticleOffset; + string m_CurrentArticleChunk; + vector<uint32_t> m_ArticleSizesInChunk; + uint32_t m_ArticleCount; + + // Just for stats. + uint32_t m_ArticleChunkCount; + uint64_t m_TotalArticleSizeUncompressed; + uint32_t m_MaxArticleSize; +}; + +} |