Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/words
diff options
context:
space:
mode:
authorYury Melnichek <melnichek@gmail.com>2011-03-06 01:12:21 +0300
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:12:57 +0300
commit5d9239a1edede8693c69119e282017d965d2c326 (patch)
tree2ac33f5cf71ee8f3814de8f2dfe3688e8c3820e1 /words
parenta5ad66614265259763a1ed849db7ae8e918e8a3c (diff)
Add sloynik to omim!! Yea!
Diffstat (limited to 'words')
-rw-r--r--words/common.hpp21
-rw-r--r--words/dictionary.hpp34
-rw-r--r--words/slof.hpp43
-rw-r--r--words/slof_dictionary.cpp99
-rw-r--r--words/slof_dictionary.hpp32
-rw-r--r--words/sloynik_engine.cpp72
-rw-r--r--words/sloynik_engine.hpp62
-rw-r--r--words/sloynik_index.cpp220
-rw-r--r--words/sloynik_index.hpp75
-rw-r--r--words/words.pro26
-rw-r--r--words/words_tests/dictionary_mock.hpp27
-rw-r--r--words/words_tests/sorted_index_test.cpp125
-rw-r--r--words/words_tests/words_tests.pro15
13 files changed, 851 insertions, 0 deletions
diff --git a/words/common.hpp b/words/common.hpp
new file mode 100644
index 0000000000..142379beeb
--- /dev/null
+++ b/words/common.hpp
@@ -0,0 +1,21 @@
+#pragma once
+#include "../base/base.hpp"
+
+namespace sl
+{
+
+struct StrFn
+{
+ struct Str; // Never defined.
+
+ Str const * (* Create)(char const * utf8Data, uint32_t size);
+ void (* Destroy)(Str const * s);
+ int (* PrimaryCompare)(void * pData, Str const * a, Str const * b);
+ int (* SecondaryCompare)(void * pData, Str const * a, Str const * b);
+
+ void * m_pData;
+ uint64_t m_PrimaryCompareId;
+ uint64_t m_SecondaryCompareId;
+};
+
+}
diff --git a/words/dictionary.hpp b/words/dictionary.hpp
new file mode 100644
index 0000000000..25ffe978f0
--- /dev/null
+++ b/words/dictionary.hpp
@@ -0,0 +1,34 @@
+#pragma once
+#include "../base/base.hpp"
+#include "../std/string.hpp"
+#include "../base/base.hpp"
+#include "../base/exception.hpp"
+#include "../std/utility.hpp"
+
+namespace sl
+{
+
+class Dictionary
+{
+public:
+ DECLARE_EXCEPTION(Exception, RootException);
+ DECLARE_EXCEPTION(OpenDictionaryException, Exception);
+ DECLARE_EXCEPTION(OpenDictionaryNewerVersionException, Exception);
+ DECLARE_EXCEPTION(BrokenDictionaryException, Exception);
+
+ // Key id, which is 0-based index of key in sorted order.
+ typedef uint32_t Id;
+
+ virtual ~Dictionary() {}
+
+ // Number of keys in the dictionary.
+ virtual Id KeyCount() const = 0;
+
+ // Get key by id.
+ virtual void KeyById(Id id, string & key) const = 0;
+
+ // Get article by id.
+ virtual void ArticleById(Id id, string & article) const = 0;
+};
+
+}
diff --git a/words/slof.hpp b/words/slof.hpp
new file mode 100644
index 0000000000..b2edc4e809
--- /dev/null
+++ b/words/slof.hpp
@@ -0,0 +1,43 @@
+#pragma once
+// slof = sloynik format.
+//
+// VU = VarUint.
+//
+// Offset: Format:
+//
+// 0 [Header]
+// m_ArticleOffset [VU ChunkSize] [Compressed article chunk 0]
+// ..
+// [VU ChunkSize] [Compressed article chunk n-1]
+// m_KeyDataOffset [VU KeySize] [Key 0]
+// ..
+// [VU KeySize] [Key m_KeyCount-1]
+// m_KeyArticleIdOffset [VU article 0 chunk offset] [VU article 0 number in chunk]
+// ..
+// [VU article m_ArticleCount-1 chunk offset] [VU article * number in chunk]
+//
+// ArticleId = ([Offset of article chunk] << 24) + [Offset of article in uncompressed chunk]
+
+#include "../coding/endianness.hpp"
+#include "../base/base.hpp"
+
+namespace sl
+{
+
+#pragma pack(push, 1)
+struct SlofHeader
+{
+ uint32_t m_Signature; // = "slof"
+ uint16_t m_MajorVersion; // Major version. Changing it breaks compatibility.
+ uint16_t m_MinorVersion; // Minor version. Changing it doesn't break compatibility.
+
+ uint32_t m_KeyCount; // Number of keys.
+ uint32_t m_ArticleCount; // Number of articles.
+
+ uint64_t m_KeyIndexOffset; // Offset of key cummulative lengthes.
+ uint64_t m_KeyDataOffset; // Offset of key data.
+ uint64_t m_ArticleOffset; // Offset of article data.
+};
+#pragma pack(pop)
+
+}
diff --git a/words/slof_dictionary.cpp b/words/slof_dictionary.cpp
new file mode 100644
index 0000000000..bf1ff8543e
--- /dev/null
+++ b/words/slof_dictionary.cpp
@@ -0,0 +1,99 @@
+#include "slof_dictionary.hpp"
+#include "../coding_sloynik/bzip2_compressor.hpp"
+#include "../coding/byte_stream.hpp"
+#include "../coding/endianness.hpp"
+#include "../coding/reader.hpp"
+#include "../coding/varint.hpp"
+#include "../base/logging.hpp"
+#include "../base/base.hpp"
+#include "../std/utility.hpp"
+
+namespace
+{
+ char const * SkipVarUint(char const * p)
+ {
+ while ((*p) & 128)
+ ++p;
+ return ++p;
+ }
+}
+
+sl::SlofDictionary::SlofDictionary(Reader const * pReader)
+ : m_pReader(pReader), m_Decompressor(&DecompressBZip2IntoFixedSize)
+{
+ Init();
+}
+
+sl::SlofDictionary::SlofDictionary(
+ Reader const * pReader, function<void (const char *, size_t, char *, size_t)> decompressor)
+ : m_pReader(pReader), m_Decompressor(decompressor)
+{
+ Init();
+}
+
+void sl::SlofDictionary::Init()
+{
+ m_pReader->Read(0, &m_Header, sizeof(m_Header));
+ if (m_Header.m_MajorVersion != 1)
+ MYTHROW(OpenDictionaryNewerVersionException, (m_Header.m_MajorVersion));
+}
+
+sl::SlofDictionary::~SlofDictionary()
+{
+}
+
+sl::Dictionary::Id sl::SlofDictionary::KeyCount() const
+{
+ return m_Header.m_KeyCount;
+}
+
+void sl::SlofDictionary::ReadKeyData(sl::Dictionary::Id id, string & data) const
+{
+ pair<uint32_t, uint32_t> offsets;
+ m_pReader->Read(m_Header.m_KeyIndexOffset + id * 4, &offsets, 8);
+ offsets.first = SwapIfBigEndian(offsets.first);
+ offsets.second = SwapIfBigEndian(offsets.second);
+ // Add 2 trailing zeroes, to be sure that reading varint doesn't hang if file is corrupted.
+ data.resize(offsets.second - offsets.first + 2);
+ m_pReader->Read(m_Header.m_KeyDataOffset + offsets.first,
+ &data[0], offsets.second - offsets.first);
+}
+
+void sl::SlofDictionary::KeyById(sl::Dictionary::Id id, string & key) const
+{
+ string keyData;
+ ReadKeyData(id, keyData);
+ char const * pBeg = SkipVarUint(SkipVarUint(&keyData[0]));
+ char const * pLast = &keyData[keyData.size() - 1];
+ // ReadKeyData adds trailing zeroes, so that reading VarUint doesn't hang up in case of
+ // corrupted data. Strip them.
+ while (pLast >= pBeg && *pLast == 0)
+ --pLast;
+ key.assign(pBeg, pLast + 1);
+}
+
+void sl::SlofDictionary::ArticleById(sl::Dictionary::Id id, string & article) const
+{
+ string keyData;
+ ReadKeyData(id, keyData);
+ ArrayByteSource keyDataSource(&keyData[0]);
+ uint64_t const articleChunkPos = ReadVarUint<uint64_t>(keyDataSource);
+ uint32_t const articleNumInChunk = ReadVarUint<uint32_t>(keyDataSource);
+
+ uint32_t const chunkSize =
+ ReadPrimitiveFromPos<uint32_t>(*m_pReader, m_Header.m_ArticleOffset + articleChunkPos);
+ string chunk(chunkSize, 0);
+ m_pReader->Read(m_Header.m_ArticleOffset + articleChunkPos + 4, &chunk[0], chunkSize);
+ ArrayByteSource chunkSource(&chunk[0]);
+ uint32_t chunkHeaderSize = ReadVarUint<uint32_t>(chunkSource);
+ chunkHeaderSize += static_cast<char const *>(chunkSource.Ptr()) - &chunk[0];
+ uint32_t const decompressedChunkSize = ReadVarUint<uint32_t>(chunkSource);
+ uint32_t articleBegInChunk = 0;
+ for (uint32_t i = 0; i < articleNumInChunk; ++i)
+ articleBegInChunk += ReadVarUint<uint32_t>(chunkSource);
+ uint32_t const articleSizeInChunk = ReadVarUint<uint32_t>(chunkSource);
+ vector<char> decompressedChunk(decompressedChunkSize);
+ m_Decompressor(&chunk[chunkHeaderSize], chunkSize - chunkHeaderSize,
+ &decompressedChunk[0], decompressedChunkSize);
+ article.assign(&decompressedChunk[articleBegInChunk], articleSizeInChunk);
+}
diff --git a/words/slof_dictionary.hpp b/words/slof_dictionary.hpp
new file mode 100644
index 0000000000..9c9fc00087
--- /dev/null
+++ b/words/slof_dictionary.hpp
@@ -0,0 +1,32 @@
+#pragma once
+#include "dictionary.hpp"
+#include "slof.hpp"
+#include "../std/function.hpp"
+#include "../std/scoped_ptr.hpp"
+
+class Reader;
+
+namespace sl
+{
+
+class SlofDictionary : public Dictionary
+{
+public:
+ // Takes ownership of pReader and deletes it, even if exception is thrown.
+ explicit SlofDictionary(Reader const * pReader);
+ // Takes ownership of pReader and deletes it, even if exception is thrown.
+ SlofDictionary(Reader const * pReader,
+ function<void (char const *, size_t, char *, size_t)> decompressor);
+ ~SlofDictionary();
+ Id KeyCount() const;
+ void KeyById(Id id, string & key) const;
+ void ArticleById(Id id, string & article) const;
+private:
+ void Init();
+ void ReadKeyData(Id id, string & data) const;
+ scoped_ptr<Reader const> m_pReader;
+ function<void (char const *, size_t, char *, size_t)> m_Decompressor;
+ SlofHeader m_Header;
+};
+
+}
diff --git a/words/sloynik_engine.cpp b/words/sloynik_engine.cpp
new file mode 100644
index 0000000000..f7e19811e9
--- /dev/null
+++ b/words/sloynik_engine.cpp
@@ -0,0 +1,72 @@
+#include "sloynik_engine.hpp"
+#include "slof_dictionary.hpp"
+#include "sloynik_index.hpp"
+#include "../coding/file_reader.hpp"
+#include "../coding/file_writer.hpp"
+#include "../base/logging.hpp"
+#include "../base/macros.hpp"
+
+sl::SloynikEngine::SloynikEngine(string const & dictionaryPath,
+ string const & indexPath,
+ StrFn const & strFn)
+{
+ m_pDictionary.reset(new sl::SlofDictionary(new FileReader(dictionaryPath)));
+ vector<uint64_t> stamp;
+ stamp.push_back(strFn.m_PrimaryCompareId);
+ stamp.push_back(strFn.m_SecondaryCompareId);
+ string const stampPath = indexPath + ".stamp";
+ bool needIndexBuild = false;
+ try
+ {
+ FileReader stampReader(stampPath);
+ if (stampReader.Size() != stamp.size() * sizeof(stamp[0]))
+ needIndexBuild = true;
+ else
+ {
+ vector<uint64_t> currentStamp(stamp.size());
+ stampReader.Read(0, &currentStamp[0], stamp.size() * sizeof(stamp[0]));
+ if (currentStamp != stamp)
+ needIndexBuild = true;
+ }
+ }
+ catch (RootException &)
+ {
+ needIndexBuild = true;
+ }
+ LOG(LINFO, ("Started sloynik engine. Words in the dictionary:", m_pDictionary->KeyCount()));
+ // Uncomment to always rebuild the index: needIndexBuild = true;
+ if (needIndexBuild)
+ {
+ FileWriter::DeleteFile(stampPath);
+ sl::SortedIndex::Build(*m_pDictionary, strFn, indexPath);
+
+ FileWriter stampWriter(stampPath);
+ stampWriter.Write(&stamp[0], stamp.size() * sizeof(stamp[0]));
+ }
+ m_pIndexReader.reset(new FileReader(indexPath + ".idx"));
+ m_pSortedIndex.reset(new sl::SortedIndex(*m_pDictionary, m_pIndexReader.get(), strFn));
+}
+
+sl::SloynikEngine::~SloynikEngine()
+{
+}
+
+void sl::SloynikEngine::Search(string const & prefix, sl::SloynikEngine::SearchResult & res) const
+{
+ res.m_FirstMatched = m_pSortedIndex->PrefixSearch(prefix);
+}
+
+void sl::SloynikEngine::GetWordInfo(WordId word, sl::SloynikEngine::WordInfo & res) const
+{
+ m_pDictionary->KeyById(m_pSortedIndex->KeyIdByPos(word), res.m_Word);
+}
+
+uint32_t sl::SloynikEngine::WordCount() const
+{
+ return m_pDictionary->KeyCount();
+}
+
+void sl::SloynikEngine::GetArticleData(WordId wordId, ArticleData & data) const
+{
+ m_pDictionary->ArticleById(m_pSortedIndex->KeyIdByPos(wordId), data.m_HTML);
+}
diff --git a/words/sloynik_engine.hpp b/words/sloynik_engine.hpp
new file mode 100644
index 0000000000..9e83686db8
--- /dev/null
+++ b/words/sloynik_engine.hpp
@@ -0,0 +1,62 @@
+#pragma once
+#include "common.hpp"
+#include "../base/base.hpp"
+#include "../std/function.hpp"
+#include "../std/scoped_ptr.hpp"
+#include "../std/string.hpp"
+#include "../std/utility.hpp"
+
+class FileReader;
+
+namespace sl
+{
+
+class Dictionary;
+class SortedIndex;
+
+class SloynikEngine
+{
+public:
+ SloynikEngine(string const & dictionaryPath,
+ string const & indexPath,
+ sl::StrFn const & strFn);
+
+ ~SloynikEngine();
+
+ typedef uint32_t WordId;
+
+ struct WordInfo
+ {
+ string m_Word;
+ };
+
+ void GetWordInfo(WordId word, WordInfo & res) const;
+
+ struct SearchResult
+ {
+ WordId m_FirstMatched;
+ };
+
+ void Search(string const & prefix, SearchResult & res) const;
+
+ uint32_t WordCount() const;
+
+ struct ArticleData
+ {
+ string m_HTML;
+
+ void swap(ArticleData & o)
+ {
+ m_HTML.swap(o.m_HTML);
+ }
+ };
+
+ void GetArticleData(WordId word, ArticleData & data) const;
+
+private:
+ scoped_ptr<sl::Dictionary> m_pDictionary;
+ scoped_ptr<FileReader> m_pIndexReader;
+ scoped_ptr<sl::SortedIndex> m_pSortedIndex;
+};
+
+}
diff --git a/words/sloynik_index.cpp b/words/sloynik_index.cpp
new file mode 100644
index 0000000000..ff81a7a84d
--- /dev/null
+++ b/words/sloynik_index.cpp
@@ -0,0 +1,220 @@
+#include "sloynik_index.hpp"
+#include "dictionary.hpp"
+#include "../coding/file_writer.hpp"
+#include "../base/assert.hpp"
+#include "../base/logging.hpp"
+#include "../base/cache.hpp"
+#include "../std/bind.hpp"
+
+extern "C" {
+ #include "../coding_sloynik/timsort/timsort.h"
+}
+
+
+#define FILE_SORTER_LOG_BATCH_SIZE 11
+
+sl::SortedIndex::SortedIndex(Dictionary const & dictionary,
+ Reader const * pIndexReader,
+ StrFn const & strFn)
+ : m_Dictionary(dictionary), m_StrFn(strFn),
+ m_SortedVector(PolymorphReader(pIndexReader), pIndexReader->Size() / sizeof(DicId))
+{
+ STATIC_ASSERT(sizeof(sl::SortedIndex::DicId) == 3);
+}
+
+sl::Dictionary const & sl::SortedIndex::GetDictionary() const
+{
+ return m_Dictionary;
+}
+
+sl::Dictionary::Id sl::SortedIndex::KeyIdByPos(Pos pos) const
+{
+ return m_SortedVector[pos].Value();
+}
+
+namespace sl
+{
+namespace impl
+{
+
+ #define DIC_ID_MAGIC_EMPTY_VALUE 0xFFFFFF
+
+ // Compare 2 values of Dictionary::Id using comparator provided.
+ // A special value of Dictionary::Id(DIC_ID_MAGIC_EMPTY_VALUE) is used
+ // to compare with a string provided in constructor.
+ template <bool bFullCompare>
+ class DictCollIdLess
+ {
+ private:
+ DictCollIdLess(DictCollIdLess<bFullCompare> const &);
+ DictCollIdLess<bFullCompare> & operator = (DictCollIdLess<bFullCompare> const &);
+ public:
+ DictCollIdLess(sl::Dictionary const & dic, sl::StrFn const & strFn)
+ : m_Dic(dic), m_StrFn(strFn), m_Special(NULL), m_Cache(FILE_SORTER_LOG_BATCH_SIZE + 1)
+ {
+ }
+
+ DictCollIdLess(sl::Dictionary const & dic, sl::StrFn const & strFn, string const & special)
+ : m_Dic(dic), m_StrFn(strFn), m_Special(strFn.Create(&special[0], special.size())), m_Cache(3)
+ {
+ }
+
+ ~DictCollIdLess()
+ {
+ if (m_Special)
+ m_StrFn.Destroy(m_Special);
+ m_Cache.ForEachValue(m_StrFn.Destroy);
+ }
+
+ int Compare(uint32_t const id1, uint32_t const id2)
+ {
+ if (id1 == id2)
+ return 0;
+ sl::StrFn::Str const * key1, * key2;
+ bool bDeleteKey1 = false;
+ KeysById(id1, id2, key1, key2, bDeleteKey1);
+ int const primaryRes = m_StrFn.PrimaryCompare(m_StrFn.m_pData, key1, key2);
+ if (bFullCompare && primaryRes == 0)
+ {
+ int const secondaryRes = m_StrFn.SecondaryCompare(m_StrFn.m_pData, key1, key2);
+ if (bDeleteKey1)
+ m_StrFn.Destroy(key1);
+ return secondaryRes;
+ }
+ else
+ {
+ if (bDeleteKey1)
+ m_StrFn.Destroy(key1);
+ return primaryRes;
+ }
+ }
+
+ bool operator () (uint32_t const id1, uint32_t const id2)
+ {
+ return Compare(id1, id2) < 0;
+ }
+
+ private:
+
+ inline void KeysById(uint32_t const id1, uint32_t const id2,
+ sl::StrFn::Str const * & key1,
+ sl::StrFn::Str const * & key2,
+ bool & bDeleteKey1)
+ {
+ bDeleteKey1 = false;
+
+ if (id1 == DIC_ID_MAGIC_EMPTY_VALUE)
+ key1 = m_Special;
+ else
+ {
+ bool found = false;
+ sl::StrFn::Str const * & s = m_Cache.Find(id1, found);
+ if (!found)
+ {
+ if (s != NULL)
+ m_StrFn.Destroy(s);
+ s = CreateKeyById(id1);
+ }
+ key1 = s;
+ }
+
+ if (id2 == DIC_ID_MAGIC_EMPTY_VALUE)
+ key2 = m_Special;
+ else
+ {
+ bool found = false;
+ sl::StrFn::Str const * & s = m_Cache.Find(id2, found);
+ if (!found)
+ {
+ if (s != NULL)
+ {
+ if (s == key1)
+ bDeleteKey1 = true;
+ else
+ m_StrFn.Destroy(s);
+ }
+ s = CreateKeyById(id2);
+ }
+ key2 = s;
+ }
+ }
+
+ inline sl::StrFn::Str const * CreateKeyById(uint32_t const id)
+ {
+ ASSERT_LESS(id, DIC_ID_MAGIC_EMPTY_VALUE, ());
+ string keyStr;
+ m_Dic.KeyById(id, keyStr);
+ return m_StrFn.Create(&keyStr[0], keyStr.size());
+ }
+
+ sl::Dictionary const & m_Dic;
+ sl::StrFn m_StrFn;
+ sl::StrFn::Str const * m_Special;
+ my::Cache<uint32_t, sl::StrFn::Str const *> m_Cache;
+ };
+
+ // Hack: global data, used for sorting with c-style timsort().
+ static DictCollIdLess<true> * g_pSortedIndexDictCollIdLessForBuild = NULL;
+
+ template <class T> struct LessRefProxy
+ {
+ T & m_F;
+ explicit LessRefProxy(T & f) : m_F(f) {}
+ inline bool operator () (SortedIndex::DicId a, SortedIndex::DicId b) const
+ {
+ return m_F(a.Value(), b.Value());
+ }
+
+ // Hack: used for sorting with c-style timsort().
+ static int GlobalCompareForSort(void const * pVoidA, void const * pVoidB)
+ {
+ SortedIndex::DicId const * pA = static_cast<SortedIndex::DicId const *>(pVoidA);
+ SortedIndex::DicId const * pB = static_cast<SortedIndex::DicId const *>(pVoidB);
+ return g_pSortedIndexDictCollIdLessForBuild->Compare(pA->Value(), pB->Value());
+ }
+ };
+ template <class T> LessRefProxy<T> MakeLessRefProxy(T & f) { return LessRefProxy<T>(f); }
+
+} // namespace impl
+} // namespace sl
+
+sl::SortedIndex::Pos sl::SortedIndex::PrefixSearch(string const & prefix)
+{
+ impl::DictCollIdLess<false> compareLess(m_Dictionary, m_StrFn, prefix);
+ return lower_bound(m_SortedVector.begin(), m_SortedVector.end(), DIC_ID_MAGIC_EMPTY_VALUE,
+ MakeLessRefProxy(compareLess))
+ - m_SortedVector.begin();
+}
+
+void sl::SortedIndex::Build(sl::Dictionary const & dictionary,
+ StrFn const & strFn,
+ string const & indexPathPrefix)
+{
+ LOG(LINFO, ("Building sorted index."));
+
+ // Initializing.
+ vector<DicId> ids(dictionary.KeyCount());
+ for (size_t i = 0; i < ids.size(); ++i)
+ ids[i] = i;
+
+ // Sorting.
+ {
+ sl::impl::DictCollIdLess<true> compareLess(dictionary, strFn);
+
+ sl::impl::g_pSortedIndexDictCollIdLessForBuild = &compareLess;
+ timsort(&ids[0], ids.size(), sizeof(ids[0]),
+ &sl::impl::LessRefProxy<sl::impl::DictCollIdLess<true> >::GlobalCompareForSort);
+ sl::impl::g_pSortedIndexDictCollIdLessForBuild = NULL;
+
+ // sort() or stable_sort() are nice but slow.
+ // sort() 150 milliseconds
+ // stable_sort() 50 milliseconds
+ // timsort() 9 milliseconds
+ // stable_sort(ids.begin(), ids.end(), MakeLessRefProxy(compareLess));
+ }
+
+ FileWriter idxWriter((indexPathPrefix + ".idx").c_str());
+ idxWriter.Write(&ids[0], ids.size() * sizeof(ids[0]));
+
+ LOG(LINFO, ("Building sorted index done."));
+}
diff --git a/words/sloynik_index.hpp b/words/sloynik_index.hpp
new file mode 100644
index 0000000000..192a1bddae
--- /dev/null
+++ b/words/sloynik_index.hpp
@@ -0,0 +1,75 @@
+#pragma once
+#include "common.hpp"
+#include "dictionary.hpp"
+#include "../coding_sloynik/polymorph_reader.hpp"
+#include "../coding/dd_vector.hpp"
+#include "../coding/file_reader.hpp"
+#include "../base/base.hpp"
+#include "../std/function.hpp"
+#include "../std/scoped_ptr.hpp"
+#include "../std/utility.hpp"
+#include "../std/vector.hpp"
+
+class Reader;
+class Writer;
+
+namespace sl
+{
+
+// Sorted index of keys.
+// Two locale-aware comparators are used: primary and secondary.
+// Secondary comporator is only used to compare keys, which are equivalent
+// according to primary comparator.
+// For search only primary comporator is used.
+// Do determine the order of keys to display, primary + secondary comporators are used.
+// To collect articles for the same key, primary + secondary comporators are used.
+class SortedIndex
+{
+public:
+ typedef uint32_t Pos;
+
+ SortedIndex(Dictionary const & dictionary, Reader const * pIndexReader, StrFn const & strFn);
+
+ // Id of the smallest key that is equal or grater that prefix.
+ Pos PrefixSearch(string const & prefix);
+
+ Dictionary::Id KeyIdByPos(Pos pos) const;
+
+ Dictionary const & GetDictionary() const;
+
+ // Build index.
+ static void Build(Dictionary const & dictionary,
+ StrFn const & strFn,
+ string const & indexPathPrefix);
+private:
+
+#pragma pack(push, 1)
+ class DicId
+ {
+ public:
+ DicId() : m_Lo(0), m_Mi(0), m_Hi(0) {}
+ DicId(uint32_t x) : m_Lo(x & 0xFF), m_Mi((x >> 8) & 0xFF), m_Hi((x >> 16) & 0xFF)
+ {
+ ASSERT_LESS_OR_EQUAL(x, 0xFFFFFF, ());
+ }
+
+ uint32_t Value() const
+ {
+ return (uint32_t(m_Hi) << 16) | (uint32_t(m_Mi) << 8) | m_Lo;
+ }
+
+ private:
+ uint8_t m_Lo, m_Mi, m_Hi;
+ };
+#pragma pack(pop)
+ friend class LessRefProxy;
+
+ typedef DDVector<DicId, PolymorphReader> DDVectorType;
+
+ Dictionary const & m_Dictionary; // Dictionary.
+ StrFn m_StrFn; // Primary and secondary comparison functions.
+ DDVectorType m_SortedVector; // Sorted array of key ids.
+};
+
+
+}
diff --git a/words/words.pro b/words/words.pro
new file mode 100644
index 0000000000..4e848b4a59
--- /dev/null
+++ b/words/words.pro
@@ -0,0 +1,26 @@
+TARGET = words
+TEMPLATE = lib
+CONFIG += staticlib
+
+SLOYNIK_DIR = ..
+DEPENDENCIES = bzip2 zlib base coding coding_sloynik
+
+include($$SLOYNIK_DIR/sloynik_common.pri)
+
+HEADERS += \
+ common.hpp \
+ dictionary.hpp \
+ slof.hpp \
+ slof_dictionary.hpp \
+ sloynik_engine.hpp \
+ sloynik_index.hpp
+
+SOURCES += \
+ slof_dictionary.cpp \
+ sloynik_engine.cpp \
+ sloynik_index.cpp
+
+OTHER_FILES += \
+ ../bugs.txt
+
+
diff --git a/words/words_tests/dictionary_mock.hpp b/words/words_tests/dictionary_mock.hpp
new file mode 100644
index 0000000000..aab8161084
--- /dev/null
+++ b/words/words_tests/dictionary_mock.hpp
@@ -0,0 +1,27 @@
+#pragma once
+#include "../dictionary.hpp"
+#include "../../base/assert.hpp"
+#include "../../std/string.hpp"
+#include "../../std/vector.hpp"
+
+class DictionaryMock : public sl::Dictionary
+{
+public:
+ void Add(string const & key, string const & article)
+ {
+ m_Keys.push_back(key);
+ m_Articles.push_back(article);
+ }
+
+ Id KeyCount() const
+ {
+ ASSERT_EQUAL(m_Keys.size(), m_Articles.size(), ());
+ return m_Keys.size();
+ }
+
+ void KeyById(Id id, string & key) const { key = m_Keys[id]; }
+ void ArticleById(Id id, string & article) const { article = m_Articles[id]; }
+
+private:
+ vector<string> m_Keys, m_Articles;
+};
diff --git a/words/words_tests/sorted_index_test.cpp b/words/words_tests/sorted_index_test.cpp
new file mode 100644
index 0000000000..65c03448db
--- /dev/null
+++ b/words/words_tests/sorted_index_test.cpp
@@ -0,0 +1,125 @@
+#include "../../testing/testing.hpp"
+#include "../sloynik_index.hpp"
+#include "dictionary_mock.hpp"
+#include "../../std/algorithm.hpp"
+#include "../../std/bind.hpp"
+#include "../../std/map.hpp"
+#include "../../std/set.hpp"
+#include "../../std/string.hpp"
+#include "../../std/vector.hpp"
+
+namespace
+{
+ set<sl::StrFn::Str const *> g_AllocatedStrSet;
+
+ sl::StrFn::Str const * StrCreate(char const * utf8Data, uint32_t size)
+ {
+ sl::StrFn::Str const * res = reinterpret_cast<sl::StrFn::Str *>(new string(utf8Data, size));
+ CHECK(g_AllocatedStrSet.insert(res).second, ());
+ return res;
+ }
+
+ void StrDestroy(sl::StrFn::Str const * s)
+ {
+ if (s)
+ {
+ CHECK(g_AllocatedStrSet.count(s), ());
+ g_AllocatedStrSet.erase(s);
+ delete reinterpret_cast<string const *>(s);
+ }
+ }
+
+ int StrSecondaryCompare(void *, sl::StrFn::Str const * pa, sl::StrFn::Str const * pb)
+ {
+ string const & a = *reinterpret_cast<string const *>(pa);
+ string const & b = *reinterpret_cast<string const *>(pb);
+ return a == b ? 0 : (a < b ? -1 : 1);
+ }
+
+ int StrPrimaryCompare(void *, sl::StrFn::Str const * pa, sl::StrFn::Str const * pb)
+ {
+ string s1(*reinterpret_cast<string const *>(pa));
+ string s2(*reinterpret_cast<string const *>(pb));
+ std::use_facet<std::ctype<char> >(std::locale()).tolower(&s1[0], &s1[0] + s1.size());
+ std::use_facet<std::ctype<char> >(std::locale()).tolower(&s2[0], &s2[0] + s2.size());
+ return s1 == s2 ? 0 : (s1 < s2 ? -1 : 1);
+ }
+
+ sl::StrFn StrFnForTest()
+ {
+ sl::StrFn strFn;
+ strFn.Create = StrCreate;
+ strFn.Destroy = StrDestroy;
+ strFn.PrimaryCompare = StrPrimaryCompare;
+ strFn.SecondaryCompare = StrSecondaryCompare;
+ strFn.m_PrimaryCompareId = 1;
+ strFn.m_SecondaryCompareId = 2;
+ strFn.m_pData = NULL;
+ return strFn;
+ }
+
+ void PushBackArticleIntoVector(vector<string> & v,
+ sl::Dictionary const * pDic,
+ sl::Dictionary::Id id)
+ {
+ v.push_back("");
+ pDic->ArticleById(id, v.back());
+ }
+
+ string KeyByIndexId(sl::SortedIndex const & idx, sl::SortedIndex::Pos id)
+ {
+ string key;
+ idx.GetDictionary().KeyById(idx.KeyIdByPos(id), key);
+ return key;
+ }
+
+ string ArticleByIndexId(sl::SortedIndex const & idx, sl::SortedIndex::Pos id)
+ {
+ string article;
+ idx.GetDictionary().ArticleById(idx.KeyIdByPos(id), article);
+ return article;
+ }
+
+ void SetupDictionary(DictionaryMock & dictionary)
+ {
+ dictionary.Add("Hello", "Hello0"); // 0
+ dictionary.Add("hello", "hello0"); // 1
+ dictionary.Add("World", "World0"); // 2
+ dictionary.Add("He", "He0"); // 3
+ dictionary.Add("abc", "abc1"); // 4
+ }
+}
+
+UNIT_TEST(SortedIndex_Smoke)
+{
+ string const & filePrefix = "sorted_index_smoke_test";
+ DictionaryMock dictionary;
+ SetupDictionary(dictionary);
+ sl::StrFn strFn = StrFnForTest();
+ sl::SortedIndex::Build(dictionary, strFn, filePrefix);
+ sl::SortedIndex idx(dictionary, new FileReader(filePrefix + ".idx"), strFn);
+ TEST_EQUAL(dictionary.KeyCount(), 5, ());
+ TEST_EQUAL(KeyByIndexId(idx, 0), "abc", ());
+ TEST_EQUAL(KeyByIndexId(idx, 1), "He", ());
+ TEST_EQUAL(KeyByIndexId(idx, 2), "Hello", ());
+ TEST_EQUAL(KeyByIndexId(idx, 3), "hello", ());
+ TEST_EQUAL(KeyByIndexId(idx, 4), "World", ());
+ TEST_EQUAL(ArticleByIndexId(idx, 0), "abc1", ());
+ TEST_EQUAL(ArticleByIndexId(idx, 1), "He0", ());
+ TEST_EQUAL(ArticleByIndexId(idx, 2), "Hello0", ());
+ TEST_EQUAL(ArticleByIndexId(idx, 3), "hello0", ());
+ TEST_EQUAL(ArticleByIndexId(idx, 4), "World0", ());
+ TEST_EQUAL(idx.PrefixSearch(""), 0, ());
+ TEST_EQUAL(idx.PrefixSearch("h"), 1, ());
+ TEST_EQUAL(idx.PrefixSearch("H"), 1, ());
+ TEST_EQUAL(idx.PrefixSearch("He"), 1, ());
+ TEST_EQUAL(idx.PrefixSearch("he"), 1, ());
+ TEST_EQUAL(idx.PrefixSearch("hea"), 2, ());
+ TEST_EQUAL(idx.PrefixSearch("hel"), 2, ());
+ TEST_EQUAL(idx.PrefixSearch("Hello"), 2, ());
+ TEST_EQUAL(idx.PrefixSearch("W"), 4, ());
+ TEST_EQUAL(idx.PrefixSearch("zzz"), 5, ());
+ remove((filePrefix + ".idx").c_str());
+ remove((filePrefix + ".idx").c_str());
+ TEST(g_AllocatedStrSet.empty(), ());
+}
diff --git a/words/words_tests/words_tests.pro b/words/words_tests/words_tests.pro
new file mode 100644
index 0000000000..f1ca3ed47e
--- /dev/null
+++ b/words/words_tests/words_tests.pro
@@ -0,0 +1,15 @@
+TARGET = words_tests
+TEMPLATE = app
+CONFIG += console
+CONFIG -= app_bundle
+
+SLOYNIK_DIR = ../..
+DEPENDENCIES = bzip2 zlib base coding coding_sloynik words
+
+include($$SLOYNIK_DIR/sloynik_common.pri)
+
+SOURCES += $$SLOYNIK_DIR/testing/testingmain.cpp \
+ sorted_index_test.cpp \
+
+HEADERS += \
+ dictionary_mock.hpp \