Add sloynik to omim!! Yea!

author: Yury Melnichek <melnichek@gmail.com> 2011-03-06 01:12:21 +0300
committer: Alex Zolotarev <alex@maps.me> 2015-09-23 01:12:57 +0300
commit: 5d9239a1edede8693c69119e282017d965d2c326 (patch)
tree: 2ac33f5cf71ee8f3814de8f2dfe3688e8c3820e1 /words
parent: a5ad66614265259763a1ed849db7ae8e918e8a3c (diff)
13 files changed, 851 insertions, 0 deletions
diff --git a/words/common.hpp b/words/common.hpp
new file mode 100644
index 0000000000..142379beeb
--- /dev/null
+++ b/words/common.hpp
@@ -0,0 +1,21 @@
+#pragma once
+#include "../base/base.hpp"
+
+namespace sl
+{
+
+struct StrFn
+{
+  struct Str; // Never defined.
+
+  Str const * (* Create)(char const * utf8Data, uint32_t size);
+  void (* Destroy)(Str const * s);
+  int (* PrimaryCompare)(void * pData, Str const * a, Str const * b);
+  int (* SecondaryCompare)(void * pData, Str const * a, Str const * b);
+
+  void * m_pData;
+  uint64_t m_PrimaryCompareId;
+  uint64_t m_SecondaryCompareId;
+};
+
+}
diff --git a/words/dictionary.hpp b/words/dictionary.hpp
new file mode 100644
index 0000000000..25ffe978f0
--- /dev/null
+++ b/words/dictionary.hpp
@@ -0,0 +1,34 @@
+#pragma once
+#include "../base/base.hpp"
+#include "../std/string.hpp"
+#include "../base/base.hpp"
+#include "../base/exception.hpp"
+#include "../std/utility.hpp"
+
+namespace sl
+{
+
+class Dictionary
+{
+public:
+  DECLARE_EXCEPTION(Exception, RootException);
+  DECLARE_EXCEPTION(OpenDictionaryException, Exception);
+  DECLARE_EXCEPTION(OpenDictionaryNewerVersionException, Exception);
+  DECLARE_EXCEPTION(BrokenDictionaryException, Exception);
+
+  // Key id, which is 0-based index of key in sorted order.
+  typedef uint32_t Id;
+
+  virtual ~Dictionary() {}
+
+  // Number of keys in the dictionary.
+  virtual Id KeyCount() const = 0;
+
+  // Get key by id.
+  virtual void KeyById(Id id, string & key) const = 0;
+
+  // Get article by id.
+  virtual void ArticleById(Id id, string & article) const = 0;
+};
+
+}
diff --git a/words/slof.hpp b/words/slof.hpp
new file mode 100644
index 0000000000..b2edc4e809
--- /dev/null
+++ b/words/slof.hpp
@@ -0,0 +1,43 @@
+#pragma once
+// slof = sloynik format.
+//
+// VU = VarUint.
+//
+// Offset:               Format:
+//
+// 0                     [Header]
+// m_ArticleOffset       [VU ChunkSize] [Compressed article chunk 0]
+//                       ..
+//                       [VU ChunkSize] [Compressed article chunk n-1]
+// m_KeyDataOffset       [VU KeySize] [Key 0]
+//                       ..
+//                       [VU KeySize] [Key m_KeyCount-1]
+// m_KeyArticleIdOffset  [VU article 0 chunk offset]                [VU article 0 number in chunk]
+//                       ..
+//                       [VU article m_ArticleCount-1 chunk offset] [VU article * number in chunk]
+//
+// ArticleId = ([Offset of article chunk] << 24) + [Offset of article in uncompressed chunk]
+
+#include "../coding/endianness.hpp"
+#include "../base/base.hpp"
+
+namespace sl
+{
+
+#pragma pack(push, 1)
+struct SlofHeader
+{
+  uint32_t m_Signature;           // = "slof"
+  uint16_t m_MajorVersion;        // Major version. Changing it breaks compatibility.
+  uint16_t m_MinorVersion;        // Minor version. Changing it doesn't break compatibility.
+
+  uint32_t m_KeyCount;            // Number of keys.
+  uint32_t m_ArticleCount;        // Number of articles.
+
+  uint64_t m_KeyIndexOffset;      // Offset of key cummulative lengthes.
+  uint64_t m_KeyDataOffset;       // Offset of key data.
+  uint64_t m_ArticleOffset;       // Offset of article data.
+};
+#pragma pack(pop)
+
+}
diff --git a/words/slof_dictionary.cpp b/words/slof_dictionary.cpp
new file mode 100644
index 0000000000..bf1ff8543e
--- /dev/null
+++ b/words/slof_dictionary.cpp
@@ -0,0 +1,99 @@
+#include "slof_dictionary.hpp"
+#include "../coding_sloynik/bzip2_compressor.hpp"
+#include "../coding/byte_stream.hpp"
+#include "../coding/endianness.hpp"
+#include "../coding/reader.hpp"
+#include "../coding/varint.hpp"
+#include "../base/logging.hpp"
+#include "../base/base.hpp"
+#include "../std/utility.hpp"
+
+namespace
+{
+  char const * SkipVarUint(char const * p)
+  {
+    while ((*p) & 128)
+      ++p;
+    return ++p;
+  }
+}
+
+sl::SlofDictionary::SlofDictionary(Reader const * pReader)
+      : m_pReader(pReader), m_Decompressor(&DecompressBZip2IntoFixedSize)
+{
+  Init();
+}
+
+sl::SlofDictionary::SlofDictionary(
+    Reader const * pReader, function<void (const char *, size_t, char *, size_t)> decompressor)
+      : m_pReader(pReader), m_Decompressor(decompressor)
+{
+  Init();
+}
+
+void sl::SlofDictionary::Init()
+{
+  m_pReader->Read(0, &m_Header, sizeof(m_Header));
+  if (m_Header.m_MajorVersion != 1)
+    MYTHROW(OpenDictionaryNewerVersionException, (m_Header.m_MajorVersion));
+}
+
+sl::SlofDictionary::~SlofDictionary()
+{
+}
+
+sl::Dictionary::Id sl::SlofDictionary::KeyCount() const
+{
+  return m_Header.m_KeyCount;
+}
+
+void sl::SlofDictionary::ReadKeyData(sl::Dictionary::Id id, string & data) const
+{
+  pair<uint32_t, uint32_t> offsets;
+  m_pReader->Read(m_Header.m_KeyIndexOffset + id * 4, &offsets, 8);
+  offsets.first = SwapIfBigEndian(offsets.first);
+  offsets.second = SwapIfBigEndian(offsets.second);
+  // Add 2 trailing zeroes, to be sure that reading varint doesn't hang if file is corrupted.
+  data.resize(offsets.second - offsets.first + 2);
+  m_pReader->Read(m_Header.m_KeyDataOffset + offsets.first,
+                  &data[0], offsets.second - offsets.first);
+}
+
+void sl::SlofDictionary::KeyById(sl::Dictionary::Id id, string & key) const
+{
+  string keyData;
+  ReadKeyData(id, keyData);
+  char const * pBeg = SkipVarUint(SkipVarUint(&keyData[0]));
+  char const * pLast = &keyData[keyData.size() - 1];
+  // ReadKeyData adds trailing zeroes, so that reading VarUint doesn't hang up in case of
+  // corrupted data. Strip them.
+  while (pLast >= pBeg && *pLast == 0)
+    --pLast;
+  key.assign(pBeg, pLast + 1);
+}
+
+void sl::SlofDictionary::ArticleById(sl::Dictionary::Id id, string & article) const
+{
+  string keyData;
+  ReadKeyData(id, keyData);
+  ArrayByteSource keyDataSource(&keyData[0]);
+  uint64_t const articleChunkPos = ReadVarUint<uint64_t>(keyDataSource);
+  uint32_t const articleNumInChunk = ReadVarUint<uint32_t>(keyDataSource);
+
+  uint32_t const chunkSize =
+      ReadPrimitiveFromPos<uint32_t>(*m_pReader, m_Header.m_ArticleOffset + articleChunkPos);
+  string chunk(chunkSize, 0);
+  m_pReader->Read(m_Header.m_ArticleOffset + articleChunkPos + 4, &chunk[0], chunkSize);
+  ArrayByteSource chunkSource(&chunk[0]);
+  uint32_t chunkHeaderSize = ReadVarUint<uint32_t>(chunkSource);
+  chunkHeaderSize += static_cast<char const *>(chunkSource.Ptr()) - &chunk[0];
+  uint32_t const decompressedChunkSize = ReadVarUint<uint32_t>(chunkSource);
+  uint32_t articleBegInChunk = 0;
+  for (uint32_t i = 0; i < articleNumInChunk; ++i)
+    articleBegInChunk += ReadVarUint<uint32_t>(chunkSource);
+  uint32_t const articleSizeInChunk = ReadVarUint<uint32_t>(chunkSource);
+  vector<char> decompressedChunk(decompressedChunkSize);
+  m_Decompressor(&chunk[chunkHeaderSize], chunkSize - chunkHeaderSize,
+                 &decompressedChunk[0], decompressedChunkSize);
+  article.assign(&decompressedChunk[articleBegInChunk], articleSizeInChunk);
+}
diff --git a/words/slof_dictionary.hpp b/words/slof_dictionary.hpp
new file mode 100644
index 0000000000..9c9fc00087
--- /dev/null
+++ b/words/slof_dictionary.hpp
@@ -0,0 +1,32 @@
+#pragma once
+#include "dictionary.hpp"
+#include "slof.hpp"
+#include "../std/function.hpp"
+#include "../std/scoped_ptr.hpp"
+
+class Reader;
+
+namespace sl
+{
+
+class SlofDictionary : public Dictionary
+{
+public:
+  // Takes ownership of pReader and deletes it, even if exception is thrown.
+  explicit SlofDictionary(Reader const * pReader);
+  // Takes ownership of pReader and deletes it, even if exception is thrown.
+  SlofDictionary(Reader const * pReader,
+                 function<void (char const *, size_t, char *, size_t)> decompressor);
+  ~SlofDictionary();
+  Id KeyCount() const;
+  void KeyById(Id id, string & key) const;
+  void ArticleById(Id id, string & article) const;
+private:
+  void Init();
+  void ReadKeyData(Id id, string & data) const;
+  scoped_ptr<Reader const> m_pReader;
+  function<void (char const *, size_t, char *, size_t)> m_Decompressor;
+  SlofHeader m_Header;
+};
+
+}
diff --git a/words/sloynik_engine.cpp b/words/sloynik_engine.cpp
new file mode 100644
index 0000000000..f7e19811e9
--- /dev/null
+++ b/words/sloynik_engine.cpp
@@ -0,0 +1,72 @@
+#include "sloynik_engine.hpp"
+#include "slof_dictionary.hpp"
+#include "sloynik_index.hpp"
+#include "../coding/file_reader.hpp"
+#include "../coding/file_writer.hpp"
+#include "../base/logging.hpp"
+#include "../base/macros.hpp"
+
+sl::SloynikEngine::SloynikEngine(string const & dictionaryPath,
+                                 string const & indexPath,
+                                 StrFn const & strFn)
+{
+  m_pDictionary.reset(new sl::SlofDictionary(new FileReader(dictionaryPath)));
+  vector<uint64_t> stamp;
+  stamp.push_back(strFn.m_PrimaryCompareId);
+  stamp.push_back(strFn.m_SecondaryCompareId);
+  string const stampPath = indexPath + ".stamp";
+  bool needIndexBuild = false;
+  try
+  {
+    FileReader stampReader(stampPath);
+    if (stampReader.Size() != stamp.size() * sizeof(stamp[0]))
+      needIndexBuild = true;
+    else
+    {
+      vector<uint64_t> currentStamp(stamp.size());
+      stampReader.Read(0, &currentStamp[0], stamp.size() * sizeof(stamp[0]));
+      if (currentStamp != stamp)
+        needIndexBuild = true;
+    }
+  }
+  catch (RootException &)
+  {
+    needIndexBuild = true;
+  }
+  LOG(LINFO, ("Started sloynik engine. Words in the dictionary:", m_pDictionary->KeyCount()));
+  // Uncomment to always rebuild the index: needIndexBuild = true;
+  if (needIndexBuild)
+  {
+    FileWriter::DeleteFile(stampPath);
+    sl::SortedIndex::Build(*m_pDictionary, strFn, indexPath);
+
+    FileWriter stampWriter(stampPath);
+    stampWriter.Write(&stamp[0], stamp.size() * sizeof(stamp[0]));
+  }
+  m_pIndexReader.reset(new FileReader(indexPath + ".idx"));
+  m_pSortedIndex.reset(new sl::SortedIndex(*m_pDictionary, m_pIndexReader.get(), strFn));
+}
+
+sl::SloynikEngine::~SloynikEngine()
+{
+}
+
+void sl::SloynikEngine::Search(string const & prefix, sl::SloynikEngine::SearchResult & res) const
+{
+  res.m_FirstMatched = m_pSortedIndex->PrefixSearch(prefix);
+}
+
+void sl::SloynikEngine::GetWordInfo(WordId word, sl::SloynikEngine::WordInfo & res) const
+{
+  m_pDictionary->KeyById(m_pSortedIndex->KeyIdByPos(word), res.m_Word);
+}
+
+uint32_t sl::SloynikEngine::WordCount() const
+{
+  return m_pDictionary->KeyCount();
+}
+
+void sl::SloynikEngine::GetArticleData(WordId wordId, ArticleData & data) const
+{
+  m_pDictionary->ArticleById(m_pSortedIndex->KeyIdByPos(wordId), data.m_HTML);
+}
diff --git a/words/sloynik_engine.hpp b/words/sloynik_engine.hpp
new file mode 100644
index 0000000000..9e83686db8
--- /dev/null
+++ b/words/sloynik_engine.hpp
@@ -0,0 +1,62 @@
+#pragma once
+#include "common.hpp"
+#include "../base/base.hpp"
+#include "../std/function.hpp"
+#include "../std/scoped_ptr.hpp"
+#include "../std/string.hpp"
+#include "../std/utility.hpp"
+
+class FileReader;
+
+namespace sl
+{
+
+class Dictionary;
+class SortedIndex;
+
+class SloynikEngine
+{
+public:
+  SloynikEngine(string const & dictionaryPath,
+                string const & indexPath,
+                sl::StrFn const & strFn);
+
+  ~SloynikEngine();
+
+  typedef uint32_t WordId;
+
+  struct WordInfo
+  {
+    string m_Word;
+  };
+
+  void GetWordInfo(WordId word, WordInfo & res) const;
+
+  struct SearchResult
+  {
+    WordId m_FirstMatched;
+  };
+
+  void Search(string const & prefix, SearchResult & res) const;
+
+  uint32_t WordCount() const;
+
+  struct ArticleData
+  {
+    string m_HTML;
+
+    void swap(ArticleData & o)
+    {
+      m_HTML.swap(o.m_HTML);
+    }
+  };
+
+  void GetArticleData(WordId word, ArticleData & data) const;
+
+private:
+  scoped_ptr<sl::Dictionary> m_pDictionary;
+  scoped_ptr<FileReader> m_pIndexReader;
+  scoped_ptr<sl::SortedIndex> m_pSortedIndex;
+};
+
+}
diff --git a/words/sloynik_index.cpp b/words/sloynik_index.cpp
new file mode 100644
index 0000000000..ff81a7a84d
--- /dev/null
+++ b/words/sloynik_index.cpp
@@ -0,0 +1,220 @@
+#include "sloynik_index.hpp"
+#include "dictionary.hpp"
+#include "../coding/file_writer.hpp"
+#include "../base/assert.hpp"
+#include "../base/logging.hpp"
+#include "../base/cache.hpp"
+#include "../std/bind.hpp"
+
+extern "C" {
+  #include "../coding_sloynik/timsort/timsort.h"
+}
+
+
+#define FILE_SORTER_LOG_BATCH_SIZE 11
+
+sl::SortedIndex::SortedIndex(Dictionary const & dictionary,
+                             Reader const * pIndexReader,
+                             StrFn const & strFn)
+  : m_Dictionary(dictionary), m_StrFn(strFn),
+    m_SortedVector(PolymorphReader(pIndexReader), pIndexReader->Size() / sizeof(DicId))
+{
+  STATIC_ASSERT(sizeof(sl::SortedIndex::DicId) == 3);
+}
+
+sl::Dictionary const & sl::SortedIndex::GetDictionary() const
+{
+  return m_Dictionary;
+}
+
+sl::Dictionary::Id sl::SortedIndex::KeyIdByPos(Pos pos) const
+{
+  return m_SortedVector[pos].Value();
+}
+
+namespace sl
+{
+namespace impl
+{
+
+  #define DIC_ID_MAGIC_EMPTY_VALUE 0xFFFFFF
+
+  // Compare 2 values of Dictionary::Id using comparator provided.
+  // A special value of Dictionary::Id(DIC_ID_MAGIC_EMPTY_VALUE) is used
+  // to compare with a string provided in constructor.
+  template <bool bFullCompare>
+  class DictCollIdLess
+  {
+  private:
+    DictCollIdLess(DictCollIdLess<bFullCompare> const &);
+    DictCollIdLess<bFullCompare> & operator = (DictCollIdLess<bFullCompare> const &);
+  public:
+    DictCollIdLess(sl::Dictionary const & dic, sl::StrFn const & strFn)
+    : m_Dic(dic), m_StrFn(strFn), m_Special(NULL), m_Cache(FILE_SORTER_LOG_BATCH_SIZE + 1)
+    {
+    }
+
+    DictCollIdLess(sl::Dictionary const & dic, sl::StrFn const & strFn, string const & special)
+    : m_Dic(dic), m_StrFn(strFn), m_Special(strFn.Create(&special[0], special.size())), m_Cache(3)
+    {
+    }
+
+    ~DictCollIdLess()
+    {
+      if (m_Special)
+        m_StrFn.Destroy(m_Special);
+      m_Cache.ForEachValue(m_StrFn.Destroy);
+    }
+
+    int Compare(uint32_t const id1, uint32_t const id2)
+    {
+      if (id1 == id2)
+        return 0;
+      sl::StrFn::Str const * key1, * key2;
+      bool bDeleteKey1 = false;
+      KeysById(id1, id2, key1, key2, bDeleteKey1);
+      int const primaryRes = m_StrFn.PrimaryCompare(m_StrFn.m_pData, key1, key2);
+      if (bFullCompare && primaryRes == 0)
+      {
+        int const secondaryRes = m_StrFn.SecondaryCompare(m_StrFn.m_pData, key1, key2);
+        if (bDeleteKey1)
+          m_StrFn.Destroy(key1);
+        return secondaryRes;
+      }
+      else
+      {
+        if (bDeleteKey1)
+          m_StrFn.Destroy(key1);
+        return primaryRes;
+      }
+    }
+
+    bool operator () (uint32_t const id1, uint32_t const id2)
+    {
+      return Compare(id1, id2) < 0;
+    }
+
+  private:
+
+    inline void KeysById(uint32_t const id1, uint32_t const id2,
+                         sl::StrFn::Str const * & key1,
+                         sl::StrFn::Str const * & key2,
+                         bool & bDeleteKey1)
+    {
+      bDeleteKey1 = false;
+
+      if (id1 == DIC_ID_MAGIC_EMPTY_VALUE)
+        key1 = m_Special;
+      else
+      {
+        bool found = false;
+        sl::StrFn::Str const * & s = m_Cache.Find(id1, found);
+        if (!found)
+        {
+          if (s != NULL)
+            m_StrFn.Destroy(s);
+          s = CreateKeyById(id1);
+        }
+        key1 = s;
+      }
+
+      if (id2 == DIC_ID_MAGIC_EMPTY_VALUE)
+        key2 = m_Special;
+      else
+      {
+        bool found = false;
+        sl::StrFn::Str const * & s = m_Cache.Find(id2, found);
+        if (!found)
+        {
+          if (s != NULL)
+          {
+            if (s == key1)
+              bDeleteKey1 = true;
+            else
+              m_StrFn.Destroy(s);
+          }
+          s = CreateKeyById(id2);
+        }
+        key2 = s;
+      }
+    }
+
+    inline sl::StrFn::Str const * CreateKeyById(uint32_t const id)
+    {
+      ASSERT_LESS(id, DIC_ID_MAGIC_EMPTY_VALUE, ());
+      string keyStr;
+      m_Dic.KeyById(id, keyStr);
+      return m_StrFn.Create(&keyStr[0], keyStr.size());
+    }
+
+    sl::Dictionary const & m_Dic;
+    sl::StrFn m_StrFn;
+    sl::StrFn::Str const * m_Special;
+    my::Cache<uint32_t, sl::StrFn::Str const *> m_Cache;
+  };
+
+  // Hack: global data, used for sorting with c-style timsort().
+  static DictCollIdLess<true> * g_pSortedIndexDictCollIdLessForBuild = NULL;
+
+  template <class T> struct LessRefProxy
+  {
+    T & m_F;
+    explicit LessRefProxy(T & f) : m_F(f) {}
+    inline bool operator () (SortedIndex::DicId a, SortedIndex::DicId b) const
+    {
+      return m_F(a.Value(), b.Value());
+    }
+
+    // Hack: used for sorting with c-style timsort().
+    static int GlobalCompareForSort(void const * pVoidA, void const * pVoidB)
+    {
+      SortedIndex::DicId const * pA = static_cast<SortedIndex::DicId const *>(pVoidA);
+      SortedIndex::DicId const * pB = static_cast<SortedIndex::DicId const *>(pVoidB);
+      return g_pSortedIndexDictCollIdLessForBuild->Compare(pA->Value(), pB->Value());
+    }
+  };
+  template <class T> LessRefProxy<T> MakeLessRefProxy(T & f) { return LessRefProxy<T>(f); }
+
+} // namespace impl
+} // namespace sl
+
+sl::SortedIndex::Pos sl::SortedIndex::PrefixSearch(string const & prefix)
+{
+  impl::DictCollIdLess<false> compareLess(m_Dictionary, m_StrFn, prefix);
+  return lower_bound(m_SortedVector.begin(), m_SortedVector.end(), DIC_ID_MAGIC_EMPTY_VALUE,
+                     MakeLessRefProxy(compareLess))
+         - m_SortedVector.begin();
+}
+
+void sl::SortedIndex::Build(sl::Dictionary const & dictionary,
+                            StrFn const & strFn,
+                            string const & indexPathPrefix)
+{
+  LOG(LINFO, ("Building sorted index."));
+
+  // Initializing.
+  vector<DicId> ids(dictionary.KeyCount());
+  for (size_t i = 0; i < ids.size(); ++i)
+    ids[i] = i;
+
+  // Sorting.
+  {
+    sl::impl::DictCollIdLess<true> compareLess(dictionary, strFn);
+
+    sl::impl::g_pSortedIndexDictCollIdLessForBuild = &compareLess;
+    timsort(&ids[0], ids.size(), sizeof(ids[0]),
+            &sl::impl::LessRefProxy<sl::impl::DictCollIdLess<true> >::GlobalCompareForSort);
+    sl::impl::g_pSortedIndexDictCollIdLessForBuild = NULL;
+
+    // sort() or stable_sort() are nice but slow.
+    // sort()            150 milliseconds
+    // stable_sort()      50 milliseconds
+    // timsort()           9 milliseconds
+    // stable_sort(ids.begin(), ids.end(), MakeLessRefProxy(compareLess));
+  }
+
+  FileWriter idxWriter((indexPathPrefix + ".idx").c_str());
+  idxWriter.Write(&ids[0], ids.size() * sizeof(ids[0]));
+
+  LOG(LINFO, ("Building sorted index done."));
+}
diff --git a/words/sloynik_index.hpp b/words/sloynik_index.hpp
new file mode 100644
index 0000000000..192a1bddae
--- /dev/null
+++ b/words/sloynik_index.hpp
@@ -0,0 +1,75 @@
+#pragma once
+#include "common.hpp"
+#include "dictionary.hpp"
+#include "../coding_sloynik/polymorph_reader.hpp"
+#include "../coding/dd_vector.hpp"
+#include "../coding/file_reader.hpp"
+#include "../base/base.hpp"
+#include "../std/function.hpp"
+#include "../std/scoped_ptr.hpp"
+#include "../std/utility.hpp"
+#include "../std/vector.hpp"
+
+class Reader;
+class Writer;
+
+namespace sl
+{
+
+// Sorted index of keys.
+// Two locale-aware comparators are used: primary and secondary.
+// Secondary comporator is only used to compare keys, which are equivalent
+// according to primary comparator.
+// For search only primary comporator is used.
+// Do determine the order of keys to display, primary + secondary comporators are used.
+// To collect articles for the same key, primary + secondary comporators are used.
+class SortedIndex
+{
+public:
+  typedef uint32_t Pos;
+
+  SortedIndex(Dictionary const & dictionary, Reader const * pIndexReader, StrFn const & strFn);
+
+  // Id of the smallest key that is equal or grater that prefix.
+  Pos PrefixSearch(string const & prefix);
+
+  Dictionary::Id KeyIdByPos(Pos pos) const;
+
+  Dictionary const & GetDictionary() const;
+
+  // Build index.
+  static void Build(Dictionary const & dictionary,
+                    StrFn const & strFn,
+                    string const & indexPathPrefix);
+private:
+
+#pragma pack(push, 1)
+  class DicId
+  {
+  public:
+    DicId() : m_Lo(0), m_Mi(0), m_Hi(0) {}
+    DicId(uint32_t x) : m_Lo(x & 0xFF), m_Mi((x >> 8) & 0xFF), m_Hi((x >> 16) & 0xFF)
+    {
+      ASSERT_LESS_OR_EQUAL(x, 0xFFFFFF, ());
+    }
+
+    uint32_t Value() const
+    {
+      return (uint32_t(m_Hi) << 16) | (uint32_t(m_Mi) << 8) | m_Lo;
+    }
+
+  private:
+    uint8_t m_Lo, m_Mi, m_Hi;
+  };
+#pragma pack(pop)
+  friend class LessRefProxy;
+
+  typedef DDVector<DicId, PolymorphReader> DDVectorType;
+
+  Dictionary const & m_Dictionary;  // Dictionary.
+  StrFn m_StrFn;                    // Primary and secondary comparison functions.
+  DDVectorType m_SortedVector;      // Sorted array of key ids.
+};
+
+
+}
diff --git a/words/words.pro b/words/words.pro
new file mode 100644
index 0000000000..4e848b4a59
--- /dev/null
+++ b/words/words.pro
@@ -0,0 +1,26 @@
+TARGET = words
+TEMPLATE = lib
+CONFIG += staticlib
+
+SLOYNIK_DIR = ..
+DEPENDENCIES = bzip2 zlib base coding coding_sloynik
+
+include($$SLOYNIK_DIR/sloynik_common.pri)
+
+HEADERS += \
+  common.hpp \
+  dictionary.hpp \
+  slof.hpp \
+  slof_dictionary.hpp \
+  sloynik_engine.hpp \
+  sloynik_index.hpp
+
+SOURCES += \
+  slof_dictionary.cpp \
+  sloynik_engine.cpp \
+  sloynik_index.cpp
+
+OTHER_FILES += \
+  ../bugs.txt
+
+
diff --git a/words/words_tests/dictionary_mock.hpp b/words/words_tests/dictionary_mock.hpp
new file mode 100644
index 0000000000..aab8161084
--- /dev/null
+++ b/words/words_tests/dictionary_mock.hpp
@@ -0,0 +1,27 @@
+#pragma once
+#include "../dictionary.hpp"
+#include "../../base/assert.hpp"
+#include "../../std/string.hpp"
+#include "../../std/vector.hpp"
+
+class DictionaryMock : public sl::Dictionary
+{
+public:
+  void Add(string const & key, string const & article)
+  {
+    m_Keys.push_back(key);
+    m_Articles.push_back(article);
+  }
+
+  Id KeyCount() const
+  {
+    ASSERT_EQUAL(m_Keys.size(), m_Articles.size(), ());
+    return m_Keys.size();
+  }
+
+  void KeyById(Id id, string & key) const { key = m_Keys[id]; }
+  void ArticleById(Id id, string & article) const { article = m_Articles[id]; }
+
+private:
+  vector<string> m_Keys, m_Articles;
+};
diff --git a/words/words_tests/sorted_index_test.cpp b/words/words_tests/sorted_index_test.cpp
new file mode 100644
index 0000000000..65c03448db
--- /dev/null
+++ b/words/words_tests/sorted_index_test.cpp
@@ -0,0 +1,125 @@
+#include "../../testing/testing.hpp"
+#include "../sloynik_index.hpp"
+#include "dictionary_mock.hpp"
+#include "../../std/algorithm.hpp"
+#include "../../std/bind.hpp"
+#include "../../std/map.hpp"
+#include "../../std/set.hpp"
+#include "../../std/string.hpp"
+#include "../../std/vector.hpp"
+
+namespace
+{
+  set<sl::StrFn::Str const *> g_AllocatedStrSet;
+
+  sl::StrFn::Str const * StrCreate(char const * utf8Data, uint32_t size)
+  {
+    sl::StrFn::Str const * res = reinterpret_cast<sl::StrFn::Str *>(new string(utf8Data, size));
+    CHECK(g_AllocatedStrSet.insert(res).second, ());
+    return res;
+  }
+
+  void StrDestroy(sl::StrFn::Str const * s)
+  {
+    if (s)
+    {
+      CHECK(g_AllocatedStrSet.count(s), ());
+      g_AllocatedStrSet.erase(s);
+      delete reinterpret_cast<string const *>(s);
+    }
+  }
+
+  int StrSecondaryCompare(void *, sl::StrFn::Str const * pa, sl::StrFn::Str const * pb)
+  {
+    string const & a = *reinterpret_cast<string const *>(pa);
+    string const & b = *reinterpret_cast<string const *>(pb);
+    return a == b ? 0 : (a < b ? -1 : 1);
+  }
+
+  int StrPrimaryCompare(void *, sl::StrFn::Str const * pa, sl::StrFn::Str const * pb)
+  {
+    string s1(*reinterpret_cast<string const *>(pa));
+    string s2(*reinterpret_cast<string const *>(pb));
+    std::use_facet<std::ctype<char> >(std::locale()).tolower(&s1[0], &s1[0] + s1.size());
+    std::use_facet<std::ctype<char> >(std::locale()).tolower(&s2[0], &s2[0] + s2.size());
+    return s1 == s2 ? 0 : (s1 < s2 ? -1 : 1);
+  }
+
+  sl::StrFn StrFnForTest()
+  {
+    sl::StrFn strFn;
+    strFn.Create = StrCreate;
+    strFn.Destroy = StrDestroy;
+    strFn.PrimaryCompare = StrPrimaryCompare;
+    strFn.SecondaryCompare = StrSecondaryCompare;
+    strFn.m_PrimaryCompareId = 1;
+    strFn.m_SecondaryCompareId = 2;
+    strFn.m_pData = NULL;
+    return strFn;
+  }
+
+  void PushBackArticleIntoVector(vector<string> & v,
+                                 sl::Dictionary const * pDic,
+                                 sl::Dictionary::Id id)
+  {
+    v.push_back("");
+    pDic->ArticleById(id, v.back());
+  }
+
+  string KeyByIndexId(sl::SortedIndex const & idx, sl::SortedIndex::Pos id)
+  {
+    string key;
+    idx.GetDictionary().KeyById(idx.KeyIdByPos(id), key);
+    return key;
+  }
+
+  string ArticleByIndexId(sl::SortedIndex const & idx, sl::SortedIndex::Pos id)
+  {
+    string article;
+    idx.GetDictionary().ArticleById(idx.KeyIdByPos(id), article);
+    return article;
+  }
+
+  void SetupDictionary(DictionaryMock & dictionary)
+  {
+    dictionary.Add("Hello", "Hello0"); // 0
+    dictionary.Add("hello", "hello0"); // 1
+    dictionary.Add("World", "World0"); // 2
+    dictionary.Add("He", "He0");       // 3
+    dictionary.Add("abc", "abc1");     // 4
+  }
+}
+
+UNIT_TEST(SortedIndex_Smoke)
+{
+  string const & filePrefix = "sorted_index_smoke_test";
+  DictionaryMock dictionary;
+  SetupDictionary(dictionary);
+  sl::StrFn strFn = StrFnForTest();
+  sl::SortedIndex::Build(dictionary, strFn, filePrefix);
+  sl::SortedIndex idx(dictionary, new FileReader(filePrefix + ".idx"), strFn);
+  TEST_EQUAL(dictionary.KeyCount(), 5, ());
+  TEST_EQUAL(KeyByIndexId(idx, 0), "abc", ());
+  TEST_EQUAL(KeyByIndexId(idx, 1), "He", ());
+  TEST_EQUAL(KeyByIndexId(idx, 2), "Hello", ());
+  TEST_EQUAL(KeyByIndexId(idx, 3), "hello", ());
+  TEST_EQUAL(KeyByIndexId(idx, 4), "World", ());
+  TEST_EQUAL(ArticleByIndexId(idx, 0), "abc1", ());
+  TEST_EQUAL(ArticleByIndexId(idx, 1), "He0", ());
+  TEST_EQUAL(ArticleByIndexId(idx, 2), "Hello0", ());
+  TEST_EQUAL(ArticleByIndexId(idx, 3), "hello0", ());
+  TEST_EQUAL(ArticleByIndexId(idx, 4), "World0", ());
+  TEST_EQUAL(idx.PrefixSearch(""), 0, ());
+  TEST_EQUAL(idx.PrefixSearch("h"), 1, ());
+  TEST_EQUAL(idx.PrefixSearch("H"), 1, ());
+  TEST_EQUAL(idx.PrefixSearch("He"), 1, ());
+  TEST_EQUAL(idx.PrefixSearch("he"), 1, ());
+  TEST_EQUAL(idx.PrefixSearch("hea"), 2, ());
+  TEST_EQUAL(idx.PrefixSearch("hel"), 2, ());
+  TEST_EQUAL(idx.PrefixSearch("Hello"), 2, ());
+  TEST_EQUAL(idx.PrefixSearch("W"), 4, ());
+  TEST_EQUAL(idx.PrefixSearch("zzz"), 5, ());
+  remove((filePrefix + ".idx").c_str());
+  remove((filePrefix + ".idx").c_str());
+  TEST(g_AllocatedStrSet.empty(), ());
+}
diff --git a/words/words_tests/words_tests.pro b/words/words_tests/words_tests.pro
new file mode 100644
index 0000000000..f1ca3ed47e
--- /dev/null
+++ b/words/words_tests/words_tests.pro
@@ -0,0 +1,15 @@
+TARGET = words_tests
+TEMPLATE = app
+CONFIG += console
+CONFIG -= app_bundle
+
+SLOYNIK_DIR = ../..
+DEPENDENCIES = bzip2 zlib base coding coding_sloynik words
+
+include($$SLOYNIK_DIR/sloynik_common.pri)
+
+SOURCES += $$SLOYNIK_DIR/testing/testingmain.cpp \
+  sorted_index_test.cpp \
+
+HEADERS += \
+  dictionary_mock.hpp \
author	Yury Melnichek <melnichek@gmail.com>	2011-03-06 01:12:21 +0300
committer	Alex Zolotarev <alex@maps.me>	2015-09-23 01:12:57 +0300
commit	5d9239a1edede8693c69119e282017d965d2c326 (patch)
tree	2ac33f5cf71ee8f3814de8f2dfe3688e8c3820e1 /words
parent	a5ad66614265259763a1ed849db7ae8e918e8a3c (diff)