diff options
author | Yury Melnichek <melnichek@gmail.com> | 2011-03-06 01:12:21 +0300 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2015-09-23 01:12:57 +0300 |
commit | 5d9239a1edede8693c69119e282017d965d2c326 (patch) | |
tree | 2ac33f5cf71ee8f3814de8f2dfe3688e8c3820e1 /words | |
parent | a5ad66614265259763a1ed849db7ae8e918e8a3c (diff) |
Add sloynik to omim!! Yea!
Diffstat (limited to 'words')
-rw-r--r-- | words/common.hpp | 21 | ||||
-rw-r--r-- | words/dictionary.hpp | 34 | ||||
-rw-r--r-- | words/slof.hpp | 43 | ||||
-rw-r--r-- | words/slof_dictionary.cpp | 99 | ||||
-rw-r--r-- | words/slof_dictionary.hpp | 32 | ||||
-rw-r--r-- | words/sloynik_engine.cpp | 72 | ||||
-rw-r--r-- | words/sloynik_engine.hpp | 62 | ||||
-rw-r--r-- | words/sloynik_index.cpp | 220 | ||||
-rw-r--r-- | words/sloynik_index.hpp | 75 | ||||
-rw-r--r-- | words/words.pro | 26 | ||||
-rw-r--r-- | words/words_tests/dictionary_mock.hpp | 27 | ||||
-rw-r--r-- | words/words_tests/sorted_index_test.cpp | 125 | ||||
-rw-r--r-- | words/words_tests/words_tests.pro | 15 |
13 files changed, 851 insertions, 0 deletions
diff --git a/words/common.hpp b/words/common.hpp new file mode 100644 index 0000000000..142379beeb --- /dev/null +++ b/words/common.hpp @@ -0,0 +1,21 @@ +#pragma once +#include "../base/base.hpp" + +namespace sl +{ + +struct StrFn +{ + struct Str; // Never defined. + + Str const * (* Create)(char const * utf8Data, uint32_t size); + void (* Destroy)(Str const * s); + int (* PrimaryCompare)(void * pData, Str const * a, Str const * b); + int (* SecondaryCompare)(void * pData, Str const * a, Str const * b); + + void * m_pData; + uint64_t m_PrimaryCompareId; + uint64_t m_SecondaryCompareId; +}; + +} diff --git a/words/dictionary.hpp b/words/dictionary.hpp new file mode 100644 index 0000000000..25ffe978f0 --- /dev/null +++ b/words/dictionary.hpp @@ -0,0 +1,34 @@ +#pragma once +#include "../base/base.hpp" +#include "../std/string.hpp" +#include "../base/base.hpp" +#include "../base/exception.hpp" +#include "../std/utility.hpp" + +namespace sl +{ + +class Dictionary +{ +public: + DECLARE_EXCEPTION(Exception, RootException); + DECLARE_EXCEPTION(OpenDictionaryException, Exception); + DECLARE_EXCEPTION(OpenDictionaryNewerVersionException, Exception); + DECLARE_EXCEPTION(BrokenDictionaryException, Exception); + + // Key id, which is 0-based index of key in sorted order. + typedef uint32_t Id; + + virtual ~Dictionary() {} + + // Number of keys in the dictionary. + virtual Id KeyCount() const = 0; + + // Get key by id. + virtual void KeyById(Id id, string & key) const = 0; + + // Get article by id. + virtual void ArticleById(Id id, string & article) const = 0; +}; + +} diff --git a/words/slof.hpp b/words/slof.hpp new file mode 100644 index 0000000000..b2edc4e809 --- /dev/null +++ b/words/slof.hpp @@ -0,0 +1,43 @@ +#pragma once +// slof = sloynik format. +// +// VU = VarUint. +// +// Offset: Format: +// +// 0 [Header] +// m_ArticleOffset [VU ChunkSize] [Compressed article chunk 0] +// .. +// [VU ChunkSize] [Compressed article chunk n-1] +// m_KeyDataOffset [VU KeySize] [Key 0] +// .. +// [VU KeySize] [Key m_KeyCount-1] +// m_KeyArticleIdOffset [VU article 0 chunk offset] [VU article 0 number in chunk] +// .. +// [VU article m_ArticleCount-1 chunk offset] [VU article * number in chunk] +// +// ArticleId = ([Offset of article chunk] << 24) + [Offset of article in uncompressed chunk] + +#include "../coding/endianness.hpp" +#include "../base/base.hpp" + +namespace sl +{ + +#pragma pack(push, 1) +struct SlofHeader +{ + uint32_t m_Signature; // = "slof" + uint16_t m_MajorVersion; // Major version. Changing it breaks compatibility. + uint16_t m_MinorVersion; // Minor version. Changing it doesn't break compatibility. + + uint32_t m_KeyCount; // Number of keys. + uint32_t m_ArticleCount; // Number of articles. + + uint64_t m_KeyIndexOffset; // Offset of key cummulative lengthes. + uint64_t m_KeyDataOffset; // Offset of key data. + uint64_t m_ArticleOffset; // Offset of article data. +}; +#pragma pack(pop) + +} diff --git a/words/slof_dictionary.cpp b/words/slof_dictionary.cpp new file mode 100644 index 0000000000..bf1ff8543e --- /dev/null +++ b/words/slof_dictionary.cpp @@ -0,0 +1,99 @@ +#include "slof_dictionary.hpp" +#include "../coding_sloynik/bzip2_compressor.hpp" +#include "../coding/byte_stream.hpp" +#include "../coding/endianness.hpp" +#include "../coding/reader.hpp" +#include "../coding/varint.hpp" +#include "../base/logging.hpp" +#include "../base/base.hpp" +#include "../std/utility.hpp" + +namespace +{ + char const * SkipVarUint(char const * p) + { + while ((*p) & 128) + ++p; + return ++p; + } +} + +sl::SlofDictionary::SlofDictionary(Reader const * pReader) + : m_pReader(pReader), m_Decompressor(&DecompressBZip2IntoFixedSize) +{ + Init(); +} + +sl::SlofDictionary::SlofDictionary( + Reader const * pReader, function<void (const char *, size_t, char *, size_t)> decompressor) + : m_pReader(pReader), m_Decompressor(decompressor) +{ + Init(); +} + +void sl::SlofDictionary::Init() +{ + m_pReader->Read(0, &m_Header, sizeof(m_Header)); + if (m_Header.m_MajorVersion != 1) + MYTHROW(OpenDictionaryNewerVersionException, (m_Header.m_MajorVersion)); +} + +sl::SlofDictionary::~SlofDictionary() +{ +} + +sl::Dictionary::Id sl::SlofDictionary::KeyCount() const +{ + return m_Header.m_KeyCount; +} + +void sl::SlofDictionary::ReadKeyData(sl::Dictionary::Id id, string & data) const +{ + pair<uint32_t, uint32_t> offsets; + m_pReader->Read(m_Header.m_KeyIndexOffset + id * 4, &offsets, 8); + offsets.first = SwapIfBigEndian(offsets.first); + offsets.second = SwapIfBigEndian(offsets.second); + // Add 2 trailing zeroes, to be sure that reading varint doesn't hang if file is corrupted. + data.resize(offsets.second - offsets.first + 2); + m_pReader->Read(m_Header.m_KeyDataOffset + offsets.first, + &data[0], offsets.second - offsets.first); +} + +void sl::SlofDictionary::KeyById(sl::Dictionary::Id id, string & key) const +{ + string keyData; + ReadKeyData(id, keyData); + char const * pBeg = SkipVarUint(SkipVarUint(&keyData[0])); + char const * pLast = &keyData[keyData.size() - 1]; + // ReadKeyData adds trailing zeroes, so that reading VarUint doesn't hang up in case of + // corrupted data. Strip them. + while (pLast >= pBeg && *pLast == 0) + --pLast; + key.assign(pBeg, pLast + 1); +} + +void sl::SlofDictionary::ArticleById(sl::Dictionary::Id id, string & article) const +{ + string keyData; + ReadKeyData(id, keyData); + ArrayByteSource keyDataSource(&keyData[0]); + uint64_t const articleChunkPos = ReadVarUint<uint64_t>(keyDataSource); + uint32_t const articleNumInChunk = ReadVarUint<uint32_t>(keyDataSource); + + uint32_t const chunkSize = + ReadPrimitiveFromPos<uint32_t>(*m_pReader, m_Header.m_ArticleOffset + articleChunkPos); + string chunk(chunkSize, 0); + m_pReader->Read(m_Header.m_ArticleOffset + articleChunkPos + 4, &chunk[0], chunkSize); + ArrayByteSource chunkSource(&chunk[0]); + uint32_t chunkHeaderSize = ReadVarUint<uint32_t>(chunkSource); + chunkHeaderSize += static_cast<char const *>(chunkSource.Ptr()) - &chunk[0]; + uint32_t const decompressedChunkSize = ReadVarUint<uint32_t>(chunkSource); + uint32_t articleBegInChunk = 0; + for (uint32_t i = 0; i < articleNumInChunk; ++i) + articleBegInChunk += ReadVarUint<uint32_t>(chunkSource); + uint32_t const articleSizeInChunk = ReadVarUint<uint32_t>(chunkSource); + vector<char> decompressedChunk(decompressedChunkSize); + m_Decompressor(&chunk[chunkHeaderSize], chunkSize - chunkHeaderSize, + &decompressedChunk[0], decompressedChunkSize); + article.assign(&decompressedChunk[articleBegInChunk], articleSizeInChunk); +} diff --git a/words/slof_dictionary.hpp b/words/slof_dictionary.hpp new file mode 100644 index 0000000000..9c9fc00087 --- /dev/null +++ b/words/slof_dictionary.hpp @@ -0,0 +1,32 @@ +#pragma once +#include "dictionary.hpp" +#include "slof.hpp" +#include "../std/function.hpp" +#include "../std/scoped_ptr.hpp" + +class Reader; + +namespace sl +{ + +class SlofDictionary : public Dictionary +{ +public: + // Takes ownership of pReader and deletes it, even if exception is thrown. + explicit SlofDictionary(Reader const * pReader); + // Takes ownership of pReader and deletes it, even if exception is thrown. + SlofDictionary(Reader const * pReader, + function<void (char const *, size_t, char *, size_t)> decompressor); + ~SlofDictionary(); + Id KeyCount() const; + void KeyById(Id id, string & key) const; + void ArticleById(Id id, string & article) const; +private: + void Init(); + void ReadKeyData(Id id, string & data) const; + scoped_ptr<Reader const> m_pReader; + function<void (char const *, size_t, char *, size_t)> m_Decompressor; + SlofHeader m_Header; +}; + +} diff --git a/words/sloynik_engine.cpp b/words/sloynik_engine.cpp new file mode 100644 index 0000000000..f7e19811e9 --- /dev/null +++ b/words/sloynik_engine.cpp @@ -0,0 +1,72 @@ +#include "sloynik_engine.hpp" +#include "slof_dictionary.hpp" +#include "sloynik_index.hpp" +#include "../coding/file_reader.hpp" +#include "../coding/file_writer.hpp" +#include "../base/logging.hpp" +#include "../base/macros.hpp" + +sl::SloynikEngine::SloynikEngine(string const & dictionaryPath, + string const & indexPath, + StrFn const & strFn) +{ + m_pDictionary.reset(new sl::SlofDictionary(new FileReader(dictionaryPath))); + vector<uint64_t> stamp; + stamp.push_back(strFn.m_PrimaryCompareId); + stamp.push_back(strFn.m_SecondaryCompareId); + string const stampPath = indexPath + ".stamp"; + bool needIndexBuild = false; + try + { + FileReader stampReader(stampPath); + if (stampReader.Size() != stamp.size() * sizeof(stamp[0])) + needIndexBuild = true; + else + { + vector<uint64_t> currentStamp(stamp.size()); + stampReader.Read(0, ¤tStamp[0], stamp.size() * sizeof(stamp[0])); + if (currentStamp != stamp) + needIndexBuild = true; + } + } + catch (RootException &) + { + needIndexBuild = true; + } + LOG(LINFO, ("Started sloynik engine. Words in the dictionary:", m_pDictionary->KeyCount())); + // Uncomment to always rebuild the index: needIndexBuild = true; + if (needIndexBuild) + { + FileWriter::DeleteFile(stampPath); + sl::SortedIndex::Build(*m_pDictionary, strFn, indexPath); + + FileWriter stampWriter(stampPath); + stampWriter.Write(&stamp[0], stamp.size() * sizeof(stamp[0])); + } + m_pIndexReader.reset(new FileReader(indexPath + ".idx")); + m_pSortedIndex.reset(new sl::SortedIndex(*m_pDictionary, m_pIndexReader.get(), strFn)); +} + +sl::SloynikEngine::~SloynikEngine() +{ +} + +void sl::SloynikEngine::Search(string const & prefix, sl::SloynikEngine::SearchResult & res) const +{ + res.m_FirstMatched = m_pSortedIndex->PrefixSearch(prefix); +} + +void sl::SloynikEngine::GetWordInfo(WordId word, sl::SloynikEngine::WordInfo & res) const +{ + m_pDictionary->KeyById(m_pSortedIndex->KeyIdByPos(word), res.m_Word); +} + +uint32_t sl::SloynikEngine::WordCount() const +{ + return m_pDictionary->KeyCount(); +} + +void sl::SloynikEngine::GetArticleData(WordId wordId, ArticleData & data) const +{ + m_pDictionary->ArticleById(m_pSortedIndex->KeyIdByPos(wordId), data.m_HTML); +} diff --git a/words/sloynik_engine.hpp b/words/sloynik_engine.hpp new file mode 100644 index 0000000000..9e83686db8 --- /dev/null +++ b/words/sloynik_engine.hpp @@ -0,0 +1,62 @@ +#pragma once +#include "common.hpp" +#include "../base/base.hpp" +#include "../std/function.hpp" +#include "../std/scoped_ptr.hpp" +#include "../std/string.hpp" +#include "../std/utility.hpp" + +class FileReader; + +namespace sl +{ + +class Dictionary; +class SortedIndex; + +class SloynikEngine +{ +public: + SloynikEngine(string const & dictionaryPath, + string const & indexPath, + sl::StrFn const & strFn); + + ~SloynikEngine(); + + typedef uint32_t WordId; + + struct WordInfo + { + string m_Word; + }; + + void GetWordInfo(WordId word, WordInfo & res) const; + + struct SearchResult + { + WordId m_FirstMatched; + }; + + void Search(string const & prefix, SearchResult & res) const; + + uint32_t WordCount() const; + + struct ArticleData + { + string m_HTML; + + void swap(ArticleData & o) + { + m_HTML.swap(o.m_HTML); + } + }; + + void GetArticleData(WordId word, ArticleData & data) const; + +private: + scoped_ptr<sl::Dictionary> m_pDictionary; + scoped_ptr<FileReader> m_pIndexReader; + scoped_ptr<sl::SortedIndex> m_pSortedIndex; +}; + +} diff --git a/words/sloynik_index.cpp b/words/sloynik_index.cpp new file mode 100644 index 0000000000..ff81a7a84d --- /dev/null +++ b/words/sloynik_index.cpp @@ -0,0 +1,220 @@ +#include "sloynik_index.hpp" +#include "dictionary.hpp" +#include "../coding/file_writer.hpp" +#include "../base/assert.hpp" +#include "../base/logging.hpp" +#include "../base/cache.hpp" +#include "../std/bind.hpp" + +extern "C" { + #include "../coding_sloynik/timsort/timsort.h" +} + + +#define FILE_SORTER_LOG_BATCH_SIZE 11 + +sl::SortedIndex::SortedIndex(Dictionary const & dictionary, + Reader const * pIndexReader, + StrFn const & strFn) + : m_Dictionary(dictionary), m_StrFn(strFn), + m_SortedVector(PolymorphReader(pIndexReader), pIndexReader->Size() / sizeof(DicId)) +{ + STATIC_ASSERT(sizeof(sl::SortedIndex::DicId) == 3); +} + +sl::Dictionary const & sl::SortedIndex::GetDictionary() const +{ + return m_Dictionary; +} + +sl::Dictionary::Id sl::SortedIndex::KeyIdByPos(Pos pos) const +{ + return m_SortedVector[pos].Value(); +} + +namespace sl +{ +namespace impl +{ + + #define DIC_ID_MAGIC_EMPTY_VALUE 0xFFFFFF + + // Compare 2 values of Dictionary::Id using comparator provided. + // A special value of Dictionary::Id(DIC_ID_MAGIC_EMPTY_VALUE) is used + // to compare with a string provided in constructor. + template <bool bFullCompare> + class DictCollIdLess + { + private: + DictCollIdLess(DictCollIdLess<bFullCompare> const &); + DictCollIdLess<bFullCompare> & operator = (DictCollIdLess<bFullCompare> const &); + public: + DictCollIdLess(sl::Dictionary const & dic, sl::StrFn const & strFn) + : m_Dic(dic), m_StrFn(strFn), m_Special(NULL), m_Cache(FILE_SORTER_LOG_BATCH_SIZE + 1) + { + } + + DictCollIdLess(sl::Dictionary const & dic, sl::StrFn const & strFn, string const & special) + : m_Dic(dic), m_StrFn(strFn), m_Special(strFn.Create(&special[0], special.size())), m_Cache(3) + { + } + + ~DictCollIdLess() + { + if (m_Special) + m_StrFn.Destroy(m_Special); + m_Cache.ForEachValue(m_StrFn.Destroy); + } + + int Compare(uint32_t const id1, uint32_t const id2) + { + if (id1 == id2) + return 0; + sl::StrFn::Str const * key1, * key2; + bool bDeleteKey1 = false; + KeysById(id1, id2, key1, key2, bDeleteKey1); + int const primaryRes = m_StrFn.PrimaryCompare(m_StrFn.m_pData, key1, key2); + if (bFullCompare && primaryRes == 0) + { + int const secondaryRes = m_StrFn.SecondaryCompare(m_StrFn.m_pData, key1, key2); + if (bDeleteKey1) + m_StrFn.Destroy(key1); + return secondaryRes; + } + else + { + if (bDeleteKey1) + m_StrFn.Destroy(key1); + return primaryRes; + } + } + + bool operator () (uint32_t const id1, uint32_t const id2) + { + return Compare(id1, id2) < 0; + } + + private: + + inline void KeysById(uint32_t const id1, uint32_t const id2, + sl::StrFn::Str const * & key1, + sl::StrFn::Str const * & key2, + bool & bDeleteKey1) + { + bDeleteKey1 = false; + + if (id1 == DIC_ID_MAGIC_EMPTY_VALUE) + key1 = m_Special; + else + { + bool found = false; + sl::StrFn::Str const * & s = m_Cache.Find(id1, found); + if (!found) + { + if (s != NULL) + m_StrFn.Destroy(s); + s = CreateKeyById(id1); + } + key1 = s; + } + + if (id2 == DIC_ID_MAGIC_EMPTY_VALUE) + key2 = m_Special; + else + { + bool found = false; + sl::StrFn::Str const * & s = m_Cache.Find(id2, found); + if (!found) + { + if (s != NULL) + { + if (s == key1) + bDeleteKey1 = true; + else + m_StrFn.Destroy(s); + } + s = CreateKeyById(id2); + } + key2 = s; + } + } + + inline sl::StrFn::Str const * CreateKeyById(uint32_t const id) + { + ASSERT_LESS(id, DIC_ID_MAGIC_EMPTY_VALUE, ()); + string keyStr; + m_Dic.KeyById(id, keyStr); + return m_StrFn.Create(&keyStr[0], keyStr.size()); + } + + sl::Dictionary const & m_Dic; + sl::StrFn m_StrFn; + sl::StrFn::Str const * m_Special; + my::Cache<uint32_t, sl::StrFn::Str const *> m_Cache; + }; + + // Hack: global data, used for sorting with c-style timsort(). + static DictCollIdLess<true> * g_pSortedIndexDictCollIdLessForBuild = NULL; + + template <class T> struct LessRefProxy + { + T & m_F; + explicit LessRefProxy(T & f) : m_F(f) {} + inline bool operator () (SortedIndex::DicId a, SortedIndex::DicId b) const + { + return m_F(a.Value(), b.Value()); + } + + // Hack: used for sorting with c-style timsort(). + static int GlobalCompareForSort(void const * pVoidA, void const * pVoidB) + { + SortedIndex::DicId const * pA = static_cast<SortedIndex::DicId const *>(pVoidA); + SortedIndex::DicId const * pB = static_cast<SortedIndex::DicId const *>(pVoidB); + return g_pSortedIndexDictCollIdLessForBuild->Compare(pA->Value(), pB->Value()); + } + }; + template <class T> LessRefProxy<T> MakeLessRefProxy(T & f) { return LessRefProxy<T>(f); } + +} // namespace impl +} // namespace sl + +sl::SortedIndex::Pos sl::SortedIndex::PrefixSearch(string const & prefix) +{ + impl::DictCollIdLess<false> compareLess(m_Dictionary, m_StrFn, prefix); + return lower_bound(m_SortedVector.begin(), m_SortedVector.end(), DIC_ID_MAGIC_EMPTY_VALUE, + MakeLessRefProxy(compareLess)) + - m_SortedVector.begin(); +} + +void sl::SortedIndex::Build(sl::Dictionary const & dictionary, + StrFn const & strFn, + string const & indexPathPrefix) +{ + LOG(LINFO, ("Building sorted index.")); + + // Initializing. + vector<DicId> ids(dictionary.KeyCount()); + for (size_t i = 0; i < ids.size(); ++i) + ids[i] = i; + + // Sorting. + { + sl::impl::DictCollIdLess<true> compareLess(dictionary, strFn); + + sl::impl::g_pSortedIndexDictCollIdLessForBuild = &compareLess; + timsort(&ids[0], ids.size(), sizeof(ids[0]), + &sl::impl::LessRefProxy<sl::impl::DictCollIdLess<true> >::GlobalCompareForSort); + sl::impl::g_pSortedIndexDictCollIdLessForBuild = NULL; + + // sort() or stable_sort() are nice but slow. + // sort() 150 milliseconds + // stable_sort() 50 milliseconds + // timsort() 9 milliseconds + // stable_sort(ids.begin(), ids.end(), MakeLessRefProxy(compareLess)); + } + + FileWriter idxWriter((indexPathPrefix + ".idx").c_str()); + idxWriter.Write(&ids[0], ids.size() * sizeof(ids[0])); + + LOG(LINFO, ("Building sorted index done.")); +} diff --git a/words/sloynik_index.hpp b/words/sloynik_index.hpp new file mode 100644 index 0000000000..192a1bddae --- /dev/null +++ b/words/sloynik_index.hpp @@ -0,0 +1,75 @@ +#pragma once +#include "common.hpp" +#include "dictionary.hpp" +#include "../coding_sloynik/polymorph_reader.hpp" +#include "../coding/dd_vector.hpp" +#include "../coding/file_reader.hpp" +#include "../base/base.hpp" +#include "../std/function.hpp" +#include "../std/scoped_ptr.hpp" +#include "../std/utility.hpp" +#include "../std/vector.hpp" + +class Reader; +class Writer; + +namespace sl +{ + +// Sorted index of keys. +// Two locale-aware comparators are used: primary and secondary. +// Secondary comporator is only used to compare keys, which are equivalent +// according to primary comparator. +// For search only primary comporator is used. +// Do determine the order of keys to display, primary + secondary comporators are used. +// To collect articles for the same key, primary + secondary comporators are used. +class SortedIndex +{ +public: + typedef uint32_t Pos; + + SortedIndex(Dictionary const & dictionary, Reader const * pIndexReader, StrFn const & strFn); + + // Id of the smallest key that is equal or grater that prefix. + Pos PrefixSearch(string const & prefix); + + Dictionary::Id KeyIdByPos(Pos pos) const; + + Dictionary const & GetDictionary() const; + + // Build index. + static void Build(Dictionary const & dictionary, + StrFn const & strFn, + string const & indexPathPrefix); +private: + +#pragma pack(push, 1) + class DicId + { + public: + DicId() : m_Lo(0), m_Mi(0), m_Hi(0) {} + DicId(uint32_t x) : m_Lo(x & 0xFF), m_Mi((x >> 8) & 0xFF), m_Hi((x >> 16) & 0xFF) + { + ASSERT_LESS_OR_EQUAL(x, 0xFFFFFF, ()); + } + + uint32_t Value() const + { + return (uint32_t(m_Hi) << 16) | (uint32_t(m_Mi) << 8) | m_Lo; + } + + private: + uint8_t m_Lo, m_Mi, m_Hi; + }; +#pragma pack(pop) + friend class LessRefProxy; + + typedef DDVector<DicId, PolymorphReader> DDVectorType; + + Dictionary const & m_Dictionary; // Dictionary. + StrFn m_StrFn; // Primary and secondary comparison functions. + DDVectorType m_SortedVector; // Sorted array of key ids. +}; + + +} diff --git a/words/words.pro b/words/words.pro new file mode 100644 index 0000000000..4e848b4a59 --- /dev/null +++ b/words/words.pro @@ -0,0 +1,26 @@ +TARGET = words +TEMPLATE = lib +CONFIG += staticlib + +SLOYNIK_DIR = .. +DEPENDENCIES = bzip2 zlib base coding coding_sloynik + +include($$SLOYNIK_DIR/sloynik_common.pri) + +HEADERS += \ + common.hpp \ + dictionary.hpp \ + slof.hpp \ + slof_dictionary.hpp \ + sloynik_engine.hpp \ + sloynik_index.hpp + +SOURCES += \ + slof_dictionary.cpp \ + sloynik_engine.cpp \ + sloynik_index.cpp + +OTHER_FILES += \ + ../bugs.txt + + diff --git a/words/words_tests/dictionary_mock.hpp b/words/words_tests/dictionary_mock.hpp new file mode 100644 index 0000000000..aab8161084 --- /dev/null +++ b/words/words_tests/dictionary_mock.hpp @@ -0,0 +1,27 @@ +#pragma once +#include "../dictionary.hpp" +#include "../../base/assert.hpp" +#include "../../std/string.hpp" +#include "../../std/vector.hpp" + +class DictionaryMock : public sl::Dictionary +{ +public: + void Add(string const & key, string const & article) + { + m_Keys.push_back(key); + m_Articles.push_back(article); + } + + Id KeyCount() const + { + ASSERT_EQUAL(m_Keys.size(), m_Articles.size(), ()); + return m_Keys.size(); + } + + void KeyById(Id id, string & key) const { key = m_Keys[id]; } + void ArticleById(Id id, string & article) const { article = m_Articles[id]; } + +private: + vector<string> m_Keys, m_Articles; +}; diff --git a/words/words_tests/sorted_index_test.cpp b/words/words_tests/sorted_index_test.cpp new file mode 100644 index 0000000000..65c03448db --- /dev/null +++ b/words/words_tests/sorted_index_test.cpp @@ -0,0 +1,125 @@ +#include "../../testing/testing.hpp" +#include "../sloynik_index.hpp" +#include "dictionary_mock.hpp" +#include "../../std/algorithm.hpp" +#include "../../std/bind.hpp" +#include "../../std/map.hpp" +#include "../../std/set.hpp" +#include "../../std/string.hpp" +#include "../../std/vector.hpp" + +namespace +{ + set<sl::StrFn::Str const *> g_AllocatedStrSet; + + sl::StrFn::Str const * StrCreate(char const * utf8Data, uint32_t size) + { + sl::StrFn::Str const * res = reinterpret_cast<sl::StrFn::Str *>(new string(utf8Data, size)); + CHECK(g_AllocatedStrSet.insert(res).second, ()); + return res; + } + + void StrDestroy(sl::StrFn::Str const * s) + { + if (s) + { + CHECK(g_AllocatedStrSet.count(s), ()); + g_AllocatedStrSet.erase(s); + delete reinterpret_cast<string const *>(s); + } + } + + int StrSecondaryCompare(void *, sl::StrFn::Str const * pa, sl::StrFn::Str const * pb) + { + string const & a = *reinterpret_cast<string const *>(pa); + string const & b = *reinterpret_cast<string const *>(pb); + return a == b ? 0 : (a < b ? -1 : 1); + } + + int StrPrimaryCompare(void *, sl::StrFn::Str const * pa, sl::StrFn::Str const * pb) + { + string s1(*reinterpret_cast<string const *>(pa)); + string s2(*reinterpret_cast<string const *>(pb)); + std::use_facet<std::ctype<char> >(std::locale()).tolower(&s1[0], &s1[0] + s1.size()); + std::use_facet<std::ctype<char> >(std::locale()).tolower(&s2[0], &s2[0] + s2.size()); + return s1 == s2 ? 0 : (s1 < s2 ? -1 : 1); + } + + sl::StrFn StrFnForTest() + { + sl::StrFn strFn; + strFn.Create = StrCreate; + strFn.Destroy = StrDestroy; + strFn.PrimaryCompare = StrPrimaryCompare; + strFn.SecondaryCompare = StrSecondaryCompare; + strFn.m_PrimaryCompareId = 1; + strFn.m_SecondaryCompareId = 2; + strFn.m_pData = NULL; + return strFn; + } + + void PushBackArticleIntoVector(vector<string> & v, + sl::Dictionary const * pDic, + sl::Dictionary::Id id) + { + v.push_back(""); + pDic->ArticleById(id, v.back()); + } + + string KeyByIndexId(sl::SortedIndex const & idx, sl::SortedIndex::Pos id) + { + string key; + idx.GetDictionary().KeyById(idx.KeyIdByPos(id), key); + return key; + } + + string ArticleByIndexId(sl::SortedIndex const & idx, sl::SortedIndex::Pos id) + { + string article; + idx.GetDictionary().ArticleById(idx.KeyIdByPos(id), article); + return article; + } + + void SetupDictionary(DictionaryMock & dictionary) + { + dictionary.Add("Hello", "Hello0"); // 0 + dictionary.Add("hello", "hello0"); // 1 + dictionary.Add("World", "World0"); // 2 + dictionary.Add("He", "He0"); // 3 + dictionary.Add("abc", "abc1"); // 4 + } +} + +UNIT_TEST(SortedIndex_Smoke) +{ + string const & filePrefix = "sorted_index_smoke_test"; + DictionaryMock dictionary; + SetupDictionary(dictionary); + sl::StrFn strFn = StrFnForTest(); + sl::SortedIndex::Build(dictionary, strFn, filePrefix); + sl::SortedIndex idx(dictionary, new FileReader(filePrefix + ".idx"), strFn); + TEST_EQUAL(dictionary.KeyCount(), 5, ()); + TEST_EQUAL(KeyByIndexId(idx, 0), "abc", ()); + TEST_EQUAL(KeyByIndexId(idx, 1), "He", ()); + TEST_EQUAL(KeyByIndexId(idx, 2), "Hello", ()); + TEST_EQUAL(KeyByIndexId(idx, 3), "hello", ()); + TEST_EQUAL(KeyByIndexId(idx, 4), "World", ()); + TEST_EQUAL(ArticleByIndexId(idx, 0), "abc1", ()); + TEST_EQUAL(ArticleByIndexId(idx, 1), "He0", ()); + TEST_EQUAL(ArticleByIndexId(idx, 2), "Hello0", ()); + TEST_EQUAL(ArticleByIndexId(idx, 3), "hello0", ()); + TEST_EQUAL(ArticleByIndexId(idx, 4), "World0", ()); + TEST_EQUAL(idx.PrefixSearch(""), 0, ()); + TEST_EQUAL(idx.PrefixSearch("h"), 1, ()); + TEST_EQUAL(idx.PrefixSearch("H"), 1, ()); + TEST_EQUAL(idx.PrefixSearch("He"), 1, ()); + TEST_EQUAL(idx.PrefixSearch("he"), 1, ()); + TEST_EQUAL(idx.PrefixSearch("hea"), 2, ()); + TEST_EQUAL(idx.PrefixSearch("hel"), 2, ()); + TEST_EQUAL(idx.PrefixSearch("Hello"), 2, ()); + TEST_EQUAL(idx.PrefixSearch("W"), 4, ()); + TEST_EQUAL(idx.PrefixSearch("zzz"), 5, ()); + remove((filePrefix + ".idx").c_str()); + remove((filePrefix + ".idx").c_str()); + TEST(g_AllocatedStrSet.empty(), ()); +} diff --git a/words/words_tests/words_tests.pro b/words/words_tests/words_tests.pro new file mode 100644 index 0000000000..f1ca3ed47e --- /dev/null +++ b/words/words_tests/words_tests.pro @@ -0,0 +1,15 @@ +TARGET = words_tests +TEMPLATE = app +CONFIG += console +CONFIG -= app_bundle + +SLOYNIK_DIR = ../.. +DEPENDENCIES = bzip2 zlib base coding coding_sloynik words + +include($$SLOYNIK_DIR/sloynik_common.pri) + +SOURCES += $$SLOYNIK_DIR/testing/testingmain.cpp \ + sorted_index_test.cpp \ + +HEADERS += \ + dictionary_mock.hpp \ |