Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaxim Pimenov <m@maps.me>2016-04-26 17:02:16 +0300
committerMaxim Pimenov <m@maps.me>2016-04-26 18:54:48 +0300
commit9243f3130a86b17f140399d26fe24a5ee124a64f (patch)
tree60ba1c5f803a7073de27eb73ca1045b304466dce /indexer
parent360bafbc2907788a3893792fb6dd366d3eb48e44 (diff)
Review fixes.
Diffstat (limited to 'indexer')
-rw-r--r--indexer/categories_holder.cpp4
-rw-r--r--indexer/categories_holder.hpp4
-rw-r--r--indexer/categories_index.cpp79
-rw-r--r--indexer/categories_index.hpp11
-rw-r--r--indexer/indexer_tests/categories_test.cpp108
5 files changed, 150 insertions, 56 deletions
diff --git a/indexer/categories_holder.cpp b/indexer/categories_holder.cpp
index 8b49341ab8..cf96e2a223 100644
--- a/indexer/categories_holder.cpp
+++ b/indexer/categories_holder.cpp
@@ -277,10 +277,12 @@ int8_t CategoriesHolder::MapLocaleToInteger(string const & locale)
{"he", 29 },
{"sw", 30 }
};
- ASSERT_EQUAL(ARRAY_SIZE(mapping), kNumLanguages, ());
+ static_assert(ARRAY_SIZE(mapping) == kNumLanguages, "");
for (size_t i = 0; i < kNumLanguages; ++i)
+ {
if (locale.find(mapping[i].m_name) == 0)
return mapping[i].m_code;
+ }
// Special cases for different Chinese variations
if (locale.find("zh") == 0)
diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp
index 68b543e003..bba60a738c 100644
--- a/indexer/categories_holder.hpp
+++ b/indexer/categories_holder.hpp
@@ -60,8 +60,8 @@ public:
template <class ToDo>
void ForEachTypeAndCategory(ToDo && toDo) const
{
- for (IteratorT i = m_type2cat.begin(); i != m_type2cat.end(); ++i)
- toDo(i->first, *i->second);
+ for (auto const it : m_type2cat)
+ toDo(it.first, *it.second);
}
template <class ToDo>
diff --git a/indexer/categories_index.cpp b/indexer/categories_index.cpp
index 344c2b3721..0c39adc53b 100644
--- a/indexer/categories_index.cpp
+++ b/indexer/categories_index.cpp
@@ -1,34 +1,60 @@
#include "categories_index.hpp"
+#include "search_delimiters.hpp"
+#include "search_string_utils.hpp"
+#include "base/assert.hpp"
+#include "base/stl_add.hpp"
#include "base/stl_helpers.hpp"
+#include "base/string_utils.hpp"
#include "std/algorithm.hpp"
#include "std/set.hpp"
namespace
{
-void AddAllSubstrings(my::MemTrie<string, uint32_t> & trie, string const & s, uint32_t value)
+void AddAllNonemptySubstrings(my::MemTrie<string, uint32_t> & trie, string const & s,
+ uint32_t value)
{
+ ASSERT(!s.empty(), ());
for (size_t i = 0; i < s.length(); ++i)
{
string t;
for (size_t j = i; j < s.length(); ++j)
{
- t.append(1, s[j]);
+ t.push_back(s[j]);
trie.Add(t, value);
}
}
}
+
+template<typename TF>
+void ForEachToken(string const & s, TF && fn)
+{
+ vector<strings::UniString> tokens;
+ SplitUniString(search::NormalizeAndSimplifyString(s), MakeBackInsertFunctor(tokens), search::Delimiters());
+ for (auto const & token : tokens)
+ fn(strings::ToUtf8(token));
+}
+
+void TokenizeAndAddAllSubstrings(my::MemTrie<string, uint32_t> & trie, string const & s,
+ uint32_t value)
+{
+ auto fn = [&](string const & token)
+ {
+ AddAllNonemptySubstrings(trie, token, value);
+ };
+ ForEachToken(s, fn);
+}
} // namespace
namespace indexer
{
void CategoriesIndex::AddCategoryByTypeAndLang(uint32_t type, int8_t lang)
{
- m_catHolder.ForEachNameByType(type, [&](CategoriesHolder::Category::Name const & name)
+ m_catHolder.ForEachNameByType(type, [&](TCategory::Name const & name)
{
if (name.m_locale == lang)
- AddAllSubstrings(m_trie, name.m_name, type);
+ TokenizeAndAddAllSubstrings(m_trie, name.m_name, type);
});
}
@@ -40,45 +66,62 @@ void CategoriesIndex::AddCategoryByTypeAllLangs(uint32_t type)
void CategoriesIndex::AddAllCategoriesInLang(int8_t lang)
{
- m_catHolder.ForEachTypeAndCategory([&](uint32_t type, Category const & cat)
+ m_catHolder.ForEachTypeAndCategory([&](uint32_t type, TCategory const & cat)
{
for (auto const & name : cat.m_synonyms)
{
if (name.m_locale == lang)
- AddAllSubstrings(m_trie, name.m_name, type);
+ TokenizeAndAddAllSubstrings(m_trie, name.m_name, type);
}
});
}
-void CategoriesIndex::AddAllCategoriesAllLangs()
+void CategoriesIndex::AddAllCategoriesInAllLangs()
{
- m_catHolder.ForEachTypeAndCategory([this](uint32_t type, Category const & cat)
+ m_catHolder.ForEachTypeAndCategory([this](uint32_t type, TCategory const & cat)
{
for (auto const & name : cat.m_synonyms)
- AddAllSubstrings(m_trie, name.m_name, type);
+ TokenizeAndAddAllSubstrings(m_trie, name.m_name, type);
});
}
-void CategoriesIndex::GetCategories(string const & query, vector<Category> & result)
+void CategoriesIndex::GetCategories(string const & query, vector<TCategory> & result) const
{
vector<uint32_t> types;
GetAssociatedTypes(query, types);
my::SortUnique(types);
- m_catHolder.ForEachTypeAndCategory([&](uint32_t type, Category const & cat)
+ m_catHolder.ForEachTypeAndCategory([&](uint32_t type, TCategory const & cat)
{
if (binary_search(types.begin(), types.end(), type))
result.push_back(cat);
});
}
-void CategoriesIndex::GetAssociatedTypes(string const & query, vector<uint32_t> & result)
+void CategoriesIndex::GetAssociatedTypes(string const & query, vector<uint32_t> & result) const
{
- set<uint32_t> types;
- auto fn = [&](string const & s, uint32_t type)
+ bool first = true;
+ set<uint32_t> intersection;
+ ForEachToken(query, [&](string const & token)
{
- types.insert(type);
- };
- m_trie.ForEachInSubtree(query, fn);
- result.insert(result.end(), types.begin(), types.end());
+ set<uint32_t> types;
+ auto fn = [&](string const &, uint32_t type)
+ {
+ types.insert(type);
+ };
+ m_trie.ForEachInSubtree(token, fn);
+ if (first)
+ {
+ intersection.swap(types);
+ }
+ else
+ {
+ set<uint32_t> tmp;
+ set_intersection(intersection.begin(),intersection.end(),types.begin(),types.end(),inserter(tmp,tmp.begin()));
+ intersection.swap(tmp);
+ }
+ first = false;
+ });
+
+ result.insert(result.end(), intersection.begin(), intersection.end());
}
} // namespace indexer
diff --git a/indexer/categories_index.hpp b/indexer/categories_index.hpp
index 0429976096..09859499ad 100644
--- a/indexer/categories_index.hpp
+++ b/indexer/categories_index.hpp
@@ -16,7 +16,7 @@ namespace indexer
class CategoriesIndex
{
public:
- using Category = CategoriesHolder::Category;
+ using TCategory = CategoriesHolder::Category;
CategoriesIndex() : m_catHolder(GetDefaultCategories()) {}
@@ -36,21 +36,22 @@ public:
void AddAllCategoriesInLang(int8_t lang);
// Adds all categories from data/classificator.txt.
- void AddAllCategoriesAllLangs();
+ void AddAllCategoriesInAllLangs();
// Returns all categories that have |query| as a substring. Note
// that all synonyms for a category are contained in a returned
// value even if only one language was used when adding this
// category's name to index.
// Beware weird results when query is a malformed UTF-8 string.
- void GetCategories(string const & query, vector<Category> & result);
+ void GetCategories(string const & query, vector<TCategory> & result) const;
// Returns all types that match to categories that have |query| as substring.
// Beware weird results when query is a malformed UTF-8 string.
- void GetAssociatedTypes(string const & query, vector<uint32_t> & result);
+ // Note: no types are returned if the query is empty.
+ void GetAssociatedTypes(string const & query, vector<uint32_t> & result) const;
#ifdef DEBUG
- int GetNumTrieNodes() { return m_trie.GetNumNodes(); }
+ inline int GetNumTrieNodes() const { return m_trie.GetNumNodes(); }
#endif
private:
diff --git a/indexer/indexer_tests/categories_test.cpp b/indexer/indexer_tests/categories_test.cpp
index 3e46c2834c..d7b340d305 100644
--- a/indexer/indexer_tests/categories_test.cpp
+++ b/indexer/indexer_tests/categories_test.cpp
@@ -16,7 +16,7 @@
using namespace indexer;
-char const * g_testCategoriesTxt =
+char const g_testCategoriesTxt[] =
"amenity-bench\n"
"en:1bench|sit down|to sit\n"
"de:0bank|auf die strafbank schicken\n"
@@ -94,7 +94,7 @@ UNIT_TEST(LoadCategories)
{
classificator::Load();
- CategoriesHolder h(make_unique<MemReader>(g_testCategoriesTxt, strlen(g_testCategoriesTxt)));
+ CategoriesHolder h(make_unique<MemReader>(g_testCategoriesTxt, sizeof(g_testCategoriesTxt) - 1));
size_t count = 0;
Checker f(count);
h.ForEachCategory(f);
@@ -105,62 +105,110 @@ UNIT_TEST(CategoriesIndex_Smoke)
{
classificator::Load();
- CategoriesHolder catHolder(
- make_unique<MemReader>(g_testCategoriesTxt, strlen(g_testCategoriesTxt)));
- CategoriesIndex catIndex(catHolder);
-
+ CategoriesHolder holder(
+ make_unique<MemReader>(g_testCategoriesTxt, sizeof(g_testCategoriesTxt) - 1));
+ CategoriesIndex index(holder);
+
uint32_t type1 = classif().GetTypeByPath({"amenity", "bench"});
uint32_t type2 = classif().GetTypeByPath({"place", "village"});
if (type1 > type2)
swap(type1, type2);
int8_t lang1 = CategoriesHolder::MapLocaleToInteger("en");
int8_t lang2 = CategoriesHolder::MapLocaleToInteger("de");
-
- auto testTypes = [&](string const & query, vector<uint32_t> && expected)
+
+ auto testTypes = [&](string const & query, vector<uint32_t> const & expected)
{
vector<uint32_t> result;
- catIndex.GetAssociatedTypes(query, result);
+ index.GetAssociatedTypes(query, result);
TEST_EQUAL(result, expected, (query));
};
-
- catIndex.AddCategoryByTypeAndLang(type1, lang1);
+
+ index.AddCategoryByTypeAndLang(type1, lang1);
testTypes("bench", {type1});
+ testTypes("BENCH", {type1});
testTypes("down", {type1});
testTypes("benck", {});
testTypes("strafbank", {});
- catIndex.AddCategoryByTypeAndLang(type1, lang2);
+ index.AddCategoryByTypeAndLang(type1, lang2);
testTypes("strafbank", {type1});
- catIndex.AddCategoryByTypeAndLang(type2, lang1);
+ testTypes("ie strafbank sc", {type1});
+ testTypes("rafb", {type1});
+ index.AddCategoryByTypeAndLang(type2, lang1);
testTypes("i", {type1, type2});
+
+ CategoriesIndex fullIndex(holder);
+ fullIndex.AddCategoryByTypeAllLangs(type1);
+ fullIndex.AddCategoryByTypeAllLangs(type2);
+ vector<CategoriesHolder::Category> cats;
+
+ // The letter 'a' matches "strafbank" and "village".
+ // One language is not enough.
+ fullIndex.GetCategories("a", cats);
+
+ TEST_EQUAL(cats.size(), 2, ());
+
+ TEST_EQUAL(cats[0].m_synonyms.size(), 8, ());
+ TEST_EQUAL(cats[0].m_synonyms[4].m_locale, CategoriesHolder::MapLocaleToInteger("de"), ());
+ TEST_EQUAL(cats[0].m_synonyms[4].m_name, "auf die strafbank schicken", ());
+
+ TEST_EQUAL(cats[1].m_synonyms.size(), 3, ());
+ TEST_EQUAL(cats[1].m_synonyms[0].m_locale, CategoriesHolder::MapLocaleToInteger("en"), ());
+ TEST_EQUAL(cats[1].m_synonyms[0].m_name, "village", ());
}
-UNIT_TEST(CategoriesIndex_AllCategories)
+UNIT_TEST(CategoriesIndex_MultipleTokens)
{
+ char const kCategories[] =
+ "shop-bakery\n"
+ "en:shop of buns\n"
+ "\n"
+ "shop-butcher\n"
+ "en:shop of meat";
+
classificator::Load();
+ CategoriesHolder holder(make_unique<MemReader>(kCategories, sizeof(kCategories) - 1));
+ CategoriesIndex index(holder);
+
+ index.AddAllCategoriesInAllLangs();
+ auto testTypes = [&](string const & query, vector<uint32_t> const & expected)
+ {
+ vector<uint32_t> result;
+ index.GetAssociatedTypes(query, result);
+ TEST_EQUAL(result, expected, (query));
+ };
- CategoriesIndex catIndex;
+ uint32_t type1 = classif().GetTypeByPath({"shop", "bakery"});
+ uint32_t type2 = classif().GetTypeByPath({"shop", "butcher"});
+ if (type1 > type2)
+ swap(type1, type2);
+
+ testTypes("shop", {type1, type2});
+ testTypes("shop buns", {type1});
+ testTypes("shop meat", {type2});
+}
- catIndex.AddAllCategoriesAllLangs();
- vector<uint32_t> types;
- catIndex.GetAssociatedTypes("", types);
- TEST_LESS(types.size(), 300, ());
#ifdef DEBUG
- TEST_LESS(catIndex.GetNumTrieNodes(), 400000, ());
-#endif
+// A check that this data structure is not too heavy.
+UNIT_TEST(CategoriesIndex_AllCategories)
+{
+ classificator::Load();
+
+ CategoriesIndex index;
+
+ index.AddAllCategoriesInAllLangs();
+ TEST_LESS(index.GetNumTrieNodes(), 250000, ());
}
+#endif
+#ifdef DEBUG
+// A check that this data structure is not too heavy.
UNIT_TEST(CategoriesIndex_AllCategoriesEnglishName)
{
classificator::Load();
- CategoriesIndex catIndex;
+ CategoriesIndex index;
- catIndex.AddAllCategoriesInLang(CategoriesHolder::MapLocaleToInteger("en"));
- vector<uint32_t> types;
- catIndex.GetAssociatedTypes("", types);
- my::SortUnique(types);
- TEST_LESS(types.size(), 300, ());
-#ifdef DEBUG
- TEST_LESS(catIndex.GetNumTrieNodes(), 10000, ());
-#endif
+ index.AddAllCategoriesInLang(CategoriesHolder::MapLocaleToInteger("en"));
+ TEST_LESS(index.GetNumTrieNodes(), 6000, ());
}
+#endif