Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaxim Pimenov <m@maps.me>2018-12-24 20:19:10 +0300
committerTatiana Yan <tatiana.kondakova@gmail.com>2018-12-26 14:00:57 +0300
commitc00402fdd220ba51153c2baa46a565e2d2eef377 (patch)
tree1b5b0a32e820cef1d1c02eea6e3793518414b370 /geocoder
parent087b82dbe94377f8a0bc6df3fad2b2e3a20c10c4 (diff)
[geocoder] Separated Hierarchy and Index.
Diffstat (limited to 'geocoder')
-rw-r--r--geocoder/CMakeLists.txt2
-rw-r--r--geocoder/geocoder.cpp14
-rw-r--r--geocoder/geocoder.hpp5
-rw-r--r--geocoder/geocoder_tests/geocoder_tests.cpp2
-rw-r--r--geocoder/hierarchy.cpp127
-rw-r--r--geocoder/hierarchy.hpp31
-rw-r--r--geocoder/index.cpp127
-rw-r--r--geocoder/index.hpp48
8 files changed, 226 insertions, 130 deletions
diff --git a/geocoder/CMakeLists.txt b/geocoder/CMakeLists.txt
index d10f7ce489..a90fa2a41a 100644
--- a/geocoder/CMakeLists.txt
+++ b/geocoder/CMakeLists.txt
@@ -9,6 +9,8 @@ set(
geocoder.hpp
hierarchy.cpp
hierarchy.hpp
+ index.cpp
+ index.hpp
result.cpp
result.hpp
types.cpp
diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp
index 98698d0b9d..6b9a091877 100644
--- a/geocoder/geocoder.cpp
+++ b/geocoder/geocoder.cpp
@@ -209,7 +209,10 @@ vector<Geocoder::Layer> & Geocoder::Context::GetLayers() { return m_layers; }
vector<Geocoder::Layer> const & Geocoder::Context::GetLayers() const { return m_layers; }
// Geocoder ----------------------------------------------------------------------------------------
-Geocoder::Geocoder(string const & pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {}
+Geocoder::Geocoder(string const & pathToJsonHierarchy)
+ : m_hierarchy(pathToJsonHierarchy), m_index(m_hierarchy)
+{
+}
void Geocoder::ProcessQuery(string const & query, vector<Result> & results) const
{
@@ -227,6 +230,8 @@ void Geocoder::ProcessQuery(string const & query, vector<Result> & results) cons
Hierarchy const & Geocoder::GetHierarchy() const { return m_hierarchy; }
+Index const & Geocoder::GetIndex() const { return m_index; }
+
void Geocoder::Go(Context & ctx, Type type) const
{
if (ctx.GetNumTokens() == 0)
@@ -313,7 +318,10 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer
for (auto const & se : layer.m_entries)
{
- for (auto const & be : se->m_buildingsOnStreet)
+ auto const * buildings = m_index.GetBuildingsOnStreet(se->m_osmId);
+ if (buildings == nullptr)
+ continue;
+ for (auto const & be : *buildings)
{
auto const bt = static_cast<size_t>(Type::Building);
auto const & realHN = MakeHouseNumber(be->m_address[bt]);
@@ -326,7 +334,7 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer
void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
Layer & curLayer) const
{
- auto const * entries = m_hierarchy.GetEntries(subquery);
+ auto const * entries = m_index.GetEntries(subquery);
if (!entries || entries->empty())
return;
diff --git a/geocoder/geocoder.hpp b/geocoder/geocoder.hpp
index 4ff93fd3af..ccf8e1be60 100644
--- a/geocoder/geocoder.hpp
+++ b/geocoder/geocoder.hpp
@@ -2,6 +2,7 @@
#include "geocoder/beam.hpp"
#include "geocoder/hierarchy.hpp"
+#include "geocoder/index.hpp"
#include "geocoder/result.hpp"
#include "geocoder/types.hpp"
@@ -123,6 +124,8 @@ public:
Hierarchy const & GetHierarchy() const;
+ Index const & GetIndex() const;
+
private:
void Go(Context & ctx, Type type) const;
@@ -132,5 +135,7 @@ private:
Layer & curLayer) const;
Hierarchy m_hierarchy;
+
+ Index m_index;
};
} // namespace geocoder
diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp
index 504dcf9b32..111c6f718a 100644
--- a/geocoder/geocoder_tests/geocoder_tests.cpp
+++ b/geocoder/geocoder_tests/geocoder_tests.cpp
@@ -72,7 +72,7 @@ UNIT_TEST(Geocoder_Hierarchy)
ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
- auto entries = geocoder.GetHierarchy().GetEntries({("florencia")});
+ auto entries = geocoder.GetIndex().GetEntries({("florencia")});
TEST(entries, ());
TEST_EQUAL(entries->size(), 1, ());
diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp
index bf8c43ecab..e0457166ee 100644
--- a/geocoder/hierarchy.cpp
+++ b/geocoder/hierarchy.cpp
@@ -2,7 +2,6 @@
#include "indexer/search_string_utils.hpp"
-#include "base/assert.hpp"
#include "base/exception.hpp"
#include "base/logging.hpp"
#include "base/macros.hpp"
@@ -19,11 +18,6 @@ namespace
{
// Information will be logged for every |kLogBatch| entries.
size_t const kLogBatch = 100000;
-
-string MakeIndexKey(geocoder::Tokens const & tokens)
-{
- return strings::JoinStrings(tokens, " ");
-}
} // namespace
namespace geocoder
@@ -123,7 +117,7 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
if (line.empty())
continue;
- auto i = line.find(' ');
+ auto const i = line.find(' ');
int64_t encodedId;
if (i == string::npos || !strings::to_any(line.substr(0, i), encodedId))
{
@@ -147,24 +141,39 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
if (stats.m_numLoaded % kLogBatch == 0)
LOG(LINFO, ("Read", stats.m_numLoaded, "entries"));
- m_entriesStorage.emplace_back(move(entry));
+ m_entries.emplace_back(move(entry));
}
if (stats.m_numLoaded % kLogBatch != 0)
LOG(LINFO, ("Read", stats.m_numLoaded, "entries"));
LOG(LINFO, ("Sorting entries..."));
- sort(m_entriesStorage.begin(), m_entriesStorage.end());
+ sort(m_entries.begin(), m_entries.end());
- LOG(LINFO, ("Indexing entries..."));
- IndexEntries();
- LOG(LINFO, ("Indexing houses..."));
- IndexHouses();
+ {
+ size_t i = 0;
+ while (i < m_entries.size())
+ {
+ size_t j = i + 1;
+ while (j < m_entries.size() && m_entries[i].m_osmId == m_entries[j].m_osmId)
+ ++j;
+ if (j != i + 1)
+ {
+ ++stats.m_duplicateOsmIds;
+ // todo Remove the cast when the hierarchies no longer contain negative keys.
+ LOG(LDEBUG,
+ ("Duplicate osm id:", static_cast<int64_t>(m_entries[i].m_osmId.GetEncodedId()), "(",
+ m_entries[i].m_osmId, ")", "occurs as a key in", j - i, "key-value entries."));
+ }
+ i = j;
+ }
+ }
LOG(LINFO, ("Finished reading and indexing the hierarchy. Stats:"));
- LOG(LINFO, ("Entries indexed:", stats.m_numLoaded));
+ LOG(LINFO, ("Entries loaded:", stats.m_numLoaded));
LOG(LINFO, ("Corrupted json lines:", stats.m_badJsons));
LOG(LINFO, ("Unreadable base::GeoObjectIds:", stats.m_badOsmIds));
+ LOG(LINFO, ("Duplicate base::GeoObjectIds:", stats.m_duplicateOsmIds));
LOG(LINFO, ("Entries with duplicate address parts:", stats.m_duplicateAddresses));
LOG(LINFO, ("Entries without address:", stats.m_emptyAddresses));
LOG(LINFO, ("Entries without names:", stats.m_emptyNames));
@@ -173,13 +182,9 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
LOG(LINFO, ("(End of stats.)"));
}
-vector<Hierarchy::Entry *> const * const Hierarchy::GetEntries(Tokens const & tokens) const
+vector<Hierarchy::Entry> const & Hierarchy::GetEntries() const
{
- auto it = m_entriesByTokens.find(MakeIndexKey(tokens));
- if (it == m_entriesByTokens.end())
- return {};
-
- return &it->second;
+ return m_entries;
}
Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & osmId) const
@@ -188,89 +193,11 @@ Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & o
return e.m_osmId < id;
};
- auto it = lower_bound(m_entriesStorage.begin(), m_entriesStorage.end(), osmId, cmp);
+ auto const it = lower_bound(m_entries.begin(), m_entries.end(), osmId, cmp);
- if (it == m_entriesStorage.end() || it->m_osmId != osmId)
+ if (it == m_entries.end() || it->m_osmId != osmId)
return nullptr;
return &(*it);
}
-
-void Hierarchy::IndexEntries()
-{
- size_t numIndexed = 0;
- for (Entry & e : m_entriesStorage)
- {
- // The entry is indexed only by its address.
- // todo(@m) Index it by name too.
- if (e.m_type == Type::Count)
- continue;
-
- if (e.m_type == Type::Street)
- {
- IndexStreet(e);
- }
- else
- {
- size_t const t = static_cast<size_t>(e.m_type);
- m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
- }
-
- // Index every token but do not index prefixes.
- // for (auto const & tok : entry.m_address[t])
- // m_entriesByTokens[{tok}].emplace_back(entry);
-
- ++numIndexed;
- if (numIndexed % kLogBatch == 0)
- LOG(LINFO, ("Indexed", numIndexed, "entries"));
- }
-
- if (numIndexed % kLogBatch != 0)
- LOG(LINFO, ("Indexed", numIndexed, "entries"));
-}
-
-void Hierarchy::IndexStreet(Entry & e)
-{
- CHECK_EQUAL(e.m_type, Type::Street, ());
- size_t const t = static_cast<size_t>(e.m_type);
- m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
-
- for (size_t i = 0; i < e.m_address[t].size(); ++i)
- {
- if (!search::IsStreetSynonym(strings::MakeUniString(e.m_address[t][i])))
- continue;
- auto addr = e.m_address[t];
- addr.erase(addr.begin() + i);
- m_entriesByTokens[MakeIndexKey(addr)].emplace_back(&e);
- }
-}
-
-void Hierarchy::IndexHouses()
-{
- size_t numIndexed = 0;
- for (auto const & be : m_entriesStorage)
- {
- if (be.m_type != Type::Building)
- continue;
-
- size_t const t = static_cast<size_t>(Type::Street);
-
- auto const * streetCandidates = GetEntries(be.m_address[t]);
- if (streetCandidates == nullptr)
- continue;
-
- for (auto & se : *streetCandidates)
- {
- if (se->IsParentTo(be))
- se->m_buildingsOnStreet.emplace_back(&be);
- }
-
- ++numIndexed;
- if (numIndexed % kLogBatch == 0)
- LOG(LINFO, ("Indexed", numIndexed, "houses"));
- }
-
- if (numIndexed % kLogBatch != 0)
- LOG(LINFO, ("Indexed", numIndexed, "houses"));
-}
} // namespace geocoder
diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp
index 0197279d4e..6bf1535843 100644
--- a/geocoder/hierarchy.hpp
+++ b/geocoder/hierarchy.hpp
@@ -8,8 +8,6 @@
#include <cstddef>
#include <cstdint>
#include <string>
-#include <unordered_map>
-#include <utility>
#include <vector>
#include "3party/jansson/myjansson.hpp"
@@ -30,6 +28,9 @@ public:
// Number of entries with unreadable base::GeoObjectIds.
uint64_t m_badOsmIds = 0;
+ // Number of base::GeoObjectsIds that occur as keys in at least two entries.
+ uint64_t m_duplicateOsmIds = 0;
+
// Number of entries with duplicate subfields in the address field.
uint64_t m_duplicateAddresses = 0;
@@ -72,37 +73,15 @@ public:
// The address fields of this entry, one per Type.
std::array<Tokens, static_cast<size_t>(Type::Count) + 1> m_address;
-
- // List of houses that belong to the street that is desribed by this entry.
- // Only valid if |m_type| is Type::Street.
- std::vector<Entry const *> m_buildingsOnStreet;
};
explicit Hierarchy(std::string const & pathToJsonHierarchy);
- // Returns a pointer to entries whose names exactly match |tokens|
- // (the order matters) or nullptr if there are no such entries.
- //
- // todo This method (and the whole class, in fact) is in the
- // prototype stage and may be too slow. Proper indexing should
- // be implemented to perform this type of queries.
- std::vector<Entry *> const * const GetEntries(Tokens const & tokens) const;
+ std::vector<Entry> const & GetEntries() const;
Entry const * GetEntryForOsmId(base::GeoObjectId const & osmId) const;
private:
- // Adds address information of entries to the index.
- void IndexEntries();
-
- // Adds the street |e| to the index, with and without synonyms
- // of the word "street".
- void IndexStreet(Entry & e);
-
- // Fills |m_buildingsOnStreet| field for all street entries.
- void IndexHouses();
-
- std::unordered_map<std::string, std::vector<Entry *>> m_entriesByTokens;
-
- std::vector<Entry> m_entriesStorage;
+ std::vector<Entry> m_entries;
};
} // namespace geocoder
diff --git a/geocoder/index.cpp b/geocoder/index.cpp
new file mode 100644
index 0000000000..512668dfdf
--- /dev/null
+++ b/geocoder/index.cpp
@@ -0,0 +1,127 @@
+#include "geocoder/index.hpp"
+
+#include "geocoder/types.hpp"
+
+#include "indexer/search_string_utils.hpp"
+
+#include "base/assert.hpp"
+#include "base/logging.hpp"
+#include "base/string_utils.hpp"
+
+using namespace std;
+
+namespace
+{
+// Information will be logged for every |kLogBatch| entries.
+size_t const kLogBatch = 100000;
+
+string MakeIndexKey(geocoder::Tokens const & tokens) { return strings::JoinStrings(tokens, " "); }
+} // namespace
+
+namespace geocoder
+{
+Index::Index(Hierarchy const & hierarchy) : m_entries(hierarchy.GetEntries())
+{
+ LOG(LINFO, ("Indexing entries..."));
+ AddEntries();
+ LOG(LINFO, ("Indexing houses..."));
+ AddHouses();
+}
+
+vector<Index::EntryPtr> const * const Index::GetEntries(Tokens const & tokens) const
+{
+ auto const it = m_entriesByTokens.find(MakeIndexKey(tokens));
+ if (it == m_entriesByTokens.end())
+ return {};
+
+ return &it->second;
+}
+
+vector<Index::EntryPtr> const * const Index::GetBuildingsOnStreet(
+ base::GeoObjectId const & osmId) const
+{
+ auto const it = m_buildingsOnStreet.find(osmId);
+ if (it == m_buildingsOnStreet.end())
+ return {};
+
+ return &it->second;
+}
+
+void Index::AddEntries()
+{
+ size_t numIndexed = 0;
+ for (auto const & e : m_entries)
+ {
+ // The entry is indexed only by its address.
+ // todo(@m) Index it by name too.
+ if (e.m_type == Type::Count)
+ continue;
+
+ if (e.m_type == Type::Street)
+ {
+ AddStreet(e);
+ }
+ else
+ {
+ size_t const t = static_cast<size_t>(e.m_type);
+ m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
+ }
+
+ // Index every token but do not index prefixes.
+ // for (auto const & tok : entry.m_address[t])
+ // m_entriesByTokens[{tok}].emplace_back(entry);
+
+ ++numIndexed;
+ if (numIndexed % kLogBatch == 0)
+ LOG(LINFO, ("Indexed", numIndexed, "entries"));
+ }
+
+ if (numIndexed % kLogBatch != 0)
+ LOG(LINFO, ("Indexed", numIndexed, "entries"));
+}
+
+void Index::AddStreet(Hierarchy::Entry const & e)
+{
+ CHECK_EQUAL(e.m_type, Type::Street, ());
+ size_t const t = static_cast<size_t>(e.m_type);
+ m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
+
+ for (size_t i = 0; i < e.m_address[t].size(); ++i)
+ {
+ if (!search::IsStreetSynonym(strings::MakeUniString(e.m_address[t][i])))
+ continue;
+ auto addr = e.m_address[t];
+ addr.erase(addr.begin() + i);
+ m_entriesByTokens[MakeIndexKey(addr)].emplace_back(&e);
+ }
+}
+
+void Index::AddHouses()
+{
+ size_t numIndexed = 0;
+ for (auto & be : m_entries)
+ {
+ if (be.m_type != Type::Building)
+ continue;
+
+ size_t const t = static_cast<size_t>(Type::Street);
+
+ auto const * streetCandidates = GetEntries(be.m_address[t]);
+ if (streetCandidates == nullptr)
+ continue;
+
+ for (auto & se : *streetCandidates)
+ {
+ if (se->IsParentTo(be))
+ m_buildingsOnStreet[se->m_osmId].emplace_back(&be);
+ }
+
+ ++numIndexed;
+ if (numIndexed % kLogBatch == 0)
+ LOG(LINFO, ("Indexed", numIndexed, "houses"));
+ }
+
+ if (numIndexed % kLogBatch != 0)
+ LOG(LINFO, ("Indexed", numIndexed, "houses"));
+}
+} // namespace geocoder
diff --git a/geocoder/index.hpp b/geocoder/index.hpp
new file mode 100644
index 0000000000..569ebb2e40
--- /dev/null
+++ b/geocoder/index.hpp
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "geocoder/hierarchy.hpp"
+
+#include "base/geo_object_id.hpp"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace geocoder
+{
+class Index
+{
+public:
+ using EntryPtr = Hierarchy::Entry const *;
+
+ explicit Index(Hierarchy const & hierarchy);
+
+ // Returns a pointer to entries whose names exactly match |tokens|
+ // (the order matters) or nullptr if there are no such entries.
+ //
+ // todo This method (and the whole class, in fact) is in the
+ // prototype stage and may be too slow. Proper indexing should
+ // be implemented to perform this type of queries.
+ std::vector<EntryPtr> const * const GetEntries(Tokens const & tokens) const;
+
+ std::vector<EntryPtr> const * const GetBuildingsOnStreet(base::GeoObjectId const & osmId) const;
+
+private:
+ // Adds address information of entries to the index.
+ void AddEntries();
+
+ // Adds the street |e| to the index, with and without synonyms
+ // of the word "street".
+ void AddStreet(Hierarchy::Entry const & e);
+
+ // Fills the |m_buildingsOnStreet| field.
+ void AddHouses();
+
+ std::vector<Hierarchy::Entry> const & m_entries;
+
+ std::unordered_map<std::string, std::vector<EntryPtr>> m_entriesByTokens;
+
+ // Lists of houses grouped by the streets they belong to.
+ std::unordered_map<base::GeoObjectId, std::vector<EntryPtr>> m_buildingsOnStreet;
+};
+} // namespace geocoder