Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/base
diff options
context:
space:
mode:
authorYuri Gorshenin <y@maps.me>2016-11-09 15:48:34 +0300
committerYuri Gorshenin <y@maps.me>2016-11-15 17:32:54 +0300
commitdbc93bdea03bdb33527e54fc3ebff69f7074318c (patch)
tree99df440d6608074107a9710f86ef832810405138 /base
parent71e7508675311ba3c2f3ea6b52e68d5e499f1dc2 (diff)
[search] Switch to DFAs in search index retrieval.
Diffstat (limited to 'base')
-rw-r--r--base/CMakeLists.txt8
-rw-r--r--base/base.pro5
-rw-r--r--base/base_tests/CMakeLists.txt2
-rw-r--r--base/base_tests/base_tests.pro1
-rw-r--r--base/base_tests/levenshtein_dfa_test.cpp8
-rw-r--r--base/base_tests/uni_string_dfa_test.cpp90
-rw-r--r--base/dfa_helpers.hpp66
-rw-r--r--base/levenshtein_dfa.hpp18
-rw-r--r--base/string_utils.hpp33
-rw-r--r--base/uni_string_dfa.cpp41
-rw-r--r--base/uni_string_dfa.hpp39
11 files changed, 277 insertions, 34 deletions
diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt
index 89acc66ef7..e71d7f4a6b 100644
--- a/base/CMakeLists.txt
+++ b/base/CMakeLists.txt
@@ -20,10 +20,13 @@ set(
const_helper.hpp
deferred_task.cpp
deferred_task.hpp
+ dfa_helpers.hpp
exception.cpp
exception.hpp
gmtime.cpp
gmtime.hpp
+ levenshtein_dfa.cpp
+ levenshtein_dfa.hpp
limited_priority_queue.hpp
logging.cpp
logging.hpp
@@ -71,12 +74,15 @@ set(
threaded_container.hpp
threaded_list.hpp
threaded_priority_queue.hpp
+ internal/message.cpp
+ internal/message.hpp
timegm.cpp
timegm.hpp
timer.cpp
timer.hpp
+ uni_string_dfa.cpp
+ uni_string_dfa.hpp
worker_thread.hpp
- internal/message.cpp
)
add_library(${PROJECT_NAME} ${SRC})
diff --git a/base/base.pro b/base/base.pro
index 9f3fe820b7..31f3f5bd88 100644
--- a/base/base.pro
+++ b/base/base.pro
@@ -32,6 +32,7 @@ SOURCES += \
time_samples.cpp \
timegm.cpp \
timer.cpp \
+ uni_string_dfa.cpp \
HEADERS += \
SRC_FIRST.hpp \
@@ -46,9 +47,10 @@ HEADERS += \
condition.hpp \
const_helper.hpp \
deferred_task.hpp \
+ dfa_helpers.hpp \
exception.hpp \
gmtime.hpp \
- internal/messagex.hpp \
+ internal/message.hpp \
levenshtein_dfa.hpp \
limited_priority_queue.hpp \
logging.hpp \
@@ -87,4 +89,5 @@ HEADERS += \
time_samples.hpp \
timegm.hpp \
timer.hpp \
+ uni_string_dfa.hpp \
worker_thread.hpp \
diff --git a/base/base_tests/CMakeLists.txt b/base/base_tests/CMakeLists.txt
index 3120b52052..0fa0ec6b3e 100644
--- a/base/base_tests/CMakeLists.txt
+++ b/base/base_tests/CMakeLists.txt
@@ -12,6 +12,7 @@ set(
condition_test.cpp
const_helper.cpp
containers_test.cpp
+ levenshtein_dfa_test.cpp
logging_test.cpp
math_test.cpp
matrix_test.cpp
@@ -33,6 +34,7 @@ set(
threads_test.cpp
timegm_test.cpp
timer_test.cpp
+ uni_string_dfa_test.cpp
worker_thread_test.cpp
)
diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro
index 1a4f2bb9ec..c8708b84d4 100644
--- a/base/base_tests/base_tests.pro
+++ b/base/base_tests/base_tests.pro
@@ -44,6 +44,7 @@ SOURCES += \
threads_test.cpp \
timegm_test.cpp \
timer_test.cpp \
+ uni_string_dfa_test.cpp \
worker_thread_test.cpp \
HEADERS +=
diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp
index c671b1aa1e..f3cd896fa2 100644
--- a/base/base_tests/levenshtein_dfa_test.cpp
+++ b/base/base_tests/levenshtein_dfa_test.cpp
@@ -1,5 +1,6 @@
#include "testing/testing.hpp"
+#include "base/dfa_helpers.hpp"
#include "base/levenshtein_dfa.hpp"
#include "std/string.hpp"
@@ -18,7 +19,7 @@ enum class Status
Status GetStatus(LevenshteinDFA const & dfa, string const & s)
{
auto it = dfa.Begin();
- it.Move(s);
+ DFAMove(it, s);
if (it.Accepts())
return Status::Accepts;
if (it.Rejects())
@@ -86,5 +87,10 @@ UNIT_TEST(LevenshteinDFA_Smoke)
TEST(Accepts(dfa, "ленигнрадский"), ());
TEST(Rejects(dfa, "ленинский"), ());
}
+
+ {
+ LevenshteinDFA dfa("atm", 1 /* maxErrors */);
+ TEST(Rejects(dfa, "san"), ());
+ }
}
} // namespace
diff --git a/base/base_tests/uni_string_dfa_test.cpp b/base/base_tests/uni_string_dfa_test.cpp
new file mode 100644
index 0000000000..33191e18dd
--- /dev/null
+++ b/base/base_tests/uni_string_dfa_test.cpp
@@ -0,0 +1,90 @@
+#include "testing/testing.hpp"
+
+#include "base/dfa_helpers.hpp"
+#include "base/string_utils.hpp"
+#include "base/uni_string_dfa.hpp"
+
+using namespace strings;
+
+namespace
+{
+UNIT_TEST(UniStringDFA_Smoke)
+{
+ {
+ UniStringDFA dfa("");
+
+ auto it = dfa.Begin();
+ TEST(it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "a");
+ TEST(!it.Accepts(), ());
+ TEST(it.Rejects(), ());
+ }
+
+ {
+ UniStringDFA dfa("абв");
+
+ auto it = dfa.Begin();
+ TEST(!it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "а");
+ TEST(!it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "б");
+ TEST(!it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "в");
+ TEST(it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "г");
+ TEST(!it.Accepts(), ());
+ TEST(it.Rejects(), ());
+ }
+
+ {
+ UniStringDFA dfa("абв");
+
+ auto it = dfa.Begin();
+ TEST(!it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "а");
+ TEST(!it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "г");
+ TEST(!it.Accepts(), ());
+ TEST(it.Rejects(), ());
+ }
+}
+
+UNIT_TEST(UniStringDFA_Prefix)
+{
+ {
+ PrefixDFAModifier<UniStringDFA> dfa(UniStringDFA("abc"));
+
+ auto it = dfa.Begin();
+ DFAMove(it, "ab");
+
+ TEST(!it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "c");
+ TEST(it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "d");
+ TEST(it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ DFAMove(it, "efghijk");
+ TEST(it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+ }
+}
+} // namespace
diff --git a/base/dfa_helpers.hpp b/base/dfa_helpers.hpp
new file mode 100644
index 0000000000..5815f380f9
--- /dev/null
+++ b/base/dfa_helpers.hpp
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "base/string_utils.hpp"
+
+#include "std/string.hpp"
+
+namespace strings
+{
+template <typename DFA>
+class PrefixDFAModifier
+{
+public:
+ class Iterator
+ {
+ public:
+ Iterator & Move(strings::UniChar c)
+ {
+ if (Accepts() || Rejects())
+ return *this;
+
+ m_it.Move(c);
+ if (m_it.Accepts())
+ m_accepts = true;
+
+ return *this;
+ }
+
+ bool Accepts() const { return m_accepts; }
+ bool Rejects() const { return !Accepts() && m_it.Rejects(); }
+
+ private:
+ friend class PrefixDFAModifier;
+
+ Iterator(typename DFA::Iterator it) : m_it(it), m_accepts(m_it.Accepts()) {}
+
+ typename DFA::Iterator m_it;
+ bool m_accepts;
+ };
+
+ explicit PrefixDFAModifier(DFA const & dfa) : m_dfa(dfa) {}
+
+ Iterator Begin() const { return Iterator(m_dfa.Begin()); }
+
+private:
+ DFA const m_dfa;
+};
+
+template <typename DFAIt, typename It>
+void DFAMove(DFAIt & it, It begin, It end)
+{
+ for (; begin != end; ++begin)
+ it.Move(*begin);
+}
+
+template <typename DFAIt>
+void DFAMove(DFAIt & it, UniString const & s)
+{
+ DFAMove(it, s.begin(), s.end());
+}
+
+template <typename DFAIt>
+void DFAMove(DFAIt & it, string const & s)
+{
+ DFAMove(it, MakeUniString(s));
+}
+} // namespace strings
diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp
index cef354c458..bdf5ca5b30 100644
--- a/base/levenshtein_dfa.hpp
+++ b/base/levenshtein_dfa.hpp
@@ -71,29 +71,13 @@ public:
return *this;
}
- inline Iterator & Move(UniString const & s) { return Move(s.begin(), s.end()); }
-
- inline Iterator & Move(string const & s)
- {
- UniString us = MakeUniString(s);
- return Move(us);
- }
-
- template <typename It>
- Iterator & Move(It begin, It end)
- {
- for (; begin != end; ++begin)
- Move(*begin);
- return *this;
- }
-
bool Accepts() const { return m_dfa.IsAccepting(m_s); }
bool Rejects() const { return m_dfa.IsRejecting(m_s); }
private:
friend class LevenshteinDFA;
- Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {}
+ explicit Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {}
size_t m_s;
LevenshteinDFA const & m_dfa;
diff --git a/base/string_utils.hpp b/base/string_utils.hpp
index 114ce0f5d8..7875c55890 100644
--- a/base/string_utils.hpp
+++ b/base/string_utils.hpp
@@ -484,40 +484,45 @@ void ForEachMatched(string const & s, regex const & regex, TFn && fn)
fn(*cur);
}
-// Computes the minimum number of insertions, deletions and alterations
-// of characters needed to transform one string into another.
-// The function works in O(length1 * length2) time and memory
-// where length1 and length2 are the lengths of the argument strings.
-// See https://en.wikipedia.org/wiki/Levenshtein_distance.
-// One string is [b1, e1) and the other is [b2, e2). The iterator
-// form is chosen to fit both std::string and strings::UniString.
-// This function does not normalize either of the strings.
+// Computes the minimum number of insertions, deletions and
+// alterations of characters needed to transform one string into
+// another. The function works in O(length1 * length2) time and
+// O(min(length1, length2)) memory where length1 and length2 are the
+// lengths of the argument strings. See
+// https://en.wikipedia.org/wiki/Levenshtein_distance. One string is
+// [b1, e1) and the other is [b2, e2). The iterator form is chosen to
+// fit both std::string and strings::UniString. This function does
+// not normalize either of the strings.
template <typename TIter>
size_t EditDistance(TIter const & b1, TIter const & e1, TIter const & b2, TIter const & e2)
{
size_t const n = distance(b1, e1);
size_t const m = distance(b2, e2);
- // |cur| and |prev| are current and previous rows of the
+
+ if (m > n)
+ return EditDistance(b2, e2, b1, e1);
+
+ // |curr| and |prev| are current and previous rows of the
// dynamic programming table.
vector<size_t> prev(m + 1);
- vector<size_t> cur(m + 1);
+ vector<size_t> curr(m + 1);
for (size_t j = 0; j <= m; j++)
prev[j] = j;
auto it1 = b1;
// 1-based to avoid corner cases.
for (size_t i = 1; i <= n; ++i, ++it1)
{
- cur[0] = i;
+ curr[0] = i;
auto const & c1 = *it1;
auto it2 = b2;
for (size_t j = 1; j <= m; ++j, ++it2)
{
auto const & c2 = *it2;
- cur[j] = min(cur[j - 1], prev[j]) + 1;
- cur[j] = min(cur[j], prev[j - 1] + (c1 == c2 ? 0 : 1));
+ curr[j] = min(curr[j - 1], prev[j]) + 1;
+ curr[j] = min(curr[j], prev[j - 1] + (c1 == c2 ? 0 : 1));
}
- prev.swap(cur);
+ prev.swap(curr);
}
return prev[m];
}
diff --git a/base/uni_string_dfa.cpp b/base/uni_string_dfa.cpp
new file mode 100644
index 0000000000..4106bb8717
--- /dev/null
+++ b/base/uni_string_dfa.cpp
@@ -0,0 +1,41 @@
+#include "base/uni_string_dfa.hpp"
+
+#include "base/assert.hpp"
+
+namespace strings
+{
+// UniStringDFA::Iterator --------------------------------------------------------------------------
+UniStringDFA::Iterator::Iterator(UniString const & s) : m_s(s), m_pos(0), m_rejected(false) {}
+
+UniStringDFA::Iterator & UniStringDFA::Iterator::Move(UniChar c)
+{
+ if (Accepts())
+ {
+ m_rejected = true;
+ return *this;
+ }
+
+ if (Rejects())
+ return *this;
+
+ ASSERT_LESS(m_pos, m_s.size(), ());
+ if (m_s[m_pos] != c)
+ {
+ m_rejected = true;
+ return *this;
+ }
+
+ ++m_pos;
+ return *this;
+}
+
+// UniStringDFA::UniStringDFA ----------------------------------------------------------------------
+UniStringDFA::UniStringDFA(UniString const & s) : m_s(s) {}
+
+UniStringDFA::UniStringDFA(string const & s): UniStringDFA(MakeUniString(s)) {}
+
+UniStringDFA::Iterator UniStringDFA::Begin() const
+{
+ return Iterator(m_s);
+}
+} // namespace strings
diff --git a/base/uni_string_dfa.hpp b/base/uni_string_dfa.hpp
new file mode 100644
index 0000000000..f7e75085a1
--- /dev/null
+++ b/base/uni_string_dfa.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "base/logging.hpp"
+#include "base/string_utils.hpp"
+
+#include "std/string.hpp"
+
+namespace strings
+{
+class UniStringDFA
+{
+public:
+ class Iterator
+ {
+ public:
+ Iterator & Move(UniChar c);
+
+ inline bool Accepts() const { return !Rejects() && m_pos == m_s.size(); }
+ inline bool Rejects() const { return m_rejected; }
+
+ private:
+ friend class UniStringDFA;
+
+ Iterator(UniString const & s);
+
+ UniString const & m_s;
+ size_t m_pos;
+ bool m_rejected;
+ };
+
+ explicit UniStringDFA(UniString const & s);
+ explicit UniStringDFA(string const & s);
+
+ Iterator Begin() const;
+
+private:
+ UniString const m_s;
+};
+} // namespace strings