Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/base
diff options
context:
space:
mode:
authorYuri Gorshenin <y@maps.me>2016-11-03 15:26:10 +0300
committerYuri Gorshenin <y@maps.me>2016-11-07 14:36:19 +0300
commit59fdb701e0b6afd4e44be1ee8d319a93224c1688 (patch)
tree3751387bdb005414e7a25323896ead875a2a3bde /base
parentdf752eb5e31499c7cb7377b7ef0f20e8c154ef63 (diff)
[base] Levenshtein DFA.
Diffstat (limited to 'base')
-rw-r--r--base/base.pro2
-rw-r--r--base/base_tests/base_tests.pro1
-rw-r--r--base/base_tests/levenshtein_dfa_test.cpp90
-rw-r--r--base/levenshtein_dfa.cpp307
-rw-r--r--base/levenshtein_dfa.hpp124
5 files changed, 524 insertions, 0 deletions
diff --git a/base/base.pro b/base/base.pro
index 803d1b544a..9f3fe820b7 100644
--- a/base/base.pro
+++ b/base/base.pro
@@ -14,6 +14,7 @@ SOURCES += \
exception.cpp \
gmtime.cpp \
internal/message.cpp \
+ levenshtein_dfa.cpp \
logging.cpp \
lower_case.cpp \
normalize_unicode.cpp \
@@ -48,6 +49,7 @@ HEADERS += \
exception.hpp \
gmtime.hpp \
internal/messagex.hpp \
+ levenshtein_dfa.hpp \
limited_priority_queue.hpp \
logging.hpp \
macros.hpp \
diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro
index 7fa3e5a31b..1a4f2bb9ec 100644
--- a/base/base_tests/base_tests.pro
+++ b/base/base_tests/base_tests.pro
@@ -22,6 +22,7 @@ SOURCES += \
condition_test.cpp \
const_helper.cpp \
containers_test.cpp \
+ levenshtein_dfa_test.cpp \
logging_test.cpp \
newtype_test.cpp \
math_test.cpp \
diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp
new file mode 100644
index 0000000000..c671b1aa1e
--- /dev/null
+++ b/base/base_tests/levenshtein_dfa_test.cpp
@@ -0,0 +1,90 @@
+#include "testing/testing.hpp"
+
+#include "base/levenshtein_dfa.hpp"
+
+#include "std/string.hpp"
+
+using namespace strings;
+
+namespace
+{
+enum class Status
+{
+ Accepts,
+ Rejects,
+ Intermediate
+};
+
+Status GetStatus(LevenshteinDFA const & dfa, string const & s)
+{
+ auto it = dfa.Begin();
+ it.Move(s);
+ if (it.Accepts())
+ return Status::Accepts;
+ if (it.Rejects())
+ return Status::Rejects;
+ return Status::Intermediate;
+}
+
+bool Accepts(LevenshteinDFA const & dfa, string const & s)
+{
+ return GetStatus(dfa, s) == Status::Accepts;
+}
+
+bool Rejects(LevenshteinDFA const & dfa, string const & s)
+{
+ return GetStatus(dfa, s) == Status::Rejects;
+}
+
+bool Intermediate(LevenshteinDFA const & dfa, string const & s)
+{
+ return GetStatus(dfa, s) == Status::Intermediate;
+}
+
+UNIT_TEST(LevenshteinDFA_Smoke)
+{
+ {
+ LevenshteinDFA dfa("", 0 /* maxErrors */);
+
+ auto it = dfa.Begin();
+ TEST(it.Accepts(), ());
+ TEST(!it.Rejects(), ());
+
+ it.Move('a');
+ TEST(!it.Accepts(), ());
+ TEST(it.Rejects(), ());
+
+ it.Move('b');
+ TEST(!it.Accepts(), ());
+ TEST(it.Rejects(), ());
+ }
+
+ {
+ LevenshteinDFA dfa("abc", 1 /* maxErrors */);
+
+ TEST(Accepts(dfa, "ab"), ());
+ TEST(Accepts(dfa, "abd"), ());
+ TEST(Accepts(dfa, "abcd"), ());
+ TEST(Accepts(dfa, "bbc"), ());
+
+ TEST(Rejects(dfa, "cba"), ());
+ TEST(Rejects(dfa, "abcde"), ());
+
+ TEST(Accepts(dfa, "ac"), ());
+ TEST(Intermediate(dfa, "acb"), ()); // transpositions are not supported yet
+ TEST(Accepts(dfa, "acbc"), ());
+ TEST(Rejects(dfa, "acbd"), ());
+
+ TEST(Intermediate(dfa, "a"), ());
+ }
+
+ {
+ LevenshteinDFA dfa("ленинградский", 2 /* maxErrors */);
+
+ TEST(Accepts(dfa, "ленинградский"), ());
+ TEST(Accepts(dfa, "ленингадский"), ());
+ TEST(Accepts(dfa, "ленигнрадский"), ());
+ TEST(Rejects(dfa, "ленинский"), ());
+ }
+}
+} // namespace
diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp
new file mode 100644
index 0000000000..c0741c7aa8
--- /dev/null
+++ b/base/levenshtein_dfa.cpp
@@ -0,0 +1,307 @@
+#include "base/levenshtein_dfa.hpp"
+
+#include "base/assert.hpp"
+#include "base/logging.hpp"
+#include "base/stl_helpers.hpp"
+
+#include "std/algorithm.hpp"
+#include "std/queue.hpp"
+#include "std/set.hpp"
+#include "std/sstream.hpp"
+#include "std/vector.hpp"
+
+namespace strings
+{
+namespace
+{
+class TransitionTable
+{
+public:
+ TransitionTable(UniString const & s, uint8_t maxErrors)
+ : m_s(s), m_size(s.size()), m_maxErrors(maxErrors)
+ {
+ }
+
+ void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t)
+ {
+ t.Clear();
+ for (auto const & p : s.m_positions)
+ GetMoves(p, c, t);
+ t.Normalize();
+ }
+
+private:
+ void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t)
+ {
+ auto & ps = t.m_positions;
+
+ if (p.m_numErrors < m_maxErrors)
+ {
+ if (p.m_offset + 1 < m_size)
+ {
+ size_t i;
+ if (FindRelevant(p, c, i))
+ {
+ if (i == 0)
+ {
+ ps.emplace_back(p.m_offset + 1, p.m_numErrors);
+ }
+ else
+ {
+ ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+ ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
+ ps.emplace_back(p.m_offset + i + 1, p.m_numErrors + i);
+ }
+ }
+ else
+ {
+ ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+ ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
+ }
+ }
+ else if (p.m_offset + 1 == m_size)
+ {
+ if (m_s[p.m_offset] == c)
+ {
+ ps.emplace_back(p.m_offset + 1, p.m_numErrors);
+ }
+ else
+ {
+ ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+ ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
+ }
+ }
+ else
+ {
+ ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+ }
+ }
+ else if (p.m_offset < m_size && m_s[p.m_offset] == c)
+ {
+ ps.emplace_back(p.m_offset + 1, p.m_numErrors);
+ }
+ }
+
+ bool FindRelevant(LevenshteinDFA::Position const & p, UniChar c, size_t & i) const
+ {
+ size_t const limit =
+ min(m_size - p.m_offset, static_cast<size_t>(m_maxErrors - p.m_numErrors) + 1);
+
+ for (i = 0; i < limit; ++i)
+ {
+ if (m_s[p.m_offset + i] == c)
+ return true;
+ }
+ return false;
+ }
+
+ UniString const & m_s;
+ size_t const m_size;
+ uint8_t const m_maxErrors;
+};
+} // namespace
+
+// LevenshteinDFA::Position ------------------------------------------------------------------------
+LevenshteinDFA::Position::Position(size_t offset, uint8_t numErrors)
+ : m_offset(offset), m_numErrors(numErrors)
+{
+}
+
+bool LevenshteinDFA::Position::SumsumedBy(Position const & rhs) const
+{
+ if (m_numErrors <= rhs.m_numErrors)
+ return false;
+
+ size_t const delta = m_numErrors - rhs.m_numErrors;
+ if (m_offset + delta < delta)
+ return true;
+ if (rhs.m_offset + delta < delta)
+ return true;
+ return m_offset + delta >= rhs.m_offset || m_offset <= rhs.m_offset + delta;
+}
+
+bool LevenshteinDFA::Position::operator<(Position const & rhs) const
+{
+ if (m_offset != rhs.m_offset)
+ return m_offset < rhs.m_offset;
+ return m_numErrors < rhs.m_numErrors;
+}
+
+bool LevenshteinDFA::Position::operator==(Position const & rhs) const
+{
+ return m_offset == rhs.m_offset && m_numErrors == rhs.m_numErrors;
+}
+
+// LevenshteinDFA::State ---------------------------------------------------------------------------
+// static
+LevenshteinDFA::State LevenshteinDFA::State::MakeStart()
+{
+ State state;
+ state.m_positions.emplace_back(0 /* offset */, 0 /* numErrors */);
+ return state;
+}
+
+void LevenshteinDFA::State::Normalize()
+{
+ size_t j = m_positions.size();
+ for (size_t i = 0; i < j; ++i)
+ {
+ auto const & cur = m_positions[i];
+
+ auto it = find_if(m_positions.begin(), m_positions.begin() + j,
+ [&](Position const & rhs) { return cur.SumsumedBy(rhs); });
+ if (it != m_positions.begin() + j)
+ {
+ ASSERT_GREATER(j, 0, ());
+ --j;
+ swap(m_positions[i], m_positions[j]);
+ }
+ }
+
+ m_positions.erase(m_positions.begin() + j, m_positions.end());
+ my::SortUnique(m_positions);
+}
+
+// LevenshteinDFA ----------------------------------------------------------------------------------
+LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
+ : m_size(s.size()), m_maxErrors(maxErrors), m_rejecting(0)
+{
+ m_alphabet.assign(s.begin(), s.end());
+ my::SortUnique(m_alphabet);
+
+ UniChar missed = 0;
+ for (size_t i = 0; i < m_alphabet.size() && missed >= m_alphabet[i]; ++i)
+ {
+ if (missed == m_alphabet[i])
+ ++missed;
+ }
+ m_alphabet.push_back(missed);
+
+ TransitionTable table(s, maxErrors);
+ queue<pair<State, size_t>> states;
+ states.emplace(State::MakeStart(), 0);
+
+ map<State, size_t> visited;
+ visited[states.front().first] = states.front().second;
+
+ m_transitions.emplace_back(m_alphabet.size());
+
+ while (!states.empty())
+ {
+ auto const p = states.front();
+ states.pop();
+
+ auto const & curr = p.first;
+ auto const id = p.second;
+
+ ASSERT(IsValid(curr), (curr));
+
+ if (IsAccepting(curr))
+ m_accepting.insert(id);
+ if (IsRejecting(curr))
+ {
+ ASSERT(m_rejecting == 0 || m_rejecting == id,
+ ("Rejecting state must be set once:", m_rejecting, id));
+ m_rejecting = id;
+ }
+
+ ASSERT_LESS(id, m_transitions.size(), ());
+
+ for (size_t i = 0; i < m_alphabet.size(); ++i)
+ {
+ State next;
+ table.Move(curr, m_alphabet[i], next);
+
+ size_t nid;
+ auto const it = visited.find(next);
+ if (it == visited.end())
+ {
+ nid = visited.size();
+ visited[next] = nid;
+ m_transitions.emplace_back(m_alphabet.size());
+ states.emplace(next, nid);
+
+ ASSERT_EQUAL(visited.size(), m_transitions.size(), ());
+ }
+ else
+ {
+ nid = it->second;
+ }
+
+ m_transitions[id][i] = nid;
+ }
+ }
+
+ ASSERT(m_rejecting != 0, ("Rejecting state is not set"));
+}
+
+LevenshteinDFA::LevenshteinDFA(string const & s, uint8_t maxErrors)
+ : LevenshteinDFA(MakeUniString(s), maxErrors)
+{
+}
+
+bool LevenshteinDFA::IsValid(Position const & p) const
+{
+ return p.m_offset <= m_size && p.m_numErrors <= m_maxErrors;
+}
+
+bool LevenshteinDFA::IsValid(State const & s) const
+{
+ for (auto const & p : s.m_positions)
+ {
+ if (!IsValid(p))
+ return false;
+ }
+ return true;
+}
+
+bool LevenshteinDFA::IsAccepting(Position const & p) const
+{
+ return m_size - p.m_offset <= m_maxErrors - p.m_numErrors;
+}
+
+bool LevenshteinDFA::IsAccepting(State const & s) const
+{
+ for (auto const & p : s.m_positions)
+ {
+ if (IsAccepting(p))
+ return true;
+ }
+ return false;
+}
+
+size_t LevenshteinDFA::Move(size_t s, UniChar c) const
+{
+ ASSERT_GREATER(m_alphabet.size(), 0, ());
+ ASSERT(is_sorted(m_alphabet.begin(), m_alphabet.end() - 1), ());
+
+ size_t i;
+ auto const it = lower_bound(m_alphabet.begin(), m_alphabet.end() - 1, c);
+ if (it == m_alphabet.end() - 1 || *it != c)
+ i = m_alphabet.size() - 1;
+ else
+ i = distance(m_alphabet.begin(), it);
+
+ return m_transitions[s][i];
+}
+
+string DebugPrint(LevenshteinDFA::Position const & p)
+{
+ ostringstream os;
+ os << "Position [" << p.m_offset << ", " << static_cast<uint32_t>(p.m_numErrors) << "]";
+ return os.str();
+}
+
+string DebugPrint(LevenshteinDFA::State const & s)
+{
+ ostringstream os;
+ os << "State [";
+ for (size_t i = 0; i < s.m_positions.size(); ++i)
+ {
+ os << DebugPrint(s.m_positions[i]);
+ if (i + 1 != s.m_positions.size())
+ os << ", ";
+ }
+ return os.str();
+}
+} // namespace strings
diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp
new file mode 100644
index 0000000000..2632a73135
--- /dev/null
+++ b/base/levenshtein_dfa.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include "base/string_utils.hpp"
+
+#include "std/cstdint.hpp"
+#include "std/unordered_set.hpp"
+
+namespace strings
+{
+// This class represends a DFA recognizing a language consisting of
+// all words that are close enough to a given string, in terms of
+// Levenshtein distance. The code is based on the work "Fast String
+// Correction with Levenshtein-Automata" by Klaus U. Schulz and Stoyan
+// Mihov.
+//
+// *NOTE* The class *IS* thread-safe.
+class LevenshteinDFA
+{
+public:
+ struct Position
+ {
+ Position() = default;
+ Position(size_t offset, uint8_t numErrors);
+
+ bool SumsumedBy(Position const & rhs) const;
+
+ bool operator<(Position const & rhs) const;
+ bool operator==(Position const & rhs) const;
+
+ size_t m_offset = 0;
+ uint8_t m_numErrors = 0;
+ };
+
+ struct State
+ {
+ static State MakeStart();
+
+ void Normalize();
+ inline void Clear() { m_positions.clear(); }
+
+ inline bool operator<(State const & rhs) const { return m_positions < rhs.m_positions; }
+
+ vector<Position> m_positions;
+ };
+
+ // An iterator to the current state in the DFA.
+ //
+ // *NOTE* The class *IS NOT* thread safe. Moreover, it should not be
+ // used after destruction of the corresponding DFA.
+ class Iterator
+ {
+ public:
+ Iterator & Move(UniChar c)
+ {
+ m_s = m_dfa.Move(m_s, c);
+ return *this;
+ }
+
+ inline Iterator & Move(UniString const & s) { return Move(s.begin(), s.end()); }
+
+ inline Iterator & Move(string const & s)
+ {
+ UniString us = MakeUniString(s);
+ return Move(us);
+ }
+
+ template <typename It>
+ Iterator & Move(It begin, It end)
+ {
+ for (; begin != end; ++begin)
+ Move(*begin);
+ return *this;
+ }
+
+ bool Accepts() const { return m_dfa.IsAccepting(m_s); }
+ bool Rejects() const { return m_dfa.IsRejecting(m_s); }
+
+ private:
+ friend class LevenshteinDFA;
+
+ Iterator(LevenshteinDFA const & dfa) : m_s(0), m_dfa(dfa) {}
+
+ size_t m_s = 0;
+ LevenshteinDFA const & m_dfa;
+ };
+
+ LevenshteinDFA(UniString const & s, uint8_t maxErrors);
+ LevenshteinDFA(string const & s, uint8_t maxErrors);
+
+ inline Iterator Begin() const { return Iterator(*this); }
+
+ size_t GetNumStates() const { return m_transitions.size(); }
+ size_t GetAlphabetSize() const { return m_alphabet.size(); }
+
+private:
+ friend class Iterator;
+
+ size_t TransformChar(UniChar c) const;
+
+ bool IsValid(Position const & p) const;
+ bool IsValid(State const & s) const;
+
+ bool IsAccepting(Position const & p) const;
+ bool IsAccepting(State const & s) const;
+ inline bool IsAccepting(size_t s) const { return m_accepting.count(s) != 0; }
+
+ inline bool IsRejecting(State const & s) const { return s.m_positions.empty(); }
+ inline bool IsRejecting(size_t s) const { return s == m_rejecting; }
+
+ size_t Move(size_t s, UniChar c) const;
+
+ size_t const m_size;
+ uint8_t const m_maxErrors;
+
+ vector<UniChar> m_alphabet;
+
+ vector<vector<size_t>> m_transitions;
+ unordered_set<size_t> m_accepting;
+ size_t m_rejecting;
+};
+
+string DebugPrint(LevenshteinDFA::Position const & p);
+string DebugPrint(LevenshteinDFA::State const & s);
+} // namespace strings