Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/base
diff options
context:
space:
mode:
authorYuri Gorshenin <y@maps.me>2016-11-07 18:21:32 +0300
committerYuri Gorshenin <y@maps.me>2016-11-08 19:13:32 +0300
commit0c4e5578a10d9bbb6cb4d64f794a9dc8cef50f15 (patch)
tree6f22395bf5dac99be201d4da9e49647314434e7d /base
parent59fdb701e0b6afd4e44be1ee8d319a93224c1688 (diff)
Review fixes.
Diffstat (limited to 'base')
-rw-r--r--base/levenshtein_dfa.cpp137
-rw-r--r--base/levenshtein_dfa.hpp32
2 files changed, 83 insertions, 86 deletions
diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp
index c0741c7aa8..6c78c4f2a4 100644
--- a/base/levenshtein_dfa.cpp
+++ b/base/levenshtein_dfa.cpp
@@ -1,7 +1,6 @@
#include "base/levenshtein_dfa.hpp"
#include "base/assert.hpp"
-#include "base/logging.hpp"
#include "base/stl_helpers.hpp"
#include "std/algorithm.hpp"
@@ -35,50 +34,28 @@ private:
{
auto & ps = t.m_positions;
- if (p.m_numErrors < m_maxErrors)
+ if (p.m_offset < m_size && m_s[p.m_offset] == c)
{
- if (p.m_offset + 1 < m_size)
- {
- size_t i;
- if (FindRelevant(p, c, i))
- {
- if (i == 0)
- {
- ps.emplace_back(p.m_offset + 1, p.m_numErrors);
- }
- else
- {
- ps.emplace_back(p.m_offset, p.m_numErrors + 1);
- ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
- ps.emplace_back(p.m_offset + i + 1, p.m_numErrors + i);
- }
- }
- else
- {
- ps.emplace_back(p.m_offset, p.m_numErrors + 1);
- ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
- }
- }
- else if (p.m_offset + 1 == m_size)
- {
- if (m_s[p.m_offset] == c)
- {
- ps.emplace_back(p.m_offset + 1, p.m_numErrors);
- }
- else
- {
- ps.emplace_back(p.m_offset, p.m_numErrors + 1);
- ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
- }
- }
- else
- {
- ps.emplace_back(p.m_offset, p.m_numErrors + 1);
- }
+ ps.emplace_back(p.m_offset + 1, p.m_numErrors);
+ return;
}
- else if (p.m_offset < m_size && m_s[p.m_offset] == c)
+
+ if (p.m_numErrors == m_maxErrors)
+ return;
+
+ ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+
+ if (p.m_offset == m_size)
+ return;
+
+ ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
+
+ size_t i;
+ if (FindRelevant(p, c, i))
{
- ps.emplace_back(p.m_offset + 1, p.m_numErrors);
+ ASSERT_GREATER(i, 0, (i));
+ ASSERT_LESS_OR_EQUAL(p.m_offset + i + 1, m_size, ());
+ ps.emplace_back(p.m_offset + i + 1, p.m_numErrors + i);
}
}
@@ -101,23 +78,26 @@ private:
};
} // namespace
+// LevenshteinDFA ----------------------------------------------------------------------------------
+// static
+size_t const LevenshteinDFA::kStartingState = 0;
+size_t const LevenshteinDFA::kRejectingState = 1;
+
// LevenshteinDFA::Position ------------------------------------------------------------------------
LevenshteinDFA::Position::Position(size_t offset, uint8_t numErrors)
: m_offset(offset), m_numErrors(numErrors)
{
}
-bool LevenshteinDFA::Position::SumsumedBy(Position const & rhs) const
+bool LevenshteinDFA::Position::SubsumedBy(Position const & rhs) const
{
if (m_numErrors <= rhs.m_numErrors)
return false;
- size_t const delta = m_numErrors - rhs.m_numErrors;
- if (m_offset + delta < delta)
- return true;
- if (rhs.m_offset + delta < delta)
- return true;
- return m_offset + delta >= rhs.m_offset || m_offset <= rhs.m_offset + delta;
+ size_t const u = m_offset < rhs.m_offset ? rhs.m_offset - m_offset : m_offset - rhs.m_offset;
+ size_t const v = m_numErrors - rhs.m_numErrors;
+
+ return u <= v;
}
bool LevenshteinDFA::Position::operator<(Position const & rhs) const
@@ -141,6 +121,12 @@ LevenshteinDFA::State LevenshteinDFA::State::MakeStart()
return state;
}
+// static
+LevenshteinDFA::State LevenshteinDFA::State::MakeRejecting()
+{
+ return State();
+}
+
void LevenshteinDFA::State::Normalize()
{
size_t j = m_positions.size();
@@ -149,7 +135,7 @@ void LevenshteinDFA::State::Normalize()
auto const & cur = m_positions[i];
auto it = find_if(m_positions.begin(), m_positions.begin() + j,
- [&](Position const & rhs) { return cur.SumsumedBy(rhs); });
+ [&](Position const & rhs) { return cur.SubsumedBy(rhs); });
if (it != m_positions.begin() + j)
{
ASSERT_GREATER(j, 0, ());
@@ -163,8 +149,9 @@ void LevenshteinDFA::State::Normalize()
}
// LevenshteinDFA ----------------------------------------------------------------------------------
+// static
LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
- : m_size(s.size()), m_maxErrors(maxErrors), m_rejecting(0)
+ : m_size(s.size()), m_maxErrors(maxErrors)
{
m_alphabet.assign(s.begin(), s.end());
my::SortUnique(m_alphabet);
@@ -177,35 +164,36 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
}
m_alphabet.push_back(missed);
- TransitionTable table(s, maxErrors);
- queue<pair<State, size_t>> states;
- states.emplace(State::MakeStart(), 0);
-
+ queue<State> states;
map<State, size_t> visited;
- visited[states.front().first] = states.front().second;
- m_transitions.emplace_back(m_alphabet.size());
+ auto pushState = [&states, &visited, this](State const & state, size_t id)
+ {
+ ASSERT_EQUAL(id, m_transitions.size(), ());
+ ASSERT_EQUAL(visited.count(state), 0, (state, id));
+
+ states.emplace(state);
+ visited[state] = id;
+ m_transitions.emplace_back(m_alphabet.size());
+ };
+
+ pushState(State::MakeStart(), kStartingState);
+ pushState(State::MakeRejecting(), kRejectingState);
+
+ TransitionTable table(s, maxErrors);
while (!states.empty())
{
- auto const p = states.front();
+ auto const curr = states.front();
states.pop();
-
- auto const & curr = p.first;
- auto const id = p.second;
-
ASSERT(IsValid(curr), (curr));
+ ASSERT_GREATER(visited.count(curr), 0, (curr));
+ auto const id = visited[curr];
+ ASSERT_LESS(id, m_transitions.size(), ());
+
if (IsAccepting(curr))
m_accepting.insert(id);
- if (IsRejecting(curr))
- {
- ASSERT(m_rejecting == 0 || m_rejecting == id,
- ("Rejecting state must be set once:", m_rejecting, id));
- m_rejecting = id;
- }
-
- ASSERT_LESS(id, m_transitions.size(), ());
for (size_t i = 0; i < m_alphabet.size(); ++i)
{
@@ -213,15 +201,12 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
table.Move(curr, m_alphabet[i], next);
size_t nid;
+
auto const it = visited.find(next);
if (it == visited.end())
{
nid = visited.size();
- visited[next] = nid;
- m_transitions.emplace_back(m_alphabet.size());
- states.emplace(next, nid);
-
- ASSERT_EQUAL(visited.size(), m_transitions.size(), ());
+ pushState(next, nid);
}
else
{
@@ -231,8 +216,6 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
m_transitions[id][i] = nid;
}
}
-
- ASSERT(m_rejecting != 0, ("Rejecting state is not set"));
}
LevenshteinDFA::LevenshteinDFA(string const & s, uint8_t maxErrors)
diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp
index 2632a73135..cef354c458 100644
--- a/base/levenshtein_dfa.hpp
+++ b/base/levenshtein_dfa.hpp
@@ -7,22 +7,36 @@
namespace strings
{
-// This class represends a DFA recognizing a language consisting of
+// This class represents a DFA recognizing a language consisting of
// all words that are close enough to a given string, in terms of
-// Levenshtein distance. The code is based on the work "Fast String
-// Correction with Levenshtein-Automata" by Klaus U. Schulz and Stoyan
-// Mihov.
+// Levenshtein distance. Levenshtein distance treats deletions,
+// insertions and replacements as errors, transpositions are not
+// handled now. The code is based on the work "Fast String Correction
+// with Levenshtein-Automata" by Klaus U. Schulz and Stoyan Mihov.
+// For a fixed number of allowed errors and fixed alphabet the
+// construction time and size of automata is O(length of the pattern),
+// but the size grows exponentially with the number of errors, so be
+// reasonable and don't use this class when the number of errors is
+// too high.
//
// *NOTE* The class *IS* thread-safe.
+//
+// TODO (@y): add support for transpositions.
+//
+// TODO (@y): consider to implement a factory of automata, that will
+// be able to construct them quickly for a fixed number of errors.
class LevenshteinDFA
{
public:
+ static size_t const kStartingState;
+ static size_t const kRejectingState;
+
struct Position
{
Position() = default;
Position(size_t offset, uint8_t numErrors);
- bool SumsumedBy(Position const & rhs) const;
+ bool SubsumedBy(Position const & rhs) const;
bool operator<(Position const & rhs) const;
bool operator==(Position const & rhs) const;
@@ -34,6 +48,7 @@ public:
struct State
{
static State MakeStart();
+ static State MakeRejecting();
void Normalize();
inline void Clear() { m_positions.clear(); }
@@ -78,9 +93,9 @@ public:
private:
friend class LevenshteinDFA;
- Iterator(LevenshteinDFA const & dfa) : m_s(0), m_dfa(dfa) {}
+ Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {}
- size_t m_s = 0;
+ size_t m_s;
LevenshteinDFA const & m_dfa;
};
@@ -105,7 +120,7 @@ private:
inline bool IsAccepting(size_t s) const { return m_accepting.count(s) != 0; }
inline bool IsRejecting(State const & s) const { return s.m_positions.empty(); }
- inline bool IsRejecting(size_t s) const { return s == m_rejecting; }
+ inline bool IsRejecting(size_t s) const { return s == kRejectingState; }
size_t Move(size_t s, UniChar c) const;
@@ -116,7 +131,6 @@ private:
vector<vector<size_t>> m_transitions;
unordered_set<size_t> m_accepting;
- size_t m_rejecting;
};
string DebugPrint(LevenshteinDFA::Position const & p);