[base] Levenshtein DFA.

author: Yuri Gorshenin <y@maps.me> 2016-11-03 15:26:10 +0300
committer: Yuri Gorshenin <y@maps.me> 2016-11-07 14:36:19 +0300
commit: 59fdb701e0b6afd4e44be1ee8d319a93224c1688 (patch)
tree: 3751387bdb005414e7a25323896ead875a2a3bde /base
parent: df752eb5e31499c7cb7377b7ef0f20e8c154ef63 (diff)
5 files changed, 524 insertions, 0 deletions
diff --git a/base/base.pro b/base/base.pro
index 803d1b544a..9f3fe820b7 100644
--- a/base/base.pro
+++ b/base/base.pro
@@ -14,6 +14,7 @@ SOURCES += \
     exception.cpp \
     gmtime.cpp \
     internal/message.cpp \
+    levenshtein_dfa.cpp \
     logging.cpp \
     lower_case.cpp \
     normalize_unicode.cpp \
@@ -48,6 +49,7 @@ HEADERS += \
     exception.hpp \
     gmtime.hpp \
     internal/messagex.hpp \
+    levenshtein_dfa.hpp \
     limited_priority_queue.hpp \
     logging.hpp \
     macros.hpp \
diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro
index 7fa3e5a31b..1a4f2bb9ec 100644
--- a/base/base_tests/base_tests.pro
+++ b/base/base_tests/base_tests.pro
@@ -22,6 +22,7 @@ SOURCES += \
   condition_test.cpp \
   const_helper.cpp \
   containers_test.cpp \
+  levenshtein_dfa_test.cpp \
   logging_test.cpp \
   newtype_test.cpp \
   math_test.cpp \
diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp
new file mode 100644
index 0000000000..c671b1aa1e
--- /dev/null
+++ b/base/base_tests/levenshtein_dfa_test.cpp
@@ -0,0 +1,90 @@
+#include "testing/testing.hpp"
+
+#include "base/levenshtein_dfa.hpp"
+
+#include "std/string.hpp"
+
+using namespace strings;
+
+namespace
+{
+enum class Status
+{
+  Accepts,
+  Rejects,
+  Intermediate
+};
+
+Status GetStatus(LevenshteinDFA const & dfa, string const & s)
+{
+  auto it = dfa.Begin();
+  it.Move(s);
+  if (it.Accepts())
+    return Status::Accepts;
+  if (it.Rejects())
+    return Status::Rejects;
+  return Status::Intermediate;
+}
+
+bool Accepts(LevenshteinDFA const & dfa, string const & s)
+{
+  return GetStatus(dfa, s) == Status::Accepts;
+}
+
+bool Rejects(LevenshteinDFA const & dfa, string const & s)
+{
+  return GetStatus(dfa, s) == Status::Rejects;
+}
+
+bool Intermediate(LevenshteinDFA const & dfa, string const & s)
+{
+  return GetStatus(dfa, s) == Status::Intermediate;
+}
+
+UNIT_TEST(LevenshteinDFA_Smoke)
+{
+  {
+    LevenshteinDFA dfa("", 0 /* maxErrors */);
+
+    auto it = dfa.Begin();
+    TEST(it.Accepts(), ());
+    TEST(!it.Rejects(), ());
+
+    it.Move('a');
+    TEST(!it.Accepts(), ());
+    TEST(it.Rejects(), ());
+
+    it.Move('b');
+    TEST(!it.Accepts(), ());
+    TEST(it.Rejects(), ());
+  }
+
+  {
+    LevenshteinDFA dfa("abc", 1 /* maxErrors */);
+
+    TEST(Accepts(dfa, "ab"), ());
+    TEST(Accepts(dfa, "abd"), ());
+    TEST(Accepts(dfa, "abcd"), ());
+    TEST(Accepts(dfa, "bbc"), ());
+
+    TEST(Rejects(dfa, "cba"), ());
+    TEST(Rejects(dfa, "abcde"), ());
+
+    TEST(Accepts(dfa, "ac"), ());
+    TEST(Intermediate(dfa, "acb"), ());  // transpositions are not supported yet
+    TEST(Accepts(dfa, "acbc"), ());
+    TEST(Rejects(dfa, "acbd"), ());
+
+    TEST(Intermediate(dfa, "a"), ());
+  }
+
+  {
+    LevenshteinDFA dfa("ленинградский", 2 /* maxErrors */);
+
+    TEST(Accepts(dfa, "ленинградский"), ());
+    TEST(Accepts(dfa, "ленингадский"), ());
+    TEST(Accepts(dfa, "ленигнрадский"), ());
+    TEST(Rejects(dfa, "ленинский"), ());
+  }
+}
+}  // namespace
diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp
new file mode 100644
index 0000000000..c0741c7aa8
--- /dev/null
+++ b/base/levenshtein_dfa.cpp
@@ -0,0 +1,307 @@
+#include "base/levenshtein_dfa.hpp"
+
+#include "base/assert.hpp"
+#include "base/logging.hpp"
+#include "base/stl_helpers.hpp"
+
+#include "std/algorithm.hpp"
+#include "std/queue.hpp"
+#include "std/set.hpp"
+#include "std/sstream.hpp"
+#include "std/vector.hpp"
+
+namespace strings
+{
+namespace
+{
+class TransitionTable
+{
+public:
+  TransitionTable(UniString const & s, uint8_t maxErrors)
+    : m_s(s), m_size(s.size()), m_maxErrors(maxErrors)
+  {
+  }
+
+  void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t)
+  {
+    t.Clear();
+    for (auto const & p : s.m_positions)
+      GetMoves(p, c, t);
+    t.Normalize();
+  }
+
+private:
+  void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t)
+  {
+    auto & ps = t.m_positions;
+
+    if (p.m_numErrors < m_maxErrors)
+    {
+      if (p.m_offset + 1 < m_size)
+      {
+        size_t i;
+        if (FindRelevant(p, c, i))
+        {
+          if (i == 0)
+          {
+            ps.emplace_back(p.m_offset + 1, p.m_numErrors);
+          }
+          else
+          {
+            ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+            ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
+            ps.emplace_back(p.m_offset + i + 1, p.m_numErrors + i);
+          }
+        }
+        else
+        {
+          ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+          ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
+        }
+      }
+      else if (p.m_offset + 1 == m_size)
+      {
+        if (m_s[p.m_offset] == c)
+        {
+          ps.emplace_back(p.m_offset + 1, p.m_numErrors);
+        }
+        else
+        {
+          ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+          ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1);
+        }
+      }
+      else
+      {
+        ps.emplace_back(p.m_offset, p.m_numErrors + 1);
+      }
+    }
+    else if (p.m_offset < m_size && m_s[p.m_offset] == c)
+    {
+      ps.emplace_back(p.m_offset + 1, p.m_numErrors);
+    }
+  }
+
+  bool FindRelevant(LevenshteinDFA::Position const & p, UniChar c, size_t & i) const
+  {
+    size_t const limit =
+        min(m_size - p.m_offset, static_cast<size_t>(m_maxErrors - p.m_numErrors) + 1);
+
+    for (i = 0; i < limit; ++i)
+    {
+      if (m_s[p.m_offset + i] == c)
+        return true;
+    }
+    return false;
+  }
+
+  UniString const & m_s;
+  size_t const m_size;
+  uint8_t const m_maxErrors;
+};
+}  // namespace
+
+// LevenshteinDFA::Position ------------------------------------------------------------------------
+LevenshteinDFA::Position::Position(size_t offset, uint8_t numErrors)
+  : m_offset(offset), m_numErrors(numErrors)
+{
+}
+
+bool LevenshteinDFA::Position::SumsumedBy(Position const & rhs) const
+{
+  if (m_numErrors <= rhs.m_numErrors)
+    return false;
+
+  size_t const delta = m_numErrors - rhs.m_numErrors;
+  if (m_offset + delta < delta)
+    return true;
+  if (rhs.m_offset + delta < delta)
+    return true;
+  return m_offset + delta >= rhs.m_offset || m_offset <= rhs.m_offset + delta;
+}
+
+bool LevenshteinDFA::Position::operator<(Position const & rhs) const
+{
+  if (m_offset != rhs.m_offset)
+    return m_offset < rhs.m_offset;
+  return m_numErrors < rhs.m_numErrors;
+}
+
+bool LevenshteinDFA::Position::operator==(Position const & rhs) const
+{
+  return m_offset == rhs.m_offset && m_numErrors == rhs.m_numErrors;
+}
+
+// LevenshteinDFA::State ---------------------------------------------------------------------------
+// static
+LevenshteinDFA::State LevenshteinDFA::State::MakeStart()
+{
+  State state;
+  state.m_positions.emplace_back(0 /* offset */, 0 /* numErrors */);
+  return state;
+}
+
+void LevenshteinDFA::State::Normalize()
+{
+  size_t j = m_positions.size();
+  for (size_t i = 0; i < j; ++i)
+  {
+    auto const & cur = m_positions[i];
+
+    auto it = find_if(m_positions.begin(), m_positions.begin() + j,
+                      [&](Position const & rhs) { return cur.SumsumedBy(rhs); });
+    if (it != m_positions.begin() + j)
+    {
+      ASSERT_GREATER(j, 0, ());
+      --j;
+      swap(m_positions[i], m_positions[j]);
+    }
+  }
+
+  m_positions.erase(m_positions.begin() + j, m_positions.end());
+  my::SortUnique(m_positions);
+}
+
+// LevenshteinDFA ----------------------------------------------------------------------------------
+LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
+  : m_size(s.size()), m_maxErrors(maxErrors), m_rejecting(0)
+{
+  m_alphabet.assign(s.begin(), s.end());
+  my::SortUnique(m_alphabet);
+
+  UniChar missed = 0;
+  for (size_t i = 0; i < m_alphabet.size() && missed >= m_alphabet[i]; ++i)
+  {
+    if (missed == m_alphabet[i])
+      ++missed;
+  }
+  m_alphabet.push_back(missed);
+
+  TransitionTable table(s, maxErrors);
+  queue<pair<State, size_t>> states;
+  states.emplace(State::MakeStart(), 0);
+
+  map<State, size_t> visited;
+  visited[states.front().first] = states.front().second;
+
+  m_transitions.emplace_back(m_alphabet.size());
+
+  while (!states.empty())
+  {
+    auto const p = states.front();
+    states.pop();
+
+    auto const & curr = p.first;
+    auto const id = p.second;
+
+    ASSERT(IsValid(curr), (curr));
+
+    if (IsAccepting(curr))
+      m_accepting.insert(id);
+    if (IsRejecting(curr))
+    {
+      ASSERT(m_rejecting == 0 || m_rejecting == id,
+             ("Rejecting state must be set once:", m_rejecting, id));
+      m_rejecting = id;
+    }
+
+    ASSERT_LESS(id, m_transitions.size(), ());
+
+    for (size_t i = 0; i < m_alphabet.size(); ++i)
+    {
+      State next;
+      table.Move(curr, m_alphabet[i], next);
+
+      size_t nid;
+      auto const it = visited.find(next);
+      if (it == visited.end())
+      {
+        nid = visited.size();
+        visited[next] = nid;
+        m_transitions.emplace_back(m_alphabet.size());
+        states.emplace(next, nid);
+
+        ASSERT_EQUAL(visited.size(), m_transitions.size(), ());
+      }
+      else
+      {
+        nid = it->second;
+      }
+
+      m_transitions[id][i] = nid;
+    }
+  }
+
+  ASSERT(m_rejecting != 0, ("Rejecting state is not set"));
+}
+
+LevenshteinDFA::LevenshteinDFA(string const & s, uint8_t maxErrors)
+  : LevenshteinDFA(MakeUniString(s), maxErrors)
+{
+}
+
+bool LevenshteinDFA::IsValid(Position const & p) const
+{
+  return p.m_offset <= m_size && p.m_numErrors <= m_maxErrors;
+}
+
+bool LevenshteinDFA::IsValid(State const & s) const
+{
+  for (auto const & p : s.m_positions)
+  {
+    if (!IsValid(p))
+      return false;
+  }
+  return true;
+}
+
+bool LevenshteinDFA::IsAccepting(Position const & p) const
+{
+  return m_size - p.m_offset <= m_maxErrors - p.m_numErrors;
+}
+
+bool LevenshteinDFA::IsAccepting(State const & s) const
+{
+  for (auto const & p : s.m_positions)
+  {
+    if (IsAccepting(p))
+      return true;
+  }
+  return false;
+}
+
+size_t LevenshteinDFA::Move(size_t s, UniChar c) const
+{
+  ASSERT_GREATER(m_alphabet.size(), 0, ());
+  ASSERT(is_sorted(m_alphabet.begin(), m_alphabet.end() - 1), ());
+
+  size_t i;
+  auto const it = lower_bound(m_alphabet.begin(), m_alphabet.end() - 1, c);
+  if (it == m_alphabet.end() - 1 || *it != c)
+    i = m_alphabet.size() - 1;
+  else
+    i = distance(m_alphabet.begin(), it);
+
+  return m_transitions[s][i];
+}
+
+string DebugPrint(LevenshteinDFA::Position const & p)
+{
+  ostringstream os;
+  os << "Position [" << p.m_offset << ", " << static_cast<uint32_t>(p.m_numErrors) << "]";
+  return os.str();
+}
+
+string DebugPrint(LevenshteinDFA::State const & s)
+{
+  ostringstream os;
+  os << "State [";
+  for (size_t i = 0; i < s.m_positions.size(); ++i)
+  {
+    os << DebugPrint(s.m_positions[i]);
+    if (i + 1 != s.m_positions.size())
+      os << ", ";
+  }
+  return os.str();
+}
+}  // namespace strings
diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp
new file mode 100644
index 0000000000..2632a73135
--- /dev/null
+++ b/base/levenshtein_dfa.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include "base/string_utils.hpp"
+
+#include "std/cstdint.hpp"
+#include "std/unordered_set.hpp"
+
+namespace strings
+{
+// This class represends a DFA recognizing a language consisting of
+// all words that are close enough to a given string, in terms of
+// Levenshtein distance. The code is based on the work "Fast String
+// Correction with Levenshtein-Automata" by Klaus U. Schulz and Stoyan
+// Mihov.
+//
+// *NOTE* The class *IS* thread-safe.
+class LevenshteinDFA
+{
+public:
+  struct Position
+  {
+    Position() = default;
+    Position(size_t offset, uint8_t numErrors);
+
+    bool SumsumedBy(Position const & rhs) const;
+
+    bool operator<(Position const & rhs) const;
+    bool operator==(Position const & rhs) const;
+
+    size_t m_offset = 0;
+    uint8_t m_numErrors = 0;
+  };
+
+  struct State
+  {
+    static State MakeStart();
+
+    void Normalize();
+    inline void Clear() { m_positions.clear(); }
+
+    inline bool operator<(State const & rhs) const { return m_positions < rhs.m_positions; }
+
+    vector<Position> m_positions;
+  };
+
+  // An iterator to the current state in the DFA.
+  //
+  // *NOTE* The class *IS NOT* thread safe. Moreover, it should not be
+  // used after destruction of the corresponding DFA.
+  class Iterator
+  {
+  public:
+    Iterator & Move(UniChar c)
+    {
+      m_s = m_dfa.Move(m_s, c);
+      return *this;
+    }
+
+    inline Iterator & Move(UniString const & s) { return Move(s.begin(), s.end()); }
+
+    inline Iterator & Move(string const & s)
+    {
+      UniString us = MakeUniString(s);
+      return Move(us);
+    }
+
+    template <typename It>
+    Iterator & Move(It begin, It end)
+    {
+      for (; begin != end; ++begin)
+        Move(*begin);
+      return *this;
+    }
+
+    bool Accepts() const { return m_dfa.IsAccepting(m_s); }
+    bool Rejects() const { return m_dfa.IsRejecting(m_s); }
+
+  private:
+    friend class LevenshteinDFA;
+
+    Iterator(LevenshteinDFA const & dfa) : m_s(0), m_dfa(dfa) {}
+
+    size_t m_s = 0;
+    LevenshteinDFA const & m_dfa;
+  };
+
+  LevenshteinDFA(UniString const & s, uint8_t maxErrors);
+  LevenshteinDFA(string const & s, uint8_t maxErrors);
+
+  inline Iterator Begin() const { return Iterator(*this); }
+
+  size_t GetNumStates() const { return m_transitions.size(); }
+  size_t GetAlphabetSize() const { return m_alphabet.size(); }
+
+private:
+  friend class Iterator;
+
+  size_t TransformChar(UniChar c) const;
+
+  bool IsValid(Position const & p) const;
+  bool IsValid(State const & s) const;
+
+  bool IsAccepting(Position const & p) const;
+  bool IsAccepting(State const & s) const;
+  inline bool IsAccepting(size_t s) const { return m_accepting.count(s) != 0; }
+
+  inline bool IsRejecting(State const & s) const { return s.m_positions.empty(); }
+  inline bool IsRejecting(size_t s) const { return s == m_rejecting; }
+
+  size_t Move(size_t s, UniChar c) const;
+
+  size_t const m_size;
+  uint8_t const m_maxErrors;
+
+  vector<UniChar> m_alphabet;
+
+  vector<vector<size_t>> m_transitions;
+  unordered_set<size_t> m_accepting;
+  size_t m_rejecting;
+};
+
+string DebugPrint(LevenshteinDFA::Position const & p);
+string DebugPrint(LevenshteinDFA::State const & s);
+}  // namespace strings
author	Yuri Gorshenin <y@maps.me>	2016-11-03 15:26:10 +0300
committer	Yuri Gorshenin <y@maps.me>	2016-11-07 14:36:19 +0300
commit	59fdb701e0b6afd4e44be1ee8d319a93224c1688 (patch)
tree	3751387bdb005414e7a25323896ead875a2a3bde /base
parent	df752eb5e31499c7cb7377b7ef0f20e8c154ef63 (diff)