base/levenshtein_dfa.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

#pragma once

#include "base/string_utils.hpp"

#include <cstddef>
#include <string>
#include <vector>

namespace strings
{
// This class represents a DFA recognizing a language consisting of
// all words that are close enough to a given string, in terms of
// Levenshtein distance. Levenshtein distance treats deletions,
// insertions and replacements as errors. Transpositions are handled
// in a quite special way - its assumed that all transformations are
// applied in parallel, i.e. the Levenshtein distance between "ab" and
// "bca" is three, not two.  The code is based on the work "Fast
// String Correction with Levenshtein-Automata" by Klaus U. Schulz and
// Stoyan Mihov.  For a fixed number of allowed errors and fixed
// alphabet the construction time and size of automata is
// O(length of the pattern), but the size grows exponentially with the
// number of errors, so be reasonable and don't use this class when
// the number of errors is too high.
//
// *NOTE* The class *IS* thread-safe.
//
// TODO (@y): consider to implement a factory of automata, that will
// be able to construct them quickly for a fixed number of errors.
class LevenshteinDFA
{
public:
  static size_t const kStartingState;
  static size_t const kRejectingState;
  
  struct Position
  {
    Position() = default;
    Position(size_t offset, size_t errorsLeft, bool transposed);

    // SubsumedBy is a relation on two positions, which allows to
    // efficiently remove unnecessary positions in a state. When the
    // function returns true, it means that |rhs| is more powerful
    // than the current position and it is safe to remove the current
    // position from the state, if the state contains |rhs|.
    bool SubsumedBy(Position const & rhs) const;

    bool operator<(Position const & rhs) const;
    bool operator==(Position const & rhs) const;

    inline bool IsStandard() const { return !m_transposed; }
    inline bool IsTransposed() const { return m_transposed; }

    size_t m_offset = 0;
    size_t m_errorsLeft = 0;
    bool m_transposed = false;
  };

  struct State
  {
    void Normalize();
    inline void Clear() { m_positions.clear(); }

    inline bool operator<(State const & rhs) const { return m_positions < rhs.m_positions; }

    std::vector<Position> m_positions;
  };

  // An iterator to the current state in the DFA.
  //
  // *NOTE* The class *IS NOT* thread safe. Moreover, it should not be
  // used after destruction of the corresponding DFA.
  class Iterator
  {
  public:
    Iterator & Move(UniChar c)
    {
      m_s = m_dfa.Move(m_s, c);
      return *this;
    }

    bool Accepts() const { return m_dfa.IsAccepting(m_s); }
    bool Rejects() const { return m_dfa.IsRejecting(m_s); }

    size_t ErrorsMade() const { return m_dfa.ErrorsMade(m_s); }

  private:
    friend class LevenshteinDFA;

    explicit Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {}

    size_t m_s;
    LevenshteinDFA const & m_dfa;
  };

  LevenshteinDFA(LevenshteinDFA const &) = default;
  LevenshteinDFA(LevenshteinDFA &&) = default;

  LevenshteinDFA(UniString const & s, size_t prefixSize,
                 std::vector<UniString> const & prefixMisprints, size_t maxErrors);
  LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors);
  LevenshteinDFA(UniString const & s, size_t maxErrors);
  LevenshteinDFA(std::string const & s, size_t maxErrors);

  inline Iterator Begin() const { return Iterator(*this); }

  size_t GetNumStates() const { return m_transitions.size(); }
  size_t GetAlphabetSize() const { return m_alphabet.size(); }

private:
  friend class Iterator;

  State MakeStart();
  State MakeRejecting();

  bool IsValid(Position const & p) const;
  bool IsValid(State const & s) const;

  bool IsAccepting(Position const & p) const;
  bool IsAccepting(State const & s) const;
  inline bool IsAccepting(size_t s) const { return m_accepting[s]; }

  inline bool IsRejecting(State const & s) const { return s.m_positions.empty(); }
  inline bool IsRejecting(size_t s) const { return s == kRejectingState; }

  // Returns minimum number of made errors among accepting positions in |s|.
  size_t ErrorsMade(State const & s) const;
  size_t ErrorsMade(size_t s) const { return m_errorsMade[s]; }

  size_t Move(size_t s, UniChar c) const;

  size_t const m_size;
  size_t const m_maxErrors;

  std::vector<UniChar> m_alphabet;

  std::vector<std::vector<size_t>> m_transitions;
  std::vector<bool> m_accepting;
  std::vector<size_t> m_errorsMade;
};

std::string DebugPrint(LevenshteinDFA::Position const & p);
std::string DebugPrint(LevenshteinDFA::State const & s);
}  // namespace strings