lm/filter/phrase_table_vocab_main.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165

#include "util/file_stream.hh"
#include "util/file_piece.hh"
#include "util/murmur_hash.hh"
#include "util/pool.hh"
#include "util/string_piece.hh"
#include "util/string_piece_hash.hh"
#include "util/tokenize_piece.hh"

#include <boost/unordered_map.hpp>
#include <boost/unordered_set.hpp>

#include <cstddef>
#include <vector>

namespace {

struct MutablePiece {
  mutable StringPiece behind;
  bool operator==(const MutablePiece &other) const {
    return behind == other.behind;
  }
};

std::size_t hash_value(const MutablePiece &m) {
  return hash_value(m.behind);
}

class InternString {
  public:
    const char *Add(StringPiece str) {
      MutablePiece mut;
      mut.behind = str;
      std::pair<boost::unordered_set<MutablePiece>::iterator, bool> res(strs_.insert(mut));
      if (res.second) {
        void *mem = backing_.Allocate(str.size() + 1);
        memcpy(mem, str.data(), str.size());
        static_cast<char*>(mem)[str.size()] = 0;
        res.first->behind = StringPiece(static_cast<char*>(mem), str.size());
      }
      return res.first->behind.data();
    }

  private:
    util::Pool backing_;
    boost::unordered_set<MutablePiece> strs_;
};

class TargetWords {
  public:
    void Introduce(StringPiece source) {
      vocab_.resize(vocab_.size() + 1);
      std::vector<unsigned int> temp(1, vocab_.size() - 1);
      Add(temp, source);
    }

    void Add(const std::vector<unsigned int> &sentences, StringPiece target) {
      if (sentences.empty()) return;
      interns_.clear();
      for (util::TokenIter<util::SingleCharacter, true> i(target, ' '); i; ++i) {
        interns_.push_back(intern_.Add(*i));
      }
      for (std::vector<unsigned int>::const_iterator i(sentences.begin()); i != sentences.end(); ++i) {
        boost::unordered_set<const char *> &vocab = vocab_[*i];
        for (std::vector<const char *>::const_iterator j = interns_.begin(); j != interns_.end(); ++j) {
          vocab.insert(*j);
        }
      }
    }

    void Print() const {
      util::FileStream out(1);
      for (std::vector<boost::unordered_set<const char *> >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) {
        for (boost::unordered_set<const char *>::const_iterator j = i->begin(); j != i->end(); ++j) {
          out << *j << ' ';
        }
        out << '\n';
      }
    }

  private:
    InternString intern_;

    std::vector<boost::unordered_set<const char *> > vocab_;

    // Temporary in Add.
    std::vector<const char *> interns_;
};

class Input {
  public:
    explicit Input(std::size_t max_length)
      : max_length_(max_length), sentence_id_(0), empty_() {}

    void AddSentence(StringPiece sentence, TargetWords &targets) {
      canonical_.clear();
      starts_.clear();
      starts_.push_back(0);
      for (util::TokenIter<util::AnyCharacter, true> i(sentence, StringPiece("\0 \t", 3)); i; ++i) {
        canonical_.append(i->data(), i->size());
        canonical_ += ' ';
        starts_.push_back(canonical_.size());
      }
      targets.Introduce(canonical_);
      for (std::size_t i = 0; i < starts_.size() - 1; ++i) {
        std::size_t subtract = starts_[i];
        const char *start = &canonical_[subtract];
        for (std::size_t j = i + 1; j < std::min(starts_.size(), i + max_length_ + 1); ++j) {
          map_[util::MurmurHash64A(start, &canonical_[starts_[j]] - start - 1)].push_back(sentence_id_);
        }
      }
      ++sentence_id_;
    }

    // Assumes single space-delimited phrase with no space at the beginning or end.
    const std::vector<unsigned int> &Matches(StringPiece phrase) const {
      Map::const_iterator i = map_.find(util::MurmurHash64A(phrase.data(), phrase.size()));
      return i == map_.end() ? empty_ : i->second;
    }

  private:
    const std::size_t max_length_;

    // hash of phrase is the key, array of sentences is the value.
    typedef boost::unordered_map<uint64_t, std::vector<unsigned int> > Map;
    Map map_;

    std::size_t sentence_id_;

    // Temporaries in AddSentence.
    std::string canonical_;
    std::vector<std::size_t> starts_;

    const std::vector<unsigned int> empty_;
};

} // namespace

int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::cerr << "Expected source text on the command line" << std::endl;
    return 1;
  }
  Input input(7);
  TargetWords targets;
  try {
    util::FilePiece inputs(argv[1], &std::cerr);
    while (true)
      input.AddSentence(inputs.ReadLine(), targets);
  } catch (const util::EndOfFileException &e) {}

  util::FilePiece table(0, NULL, &std::cerr);
  StringPiece line;
  const StringPiece pipes("|||");
  while (true) {
    try {
      line = table.ReadLine();
    } catch (const util::EndOfFileException &e) { break; }
    util::TokenIter<util::MultiCharacter> it(line, pipes);
    StringPiece source(*it);
    if (!source.empty() && source[source.size() - 1] == ' ')
      source.remove_suffix(1);
    targets.Add(input.Matches(source), *++it);
  }
  targets.Print();
}