mert/Scorer.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

#pragma once

#include <iostream>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include <limits>
#include "Types.h"
#include "ScoreData.h"

namespace mert
{

class Vocabulary;

} // namespace mert

namespace MosesTuning
{

class PreProcessFilter;
class ScoreStats;

enum ScorerRegularisationStrategy {REG_NONE, REG_AVERAGE, REG_MINIMUM};

/**
 * Superclass of all scorers and dummy implementation.
 *
 * In order to add a new scorer it should be sufficient to override the members
 * prepareStats(), setReferenceFiles() and score() (or calculateScore()).
 */
class Scorer
{
public:
  Scorer(const std::string& name, const std::string& config);
  virtual ~Scorer();

  /**
   * Return the number of statistics needed for the computation of the score.
   */
  virtual std::size_t NumberOfScores() const = 0;

  /**
   * Calculate score based on a vector of sufficient statistics.
   */
  virtual float calculateScore(const std::vector<ScoreStatsType>& totals) const = 0;

  float calculateSentenceLevelBackgroundScore(const std::vector<ScoreStatsType>& totals, const std::vector<ScoreStatsType>& bg) {
    std::vector<ScoreStatsType> stats(totals.size());
    for(size_t i=0; i<stats.size(); i++)
      stats[i] = totals[i]+bg[i];
    // Get score and scale by reference length (as per Chiang et al 08)
    return calculateScore(stats) * getReferenceLength(stats);
  }

  /**
   * Set the reference files. This must be called before prepareStats().
   */
  virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles) {
    // do nothing
  }

  /**
   * Process the given guessed text, corresponding to the given reference sindex
   * and add the appropriate statistics to the entry.
   */
  virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry) {
    // do nothing.
  }

  virtual void prepareStats(const std::string& sindex, const std::string& text, ScoreStats& entry) {
    this->prepareStats(static_cast<std::size_t>(atoi(sindex.c_str())), text, entry);
  }

  /**
   * Score using each of the candidate index, then go through the diffs
   * applying each in turn, and calculating a new score each time.
   */
  virtual void score(const candidates_t& candidates, const diffs_t& diffs,
                     statscores_t& scores) const = 0;
  /*
  {
    //dummy impl
    if (!m_score_data) {
      throw runtime_error("score data not loaded");
    }
    scores.push_back(0);
    for (std::size_t i = 0; i < diffs.size(); ++i) {
      scores.push_back(0);
    }
  }
  */

  /**
   * Calculate the score of the sentences corresponding to the list of candidate
   * indices. Each index indicates the 1-best choice from the n-best list.
   */
  float score(const candidates_t& candidates) const;

  const std::string& getName() const {
    return m_name;
  }

  std::size_t getReferenceSize() const {
    if (m_score_data) {
      return m_score_data->size();
    }
    return 0;
  }

  /**
   * Based on vector of sufficient statistics, return length of reference.
   */
  virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const = 0;

  /**
   * Set the score data, prior to scoring.
   */
  virtual void setScoreData(ScoreData* data) {
    m_score_data = data;
  }

  /**
   * The scorer returns if it uses the reference alignment data
   * for permutation distance scores
   **/
  virtual bool useAlignment() const {
    //cout << "Scorer::useAlignment returning false " << endl;
    return false;
  };

  /**
   * Set the factors, which should be used for this metric
   */
  virtual void setFactors(const std::string& factors);

  mert::Vocabulary* GetVocab() const {
    return m_vocab;
  }

  /**
   * Set unix filter, which will be used to preprocess the sentences
   */
  virtual void setFilter(const std::string& filterCommand);

private:
  void InitConfig(const std::string& config);

  /**
   * Take the factored sentence and return the desired factors
   */
  std::string applyFactors(const std::string& sentece) const;

  /**
   * Preprocess the sentence with the filter (if given)
   */
  std::string applyFilter(const std::string& sentence) const;

  std::string m_name;
  mert::Vocabulary* m_vocab;
  std::map<std::string, std::string> m_config;
  std::vector<int> m_factors;

#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
  PreProcessFilter* m_filter;
#endif

protected:
  ScoreData* m_score_data;
  bool m_enable_preserve_case;

  /**
   * Get value of config variable. If not provided, return default.
   */
  std::string getConfig(const std::string& key, const std::string& def="") const {
    std::map<std::string,std::string>::const_iterator i = m_config.find(key);
    if (i == m_config.end()) {
      return def;
    } else {
      return i->second;
    }
  }

  /**
   * Tokenise line and encode.
   * Note: We assume that all tokens are separated by whitespaces.
   */
  void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded) const;

  /*
   * Tokenize functions for testing only.
   */
  void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded) const;

  /**
   * Every inherited scorer should call this function for each sentence
   */
  std::string preprocessSentence(const std::string& sentence) const {
    return applyFactors(applyFilter(sentence));
  }

};

namespace
{

//regularisation strategies
inline float score_min(const statscores_t& scores, size_t start, size_t end)
{
  float min = std::numeric_limits<float>::max();
  for (size_t i = start; i < end; ++i) {
    if (scores[i] < min) {
      min = scores[i];
    }
  }
  return min;
}

inline float score_average(const statscores_t& scores, size_t start, size_t end)
{
  if ((end - start) < 1) {
    // this shouldn't happen
    return 0;
  }
  float total = 0;
  for (size_t j = start; j < end; ++j) {
    total += scores[j];
  }

  return total / (end - start);
}

} // namespace

}