Welcome to mirror list, hosted at ThFree Co, Russian Federation.

CderScorer.cpp « mert - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 4479e0ad88215266a675dbad0d1097294653a4bf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include "CderScorer.h"

#include <algorithm>
#include <fstream>
#include <stdexcept>

using namespace std;

namespace
{

inline int CalcDistance(int word1, int word2)
{
  return word1 == word2 ? 0 : 1;
}

} // namespace

namespace MosesTuning
{


CderScorer::CderScorer(const string& config, bool allowed_long_jumps)
  : StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
    m_allowed_long_jumps(allowed_long_jumps) {}

CderScorer::~CderScorer() {}

void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
  //make sure reference data is clear
  m_ref_sentences.clear();

  //load reference data
  for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
    ifstream refin(referenceFiles[rid].c_str());
    if (!refin) {
      throw runtime_error("Unable to open: " + referenceFiles[rid]);
    }
    m_ref_sentences.push_back(vector<sent_t>());
    string line;
    while (getline(refin,line)) {
      line = this->preprocessSentence(line);
      sent_t encoded;
      TokenizeAndEncode(line, encoded);
      m_ref_sentences[rid].push_back(encoded);
    }
  }
}

void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  string sentence = this->preprocessSentence(text);

  vector<ScoreStatsType> stats;
  prepareStatsVector(sid, sentence, stats);
  entry.set(stats);
}

void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<ScoreStatsType>& stats)
{
  sent_t cand;
  TokenizeAndEncode(text, cand);

  float max = -2;
  vector<ScoreStatsType> tmp;
  for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
    const sent_t& ref = m_ref_sentences[rid][sid];
    tmp.clear();
    computeCD(cand, ref, tmp);
    int score = calculateScore(tmp);
    if (rid == 0) {
      stats = tmp;
      max = score;
    } else if (score > max) {
      stats = tmp;
      max = score;
    }
  }
}

float CderScorer::calculateScore(const vector<ScoreStatsType>& comps) const
{
  if (comps.size() != 2) {
    throw runtime_error("Size of stat vector for CDER is not 2");
  }
  if (comps[1] == 0) return 1.0f;
  return 1.0f - (comps[0] / static_cast<float>(comps[1]));
}

void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
                           vector<ScoreStatsType>& stats) const
{
  int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
  int L = ref.size() + 1; // Number of inter-words positions in reference sentence

  int l = 0;
  // row[i] stores cost of cheapest path from (0,0) to (i,l) in CDER aligment grid.
  vector<int>* row = new vector<int>(I);

  // Initialization of first row
  for (int i = 0; i < I; ++i) (*row)[i] = i;

  // For CDER metric, the initialization is different
  if (m_allowed_long_jumps) {
    for (int i = 1; i < I; ++i) (*row)[i] = 1;
  }

  // Calculating costs for next row using costs from the previous row.
  while (++l < L) {
    vector<int>* nextRow = new vector<int>(I);
    for (int i = 0; i < I; ++i) {
      vector<int> possibleCosts;
      if (i > 0) {
        possibleCosts.push_back((*nextRow)[i-1] + 1); // Deletion
        possibleCosts.push_back((*row)[i-1] + CalcDistance(ref[l-1], cand[i-1])); // Substitution/Identity
      }
      possibleCosts.push_back((*row)[i] + 1); // Insertion
      (*nextRow)[i] = *min_element(possibleCosts.begin(), possibleCosts.end());
    }

    if (m_allowed_long_jumps) {
      // Cost of LongJumps is the same for all in the row
      int LJ = 1 + *min_element(nextRow->begin(), nextRow->end());

      for (int i = 0; i < I; ++i) {
        (*nextRow)[i] = min((*nextRow)[i], LJ); // LongJumps
      }
    }

    delete row;
    row = nextRow;
  }

  stats.resize(2);
  stats[0] = *(row->rbegin());  // CD distance is the cost of path from (0,0) to (I,L)
  stats[1] = ref.size();

  delete row;
}

}