Welcome to mirror list, hosted at ThFree Co, Russian Federation.

PerScorer.cpp « mert - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 06b53436f8e7f59b21421be3ae9870e15b0c2e65 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include "PerScorer.h"

#include <fstream>
#include <stdexcept>

#include "ScoreStats.h"
#include "Util.h"

using namespace std;

namespace MosesTuning
{
  

PerScorer::PerScorer(const string& config)
  : StatisticsBasedScorer("PER",config) {}

PerScorer::~PerScorer() {}

void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
  // For each line in the reference file, create a multiset of
  // the word ids.
  if (referenceFiles.size() != 1) {
    throw runtime_error("PER only supports a single reference");
  }
  m_ref_tokens.clear();
  m_ref_lengths.clear();
  ifstream in(referenceFiles[0].c_str());
  if (!in) {
    throw runtime_error("Unable to open " + referenceFiles[0]);
  }
  string line;
  int sid = 0;
  while (getline(in,line)) {
    line = this->preprocessSentence(line);
    vector<int> tokens;
    TokenizeAndEncode(line, tokens);
    m_ref_tokens.push_back(multiset<int>());
    for (size_t i = 0; i < tokens.size(); ++i) {
      m_ref_tokens.back().insert(tokens[i]);
    }
    m_ref_lengths.push_back(tokens.size());
    if (sid > 0 && sid % 100 == 0) {
      TRACE_ERR(".");
    }
    ++sid;
  }
  TRACE_ERR(endl);

}

void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  if (sid >= m_ref_lengths.size()) {
    stringstream msg;
    msg << "Sentence id (" << sid << ") not found in reference set";
    throw runtime_error(msg.str());
  }

  string sentence = this->preprocessSentence(text);

  // Calculate correct, output_length and ref_length for
  // the line and store it in entry
  vector<int> testtokens;
  TokenizeAndEncode(sentence, testtokens);
  multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
  set<int> testtokens_unique(testtokens.begin(),testtokens.end());
  int correct = 0;
  for (set<int>::iterator i = testtokens_unique.begin();
       i != testtokens_unique.end(); ++i) {
    int token = *i;
    correct += min(m_ref_tokens[sid].count(token), testtokens_all.count(token));
  }

  ostringstream stats;
  stats << correct << " " << testtokens.size() << " " << m_ref_lengths[sid] << " " ;
  string stats_str = stats.str();
  entry.set(stats_str);
}

float PerScorer::calculateScore(const vector<int>& comps) const
{
  float denom = comps[2];
  float num = comps[0] - max(0,comps[1]-comps[2]);
  if (denom == 0) {
    // This shouldn't happen!
    return 0.0;
  } else {
    return num/denom;
  }
}

}