Use boost::unordered_map instead of std::map.

For storing the word vocabulary used in computation of BLEU scores. This change will reduce the running time of extractor about 2-3 seconds (9% reduction).
author: Tetsuo Kiso <tetsuo-s@is.naist.jp> 2012-12-07 00:12:24 +0400
committer: Tetsuo Kiso <tetsuo-s@is.naist.jp> 2012-12-07 00:12:24 +0400
commit: 8fdec9bf3059e388b267da06b4d5a0ca67615df7 (patch)
tree: 1a4a8f18824c786dcb1662ee8be3643f606ea4e8 /mert/BleuScorer.cpp
parent: 6c04c4ad9c1fa9b184a3ad95c8e7b75bcb5c92c3 (diff)
1 files changed, 13 insertions, 3 deletions
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
index a3ba16b13..8fb814390 100644
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@@ -50,11 +50,21 @@ BleuScorer::BleuScorer(const string& config)
 BleuScorer::~BleuScorer() {}
 
 size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
-                               unsigned int n)
+                               unsigned int n, bool is_testing)
 {
   assert(n > 0);
   vector<int> encoded_tokens;
-  TokenizeAndEncode(line, encoded_tokens);
+
+  // When performing tokenization of a hypothesis translation, we don't have
+  // to update the Scorer's word vocabulary. However, the tokenization of
+  // reference translations requires modifying the vocabulary, which means
+  // this procedure might be slower than the tokenization the hypothesis
+  // translation.
+  if (is_testing) {
+    TokenizeAndEncodeTesting(line, encoded_tokens);
+  } else {
+    TokenizeAndEncode(line, encoded_tokens);
+  }
   for (size_t k = 1; k <= n; ++k) {
     //ngram order longer than sentence - no point
     if (k > encoded_tokens.size()) {
@@ -147,7 +157,7 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
   // stats for this line
   vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
   string sentence = preprocessSentence(text);
-  const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder);
+  const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true);
 
   const int reference_len = CalcReferenceLength(sid, length);
   stats.push_back(reference_len);
author	Tetsuo Kiso <tetsuo-s@is.naist.jp>	2012-12-07 00:12:24 +0400
committer	Tetsuo Kiso <tetsuo-s@is.naist.jp>	2012-12-07 00:12:24 +0400
commit	8fdec9bf3059e388b267da06b4d5a0ca67615df7 (patch)
tree	1a4a8f18824c786dcb1662ee8be3643f606ea4e8 /mert/BleuScorer.cpp
parent	6c04c4ad9c1fa9b184a3ad95c8e7b75bcb5c92c3 (diff)