Merge branch 'trunk' into miramerge. Still to fix build.

Conflicts: Jamroot mert/Data.cpp mert/Data.h mert/FeatureArray.cpp mert/FeatureArray.h mert/FeatureData.cpp mert/FeatureData.h mert/FeatureStats.cpp mert/FeatureStats.h mert/mert.cpp moses-chart-cmd/src/IOWrapper.h moses-chart-cmd/src/Main.cpp moses-cmd/src/IOWrapper.cpp moses-cmd/src/IOWrapper.h moses-cmd/src/Main.cpp moses/src/GlobalLexicalModel.cpp moses/src/Jamfile moses/src/Parameter.cpp moses/src/PhraseDictionary.cpp moses/src/ScoreIndexManager.h moses/src/TargetPhrase.h regression-testing/tests/phrase.lexicalized-reordering-bin/truth/results.txt regression-testing/tests/phrase.lexicalized-reordering-cn/truth/results.txt regression-testing/tests/phrase.lexicalized-reordering/truth/results.txt regression-testing/tests/phrase.multiple-translation-system-lr/truth/results.txt regression-testing/tests/phrase.show-weights.lex-reorder/truth/results.txt regression-testing/tests/phrase.show-weights/truth/results.txt scripts/ems/experiment.meta scripts/ems/experiment.perl scripts/training/filter-model-given-input.pl scripts/training/mert-moses.pl
author: Barry Haddow <barry.haddow@gmail.com> 2012-05-25 00:11:35 +0400
committer: Barry Haddow <barry.haddow@gmail.com> 2012-05-25 00:11:35 +0400
commit: c397d2068bd65fc1eca6bf49057cfda154fa3ce5 (patch)
tree: e8a8c4e6f60c2b0d47592371d981b9d1a1ac9f85 /mert/Data.cpp
parent: 5a17ef82b3c7449d9fb3686875a7936193e14a89 (diff)
parent: 277fd38bbd345c083b762a0fb36c0a69da2ca7eb (diff)
1 files changed, 182 insertions, 102 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 4f822558e..2a6bd5e92 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -1,13 +1,12 @@
 /*
  *  Data.cpp
- *  met - Minimum Error Training
+ *  mert - Minimum Error Rate Training
  *
  *  Created by Nicola Bertoldi on 13/05/08.
  *
  */
 
 #include <algorithm>
-#include "util/check.hh"
 #include <cmath>
 #include <fstream>
 
@@ -16,148 +15,229 @@
 #include "Scorer.h"
 #include "ScorerFactory.h"
 #include "Util.h"
+#include "util/check.hh"
+
+using namespace std;
+
 
-Data::Data(Scorer& ptr, const std::string& sparseweightsfile)
-    : theScorer(&ptr),
-      score_type(theScorer->getName()),
-      number_of_scores(0),
-      scoredata(new ScoreData(*theScorer)),
-      featdata(new FeatureData)
+Data::Data(Scorer* scorer, const string& sparse_weights_file)
+    : m_scorer(scorer),
+      m_score_type(m_scorer->getName()),
+      m_num_scores(0),
+      m_score_data(new ScoreData(m_scorer)),
+      m_feature_data(new FeatureData)
 {
-  TRACE_ERR("Data::score_type " << score_type << std::endl);
-  TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
-  if (sparseweightsfile.size()) {
-    sparse_weights.load(sparseweightsfile);
+  TRACE_ERR("Data::m_score_type " << m_score_type << endl);
+  TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
+  if (sparse_weights_file.size()) {
+    m_sparse_weights.load(sparse_weights_file);
     ostringstream msg;
     msg << "Data::sparse_weights {";
-    sparse_weights.write(msg,"=");
+    m_sparse_weights.write(msg,"=");
     msg << "}";
     TRACE_ERR(msg.str() << std::endl);
   }
 }
 
-Data::~Data() {
-  if (featdata) {
-    delete featdata;
-    featdata = NULL;
-  }
-  if (scoredata) {
-    delete scoredata;
-    scoredata = NULL;
-  }
-}
+//ADDED BY TS
+// TODO: This is too long; consider creating additional functions to
+// reduce the lines of this function.
+void Data::removeDuplicates() {
+  size_t nSentences = m_feature_data->size();
+  assert(m_score_data->size() == nSentences);
 
-void Data::loadnbest(const std::string &file)
-{
-  TRACE_ERR("loading nbest from " << file << std::endl);
+  for (size_t s = 0; s < nSentences; s++) {
+    FeatureArray& feat_array =  m_feature_data->get(s);
+    ScoreArray& score_array =  m_score_data->get(s);
 
-  FeatureStats featentry;
-  ScoreStats scoreentry;
-  std::string sentence_index;
+    assert(feat_array.size() == score_array.size());
 
-  inputfilestream inp(file); // matches a stream with a file. Opens the file
+    //serves as a hash-map:
+    map<double, vector<size_t> > lookup;
 
-  if (!inp.good())
-    throw runtime_error("Unable to open: " + file);
+    size_t end_pos = feat_array.size() - 1;
+
+    size_t nRemoved = 0;
+
+    for (size_t k = 0; k <= end_pos; k++) {
+      const FeatureStats& cur_feats = feat_array.get(k);
+      double sum = 0.0;
+      for (size_t l = 0; l < cur_feats.size(); l++)
+        sum += cur_feats.get(l);
 
-  std::string substring, subsubstring, stringBuf;
-  std::string theSentence;
-  std::string::size_type loc;
+      if (lookup.find(sum) != lookup.end()) {
 
-  while (getline(inp,stringBuf,'\n')) {
-    if (stringBuf.empty()) continue;
+        //cerr << "hit" << endl;
+        vector<size_t>& cur_list = lookup[sum];
 
-//              TRACE_ERR("stringBuf: " << stringBuf << std::endl);
+        // TODO: Make sure this is correct because we have already used 'l'.
+        // If this does not impact on the removing duplicates, it is better
+        // to change
+        size_t l = 0;
+        for (l = 0; l < cur_list.size(); l++) {
+          size_t j = cur_list[l];
+
+          if (cur_feats == feat_array.get(j)
+              && score_array.get(k) == score_array.get(j)) {
+            if (k < end_pos) {
+              feat_array.swap(k,end_pos);
+              score_array.swap(k,end_pos);
+              k--;
+            }
+            end_pos--;
+            nRemoved++;
+            break;
+          }
+        }
+        if (l == lookup[sum].size())
+          cur_list.push_back(k);
+      } else {
+        lookup[sum].push_back(k);
+      }
+      // for (size_t j=0; j < k; j++) {
+
+      // 	if (feat_array.get(k) == feat_array.get(j)
+      // 	    && score_array.get(k) == score_array.get(j)) {
+
+      // 	  if (k < end_pos) {
+
+      // 	    feat_array.swap(k,end_pos);
+      // 	    score_array.swap(k,end_pos);
+
+      // 	    k--;
+      // 	  }
+
+      //          end_pos--;
+      // 	  nRemoved++;
+      //          break;
+      // 	}
+      // }
+    } // end for k
+
+    if (nRemoved > 0) {
+      feat_array.resize(end_pos+1);
+      score_array.resize(end_pos+1);
+    }
+  }
+}
+//END_ADDED
+
+void Data::load(const std::string &featfile, const std::string &scorefile) {
+  m_feature_data->load(featfile, m_sparse_weights);
+  m_score_data->load(scorefile);
+}
 
-    getNextPound(stringBuf, substring, "|||"); //first field
-    sentence_index = substring;
+void Data::loadNBest(const string &file)
+{
+  TRACE_ERR("loading nbest from " << file << endl);
+  inputfilestream inp(file); // matches a stream with a file. Opens the file
+  if (!inp.good())
+    throw runtime_error("Unable to open: " + file);
 
-    getNextPound(stringBuf, substring, "|||"); //second field
-    theSentence = substring;
+  ScoreStats scoreentry;
+  string line, sentence_index, sentence, feature_str;
 
+  while (getline(inp, line, '\n')) {
+    if (line.empty()) continue;
     // adding statistics for error measures
-    featentry.reset();
     scoreentry.clear();
 
-    theScorer->prepareStats(sentence_index, theSentence, scoreentry);
-
-    scoredata->add(scoreentry, sentence_index);
+    getNextPound(line, sentence_index, "|||"); // first field
+    getNextPound(line, sentence, "|||");       // second field
+    getNextPound(line, feature_str, "|||");    // third field
 
-    getNextPound(stringBuf, substring, "|||"); //third field
+    m_scorer->prepareStats(sentence_index, sentence, scoreentry);
+    m_score_data->add(scoreentry, sentence_index);
 
     // examine first line for name of features
     if (!existsFeatureNames()) {
-      std::string stringsupport=substring;
-      std::string features="";
-      std::string tmpname="";
-
-      size_t tmpidx=0;
-      while (!stringsupport.empty()) {
-        //                      TRACE_ERR("Decompounding: " << substring << std::endl);
-        getNextPound(stringsupport, subsubstring);
-
-        // string ending with ":" are skipped, because they are the names of the features
-        if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
-          features+=tmpname+"_"+stringify(tmpidx)+" ";
-          tmpidx++;
-        }
-        // ignore sparse feature name
-        else if (subsubstring.find("_") != string::npos) {
-          // also ignore its value
-          getNextPound(stringsupport, subsubstring);
-        }
-        // update current feature name
-        else {
-          tmpidx=0;
-          tmpname=subsubstring.substr(0,subsubstring.size() - 1);
-        }
-      }
+      InitFeatureMap(feature_str);
+    }
+    AddFeatures(feature_str, sentence_index);
+  }
+  inp.close();
+}
 
-      featdata->setFeatureMap(features);
+void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
+  if (bin)
+    cerr << "Binary write mode is selected" << endl;
+  else
+    cerr << "Binary write mode is NOT selected" << endl;
+
+  m_feature_data->save(featfile, bin);
+  m_score_data->save(scorefile, bin);
+}
+
+void Data::InitFeatureMap(const string& str) {
+  string buf = str;
+  string substr;
+  string features = "";
+  string tmp_name = "";
+  size_t tmp_index = 0;
+
+  while (!buf.empty()) {
+    getNextPound(buf, substr);
+
+    // string ending with ":" are skipped, because they are the names of the features
+    if (!EndsWith(substr, ":")) {
+      stringstream ss;
+      ss << tmp_name << "_" << tmp_index << " ";
+      features.append(ss.str());
+
+      tmp_index++;
+    } else if (substr.find("_") != string::npos) {
+      // ignore sparse feature name and its value
+      getNextPound(buf, substr);
+    } else {                              // update current feature name
+      tmp_index = 0;
+      tmp_name = substr.substr(0, substr.size() - 1);
     }
+  }
+  m_feature_data->setFeatureMap(features);
+}
 
-    // adding features
-    while (!substring.empty()) {
-//                      TRACE_ERR("Decompounding: " << substring << std::endl);
-      getNextPound(substring, subsubstring);
+void Data::AddFeatures(const string& str,
+                       const string& sentence_index) {
+  string buf = str;
+  string substr;
+  FeatureStats feature_entry;
+  feature_entry.reset();
 
-      // no ':' -> feature value that needs to be stored
-      if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
-        featentry.add(ConvertStringToFeatureStatsType(subsubstring));
-      }
+  while (!buf.empty()) {
+    getNextPound(buf, substr);
+
+    // no ':' -> feature value that needs to be stored
+    if (!EndsWith(substr, ":")) {
+      feature_entry.add(ConvertStringToFeatureStatsType(substr));
+    } else if (substr.find("_") != string::npos) {
       // sparse feature name? store as well
-      else if (subsubstring.find("_") != string::npos) {
-        std::string name = subsubstring;
-        getNextPound(substring, subsubstring);
-        featentry.addSparse( name, atof(subsubstring.c_str()) );
-      }
+      string name = substr;
+      getNextPound(buf, substr);
+      feature_entry.addSparse(name, atof(substr.c_str()));
     }
-    //cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
-    featdata->add(featentry,sentence_index);
   }
-
-  inp.close();
+  m_feature_data->add(feature_entry, sentence_index);
 }
 
-
 void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
-                        std::vector<Data>& shards)
+                        vector<Data>& shards)
 {
   CHECK(shard_count);
   CHECK(shard_size >= 0);
   CHECK(shard_size <= 1);
 
-  size_t data_size = scoredata->size();
-  CHECK(data_size == featdata->size());
+  size_t data_size = m_score_data->size();
+  CHECK(data_size == m_feature_data->size());
 
   shard_size *= data_size;
+  const float coeff = static_cast<float>(data_size) / shard_count;
 
   for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
     vector<size_t> shard_contents;
     if (shard_size == 0) {
       //split into roughly equal size shards
-      size_t shard_start = floor(0.5 + shard_id * (float)data_size / shard_count);
-      size_t shard_end = floor(0.5 + (shard_id+1) * (float)data_size / shard_count);
+      const size_t shard_start = floor(0.5 + shard_id * coeff);
+      const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff);
       for (size_t i = shard_start; i < shard_end; ++i) {
         shard_contents.push_back(i);
       }
@@ -168,14 +248,14 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
       }
     }
 
-    Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig);
+    Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig);
 
-    shards.push_back(Data(*scorer));
-    shards.back().score_type = score_type;
-    shards.back().number_of_scores = number_of_scores;
+    shards.push_back(Data(scorer));
+    shards.back().m_score_type = m_score_type;
+    shards.back().m_num_scores = m_num_scores;
     for (size_t i = 0; i < shard_contents.size(); ++i) {
-      shards.back().featdata->add(featdata->get(shard_contents[i]));
-      shards.back().scoredata->add(scoredata->get(shard_contents[i]));
+      shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i]));
+      shards.back().m_score_data->add(m_score_data->get(shard_contents[i]));
     }
     //cerr << endl;
   }
author	Barry Haddow <barry.haddow@gmail.com>	2012-05-25 00:11:35 +0400
committer	Barry Haddow <barry.haddow@gmail.com>	2012-05-25 00:11:35 +0400
commit	c397d2068bd65fc1eca6bf49057cfda154fa3ce5 (patch)
tree	e8a8c4e6f60c2b0d47592371d981b9d1a1ac9f85 /mert/Data.cpp
parent	5a17ef82b3c7449d9fb3686875a7936193e14a89 (diff)
parent	277fd38bbd345c083b762a0fb36c0a69da2ca7eb (diff)