Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2012-05-25 00:11:35 +0400
committerBarry Haddow <barry.haddow@gmail.com>2012-05-25 00:11:35 +0400
commitc397d2068bd65fc1eca6bf49057cfda154fa3ce5 (patch)
treee8a8c4e6f60c2b0d47592371d981b9d1a1ac9f85 /mert/Data.cpp
parent5a17ef82b3c7449d9fb3686875a7936193e14a89 (diff)
parent277fd38bbd345c083b762a0fb36c0a69da2ca7eb (diff)
Merge branch 'trunk' into miramerge. Still to fix build.
Conflicts: Jamroot mert/Data.cpp mert/Data.h mert/FeatureArray.cpp mert/FeatureArray.h mert/FeatureData.cpp mert/FeatureData.h mert/FeatureStats.cpp mert/FeatureStats.h mert/mert.cpp moses-chart-cmd/src/IOWrapper.h moses-chart-cmd/src/Main.cpp moses-cmd/src/IOWrapper.cpp moses-cmd/src/IOWrapper.h moses-cmd/src/Main.cpp moses/src/GlobalLexicalModel.cpp moses/src/Jamfile moses/src/Parameter.cpp moses/src/PhraseDictionary.cpp moses/src/ScoreIndexManager.h moses/src/TargetPhrase.h regression-testing/tests/phrase.lexicalized-reordering-bin/truth/results.txt regression-testing/tests/phrase.lexicalized-reordering-cn/truth/results.txt regression-testing/tests/phrase.lexicalized-reordering/truth/results.txt regression-testing/tests/phrase.multiple-translation-system-lr/truth/results.txt regression-testing/tests/phrase.show-weights.lex-reorder/truth/results.txt regression-testing/tests/phrase.show-weights/truth/results.txt scripts/ems/experiment.meta scripts/ems/experiment.perl scripts/training/filter-model-given-input.pl scripts/training/mert-moses.pl
Diffstat (limited to 'mert/Data.cpp')
-rw-r--r--mert/Data.cpp284
1 files changed, 182 insertions, 102 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 4f822558e..2a6bd5e92 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -1,13 +1,12 @@
/*
* Data.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#include <algorithm>
-#include "util/check.hh"
#include <cmath>
#include <fstream>
@@ -16,148 +15,229 @@
#include "Scorer.h"
#include "ScorerFactory.h"
#include "Util.h"
+#include "util/check.hh"
+
+using namespace std;
+
-Data::Data(Scorer& ptr, const std::string& sparseweightsfile)
- : theScorer(&ptr),
- score_type(theScorer->getName()),
- number_of_scores(0),
- scoredata(new ScoreData(*theScorer)),
- featdata(new FeatureData)
+Data::Data(Scorer* scorer, const string& sparse_weights_file)
+ : m_scorer(scorer),
+ m_score_type(m_scorer->getName()),
+ m_num_scores(0),
+ m_score_data(new ScoreData(m_scorer)),
+ m_feature_data(new FeatureData)
{
- TRACE_ERR("Data::score_type " << score_type << std::endl);
- TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
- if (sparseweightsfile.size()) {
- sparse_weights.load(sparseweightsfile);
+ TRACE_ERR("Data::m_score_type " << m_score_type << endl);
+ TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
+ if (sparse_weights_file.size()) {
+ m_sparse_weights.load(sparse_weights_file);
ostringstream msg;
msg << "Data::sparse_weights {";
- sparse_weights.write(msg,"=");
+ m_sparse_weights.write(msg,"=");
msg << "}";
TRACE_ERR(msg.str() << std::endl);
}
}
-Data::~Data() {
- if (featdata) {
- delete featdata;
- featdata = NULL;
- }
- if (scoredata) {
- delete scoredata;
- scoredata = NULL;
- }
-}
+//ADDED BY TS
+// TODO: This is too long; consider creating additional functions to
+// reduce the lines of this function.
+void Data::removeDuplicates() {
+ size_t nSentences = m_feature_data->size();
+ assert(m_score_data->size() == nSentences);
-void Data::loadnbest(const std::string &file)
-{
- TRACE_ERR("loading nbest from " << file << std::endl);
+ for (size_t s = 0; s < nSentences; s++) {
+ FeatureArray& feat_array = m_feature_data->get(s);
+ ScoreArray& score_array = m_score_data->get(s);
- FeatureStats featentry;
- ScoreStats scoreentry;
- std::string sentence_index;
+ assert(feat_array.size() == score_array.size());
- inputfilestream inp(file); // matches a stream with a file. Opens the file
+ //serves as a hash-map:
+ map<double, vector<size_t> > lookup;
- if (!inp.good())
- throw runtime_error("Unable to open: " + file);
+ size_t end_pos = feat_array.size() - 1;
+
+ size_t nRemoved = 0;
+
+ for (size_t k = 0; k <= end_pos; k++) {
+ const FeatureStats& cur_feats = feat_array.get(k);
+ double sum = 0.0;
+ for (size_t l = 0; l < cur_feats.size(); l++)
+ sum += cur_feats.get(l);
- std::string substring, subsubstring, stringBuf;
- std::string theSentence;
- std::string::size_type loc;
+ if (lookup.find(sum) != lookup.end()) {
- while (getline(inp,stringBuf,'\n')) {
- if (stringBuf.empty()) continue;
+ //cerr << "hit" << endl;
+ vector<size_t>& cur_list = lookup[sum];
-// TRACE_ERR("stringBuf: " << stringBuf << std::endl);
+ // TODO: Make sure this is correct because we have already used 'l'.
+ // If this does not impact on the removing duplicates, it is better
+ // to change
+ size_t l = 0;
+ for (l = 0; l < cur_list.size(); l++) {
+ size_t j = cur_list[l];
+
+ if (cur_feats == feat_array.get(j)
+ && score_array.get(k) == score_array.get(j)) {
+ if (k < end_pos) {
+ feat_array.swap(k,end_pos);
+ score_array.swap(k,end_pos);
+ k--;
+ }
+ end_pos--;
+ nRemoved++;
+ break;
+ }
+ }
+ if (l == lookup[sum].size())
+ cur_list.push_back(k);
+ } else {
+ lookup[sum].push_back(k);
+ }
+ // for (size_t j=0; j < k; j++) {
+
+ // if (feat_array.get(k) == feat_array.get(j)
+ // && score_array.get(k) == score_array.get(j)) {
+
+ // if (k < end_pos) {
+
+ // feat_array.swap(k,end_pos);
+ // score_array.swap(k,end_pos);
+
+ // k--;
+ // }
+
+ // end_pos--;
+ // nRemoved++;
+ // break;
+ // }
+ // }
+ } // end for k
+
+ if (nRemoved > 0) {
+ feat_array.resize(end_pos+1);
+ score_array.resize(end_pos+1);
+ }
+ }
+}
+//END_ADDED
+
+void Data::load(const std::string &featfile, const std::string &scorefile) {
+ m_feature_data->load(featfile, m_sparse_weights);
+ m_score_data->load(scorefile);
+}
- getNextPound(stringBuf, substring, "|||"); //first field
- sentence_index = substring;
+void Data::loadNBest(const string &file)
+{
+ TRACE_ERR("loading nbest from " << file << endl);
+ inputfilestream inp(file); // matches a stream with a file. Opens the file
+ if (!inp.good())
+ throw runtime_error("Unable to open: " + file);
- getNextPound(stringBuf, substring, "|||"); //second field
- theSentence = substring;
+ ScoreStats scoreentry;
+ string line, sentence_index, sentence, feature_str;
+ while (getline(inp, line, '\n')) {
+ if (line.empty()) continue;
// adding statistics for error measures
- featentry.reset();
scoreentry.clear();
- theScorer->prepareStats(sentence_index, theSentence, scoreentry);
-
- scoredata->add(scoreentry, sentence_index);
+ getNextPound(line, sentence_index, "|||"); // first field
+ getNextPound(line, sentence, "|||"); // second field
+ getNextPound(line, feature_str, "|||"); // third field
- getNextPound(stringBuf, substring, "|||"); //third field
+ m_scorer->prepareStats(sentence_index, sentence, scoreentry);
+ m_score_data->add(scoreentry, sentence_index);
// examine first line for name of features
if (!existsFeatureNames()) {
- std::string stringsupport=substring;
- std::string features="";
- std::string tmpname="";
-
- size_t tmpidx=0;
- while (!stringsupport.empty()) {
- // TRACE_ERR("Decompounding: " << substring << std::endl);
- getNextPound(stringsupport, subsubstring);
-
- // string ending with ":" are skipped, because they are the names of the features
- if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
- features+=tmpname+"_"+stringify(tmpidx)+" ";
- tmpidx++;
- }
- // ignore sparse feature name
- else if (subsubstring.find("_") != string::npos) {
- // also ignore its value
- getNextPound(stringsupport, subsubstring);
- }
- // update current feature name
- else {
- tmpidx=0;
- tmpname=subsubstring.substr(0,subsubstring.size() - 1);
- }
- }
+ InitFeatureMap(feature_str);
+ }
+ AddFeatures(feature_str, sentence_index);
+ }
+ inp.close();
+}
- featdata->setFeatureMap(features);
+void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
+ if (bin)
+ cerr << "Binary write mode is selected" << endl;
+ else
+ cerr << "Binary write mode is NOT selected" << endl;
+
+ m_feature_data->save(featfile, bin);
+ m_score_data->save(scorefile, bin);
+}
+
+void Data::InitFeatureMap(const string& str) {
+ string buf = str;
+ string substr;
+ string features = "";
+ string tmp_name = "";
+ size_t tmp_index = 0;
+
+ while (!buf.empty()) {
+ getNextPound(buf, substr);
+
+ // string ending with ":" are skipped, because they are the names of the features
+ if (!EndsWith(substr, ":")) {
+ stringstream ss;
+ ss << tmp_name << "_" << tmp_index << " ";
+ features.append(ss.str());
+
+ tmp_index++;
+ } else if (substr.find("_") != string::npos) {
+ // ignore sparse feature name and its value
+ getNextPound(buf, substr);
+ } else { // update current feature name
+ tmp_index = 0;
+ tmp_name = substr.substr(0, substr.size() - 1);
}
+ }
+ m_feature_data->setFeatureMap(features);
+}
- // adding features
- while (!substring.empty()) {
-// TRACE_ERR("Decompounding: " << substring << std::endl);
- getNextPound(substring, subsubstring);
+void Data::AddFeatures(const string& str,
+ const string& sentence_index) {
+ string buf = str;
+ string substr;
+ FeatureStats feature_entry;
+ feature_entry.reset();
- // no ':' -> feature value that needs to be stored
- if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
- featentry.add(ConvertStringToFeatureStatsType(subsubstring));
- }
+ while (!buf.empty()) {
+ getNextPound(buf, substr);
+
+ // no ':' -> feature value that needs to be stored
+ if (!EndsWith(substr, ":")) {
+ feature_entry.add(ConvertStringToFeatureStatsType(substr));
+ } else if (substr.find("_") != string::npos) {
// sparse feature name? store as well
- else if (subsubstring.find("_") != string::npos) {
- std::string name = subsubstring;
- getNextPound(substring, subsubstring);
- featentry.addSparse( name, atof(subsubstring.c_str()) );
- }
+ string name = substr;
+ getNextPound(buf, substr);
+ feature_entry.addSparse(name, atof(substr.c_str()));
}
- //cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
- featdata->add(featentry,sentence_index);
}
-
- inp.close();
+ m_feature_data->add(feature_entry, sentence_index);
}
-
void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
- std::vector<Data>& shards)
+ vector<Data>& shards)
{
CHECK(shard_count);
CHECK(shard_size >= 0);
CHECK(shard_size <= 1);
- size_t data_size = scoredata->size();
- CHECK(data_size == featdata->size());
+ size_t data_size = m_score_data->size();
+ CHECK(data_size == m_feature_data->size());
shard_size *= data_size;
+ const float coeff = static_cast<float>(data_size) / shard_count;
for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
vector<size_t> shard_contents;
if (shard_size == 0) {
//split into roughly equal size shards
- size_t shard_start = floor(0.5 + shard_id * (float)data_size / shard_count);
- size_t shard_end = floor(0.5 + (shard_id+1) * (float)data_size / shard_count);
+ const size_t shard_start = floor(0.5 + shard_id * coeff);
+ const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff);
for (size_t i = shard_start; i < shard_end; ++i) {
shard_contents.push_back(i);
}
@@ -168,14 +248,14 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
}
}
- Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig);
+ Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig);
- shards.push_back(Data(*scorer));
- shards.back().score_type = score_type;
- shards.back().number_of_scores = number_of_scores;
+ shards.push_back(Data(scorer));
+ shards.back().m_score_type = m_score_type;
+ shards.back().m_num_scores = m_num_scores;
for (size_t i = 0; i < shard_contents.size(); ++i) {
- shards.back().featdata->add(featdata->get(shard_contents[i]));
- shards.back().scoredata->add(scoredata->get(shard_contents[i]));
+ shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i]));
+ shards.back().m_score_data->add(m_score_data->get(shard_contents[i]));
}
//cerr << endl;
}