diff options
author | Barry Haddow <barry.haddow@gmail.com> | 2012-05-25 00:11:35 +0400 |
---|---|---|
committer | Barry Haddow <barry.haddow@gmail.com> | 2012-05-25 00:11:35 +0400 |
commit | c397d2068bd65fc1eca6bf49057cfda154fa3ce5 (patch) | |
tree | e8a8c4e6f60c2b0d47592371d981b9d1a1ac9f85 /mert/Data.cpp | |
parent | 5a17ef82b3c7449d9fb3686875a7936193e14a89 (diff) | |
parent | 277fd38bbd345c083b762a0fb36c0a69da2ca7eb (diff) |
Merge branch 'trunk' into miramerge. Still to fix build.
Conflicts:
Jamroot
mert/Data.cpp
mert/Data.h
mert/FeatureArray.cpp
mert/FeatureArray.h
mert/FeatureData.cpp
mert/FeatureData.h
mert/FeatureStats.cpp
mert/FeatureStats.h
mert/mert.cpp
moses-chart-cmd/src/IOWrapper.h
moses-chart-cmd/src/Main.cpp
moses-cmd/src/IOWrapper.cpp
moses-cmd/src/IOWrapper.h
moses-cmd/src/Main.cpp
moses/src/GlobalLexicalModel.cpp
moses/src/Jamfile
moses/src/Parameter.cpp
moses/src/PhraseDictionary.cpp
moses/src/ScoreIndexManager.h
moses/src/TargetPhrase.h
regression-testing/tests/phrase.lexicalized-reordering-bin/truth/results.txt
regression-testing/tests/phrase.lexicalized-reordering-cn/truth/results.txt
regression-testing/tests/phrase.lexicalized-reordering/truth/results.txt
regression-testing/tests/phrase.multiple-translation-system-lr/truth/results.txt
regression-testing/tests/phrase.show-weights.lex-reorder/truth/results.txt
regression-testing/tests/phrase.show-weights/truth/results.txt
scripts/ems/experiment.meta
scripts/ems/experiment.perl
scripts/training/filter-model-given-input.pl
scripts/training/mert-moses.pl
Diffstat (limited to 'mert/Data.cpp')
-rw-r--r-- | mert/Data.cpp | 284 |
1 files changed, 182 insertions, 102 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp index 4f822558e..2a6bd5e92 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -1,13 +1,12 @@ /* * Data.cpp - * met - Minimum Error Training + * mert - Minimum Error Rate Training * * Created by Nicola Bertoldi on 13/05/08. * */ #include <algorithm> -#include "util/check.hh" #include <cmath> #include <fstream> @@ -16,148 +15,229 @@ #include "Scorer.h" #include "ScorerFactory.h" #include "Util.h" +#include "util/check.hh" + +using namespace std; + -Data::Data(Scorer& ptr, const std::string& sparseweightsfile) - : theScorer(&ptr), - score_type(theScorer->getName()), - number_of_scores(0), - scoredata(new ScoreData(*theScorer)), - featdata(new FeatureData) +Data::Data(Scorer* scorer, const string& sparse_weights_file) + : m_scorer(scorer), + m_score_type(m_scorer->getName()), + m_num_scores(0), + m_score_data(new ScoreData(m_scorer)), + m_feature_data(new FeatureData) { - TRACE_ERR("Data::score_type " << score_type << std::endl); - TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl); - if (sparseweightsfile.size()) { - sparse_weights.load(sparseweightsfile); + TRACE_ERR("Data::m_score_type " << m_score_type << endl); + TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl); + if (sparse_weights_file.size()) { + m_sparse_weights.load(sparse_weights_file); ostringstream msg; msg << "Data::sparse_weights {"; - sparse_weights.write(msg,"="); + m_sparse_weights.write(msg,"="); msg << "}"; TRACE_ERR(msg.str() << std::endl); } } -Data::~Data() { - if (featdata) { - delete featdata; - featdata = NULL; - } - if (scoredata) { - delete scoredata; - scoredata = NULL; - } -} +//ADDED BY TS +// TODO: This is too long; consider creating additional functions to +// reduce the lines of this function. +void Data::removeDuplicates() { + size_t nSentences = m_feature_data->size(); + assert(m_score_data->size() == nSentences); -void Data::loadnbest(const std::string &file) -{ - TRACE_ERR("loading nbest from " << file << std::endl); + for (size_t s = 0; s < nSentences; s++) { + FeatureArray& feat_array = m_feature_data->get(s); + ScoreArray& score_array = m_score_data->get(s); - FeatureStats featentry; - ScoreStats scoreentry; - std::string sentence_index; + assert(feat_array.size() == score_array.size()); - inputfilestream inp(file); // matches a stream with a file. Opens the file + //serves as a hash-map: + map<double, vector<size_t> > lookup; - if (!inp.good()) - throw runtime_error("Unable to open: " + file); + size_t end_pos = feat_array.size() - 1; + + size_t nRemoved = 0; + + for (size_t k = 0; k <= end_pos; k++) { + const FeatureStats& cur_feats = feat_array.get(k); + double sum = 0.0; + for (size_t l = 0; l < cur_feats.size(); l++) + sum += cur_feats.get(l); - std::string substring, subsubstring, stringBuf; - std::string theSentence; - std::string::size_type loc; + if (lookup.find(sum) != lookup.end()) { - while (getline(inp,stringBuf,'\n')) { - if (stringBuf.empty()) continue; + //cerr << "hit" << endl; + vector<size_t>& cur_list = lookup[sum]; -// TRACE_ERR("stringBuf: " << stringBuf << std::endl); + // TODO: Make sure this is correct because we have already used 'l'. + // If this does not impact on the removing duplicates, it is better + // to change + size_t l = 0; + for (l = 0; l < cur_list.size(); l++) { + size_t j = cur_list[l]; + + if (cur_feats == feat_array.get(j) + && score_array.get(k) == score_array.get(j)) { + if (k < end_pos) { + feat_array.swap(k,end_pos); + score_array.swap(k,end_pos); + k--; + } + end_pos--; + nRemoved++; + break; + } + } + if (l == lookup[sum].size()) + cur_list.push_back(k); + } else { + lookup[sum].push_back(k); + } + // for (size_t j=0; j < k; j++) { + + // if (feat_array.get(k) == feat_array.get(j) + // && score_array.get(k) == score_array.get(j)) { + + // if (k < end_pos) { + + // feat_array.swap(k,end_pos); + // score_array.swap(k,end_pos); + + // k--; + // } + + // end_pos--; + // nRemoved++; + // break; + // } + // } + } // end for k + + if (nRemoved > 0) { + feat_array.resize(end_pos+1); + score_array.resize(end_pos+1); + } + } +} +//END_ADDED + +void Data::load(const std::string &featfile, const std::string &scorefile) { + m_feature_data->load(featfile, m_sparse_weights); + m_score_data->load(scorefile); +} - getNextPound(stringBuf, substring, "|||"); //first field - sentence_index = substring; +void Data::loadNBest(const string &file) +{ + TRACE_ERR("loading nbest from " << file << endl); + inputfilestream inp(file); // matches a stream with a file. Opens the file + if (!inp.good()) + throw runtime_error("Unable to open: " + file); - getNextPound(stringBuf, substring, "|||"); //second field - theSentence = substring; + ScoreStats scoreentry; + string line, sentence_index, sentence, feature_str; + while (getline(inp, line, '\n')) { + if (line.empty()) continue; // adding statistics for error measures - featentry.reset(); scoreentry.clear(); - theScorer->prepareStats(sentence_index, theSentence, scoreentry); - - scoredata->add(scoreentry, sentence_index); + getNextPound(line, sentence_index, "|||"); // first field + getNextPound(line, sentence, "|||"); // second field + getNextPound(line, feature_str, "|||"); // third field - getNextPound(stringBuf, substring, "|||"); //third field + m_scorer->prepareStats(sentence_index, sentence, scoreentry); + m_score_data->add(scoreentry, sentence_index); // examine first line for name of features if (!existsFeatureNames()) { - std::string stringsupport=substring; - std::string features=""; - std::string tmpname=""; - - size_t tmpidx=0; - while (!stringsupport.empty()) { - // TRACE_ERR("Decompounding: " << substring << std::endl); - getNextPound(stringsupport, subsubstring); - - // string ending with ":" are skipped, because they are the names of the features - if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) { - features+=tmpname+"_"+stringify(tmpidx)+" "; - tmpidx++; - } - // ignore sparse feature name - else if (subsubstring.find("_") != string::npos) { - // also ignore its value - getNextPound(stringsupport, subsubstring); - } - // update current feature name - else { - tmpidx=0; - tmpname=subsubstring.substr(0,subsubstring.size() - 1); - } - } + InitFeatureMap(feature_str); + } + AddFeatures(feature_str, sentence_index); + } + inp.close(); +} - featdata->setFeatureMap(features); +void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) { + if (bin) + cerr << "Binary write mode is selected" << endl; + else + cerr << "Binary write mode is NOT selected" << endl; + + m_feature_data->save(featfile, bin); + m_score_data->save(scorefile, bin); +} + +void Data::InitFeatureMap(const string& str) { + string buf = str; + string substr; + string features = ""; + string tmp_name = ""; + size_t tmp_index = 0; + + while (!buf.empty()) { + getNextPound(buf, substr); + + // string ending with ":" are skipped, because they are the names of the features + if (!EndsWith(substr, ":")) { + stringstream ss; + ss << tmp_name << "_" << tmp_index << " "; + features.append(ss.str()); + + tmp_index++; + } else if (substr.find("_") != string::npos) { + // ignore sparse feature name and its value + getNextPound(buf, substr); + } else { // update current feature name + tmp_index = 0; + tmp_name = substr.substr(0, substr.size() - 1); } + } + m_feature_data->setFeatureMap(features); +} - // adding features - while (!substring.empty()) { -// TRACE_ERR("Decompounding: " << substring << std::endl); - getNextPound(substring, subsubstring); +void Data::AddFeatures(const string& str, + const string& sentence_index) { + string buf = str; + string substr; + FeatureStats feature_entry; + feature_entry.reset(); - // no ':' -> feature value that needs to be stored - if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) { - featentry.add(ConvertStringToFeatureStatsType(subsubstring)); - } + while (!buf.empty()) { + getNextPound(buf, substr); + + // no ':' -> feature value that needs to be stored + if (!EndsWith(substr, ":")) { + feature_entry.add(ConvertStringToFeatureStatsType(substr)); + } else if (substr.find("_") != string::npos) { // sparse feature name? store as well - else if (subsubstring.find("_") != string::npos) { - std::string name = subsubstring; - getNextPound(substring, subsubstring); - featentry.addSparse( name, atof(subsubstring.c_str()) ); - } + string name = substr; + getNextPound(buf, substr); + feature_entry.addSparse(name, atof(substr.c_str())); } - //cerr << "number of sparse features: " << featentry.getSparse().size() << endl; - featdata->add(featentry,sentence_index); } - - inp.close(); + m_feature_data->add(feature_entry, sentence_index); } - void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig, - std::vector<Data>& shards) + vector<Data>& shards) { CHECK(shard_count); CHECK(shard_size >= 0); CHECK(shard_size <= 1); - size_t data_size = scoredata->size(); - CHECK(data_size == featdata->size()); + size_t data_size = m_score_data->size(); + CHECK(data_size == m_feature_data->size()); shard_size *= data_size; + const float coeff = static_cast<float>(data_size) / shard_count; for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) { vector<size_t> shard_contents; if (shard_size == 0) { //split into roughly equal size shards - size_t shard_start = floor(0.5 + shard_id * (float)data_size / shard_count); - size_t shard_end = floor(0.5 + (shard_id+1) * (float)data_size / shard_count); + const size_t shard_start = floor(0.5 + shard_id * coeff); + const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff); for (size_t i = shard_start; i < shard_end; ++i) { shard_contents.push_back(i); } @@ -168,14 +248,14 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor } } - Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig); + Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig); - shards.push_back(Data(*scorer)); - shards.back().score_type = score_type; - shards.back().number_of_scores = number_of_scores; + shards.push_back(Data(scorer)); + shards.back().m_score_type = m_score_type; + shards.back().m_num_scores = m_num_scores; for (size_t i = 0; i < shard_contents.size(); ++i) { - shards.back().featdata->add(featdata->get(shard_contents[i])); - shards.back().scoredata->add(scoredata->get(shard_contents[i])); + shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i])); + shards.back().m_score_data->add(m_score_data->get(shard_contents[i])); } //cerr << endl; } |