From 21009b5d1e44b70a7e5b1f0d039e83a961a41776 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 12 Dec 2011 20:48:42 +0700 Subject: revert --- mert/Data.cpp | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 23fdc6d82..94f5287a8 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -47,6 +47,99 @@ Data::~Data() { } } +//ADDED BY TS +void Data::remove_duplicates() { + + uint nSentences = featdata->size(); + assert(scoredata->size() == nSentences); + + for (uint s=0; s < nSentences; s++) { + + FeatureArray& feat_array = featdata->get(s); + ScoreArray& score_array = scoredata->get(s); + + assert(feat_array.size() == score_array.size()); + + //serves as a hash-map: + std::map > lookup; + + uint end_pos = feat_array.size() - 1; + + uint nRemoved = 0; + for (uint k=0; k <= end_pos; k++) { + + const FeatureStats& cur_feats = feat_array.get(k); + + double sum = 0.0; + for (uint l=0; l < cur_feats.size(); l++) + sum += cur_feats.get(l); + + if (lookup.find(sum) != lookup.end()) { + + //std::cerr << "hit" << std::endl; + + std::vector& cur_list = lookup[sum]; + + uint l=0; + for (l=0; l < cur_list.size(); l++) { + + uint j=cur_list[l]; + + if (cur_feats == feat_array.get(j) + && score_array.get(k) == score_array.get(j)) { + + if (k < end_pos) { + + feat_array.swap(k,end_pos); + score_array.swap(k,end_pos); + + k--; + } + + end_pos--; + nRemoved++; + break; + } + } + + if (l == lookup[sum].size()) + cur_list.push_back(k); + } + else + lookup[sum].push_back(k); + + // for (uint j=0; j < k; j++) { + + // if (feat_array.get(k) == feat_array.get(j) + // && score_array.get(k) == score_array.get(j)) { + + // if (k < end_pos) { + + // feat_array.swap(k,end_pos); + // score_array.swap(k,end_pos); + + // k--; + // } + + // end_pos--; + // nRemoved++; + // break; + // } + // } + } + + std::cerr << "removed " << nRemoved << "/" << feat_array.size() << std::endl; + + if (nRemoved > 0) { + + feat_array.resize(end_pos+1); + score_array.resize(end_pos+1); + } + } +} +//END_ADDED + + void Data::loadnbest(const std::string &file) { TRACE_ERR("loading nbest from " << file << std::endl); -- cgit v1.2.3 From ca0a3ea87018449c5a8fcb10645dcd8433285c43 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 12 Dec 2011 23:27:27 +0700 Subject: uint -> size_t --- mert/Data.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 94f5287a8..a3c6fc314 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -50,10 +50,10 @@ Data::~Data() { //ADDED BY TS void Data::remove_duplicates() { - uint nSentences = featdata->size(); + size_t nSentences = featdata->size(); assert(scoredata->size() == nSentences); - for (uint s=0; s < nSentences; s++) { + for (size_t s=0; s < nSentences; s++) { FeatureArray& feat_array = featdata->get(s); ScoreArray& score_array = scoredata->get(s); @@ -61,29 +61,29 @@ void Data::remove_duplicates() { assert(feat_array.size() == score_array.size()); //serves as a hash-map: - std::map > lookup; + std::map > lookup; - uint end_pos = feat_array.size() - 1; + size_t end_pos = feat_array.size() - 1; - uint nRemoved = 0; - for (uint k=0; k <= end_pos; k++) { + size_t nRemoved = 0; + for (size_t k=0; k <= end_pos; k++) { const FeatureStats& cur_feats = feat_array.get(k); double sum = 0.0; - for (uint l=0; l < cur_feats.size(); l++) + for (size_t l=0; l < cur_feats.size(); l++) sum += cur_feats.get(l); if (lookup.find(sum) != lookup.end()) { //std::cerr << "hit" << std::endl; - std::vector& cur_list = lookup[sum]; + std::vector& cur_list = lookup[sum]; - uint l=0; + size_t l=0; for (l=0; l < cur_list.size(); l++) { - uint j=cur_list[l]; + size_t j=cur_list[l]; if (cur_feats == feat_array.get(j) && score_array.get(k) == score_array.get(j)) { @@ -108,7 +108,7 @@ void Data::remove_duplicates() { else lookup[sum].push_back(k); - // for (uint j=0; j < k; j++) { + // for (size_t j=0; j < k; j++) { // if (feat_array.get(k) == feat_array.get(j) // && score_array.get(k) == score_array.get(j)) { -- cgit v1.2.3 From 194e24115adde476df107f5f5ef366eca3b86523 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 1 Feb 2012 17:17:58 +0900 Subject: Change casts to C++ style casts, and delete unnecessary casts. --- mert/Data.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index a3c6fc314..3e723f8be 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -255,8 +255,8 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor vector shard_contents; if (shard_size == 0) { //split into roughly equal size shards - size_t shard_start = floor(0.5 + shard_id * (float)data_size / shard_count); - size_t shard_end = floor(0.5 + (shard_id+1) * (float)data_size / shard_count); + const size_t shard_start = floor(0.5 + shard_id * static_cast(data_size) / shard_count); + const size_t shard_end = floor(0.5 + (shard_id + 1) * static_cast(data_size) / shard_count); for (size_t i = shard_start; i < shard_end; ++i) { shard_contents.push_back(i); } -- cgit v1.2.3 From 752724594eb22327f446f91b0c9c41d0e631bd75 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Wed, 8 Feb 2012 17:11:56 +0000 Subject: Fix sharding bug --- mert/Data.cpp | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 3e723f8be..a4e6c2b24 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -21,8 +21,8 @@ Data::Data() : theScorer(NULL), number_of_scores(0), _sparse_flag(false), - scoredata(NULL), - featdata(NULL) {} + scoredata(), + featdata() {} Data::Data(Scorer& ptr) : theScorer(&ptr), @@ -36,17 +36,6 @@ Data::Data(Scorer& ptr) TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl); } -Data::~Data() { - if (featdata) { - delete featdata; - featdata = NULL; - } - if (scoredata) { - delete scoredata; - scoredata = NULL; - } -} - //ADDED BY TS void Data::remove_duplicates() { @@ -128,7 +117,6 @@ void Data::remove_duplicates() { // } } - std::cerr << "removed " << nRemoved << "/" << feat_array.size() << std::endl; if (nRemoved > 0) { -- cgit v1.2.3 From 94888b258dcfd4ca68f4fc5ecad8b0410829e63b Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Mon, 20 Feb 2012 08:29:53 +0900 Subject: Fix typo. --- mert/Data.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index a4e6c2b24..627e53c8e 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -1,6 +1,6 @@ /* * Data.cpp - * met - Minimum Error Training + * mert - Minimum Error Rate Training * * Created by Nicola Bertoldi on 13/05/08. * -- cgit v1.2.3 From 2bdeee9caab9bc8adc8754b8a9b9f9e4f7b8563b Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 7 Mar 2012 07:01:28 +0900 Subject: Clean up Data::loadnbest(). Add helper functions. --- mert/Data.cpp | 129 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 63 insertions(+), 66 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 627e53c8e..4ab97a4dd 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -71,20 +71,20 @@ void Data::remove_duplicates() { size_t l=0; for (l=0; l < cur_list.size(); l++) { - + size_t j=cur_list[l]; if (cur_feats == feat_array.get(j) && score_array.get(k) == score_array.get(j)) { if (k < end_pos) { - + feat_array.swap(k,end_pos); score_array.swap(k,end_pos); - + k--; } - + end_pos--; nRemoved++; break; @@ -132,93 +132,90 @@ void Data::loadnbest(const std::string &file) { TRACE_ERR("loading nbest from " << file << std::endl); - FeatureStats featentry; ScoreStats scoreentry; - std::string sentence_index; inputfilestream inp(file); // matches a stream with a file. Opens the file if (!inp.good()) throw runtime_error("Unable to open: " + file); - std::string substring, subsubstring, stringBuf; - std::string theSentence; + std::string subsubstring, stringBuf; + std::string sentence_index, sentence, feature_str; std::string::size_type loc; while (getline(inp,stringBuf,'\n')) { if (stringBuf.empty()) continue; - -// TRACE_ERR("stringBuf: " << stringBuf << std::endl); - - getNextPound(stringBuf, substring, "|||"); //first field - sentence_index = substring; - - getNextPound(stringBuf, substring, "|||"); //second field - theSentence = substring; - // adding statistics for error measures - featentry.reset(); scoreentry.clear(); - theScorer->prepareStats(sentence_index, theSentence, scoreentry); + getNextPound(stringBuf, sentence_index, "|||"); // first field + getNextPound(stringBuf, sentence, "|||"); // second field + getNextPound(stringBuf, feature_str, "|||"); // third field + theScorer->prepareStats(sentence_index, sentence, scoreentry); scoredata->add(scoreentry, sentence_index); - getNextPound(stringBuf, substring, "|||"); //third field - // examine first line for name of features if (!existsFeatureNames()) { - std::string stringsupport=substring; - std::string features=""; - std::string tmpname=""; - - size_t tmpidx=0; - while (!stringsupport.empty()) { - // TRACE_ERR("Decompounding: " << substring << std::endl); - getNextPound(stringsupport, subsubstring); - - // string ending with ":" are skipped, because they are the names of the features - if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) { - features+=tmpname+"_"+stringify(tmpidx)+" "; - tmpidx++; - } - // ignore sparse feature name - else if (subsubstring.find("_") != string::npos) { - // also ignore its value - getNextPound(stringsupport, subsubstring); - } - // update current feature name - else { - tmpidx=0; - tmpname=subsubstring.substr(0,subsubstring.size() - 1); - } - } - - featdata->setFeatureMap(features); + InitFeatureMap(feature_str); } + AddFeatures(feature_str, sentence_index); + } + inp.close(); +} - // adding features - while (!substring.empty()) { -// TRACE_ERR("Decompounding: " << substring << std::endl); - getNextPound(substring, subsubstring); +void Data::InitFeatureMap(const string& str) { + string buf = str; + string substr; + string features = ""; + string tmp_name = ""; + size_t tmp_index = 0; + string::size_type loc; + char tmp[64]; // for snprintf(); + + while (!buf.empty()) { + getNextPound(buf, substr); + + // string ending with ":" are skipped, because they are the names of the features + if ((loc = substr.find_last_of(":")) != substr.length()-1) { + snprintf(tmp, sizeof(tmp), "%s_%lu ", tmp_name.c_str(), tmp_index); + features.append(tmp); + + tmp_index++; + } else if (substr.find("_") != string::npos) { + // ignore sparse feature name and its value + getNextPound(buf, substr); + } else { // update current feature name + tmp_index = 0; + tmp_name = substr.substr(0, substr.size() - 1); + } + } + featdata->setFeatureMap(features); +} - // no ':' -> feature value that needs to be stored - if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) { - featentry.add(ConvertStringToFeatureStatsType(subsubstring)); - } +void Data::AddFeatures(const string& str, + const string& sentence_index) { + string::size_type loc; + string buf = str; + string substr; + FeatureStats feature_entry; + feature_entry.reset(); + + while (!buf.empty()) { + getNextPound(buf, substr); + + // no ':' -> feature value that needs to be stored + if ((loc = substr.find_last_of(":")) != substr.length()-1) { + feature_entry.add(ConvertStringToFeatureStatsType(substr)); + } else if (substr.find("_") != string::npos) { // sparse feature name? store as well - else if (subsubstring.find("_") != string::npos) { - std::string name = subsubstring; - getNextPound(substring, subsubstring); - featentry.addSparse( name, atof(subsubstring.c_str()) ); - _sparse_flag = true; - } + std::string name = substr; + getNextPound(buf, substr); + feature_entry.addSparse(name, atof(substr.c_str())); + _sparse_flag = true; } - //cerr << "number of sparse features: " << featentry.getSparse().size() << endl; - featdata->add(featentry,sentence_index); } - - inp.close(); + featdata->add(feature_entry, sentence_index); } // TODO -- cgit v1.2.3 From 6ada41576cb9dd8e172505995305de18b9c68292 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 7 Mar 2012 07:07:29 +0900 Subject: Remove an unused variable. --- mert/Data.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 4ab97a4dd..93b193774 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -131,26 +131,23 @@ void Data::remove_duplicates() { void Data::loadnbest(const std::string &file) { TRACE_ERR("loading nbest from " << file << std::endl); - - ScoreStats scoreentry; - inputfilestream inp(file); // matches a stream with a file. Opens the file - if (!inp.good()) throw runtime_error("Unable to open: " + file); - std::string subsubstring, stringBuf; + ScoreStats scoreentry; + std::string line; std::string sentence_index, sentence, feature_str; std::string::size_type loc; - while (getline(inp,stringBuf,'\n')) { - if (stringBuf.empty()) continue; + while (getline(inp, line, '\n')) { + if (line.empty()) continue; // adding statistics for error measures scoreentry.clear(); - getNextPound(stringBuf, sentence_index, "|||"); // first field - getNextPound(stringBuf, sentence, "|||"); // second field - getNextPound(stringBuf, feature_str, "|||"); // third field + getNextPound(line, sentence_index, "|||"); // first field + getNextPound(line, sentence, "|||"); // second field + getNextPound(line, feature_str, "|||"); // third field theScorer->prepareStats(sentence_index, sentence, scoreentry); scoredata->add(scoreentry, sentence_index); -- cgit v1.2.3 From 851a1835b68deb958956e1d0965db002574d0241 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 7 Mar 2012 07:19:24 +0900 Subject: Remove an unused variable and unnecessary 'std::'. --- mert/Data.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 93b193774..c4a35b9b2 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -32,7 +32,7 @@ Data::Data(Scorer& ptr) scoredata(new ScoreData(*theScorer)), featdata(new FeatureData) { - TRACE_ERR("Data::score_type " << score_type << std::endl); + TRACE_ERR("Data::score_type " << score_type << endl); TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl); } @@ -50,7 +50,7 @@ void Data::remove_duplicates() { assert(feat_array.size() == score_array.size()); //serves as a hash-map: - std::map > lookup; + map > lookup; size_t end_pos = feat_array.size() - 1; @@ -65,9 +65,9 @@ void Data::remove_duplicates() { if (lookup.find(sum) != lookup.end()) { - //std::cerr << "hit" << std::endl; + //cerr << "hit" << endl; - std::vector& cur_list = lookup[sum]; + vector& cur_list = lookup[sum]; size_t l=0; for (l=0; l < cur_list.size(); l++) { @@ -128,17 +128,15 @@ void Data::remove_duplicates() { //END_ADDED -void Data::loadnbest(const std::string &file) +void Data::loadnbest(const string &file) { - TRACE_ERR("loading nbest from " << file << std::endl); + TRACE_ERR("loading nbest from " << file << endl); inputfilestream inp(file); // matches a stream with a file. Opens the file if (!inp.good()) throw runtime_error("Unable to open: " + file); ScoreStats scoreentry; - std::string line; - std::string sentence_index, sentence, feature_str; - std::string::size_type loc; + string line, sentence_index, sentence, feature_str; while (getline(inp, line, '\n')) { if (line.empty()) continue; @@ -206,7 +204,7 @@ void Data::AddFeatures(const string& str, feature_entry.add(ConvertStringToFeatureStatsType(substr)); } else if (substr.find("_") != string::npos) { // sparse feature name? store as well - std::string name = substr; + string name = substr; getNextPound(buf, substr); feature_entry.addSparse(name, atof(substr.c_str())); _sparse_flag = true; @@ -217,12 +215,12 @@ void Data::AddFeatures(const string& str, // TODO void Data::mergeSparseFeatures() { - std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n"; + cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n"; exit(1); } void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig, - std::vector& shards) + vector& shards) { CHECK(shard_count); CHECK(shard_size >= 0); @@ -232,13 +230,14 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor CHECK(data_size == featdata->size()); shard_size *= data_size; + const float coeff = static_cast(data_size) / shard_count; for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) { vector shard_contents; if (shard_size == 0) { //split into roughly equal size shards - const size_t shard_start = floor(0.5 + shard_id * static_cast(data_size) / shard_count); - const size_t shard_end = floor(0.5 + (shard_id + 1) * static_cast(data_size) / shard_count); + const size_t shard_start = floor(0.5 + shard_id * coeff); + const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff); for (size_t i = shard_start; i < shard_end; ++i) { shard_contents.push_back(i); } -- cgit v1.2.3 From eb2c9ee5e3e4ed76dd9b155c1b509a22d5ab7f4f Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sat, 10 Mar 2012 17:12:34 +0900 Subject: mert: Prefix private members with "m_" except TER. Squashed commit of the following: - Clean up PRO. - Clean up ScoreStats. - Clean up ScoreData. - Clean up ScoreArray. - Remove unnecessary headers. - Clean up ScopedVector. - Clean up Point. - Clean up PerScorer. - Clean up Optimizer. - Clean up MergeScorer. - Clean up InterpolatedScorer. - Clean up FileStream. - Clean up FeatureStats. - Remove inefficient string concatenation. - Clean up FeatureData. - Clean up FeatureArray. - Clean up Data. --- mert/Data.cpp | 60 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index c4a35b9b2..3a50a4550 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -18,34 +18,34 @@ #include "Util.h" Data::Data() - : theScorer(NULL), - number_of_scores(0), - _sparse_flag(false), - scoredata(), - featdata() {} + : m_scorer(NULL), + m_num_scores(0), + m_sparse_flag(false), + m_score_data(), + m_feature_data() {} Data::Data(Scorer& ptr) - : theScorer(&ptr), - score_type(theScorer->getName()), - number_of_scores(0), - _sparse_flag(false), - scoredata(new ScoreData(*theScorer)), - featdata(new FeatureData) + : m_scorer(&ptr), + m_score_type(m_scorer->getName()), + m_num_scores(0), + m_sparse_flag(false), + m_score_data(new ScoreData(*m_scorer)), + m_feature_data(new FeatureData) { - TRACE_ERR("Data::score_type " << score_type << endl); - TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl); + TRACE_ERR("Data::m_score_type " << m_score_type << endl); + TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl); } //ADDED BY TS void Data::remove_duplicates() { - size_t nSentences = featdata->size(); - assert(scoredata->size() == nSentences); + size_t nSentences = m_feature_data->size(); + assert(m_score_data->size() == nSentences); for (size_t s=0; s < nSentences; s++) { - FeatureArray& feat_array = featdata->get(s); - ScoreArray& score_array = scoredata->get(s); + FeatureArray& feat_array = m_feature_data->get(s); + ScoreArray& score_array = m_score_data->get(s); assert(feat_array.size() == score_array.size()); @@ -147,8 +147,8 @@ void Data::loadnbest(const string &file) getNextPound(line, sentence, "|||"); // second field getNextPound(line, feature_str, "|||"); // third field - theScorer->prepareStats(sentence_index, sentence, scoreentry); - scoredata->add(scoreentry, sentence_index); + m_scorer->prepareStats(sentence_index, sentence, scoreentry); + m_score_data->add(scoreentry, sentence_index); // examine first line for name of features if (!existsFeatureNames()) { @@ -185,7 +185,7 @@ void Data::InitFeatureMap(const string& str) { tmp_name = substr.substr(0, substr.size() - 1); } } - featdata->setFeatureMap(features); + m_feature_data->setFeatureMap(features); } void Data::AddFeatures(const string& str, @@ -207,10 +207,10 @@ void Data::AddFeatures(const string& str, string name = substr; getNextPound(buf, substr); feature_entry.addSparse(name, atof(substr.c_str())); - _sparse_flag = true; + m_sparse_flag = true; } } - featdata->add(feature_entry, sentence_index); + m_feature_data->add(feature_entry, sentence_index); } // TODO @@ -226,8 +226,8 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor CHECK(shard_size >= 0); CHECK(shard_size <= 1); - size_t data_size = scoredata->size(); - CHECK(data_size == featdata->size()); + size_t data_size = m_score_data->size(); + CHECK(data_size == m_feature_data->size()); shard_size *= data_size; const float coeff = static_cast(data_size) / shard_count; @@ -248,15 +248,15 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor } } - Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig); + Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig); shards.push_back(Data(*scorer)); - shards.back().score_type = score_type; - shards.back().number_of_scores = number_of_scores; - shards.back()._sparse_flag = _sparse_flag; + shards.back().m_score_type = m_score_type; + shards.back().m_num_scores = m_num_scores; + shards.back().m_sparse_flag = m_sparse_flag; for (size_t i = 0; i < shard_contents.size(); ++i) { - shards.back().featdata->add(featdata->get(shard_contents[i])); - shards.back().scoredata->add(scoredata->get(shard_contents[i])); + shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i])); + shards.back().m_score_data->add(m_score_data->get(shard_contents[i])); } //cerr << endl; } -- cgit v1.2.3 From a1ab79c7fce9079b05affaf26427846254b8e909 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sat, 10 Mar 2012 17:28:38 +0900 Subject: Pass by pointers to Scorer instead of references. --- mert/Data.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 3a50a4550..33c259658 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -24,12 +24,12 @@ Data::Data() m_score_data(), m_feature_data() {} -Data::Data(Scorer& ptr) - : m_scorer(&ptr), +Data::Data(Scorer* scorer) + : m_scorer(scorer), m_score_type(m_scorer->getName()), m_num_scores(0), m_sparse_flag(false), - m_score_data(new ScoreData(*m_scorer)), + m_score_data(new ScoreData(m_scorer)), m_feature_data(new FeatureData) { TRACE_ERR("Data::m_score_type " << m_score_type << endl); @@ -250,7 +250,7 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig); - shards.push_back(Data(*scorer)); + shards.push_back(Data(scorer)); shards.back().m_score_type = m_score_type; shards.back().m_num_scores = m_num_scores; shards.back().m_sparse_flag = m_sparse_flag; -- cgit v1.2.3 From 81309bdb2d2dd6be3e75165b6464497fbb4bcb19 Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sat, 10 Mar 2012 17:47:01 +0900 Subject: Clean up Data; add TODOs. --- mert/Data.cpp | 96 ++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 44 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 33c259658..b1950ea4e 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -7,7 +7,6 @@ */ #include -#include "util/check.hh" #include #include @@ -16,6 +15,7 @@ #include "Scorer.h" #include "ScorerFactory.h" #include "Util.h" +#include "util/check.hh" Data::Data() : m_scorer(NULL), @@ -37,13 +37,13 @@ Data::Data(Scorer* scorer) } //ADDED BY TS -void Data::remove_duplicates() { - +// TODO: This is too long; consider creating additional functions to +// reduce the lines of this function. +void Data::removeDuplicates() { size_t nSentences = m_feature_data->size(); assert(m_score_data->size() == nSentences); - for (size_t s=0; s < nSentences; s++) { - + for (size_t s = 0; s < nSentences; s++) { FeatureArray& feat_array = m_feature_data->get(s); ScoreArray& score_array = m_score_data->get(s); @@ -55,48 +55,42 @@ void Data::remove_duplicates() { size_t end_pos = feat_array.size() - 1; size_t nRemoved = 0; - for (size_t k=0; k <= end_pos; k++) { + for (size_t k = 0; k <= end_pos; k++) { const FeatureStats& cur_feats = feat_array.get(k); - double sum = 0.0; - for (size_t l=0; l < cur_feats.size(); l++) - sum += cur_feats.get(l); + for (size_t l = 0; l < cur_feats.size(); l++) + sum += cur_feats.get(l); if (lookup.find(sum) != lookup.end()) { - //cerr << "hit" << endl; - - vector& cur_list = lookup[sum]; - - size_t l=0; - for (l=0; l < cur_list.size(); l++) { - - size_t j=cur_list[l]; - - if (cur_feats == feat_array.get(j) - && score_array.get(k) == score_array.get(j)) { - - if (k < end_pos) { - - feat_array.swap(k,end_pos); - score_array.swap(k,end_pos); - - k--; - } - - end_pos--; - nRemoved++; - break; - } - } - - if (l == lookup[sum].size()) - cur_list.push_back(k); + //cerr << "hit" << endl; + vector& cur_list = lookup[sum]; + + // TODO: Make sure this is correct because we have already used 'l'. + // If this does not impact on the removing duplicates, it is better + // to change + size_t l = 0; + for (l = 0; l < cur_list.size(); l++) { + size_t j = cur_list[l]; + + if (cur_feats == feat_array.get(j) + && score_array.get(k) == score_array.get(j)) { + if (k < end_pos) { + feat_array.swap(k,end_pos); + score_array.swap(k,end_pos); + k--; + } + end_pos--; + nRemoved++; + break; + } + } + if (l == lookup[sum].size()) + cur_list.push_back(k); + } else { + lookup[sum].push_back(k); } - else - lookup[sum].push_back(k); - // for (size_t j=0; j < k; j++) { // if (feat_array.get(k) == feat_array.get(j) @@ -115,11 +109,9 @@ void Data::remove_duplicates() { // break; // } // } - } - + } // end for k if (nRemoved > 0) { - feat_array.resize(end_pos+1); score_array.resize(end_pos+1); } @@ -127,8 +119,14 @@ void Data::remove_duplicates() { } //END_ADDED +void Data::load(const std::string &featfile, const std::string &scorefile) { + m_feature_data->load(featfile); + m_score_data->load(scorefile); + if (m_feature_data->hasSparseFeatures()) + m_sparse_flag = true; +} -void Data::loadnbest(const string &file) +void Data::loadNBest(const string &file) { TRACE_ERR("loading nbest from " << file << endl); inputfilestream inp(file); // matches a stream with a file. Opens the file @@ -159,6 +157,16 @@ void Data::loadnbest(const string &file) inp.close(); } +void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) { + if (bin) + cerr << "Binary write mode is selected" << endl; + else + cerr << "Binary write mode is NOT selected" << endl; + + m_feature_data->save(featfile, bin); + m_score_data->save(scorefile, bin); +} + void Data::InitFeatureMap(const string& str) { string buf = str; string substr; -- cgit v1.2.3 From 1ade69a546120ab3f8444f9aaa83e49fa8a98fbb Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 4 Apr 2012 22:04:51 +0900 Subject: Add a function to check whether a string ends with a suffix. - Use the function in Data::InitFeatureMap(). - Add an unit test for InitFeatureMap(). - Move helper functions for Data::loadnbest() to public for unit testing. --- mert/Data.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index b1950ea4e..b0b8f12be 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -173,14 +173,13 @@ void Data::InitFeatureMap(const string& str) { string features = ""; string tmp_name = ""; size_t tmp_index = 0; - string::size_type loc; char tmp[64]; // for snprintf(); while (!buf.empty()) { getNextPound(buf, substr); // string ending with ":" are skipped, because they are the names of the features - if ((loc = substr.find_last_of(":")) != substr.length()-1) { + if (!EndsWith(substr, ":")) { snprintf(tmp, sizeof(tmp), "%s_%lu ", tmp_name.c_str(), tmp_index); features.append(tmp); -- cgit v1.2.3 From 8a2495c96695146b1a74c5e6f71a8f6885c8c67c Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Thu, 5 Apr 2012 00:03:13 +0900 Subject: Use EndsWith(). --- mert/Data.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index b0b8f12be..5405b0cb9 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -197,7 +197,6 @@ void Data::InitFeatureMap(const string& str) { void Data::AddFeatures(const string& str, const string& sentence_index) { - string::size_type loc; string buf = str; string substr; FeatureStats feature_entry; @@ -207,7 +206,7 @@ void Data::AddFeatures(const string& str, getNextPound(buf, substr); // no ':' -> feature value that needs to be stored - if ((loc = substr.find_last_of(":")) != substr.length()-1) { + if (!EndsWith(substr, ":")) { feature_entry.add(ConvertStringToFeatureStatsType(substr)); } else if (substr.find("_") != string::npos) { // sparse feature name? store as well -- cgit v1.2.3 From bd79fc2c131f05abe5ef52329896803d2d2a255b Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Wed, 18 Apr 2012 23:47:48 +0900 Subject: Use std::stringstream instead of using snprintf() for Windows. This commit fixes compilation problems related to snprintf() for Windows users. Thanks to Raka Prasetya for reporting the errors. Thanks also to Kenneth Heafield and Barry Haddow for suggestions. --- mert/Data.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 5405b0cb9..be4c65fb2 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -173,15 +173,15 @@ void Data::InitFeatureMap(const string& str) { string features = ""; string tmp_name = ""; size_t tmp_index = 0; - char tmp[64]; // for snprintf(); while (!buf.empty()) { getNextPound(buf, substr); // string ending with ":" are skipped, because they are the names of the features if (!EndsWith(substr, ":")) { - snprintf(tmp, sizeof(tmp), "%s_%lu ", tmp_name.c_str(), tmp_index); - features.append(tmp); + stringstream ss; + ss << tmp_name << "_" << tmp_index << " "; + features.append(ss.str()); tmp_index++; } else if (substr.find("_") != string::npos) { -- cgit v1.2.3 From df4586740d8319739a196bfc4df2bdeb7862b29a Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Sun, 6 May 2012 05:27:04 +0900 Subject: Fix using directive refers to implicitly-defined namespace 'std'. --- mert/Data.cpp | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index be4c65fb2..19a89f754 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -17,6 +17,8 @@ #include "Util.h" #include "util/check.hh" +using namespace std; + Data::Data() : m_scorer(NULL), m_num_scores(0), -- cgit v1.2.3