diff options
author | Hieu Hoang <hieuhoang@Hieus-MacBook.local> | 2011-12-12 17:48:42 +0400 |
---|---|---|
committer | Hieu Hoang <hieuhoang@Hieus-MacBook.local> | 2011-12-12 17:48:42 +0400 |
commit | 753eebd959673ffba4d4d44301955334f40156ab (patch) | |
tree | 14933af527fa33b63be235cc6c0c99b02e5bcd28 | |
parent | 8327cce73b48826accee4354f5da926b3dd80074 (diff) |
revert
-rw-r--r-- | mert/Data.cpp | 93 | ||||
-rw-r--r-- | mert/Data.h | 6 | ||||
-rw-r--r-- | mert/FeatureArray.h | 10 | ||||
-rw-r--r-- | mert/FeatureStats.cpp | 16 | ||||
-rw-r--r-- | mert/FeatureStats.h | 4 | ||||
-rw-r--r-- | mert/ScoreArray.h | 10 | ||||
-rw-r--r-- | mert/ScoreStats.cpp | 16 | ||||
-rw-r--r-- | mert/ScoreStats.h | 4 | ||||
-rw-r--r-- | mert/extractor.cpp | 4 | ||||
-rwxr-xr-x | mert/mert.cpp | 7 | ||||
-rwxr-xr-x | scripts/generic/trainlm-irst.perl | 2 |
11 files changed, 169 insertions, 3 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp index 23fdc6d82..94f5287a8 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -47,6 +47,99 @@ Data::~Data() { } } +//ADDED BY TS +void Data::remove_duplicates() { + + uint nSentences = featdata->size(); + assert(scoredata->size() == nSentences); + + for (uint s=0; s < nSentences; s++) { + + FeatureArray& feat_array = featdata->get(s); + ScoreArray& score_array = scoredata->get(s); + + assert(feat_array.size() == score_array.size()); + + //serves as a hash-map: + std::map<double, std::vector<uint> > lookup; + + uint end_pos = feat_array.size() - 1; + + uint nRemoved = 0; + for (uint k=0; k <= end_pos; k++) { + + const FeatureStats& cur_feats = feat_array.get(k); + + double sum = 0.0; + for (uint l=0; l < cur_feats.size(); l++) + sum += cur_feats.get(l); + + if (lookup.find(sum) != lookup.end()) { + + //std::cerr << "hit" << std::endl; + + std::vector<uint>& cur_list = lookup[sum]; + + uint l=0; + for (l=0; l < cur_list.size(); l++) { + + uint j=cur_list[l]; + + if (cur_feats == feat_array.get(j) + && score_array.get(k) == score_array.get(j)) { + + if (k < end_pos) { + + feat_array.swap(k,end_pos); + score_array.swap(k,end_pos); + + k--; + } + + end_pos--; + nRemoved++; + break; + } + } + + if (l == lookup[sum].size()) + cur_list.push_back(k); + } + else + lookup[sum].push_back(k); + + // for (uint j=0; j < k; j++) { + + // if (feat_array.get(k) == feat_array.get(j) + // && score_array.get(k) == score_array.get(j)) { + + // if (k < end_pos) { + + // feat_array.swap(k,end_pos); + // score_array.swap(k,end_pos); + + // k--; + // } + + // end_pos--; + // nRemoved++; + // break; + // } + // } + } + + std::cerr << "removed " << nRemoved << "/" << feat_array.size() << std::endl; + + if (nRemoved > 0) { + + feat_array.resize(end_pos+1); + score_array.resize(end_pos+1); + } + } +} +//END_ADDED + + void Data::loadnbest(const std::string &file) { TRACE_ERR("loading nbest from " << file << std::endl); diff --git a/mert/Data.h b/mert/Data.h index db858fc18..56d7ac47d 100644 --- a/mert/Data.h +++ b/mert/Data.h @@ -73,7 +73,7 @@ public: void mergeSparseFeatures(); void loadnbest(const std::string &file); - + void load(const std::string &featfile,const std::string &scorefile) { featdata->load(featfile); scoredata->load(scorefile); @@ -81,6 +81,10 @@ public: _sparse_flag = true; } + //ADDED BY TS + void remove_duplicates(); + //END_ADDED + void save(const std::string &featfile,const std::string &scorefile, bool bin=false) { if (bin) cerr << "Binary write mode is selected" << endl; diff --git a/mert/FeatureArray.h b/mert/FeatureArray.h index 1fa3c4151..ee8ee1354 100644 --- a/mert/FeatureArray.h +++ b/mert/FeatureArray.h @@ -63,6 +63,16 @@ public: array_.push_back(e); } + //ADDED BY TS + void swap(size_t i, size_t j) { + std::swap(array_[i],array_[j]); + } + + void resize(size_t new_size) { + array_.resize(std::min(new_size,array_.size())); + } + //END_ADDED + void merge(FeatureArray& e); inline size_t size() const { diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp index e7682518c..a8f0f478b 100644 --- a/mert/FeatureStats.cpp +++ b/mert/FeatureStats.cpp @@ -218,3 +218,19 @@ ostream& operator<<(ostream& o, const FeatureStats& e) return o; } + +//ADEED_BY_TS +bool operator==(const FeatureStats& f1, const FeatureStats& f2) { + size_t size = f1.size(); + + if (size != f2.size()) + return false; + + for (size_t k=0; k < size; k++) { + if (f1.get(k) != f2.get(k)) + return false; + } + + return true; +} +//END_ADDED diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h index 44858a5a3..10ff31992 100644 --- a/mert/FeatureStats.h +++ b/mert/FeatureStats.h @@ -134,4 +134,8 @@ public: friend ostream& operator<<(ostream& o, const FeatureStats& e); }; +//ADEED_BY_TS +bool operator==(const FeatureStats& f1, const FeatureStats& f2); +//END_ADDED + #endif // FEATURE_STATS_H diff --git a/mert/ScoreArray.h b/mert/ScoreArray.h index 1240a704a..0a0ddbdc0 100644 --- a/mert/ScoreArray.h +++ b/mert/ScoreArray.h @@ -62,6 +62,16 @@ public: array_.push_back(e); } + //ADDED BY TS + void swap(size_t i, size_t j) { + std::swap(array_[i],array_[j]); + } + + void resize(size_t new_size) { + array_.resize(std::min(new_size,array_.size())); + } + //END_ADDED + void merge(ScoreArray& e); inline std::string name() const { diff --git a/mert/ScoreStats.cpp b/mert/ScoreStats.cpp index eb1750983..7efea99a9 100644 --- a/mert/ScoreStats.cpp +++ b/mert/ScoreStats.cpp @@ -132,3 +132,19 @@ ostream& operator<<(ostream& o, const ScoreStats& e) o << e.get(i) << " "; return o; } + +//ADDED_BY_TS +bool operator==(const ScoreStats& s1, const ScoreStats& s2) { + size_t size = s1.size(); + + if (size != s2.size()) + return false; + + for (size_t k=0; k < size; k++) { + if (s1.get(k) != s2.get(k)) + return false; + } + + return true; +} +//END_ADDED diff --git a/mert/ScoreStats.h b/mert/ScoreStats.h index 43a6f1f23..68df91195 100644 --- a/mert/ScoreStats.h +++ b/mert/ScoreStats.h @@ -100,4 +100,8 @@ public: friend ostream& operator<<(ostream& o, const ScoreStats& e); }; +//ADDED_BY_TS +bool operator==(const ScoreStats& s1, const ScoreStats& s2); +//END_ADDED + #endif // SCORE_STATS_H diff --git a/mert/extractor.cpp b/mert/extractor.cpp index 37c46d2dd..b9c602f8f 100644 --- a/mert/extractor.cpp +++ b/mert/extractor.cpp @@ -182,6 +182,10 @@ int main(int argc, char** argv) PrintUserTime("Nbest entries loaded and scored"); + //ADDED_BY_TS + data.remove_duplicates(); + //END_ADDED + if (binmode) cerr << "Binary write mode is selected" << endl; else diff --git a/mert/mert.cpp b/mert/mert.cpp index 98f2e986c..7609ae302 100755 --- a/mert/mert.cpp +++ b/mert/mert.cpp @@ -1,5 +1,5 @@ /** - * \description The is the main for the new version of the mert algorithm developed during the 2nd MT marathon + * \description This is the main for the new version of the mert algorithm developed during the 2nd MT marathon */ #include <limits> @@ -260,6 +260,7 @@ int main (int argc, char **argv) if(j<pdim) { cerr<<initfile<<":Too few minimum weights." << endl; cerr<<"error could not initialize start point with " << initfile << endl; + std::cerr << "j: " << j << ", pdim: " << pdim << std::endl; exit(3); } max.resize(pdim); @@ -297,6 +298,10 @@ int main (int argc, char **argv) D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i)); } + //ADDED_BY_TS + D.remove_duplicates(); + //END_ADDED + PrintUserTime("Data loaded"); // starting point score over latest n-best, accumulative n-best diff --git a/scripts/generic/trainlm-irst.perl b/scripts/generic/trainlm-irst.perl index 166551bc5..15e8d7ee9 100755 --- a/scripts/generic/trainlm-irst.perl +++ b/scripts/generic/trainlm-irst.perl @@ -3,7 +3,7 @@ # Compatible with sri LM-creating script, eg. # ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt # To use it in the EMS, add this to the [LM] section -# lm-training = "$moses-script-dir/generic/trainlm.irst.perl -cores $cores -irst-dir $irst-dir" +# lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irst-dir" # settings = "" # Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section. # It should point to the root of the LM toolkit, eg |