From 753eebd959673ffba4d4d44301955334f40156ab Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 12 Dec 2011 20:48:42 +0700 Subject: revert --- mert/Data.cpp | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'mert/Data.cpp') diff --git a/mert/Data.cpp b/mert/Data.cpp index 23fdc6d82..94f5287a8 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -47,6 +47,99 @@ Data::~Data() { } } +//ADDED BY TS +void Data::remove_duplicates() { + + uint nSentences = featdata->size(); + assert(scoredata->size() == nSentences); + + for (uint s=0; s < nSentences; s++) { + + FeatureArray& feat_array = featdata->get(s); + ScoreArray& score_array = scoredata->get(s); + + assert(feat_array.size() == score_array.size()); + + //serves as a hash-map: + std::map > lookup; + + uint end_pos = feat_array.size() - 1; + + uint nRemoved = 0; + for (uint k=0; k <= end_pos; k++) { + + const FeatureStats& cur_feats = feat_array.get(k); + + double sum = 0.0; + for (uint l=0; l < cur_feats.size(); l++) + sum += cur_feats.get(l); + + if (lookup.find(sum) != lookup.end()) { + + //std::cerr << "hit" << std::endl; + + std::vector& cur_list = lookup[sum]; + + uint l=0; + for (l=0; l < cur_list.size(); l++) { + + uint j=cur_list[l]; + + if (cur_feats == feat_array.get(j) + && score_array.get(k) == score_array.get(j)) { + + if (k < end_pos) { + + feat_array.swap(k,end_pos); + score_array.swap(k,end_pos); + + k--; + } + + end_pos--; + nRemoved++; + break; + } + } + + if (l == lookup[sum].size()) + cur_list.push_back(k); + } + else + lookup[sum].push_back(k); + + // for (uint j=0; j < k; j++) { + + // if (feat_array.get(k) == feat_array.get(j) + // && score_array.get(k) == score_array.get(j)) { + + // if (k < end_pos) { + + // feat_array.swap(k,end_pos); + // score_array.swap(k,end_pos); + + // k--; + // } + + // end_pos--; + // nRemoved++; + // break; + // } + // } + } + + std::cerr << "removed " << nRemoved << "/" << feat_array.size() << std::endl; + + if (nRemoved > 0) { + + feat_array.resize(end_pos+1); + score_array.resize(end_pos+1); + } + } +} +//END_ADDED + + void Data::loadnbest(const std::string &file) { TRACE_ERR("loading nbest from " << file << std::endl); -- cgit v1.2.3