Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@Hieus-MacBook.local>2011-12-12 17:48:42 +0400
committerHieu Hoang <hieuhoang@Hieus-MacBook.local>2011-12-12 17:48:42 +0400
commit753eebd959673ffba4d4d44301955334f40156ab (patch)
tree14933af527fa33b63be235cc6c0c99b02e5bcd28 /mert
parent8327cce73b48826accee4354f5da926b3dd80074 (diff)
revert
Diffstat (limited to 'mert')
-rw-r--r--mert/Data.cpp93
-rw-r--r--mert/Data.h6
-rw-r--r--mert/FeatureArray.h10
-rw-r--r--mert/FeatureStats.cpp16
-rw-r--r--mert/FeatureStats.h4
-rw-r--r--mert/ScoreArray.h10
-rw-r--r--mert/ScoreStats.cpp16
-rw-r--r--mert/ScoreStats.h4
-rw-r--r--mert/extractor.cpp4
-rwxr-xr-xmert/mert.cpp7
10 files changed, 168 insertions, 2 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 23fdc6d82..94f5287a8 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -47,6 +47,99 @@ Data::~Data() {
}
}
+//ADDED BY TS
+void Data::remove_duplicates() {
+
+ uint nSentences = featdata->size();
+ assert(scoredata->size() == nSentences);
+
+ for (uint s=0; s < nSentences; s++) {
+
+ FeatureArray& feat_array = featdata->get(s);
+ ScoreArray& score_array = scoredata->get(s);
+
+ assert(feat_array.size() == score_array.size());
+
+ //serves as a hash-map:
+ std::map<double, std::vector<uint> > lookup;
+
+ uint end_pos = feat_array.size() - 1;
+
+ uint nRemoved = 0;
+ for (uint k=0; k <= end_pos; k++) {
+
+ const FeatureStats& cur_feats = feat_array.get(k);
+
+ double sum = 0.0;
+ for (uint l=0; l < cur_feats.size(); l++)
+ sum += cur_feats.get(l);
+
+ if (lookup.find(sum) != lookup.end()) {
+
+ //std::cerr << "hit" << std::endl;
+
+ std::vector<uint>& cur_list = lookup[sum];
+
+ uint l=0;
+ for (l=0; l < cur_list.size(); l++) {
+
+ uint j=cur_list[l];
+
+ if (cur_feats == feat_array.get(j)
+ && score_array.get(k) == score_array.get(j)) {
+
+ if (k < end_pos) {
+
+ feat_array.swap(k,end_pos);
+ score_array.swap(k,end_pos);
+
+ k--;
+ }
+
+ end_pos--;
+ nRemoved++;
+ break;
+ }
+ }
+
+ if (l == lookup[sum].size())
+ cur_list.push_back(k);
+ }
+ else
+ lookup[sum].push_back(k);
+
+ // for (uint j=0; j < k; j++) {
+
+ // if (feat_array.get(k) == feat_array.get(j)
+ // && score_array.get(k) == score_array.get(j)) {
+
+ // if (k < end_pos) {
+
+ // feat_array.swap(k,end_pos);
+ // score_array.swap(k,end_pos);
+
+ // k--;
+ // }
+
+ // end_pos--;
+ // nRemoved++;
+ // break;
+ // }
+ // }
+ }
+
+ std::cerr << "removed " << nRemoved << "/" << feat_array.size() << std::endl;
+
+ if (nRemoved > 0) {
+
+ feat_array.resize(end_pos+1);
+ score_array.resize(end_pos+1);
+ }
+ }
+}
+//END_ADDED
+
+
void Data::loadnbest(const std::string &file)
{
TRACE_ERR("loading nbest from " << file << std::endl);
diff --git a/mert/Data.h b/mert/Data.h
index db858fc18..56d7ac47d 100644
--- a/mert/Data.h
+++ b/mert/Data.h
@@ -73,7 +73,7 @@ public:
void mergeSparseFeatures();
void loadnbest(const std::string &file);
-
+
void load(const std::string &featfile,const std::string &scorefile) {
featdata->load(featfile);
scoredata->load(scorefile);
@@ -81,6 +81,10 @@ public:
_sparse_flag = true;
}
+ //ADDED BY TS
+ void remove_duplicates();
+ //END_ADDED
+
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
if (bin) cerr << "Binary write mode is selected" << endl;
diff --git a/mert/FeatureArray.h b/mert/FeatureArray.h
index 1fa3c4151..ee8ee1354 100644
--- a/mert/FeatureArray.h
+++ b/mert/FeatureArray.h
@@ -63,6 +63,16 @@ public:
array_.push_back(e);
}
+ //ADDED BY TS
+ void swap(size_t i, size_t j) {
+ std::swap(array_[i],array_[j]);
+ }
+
+ void resize(size_t new_size) {
+ array_.resize(std::min(new_size,array_.size()));
+ }
+ //END_ADDED
+
void merge(FeatureArray& e);
inline size_t size() const {
diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp
index e7682518c..a8f0f478b 100644
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@@ -218,3 +218,19 @@ ostream& operator<<(ostream& o, const FeatureStats& e)
return o;
}
+
+//ADEED_BY_TS
+bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
+ size_t size = f1.size();
+
+ if (size != f2.size())
+ return false;
+
+ for (size_t k=0; k < size; k++) {
+ if (f1.get(k) != f2.get(k))
+ return false;
+ }
+
+ return true;
+}
+//END_ADDED
diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h
index 44858a5a3..10ff31992 100644
--- a/mert/FeatureStats.h
+++ b/mert/FeatureStats.h
@@ -134,4 +134,8 @@ public:
friend ostream& operator<<(ostream& o, const FeatureStats& e);
};
+//ADEED_BY_TS
+bool operator==(const FeatureStats& f1, const FeatureStats& f2);
+//END_ADDED
+
#endif // FEATURE_STATS_H
diff --git a/mert/ScoreArray.h b/mert/ScoreArray.h
index 1240a704a..0a0ddbdc0 100644
--- a/mert/ScoreArray.h
+++ b/mert/ScoreArray.h
@@ -62,6 +62,16 @@ public:
array_.push_back(e);
}
+ //ADDED BY TS
+ void swap(size_t i, size_t j) {
+ std::swap(array_[i],array_[j]);
+ }
+
+ void resize(size_t new_size) {
+ array_.resize(std::min(new_size,array_.size()));
+ }
+ //END_ADDED
+
void merge(ScoreArray& e);
inline std::string name() const {
diff --git a/mert/ScoreStats.cpp b/mert/ScoreStats.cpp
index eb1750983..7efea99a9 100644
--- a/mert/ScoreStats.cpp
+++ b/mert/ScoreStats.cpp
@@ -132,3 +132,19 @@ ostream& operator<<(ostream& o, const ScoreStats& e)
o << e.get(i) << " ";
return o;
}
+
+//ADDED_BY_TS
+bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
+ size_t size = s1.size();
+
+ if (size != s2.size())
+ return false;
+
+ for (size_t k=0; k < size; k++) {
+ if (s1.get(k) != s2.get(k))
+ return false;
+ }
+
+ return true;
+}
+//END_ADDED
diff --git a/mert/ScoreStats.h b/mert/ScoreStats.h
index 43a6f1f23..68df91195 100644
--- a/mert/ScoreStats.h
+++ b/mert/ScoreStats.h
@@ -100,4 +100,8 @@ public:
friend ostream& operator<<(ostream& o, const ScoreStats& e);
};
+//ADDED_BY_TS
+bool operator==(const ScoreStats& s1, const ScoreStats& s2);
+//END_ADDED
+
#endif // SCORE_STATS_H
diff --git a/mert/extractor.cpp b/mert/extractor.cpp
index 37c46d2dd..b9c602f8f 100644
--- a/mert/extractor.cpp
+++ b/mert/extractor.cpp
@@ -182,6 +182,10 @@ int main(int argc, char** argv)
PrintUserTime("Nbest entries loaded and scored");
+ //ADDED_BY_TS
+ data.remove_duplicates();
+ //END_ADDED
+
if (binmode)
cerr << "Binary write mode is selected" << endl;
else
diff --git a/mert/mert.cpp b/mert/mert.cpp
index 98f2e986c..7609ae302 100755
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@@ -1,5 +1,5 @@
/**
- * \description The is the main for the new version of the mert algorithm developed during the 2nd MT marathon
+ * \description This is the main for the new version of the mert algorithm developed during the 2nd MT marathon
*/
#include <limits>
@@ -260,6 +260,7 @@ int main (int argc, char **argv)
if(j<pdim) {
cerr<<initfile<<":Too few minimum weights." << endl;
cerr<<"error could not initialize start point with " << initfile << endl;
+ std::cerr << "j: " << j << ", pdim: " << pdim << std::endl;
exit(3);
}
max.resize(pdim);
@@ -297,6 +298,10 @@ int main (int argc, char **argv)
D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
}
+ //ADDED_BY_TS
+ D.remove_duplicates();
+ //END_ADDED
+
PrintUserTime("Data loaded");
// starting point score over latest n-best, accumulative n-best