Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authorTetsuo Kiso <tetsuo-s@is.naist.jp>2012-03-10 12:47:01 +0400
committerTetsuo Kiso <tetsuo-s@is.naist.jp>2012-03-10 12:47:01 +0400
commit3ce46da4cd4c9c7779401548accf235f0c331059 (patch)
tree70c5e14d55924245e6dcb2cc943bf5930dad5e75 /mert
parentb5bcf48b1755f1d1ae11fee8808bc6329a8363a4 (diff)
Clean up Data; add TODOs.
Diffstat (limited to 'mert')
-rw-r--r--mert/Data.cpp96
-rw-r--r--mert/Data.h64
-rw-r--r--mert/extractor.cpp4
-rwxr-xr-xmert/mert.cpp2
4 files changed, 74 insertions, 92 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 33c259658..b1950ea4e 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -7,7 +7,6 @@
*/
#include <algorithm>
-#include "util/check.hh"
#include <cmath>
#include <fstream>
@@ -16,6 +15,7 @@
#include "Scorer.h"
#include "ScorerFactory.h"
#include "Util.h"
+#include "util/check.hh"
Data::Data()
: m_scorer(NULL),
@@ -37,13 +37,13 @@ Data::Data(Scorer* scorer)
}
//ADDED BY TS
-void Data::remove_duplicates() {
-
+// TODO: This is too long; consider creating additional functions to
+// reduce the lines of this function.
+void Data::removeDuplicates() {
size_t nSentences = m_feature_data->size();
assert(m_score_data->size() == nSentences);
- for (size_t s=0; s < nSentences; s++) {
-
+ for (size_t s = 0; s < nSentences; s++) {
FeatureArray& feat_array = m_feature_data->get(s);
ScoreArray& score_array = m_score_data->get(s);
@@ -55,48 +55,42 @@ void Data::remove_duplicates() {
size_t end_pos = feat_array.size() - 1;
size_t nRemoved = 0;
- for (size_t k=0; k <= end_pos; k++) {
+ for (size_t k = 0; k <= end_pos; k++) {
const FeatureStats& cur_feats = feat_array.get(k);
-
double sum = 0.0;
- for (size_t l=0; l < cur_feats.size(); l++)
- sum += cur_feats.get(l);
+ for (size_t l = 0; l < cur_feats.size(); l++)
+ sum += cur_feats.get(l);
if (lookup.find(sum) != lookup.end()) {
- //cerr << "hit" << endl;
-
- vector<size_t>& cur_list = lookup[sum];
-
- size_t l=0;
- for (l=0; l < cur_list.size(); l++) {
-
- size_t j=cur_list[l];
-
- if (cur_feats == feat_array.get(j)
- && score_array.get(k) == score_array.get(j)) {
-
- if (k < end_pos) {
-
- feat_array.swap(k,end_pos);
- score_array.swap(k,end_pos);
-
- k--;
- }
-
- end_pos--;
- nRemoved++;
- break;
- }
- }
-
- if (l == lookup[sum].size())
- cur_list.push_back(k);
+ //cerr << "hit" << endl;
+ vector<size_t>& cur_list = lookup[sum];
+
+ // TODO: Make sure this is correct because we have already used 'l'.
+ // If this does not impact on the removing duplicates, it is better
+ // to change
+ size_t l = 0;
+ for (l = 0; l < cur_list.size(); l++) {
+ size_t j = cur_list[l];
+
+ if (cur_feats == feat_array.get(j)
+ && score_array.get(k) == score_array.get(j)) {
+ if (k < end_pos) {
+ feat_array.swap(k,end_pos);
+ score_array.swap(k,end_pos);
+ k--;
+ }
+ end_pos--;
+ nRemoved++;
+ break;
+ }
+ }
+ if (l == lookup[sum].size())
+ cur_list.push_back(k);
+ } else {
+ lookup[sum].push_back(k);
}
- else
- lookup[sum].push_back(k);
-
// for (size_t j=0; j < k; j++) {
// if (feat_array.get(k) == feat_array.get(j)
@@ -115,11 +109,9 @@ void Data::remove_duplicates() {
// break;
// }
// }
- }
-
+ } // end for k
if (nRemoved > 0) {
-
feat_array.resize(end_pos+1);
score_array.resize(end_pos+1);
}
@@ -127,8 +119,14 @@ void Data::remove_duplicates() {
}
//END_ADDED
+void Data::load(const std::string &featfile, const std::string &scorefile) {
+ m_feature_data->load(featfile);
+ m_score_data->load(scorefile);
+ if (m_feature_data->hasSparseFeatures())
+ m_sparse_flag = true;
+}
-void Data::loadnbest(const string &file)
+void Data::loadNBest(const string &file)
{
TRACE_ERR("loading nbest from " << file << endl);
inputfilestream inp(file); // matches a stream with a file. Opens the file
@@ -159,6 +157,16 @@ void Data::loadnbest(const string &file)
inp.close();
}
+void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
+ if (bin)
+ cerr << "Binary write mode is selected" << endl;
+ else
+ cerr << "Binary write mode is NOT selected" << endl;
+
+ m_feature_data->save(featfile, bin);
+ m_score_data->save(scorefile, bin);
+}
+
void Data::InitFeatureMap(const string& str) {
string buf = str;
string substr;
diff --git a/mert/Data.h b/mert/Data.h
index c18d0d9bd..376367d4c 100644
--- a/mert/Data.h
+++ b/mert/Data.h
@@ -11,11 +11,8 @@
using namespace std;
-#include <limits>
#include <vector>
-#include <iostream>
-
-#include<boost/shared_ptr.hpp>
+#include <boost/shared_ptr.hpp>
#include "Util.h"
#include "FeatureData.h"
@@ -26,6 +23,8 @@ class Scorer;
typedef boost::shared_ptr<ScoreData> ScoreDataHandle;
typedef boost::shared_ptr<FeatureData> FeatureDataHandle;
+// NOTE: there is no copy constructor implemented, so only the
+// compiler synthesised shallow copy is available.
class Data
{
private:
@@ -45,63 +44,38 @@ public:
explicit Data(Scorer* scorer);
Data();
- //Note that there is no copy constructor implemented, so only the
- //compiler synthesised shallow copy is available
-
- inline void clear() {
+ void clear() {
m_score_data->clear();
m_feature_data->clear();
}
- ScoreDataHandle getScoreData() {
- return m_score_data;
- }
+ ScoreDataHandle getScoreData() { return m_score_data; }
- FeatureDataHandle getFeatureData() {
- return m_feature_data;
- }
+ FeatureDataHandle getFeatureData() { return m_feature_data; }
- Scorer* getScorer() {
- return m_scorer;
- }
+ Scorer* getScorer() { return m_scorer; }
- inline size_t NumberOfFeatures() const {
+ size_t NumberOfFeatures() const {
return m_feature_data->NumberOfFeatures();
}
- inline void NumberOfFeatures(size_t v) {
- m_feature_data->NumberOfFeatures(v);
- }
- inline std::string Features() const {
- return m_feature_data->Features();
- }
- inline void Features(const std::string &f) {
- m_feature_data->Features(f);
- }
- inline bool hasSparseFeatures() const { return m_sparse_flag; }
- void mergeSparseFeatures();
+ void NumberOfFeatures(size_t v) { m_feature_data->NumberOfFeatures(v); }
- void loadnbest(const std::string &file);
+ std::string Features() const { return m_feature_data->Features(); }
+ void Features(const std::string &f) { m_feature_data->Features(f); }
- void load(const std::string &featfile,const std::string &scorefile) {
- m_feature_data->load(featfile);
- m_score_data->load(scorefile);
- if (m_feature_data->hasSparseFeatures())
- m_sparse_flag = true;
- }
+ bool hasSparseFeatures() const { return m_sparse_flag; }
+ void mergeSparseFeatures();
- //ADDED BY TS
- void remove_duplicates();
- //END_ADDED
+ void loadNBest(const std::string &file);
- void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
+ void load(const std::string &featfile, const std::string &scorefile);
- if (bin) cerr << "Binary write mode is selected" << endl;
- else cerr << "Binary write mode is NOT selected" << endl;
+ void save(const std::string &featfile, const std::string &scorefile, bool bin=false);
- m_feature_data->save(featfile, bin);
- m_score_data->save(scorefile, bin);
- }
+ //ADDED BY TS
+ void removeDuplicates();
+ //END_ADDED
inline bool existsFeatureNames() const {
return m_feature_data->existsFeatureNames();
diff --git a/mert/extractor.cpp b/mert/extractor.cpp
index 1e1cebeaa..3442ed36b 100644
--- a/mert/extractor.cpp
+++ b/mert/extractor.cpp
@@ -208,13 +208,13 @@ int main(int argc, char** argv)
// computing score statistics of each nbest file
for (size_t i = 0; i < nbestFiles.size(); i++) {
- data.loadnbest(nbestFiles.at(i));
+ data.loadNBest(nbestFiles.at(i));
}
PrintUserTime("Nbest entries loaded and scored");
//ADDED_BY_TS
- data.remove_duplicates();
+ data.removeDuplicates();
//END_ADDED
data.save(option.featureDataFile, option.scoreDataFile, option.binmode);
diff --git a/mert/mert.cpp b/mert/mert.cpp
index bc0252277..2455aa39f 100755
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@@ -348,7 +348,7 @@ int main(int argc, char **argv)
scorer->setScoreData(data.getScoreData().get());
//ADDED_BY_TS
- data.remove_duplicates();
+ data.removeDuplicates();
//END_ADDED
PrintUserTime("Data loaded");