Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authornicolabertoldi <nicolabertoldi@1f5c12ca-751b-0410-a591-d2e778427230>2008-06-05 11:23:34 +0400
committernicolabertoldi <nicolabertoldi@1f5c12ca-751b-0410-a591-d2e778427230>2008-06-05 11:23:34 +0400
commit281bf610b87a02c762cf2bece152e99c184069fa (patch)
treee11bb70d149d87574f2c04f1bbd2521afb081d85 /mert
parent1b44c7c445ead80485186c1c92b18cfbd3b7992e (diff)
added binary read/load facility for feature data
added names of features in the header added methods to access the features by name git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1819 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'mert')
-rw-r--r--mert/Data.cpp12
-rw-r--r--mert/Data.h6
-rw-r--r--mert/FeatureArray.cpp100
-rw-r--r--mert/FeatureArray.h19
-rw-r--r--mert/FeatureData.cpp4
-rw-r--r--mert/FeatureData.h6
-rw-r--r--mert/FeatureStats.cpp80
-rw-r--r--mert/FeatureStats.h43
-rw-r--r--mert/Types.h3
-rw-r--r--mert/Util.cpp11
-rw-r--r--mert/example/README7
11 files changed, 179 insertions, 112 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index a7c41f29c..f1346de33 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -53,7 +53,7 @@ void Data::loadnbest(const std::string &file)
theSentence = substring;
// adding statistics for error measures
- featentry.clear();
+ featentry.reset();
scoreentry.clear();
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
@@ -81,8 +81,14 @@ void Data::loadnbest(const std::string &file)
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
}
}
- number_of_features=idx2featname_.size();
- TRACE_ERR("number_of_features: " << number_of_features << std::endl);
+ std::string features="";
+ for (size_t i=0; i<idx2featname_.size(); i++)
+ features+=idx2featname_[i]+" ";
+
+ NumberOfFeatures(idx2featname_.size());
+ Features(features);
+ TRACE_ERR("number_of_features: " << NumberOfFeatures() << std::endl);
+ TRACE_ERR("features: " << Features() << std::endl);
}
// adding features
diff --git a/mert/Data.h b/mert/Data.h
index 1d0f7a5f4..4bf6b9fb4 100644
--- a/mert/Data.h
+++ b/mert/Data.h
@@ -32,7 +32,6 @@ private:
std::string score_type;
map<std::string, size_t> featname2idx_; //map from name to index of features
map<size_t, std::string> idx2featname_; //map from index to name of features
- size_t number_of_features; //number of features
size_t number_of_scores; //number of scores
public:
@@ -44,6 +43,11 @@ public:
ScoreData* getScoreData() { return scoredata; };
FeatureData* getFeatureData() { return featdata; };
+
+ inline size_t NumberOfFeatures() const{ return featdata->NumberOfFeatures(); }
+ inline void NumberOfFeatures(size_t v){ featdata->NumberOfFeatures(v); }
+ inline std::string Features() const{ return featdata->Features(); }
+ inline void Features(const std::string f){ featdata->Features(f); }
void loadnbest(const std::string &file);
diff --git a/mert/FeatureArray.cpp b/mert/FeatureArray.cpp
index 8509660e9..4645e1aef 100644
--- a/mert/FeatureArray.cpp
+++ b/mert/FeatureArray.cpp
@@ -16,8 +16,8 @@ FeatureArray::FeatureArray(): idx("")
void FeatureArray::savetxt(std::ofstream& outFile)
{
-
- outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size() << std::endl;
+ outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size()
+ << " " << number_of_features << " " << features << std::endl;
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++){
i->savetxt(outFile);
outFile << std::endl;
@@ -27,30 +27,23 @@ void FeatureArray::savetxt(std::ofstream& outFile)
void FeatureArray::savebin(std::ofstream& outFile)
{
- TRACE_ERR("binary saving is not yet implemented!" << std::endl);
-
-/*
-NOT YET IMPLEMENTED
-*/
- outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size() << std::endl;
- for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++){
- i->savebin(outFile);
- outFile << std::endl;
- }
+ outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size()
+ << " " << number_of_features << " " << features << std::endl;
+ for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++)
+ i->savebin(outFile);
outFile << FEATURES_BIN_END << std::endl;
-
}
void FeatureArray::save(std::ofstream& inFile, bool bin)
{
- (bin)?savebin(inFile):savetxt(inFile);
+ if (size()>0)
+ (bin)?savebin(inFile):savetxt(inFile);
}
void FeatureArray::save(const std::string &file, bool bin)
{
- TRACE_ERR("saving the array into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
@@ -59,75 +52,76 @@ void FeatureArray::save(const std::string &file, bool bin)
outFile.close();
}
-void FeatureArray::loadtxt(ifstream& inFile)
+void FeatureArray::loadbin(ifstream& inFile, size_t n)
{
- FeatureStats entry;
+ FeatureStats entry(number_of_features);
+
+ for (size_t i=0 ; i < n; i++){
+ entry.loadbin(inFile);
+ add(entry);
+ }
+}
- int number_of_entries=0;
- int nextPound;
+void FeatureArray::loadtxt(ifstream& inFile, size_t n)
+{
+ FeatureStats entry(number_of_features);
+
+ for (size_t i=0 ; i < n; i++){
+ entry.loadtxt(inFile);
+ add(entry);
+ }
+}
+void FeatureArray::load(ifstream& inFile)
+{
+ size_t number_of_entries=0;
+ bool binmode=false;
+
std::string substring, stringBuf;
std::string::size_type loc;
-
std::getline(inFile, stringBuf);
if (!inFile.good()){
return;
}
if (!stringBuf.empty()){
- TRACE_ERR("Reading: " << stringBuf << std::endl);
- if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) != 0){
- TRACE_ERR("ERROR: FeatureArray::loadtxt(): Wrong header");
+ if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0){
+ binmode=false;
+ }else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0){
+ binmode=true;
+ }else{
+ TRACE_ERR("ERROR: FeatureArray::load(): Wrong header");
return;
}
- nextPound = getNextPound(stringBuf, substring);
- nextPound = getNextPound(stringBuf, substring);
+ getNextPound(stringBuf, substring);
+ getNextPound(stringBuf, substring);
idx = substring;
- nextPound = getNextPound(stringBuf, substring);
+ getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
- // TRACE_ERR("idx: " << idx " nbest: " << number_of_entries << std::endl);
+ getNextPound(stringBuf, substring);
+ number_of_features = atoi(substring.c_str());
+ features = stringBuf;
}
- for (int i=0 ; i < number_of_entries; i++)
- {
- entry.clear();
- std::getline(inFile, stringBuf);
- entry.set(stringBuf);
- add(entry);
- }
+ (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
std::getline(inFile, stringBuf);
if (!stringBuf.empty()){
-// TRACE_ERR("Reading: " << stringBuf << std::endl);
- if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0){
- TRACE_ERR("ERROR: FeatureArray::loadtxt(): Wrong footer");
+ if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0){
+ TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer");
return;
}
}
}
-void FeatureArray::loadbin(ifstream& inFile)
-{
- TRACE_ERR("binary saving is not yet implemented!" << std::endl);
-
-/*
-NOT YET IMPLEMENTED
-*/
-}
-
-void FeatureArray::load(ifstream& inFile, bool bin)
-{
- (bin)?loadbin(inFile):loadtxt(inFile);
-}
-
-void FeatureArray::load(const std::string &file, bool bin)
+void FeatureArray::load(const std::string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
- load((ifstream&) inFile, bin);
+ load((ifstream&) inFile);
inFile.close();
diff --git a/mert/FeatureArray.h b/mert/FeatureArray.h
index 943056bab..d08bf54db 100644
--- a/mert/FeatureArray.h
+++ b/mert/FeatureArray.h
@@ -28,6 +28,8 @@ class FeatureArray
{
protected:
featarray_t array_;
+ size_t number_of_features;
+ std::string features;
private:
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
@@ -49,19 +51,22 @@ public:
void merge(FeatureArray& e);
inline size_t size(){ return array_.size(); }
- inline size_t NumberOfFeatures(){ return (array_.size()>0)?array_.at(0).size():0; }
-
+ inline size_t NumberOfFeatures() const{ return number_of_features; }
+ inline void NumberOfFeatures(size_t v){ number_of_features = v; }
+ inline std::string Features() const{ return features; }
+ inline void Features(const std::string f){ features = f; }
+
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
void save(ofstream& outFile, bool bin=false);
void save(const std::string &file, bool bin=false);
inline void save(bool bin=false){ save("/dev/stdout",bin); }
- void loadtxt(ifstream& inFile);
- void loadbin(ifstream& inFile);
- void load(ifstream& inFile, bool bin=false);
- void load(const std::string &file, bool bin=false);
-
+ void loadtxt(ifstream& inFile, size_t n);
+ void loadbin(ifstream& inFile, size_t n);
+ void load(ifstream& inFile);
+ void load(const std::string &file);
+
bool check_consistency();
};
diff --git a/mert/FeatureData.cpp b/mert/FeatureData.cpp
index 7218ad4d0..66ee8e331 100644
--- a/mert/FeatureData.cpp
+++ b/mert/FeatureData.cpp
@@ -98,8 +98,10 @@ void FeatureData::add(FeatureStats& e, const std::string & sent_idx){
else{
// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl);
FeatureArray a;
- a.add(e);
+ a.NumberOfFeatures(number_of_features);
+ a.Features(features);
a.setIndex(sent_idx);
+ a.add(e);
add(a);
}
}
diff --git a/mert/FeatureData.h b/mert/FeatureData.h
index 7ac64b7e2..63592889e 100644
--- a/mert/FeatureData.h
+++ b/mert/FeatureData.h
@@ -28,6 +28,7 @@ protected:
private:
size_t number_of_features;
+ std::string features;
public:
FeatureData();
@@ -50,7 +51,10 @@ public:
void add(FeatureStats& e, const std::string& sent_idx);
inline size_t size(){ return array_.size(); }
- inline size_t NumberOfFeatures(){ return number_of_features; }
+ inline size_t NumberOfFeatures() const{ return number_of_features; }
+ inline void NumberOfFeatures(size_t v){ number_of_features = v; }
+ inline std::string Features() const{ return features; }
+ inline void Features(const std::string f){ features = f; }
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp
index 62e59974a..bbceff135 100644
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@@ -9,18 +9,35 @@
#include <fstream>
#include "FeatureStats.h"
+#define AVAILABLE_ 8;
+
FeatureStats::FeatureStats()
-{};
+{
+ available_ = AVAILABLE_;
+ entries_ = 0;
+ array2_ = new FeatureStatsType[available_];
+};
+
+FeatureStats::~FeatureStats()
+{
+ delete array2_;
+};
-FeatureStats::FeatureStats(const FeatureStats &stats):
-array_(stats.array_)
-{};
+FeatureStats::FeatureStats(const FeatureStats &stats)
+{
+ available_ = stats.available();
+ entries_ = stats.size();
+ array2_ = new FeatureStatsType[available_];
+ memcpy(array2_,stats.getArray(),bytes_);
+};
FeatureStats::FeatureStats(const size_t size)
{
- for(unsigned int i = 0; i < size; i++)
- array_.push_back(0);
+ available_ = size;
+ entries_ = size;
+ array2_ = new FeatureStatsType[available_];
+ memset(array2_,0,bytes_);
};
@@ -29,30 +46,48 @@ FeatureStats::FeatureStats(std::string &theString)
set(theString);
}
+void FeatureStats::expand()
+{
+ available_*=2;
+ featstats_t t_ = new FeatureStatsType[available_];
+ memcpy(t_,array2_,bytes_);
+ delete array2_;
+ array2_=t_;
+}
+
+void FeatureStats::add(FeatureStatsType v)
+{
+ if (isfull()) expand();
+ array2_[entries_++]=v;
+}
+
void FeatureStats::set(std::string &theString)
{
std::string substring, stringBuf;
-
- int nextPound;
- FeatureStatsType sc;
-// TRACE_ERR("Decompounding string: " << theString << std::endl);
+ reset();
+
while (!theString.empty()){
- nextPound = getNextPound(theString, substring);
- sc = ATOFST(substring.c_str());
- array_.push_back(sc);
+ getNextPound(theString, substring);
+ add(ATOFST(substring.c_str()));
}
}
+
+void FeatureStats::loadbin(std::ifstream& inFile)
+{
+ inFile.read((char*) array2_, bytes_);
+}
+
void FeatureStats::loadtxt(std::ifstream& inFile)
{
- std::string theString;
+ std::string theString;
std::getline(inFile, theString);
set(theString);
}
void FeatureStats::loadtxt(const std::string &file)
{
-// TRACE_ERR("loading the stats from " << file << std::endl);
+ // TRACE_ERR("loading the stats from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
@@ -72,19 +107,22 @@ void FeatureStats::savetxt(const std::string &file)
void FeatureStats::savetxt(std::ofstream& outFile)
{
- TRACE_ERR("saving the stats" << std::endl);
+// TRACE_ERR("saving the stats" << std::endl);
outFile << *this;
}
void FeatureStats::savebin(std::ofstream& outFile)
{
- outFile << "S|";
- outFile << "|E";
+ outFile.write((char*) array2_, bytes_);
}
FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
{
- array_ = stats.array_;
+ delete array2_;
+ available_ = stats.available();
+ entries_ = stats.size();
+ array2_ = new FeatureStatsType[available_];
+ memcpy(array2_,stats.getArray(),bytes_);
return *this;
}
@@ -92,7 +130,7 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
/**write the whole object to a stream*/
ostream& operator<<(ostream& o, const FeatureStats& e){
- for (featstats_t::iterator i = e.getArray().begin(); i != e.getArray().end(); i++)
- o << *i << " ";
+ for (size_t i=0; i< e.size(); i++)
+ o << e.get(i) << " ";
return o;
}
diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h
index 005471afe..804b0b828 100644
--- a/mert/FeatureStats.h
+++ b/mert/FeatureStats.h
@@ -20,13 +20,15 @@ using namespace std;
#define FEATURE_STATS_MIN (numeric_limits<FeatureStatsType>::min())
#define ATOFST(str) ((FeatureStatsType) atof(str))
+#define bytes_ (entries_*sizeof(FeatureStatsType))
+
class FeatureStats
{
-protected:
- featstats_t array_;
-
private:
-
+ featstats_t array2_;
+ size_t entries_;
+ size_t available_;
+
public:
FeatureStats();
FeatureStats(const size_t size);
@@ -34,38 +36,37 @@ public:
FeatureStats(std::string &theString);
FeatureStats& operator=(const FeatureStats &stats);
- ~FeatureStats(){};
-
- inline void clear() { array_.clear(); }
+ ~FeatureStats();
- inline FeatureStatsType get(size_t i){ return array_.at(i); }
- inline FeatureStatsType get(size_t i)const{ return array_.at(i); }
- inline featstats_t getArray() const { return array_; }
+ bool isfull(){return (entries_ < available_)?0:1; }
+ void expand();
+ void add(FeatureStatsType v);
+
+ inline void clear() { memset((void*) array2_,0,bytes_); }
- void set(std::string &theString);
+ inline FeatureStatsType get(size_t i){ return array2_[i]; }
+ inline FeatureStatsType get(size_t i)const{ return array2_[i]; }
+ inline featstats_t getArray() const { return array2_; }
- void add(FeatureStatsType e){ array_.push_back(e); }
+ void set(std::string &theString);
- inline size_t size(){ return array_.size(); }
+ inline size_t bytes() const{ return bytes_; }
+ inline size_t size() const{ return entries_; }
+ inline size_t available() const{ return available_; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt(){ savetxt("/dev/stdout"); }
- void loadtxt(ifstream& inFile);
void loadtxt(const std::string &file);
+ void loadtxt(ifstream& inFile);
+ void loadbin(ifstream& inFile);
- inline void reset()
- {
- for (featstats_t::iterator i = array_.begin(); i != array_.end(); i++)
- *i = 0;
- }
-
+ inline void reset(){ entries_ = 0; clear(); }
/**write the whole object to a stream*/
friend ostream& operator<<(ostream& o, const FeatureStats& e);
-
};
#endif
diff --git a/mert/Types.h b/mert/Types.h
index eb043f5e9..826813bae 100644
--- a/mert/Types.h
+++ b/mert/Types.h
@@ -24,7 +24,8 @@ typedef vector<statscore_t> statscores_t;
typedef float FeatureStatsType;
-typedef vector<FeatureStatsType> featstats_t;
+typedef FeatureStatsType* featstats_t;
+//typedef vector<FeatureStatsType> featstats_t;
typedef vector<FeatureStats> featarray_t;
typedef vector<FeatureArray> featdata_t;
diff --git a/mert/Util.cpp b/mert/Util.cpp
index 4f113ebe0..e7f263fdd 100644
--- a/mert/Util.cpp
+++ b/mert/Util.cpp
@@ -92,3 +92,14 @@ outputfilestream::~outputfilestream()
void outputfilestream::close()
{
}
+
+int swapbytes(char *p, int sz, int n)
+{
+ char c, *l, *h;
+
+ if((n<1) || (sz<2)) return 0;
+ for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; }
+ return 0;
+
+};
+
diff --git a/mert/example/README b/mert/example/README
index 2e10fda5a..ed9345f3d 100644
--- a/mert/example/README
+++ b/mert/example/README
@@ -1,15 +1,16 @@
extractor=../extractor
+#extractor="../extractor --binary"
mert=../mert
size=15
#to read an nbest file; output is in text format
#$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT --scfile SCORESTAT --sctype BLEU
-#$extractor --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.2 --scfile SCORESTAT.2 --sctype BLEU --prev-ffile FEATSTAT --prev-scfile SCORESTAT
-$extractor --ffile FEATSTAT.2 --scfile SCORESTAT.2 --sctype BLEU --prev-ffile FEATSTAT,FEATSTAT --prev-scfile SCORESTAT,SCORESTAT
#$extractor --ffile FEATSTAT.2 --scfile SCORESTAT.2 --sctype BLEU --prev-ffile FEATSTAT --prev-scfile SCORESTAT
+#$extractor --binary --ffile FEATSTAT.3 --scfile SCORESTAT.3 --sctype BLEU --prev-ffile FEATSTAT,FEATSTAT.2 --prev-scfile SCORESTAT,SCORESTAT.2
+#$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.4 --scfile SCORESTAT.4 --sctype BLEU --prev-ffile FEATSTAT,FEATSTAT.3 --prev-scfile SCORESTAT,SCORESTAT.3
-#$mert --ifile init.opt --scfile SCORESTAT --ffile FEATSTAT -d $size -verbose 4
+$mert --ifile init.opt --scfile SCORESTAT --ffile FEATSTAT -d $size -verbose 4
exit