Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authornicolabertoldi <nicolabertoldi@1f5c12ca-751b-0410-a591-d2e778427230>2008-05-20 18:15:30 +0400
committernicolabertoldi <nicolabertoldi@1f5c12ca-751b-0410-a591-d2e778427230>2008-05-20 18:15:30 +0400
commit8a594fc254c520b62700b56fb14534be36394e06 (patch)
treeba2a8fa7eddbdf08a5b9cefff5b247094fc909cb /mert
parentf30000b87575f969e1ef9ac87ef1eaccecb1ec4e (diff)
reading from textual gzipped file is now possible
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1786 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'mert')
-rw-r--r--mert/Data.cpp33
-rw-r--r--mert/FeatureArray.cpp11
-rw-r--r--mert/FeatureData.cpp56
-rw-r--r--mert/FeatureData.h4
-rwxr-xr-xmert/Makefile12
-rw-r--r--mert/ScoreArray.cpp12
-rw-r--r--mert/ScoreData.cpp52
-rw-r--r--mert/ScoreData.h5
-rw-r--r--mert/Util.cpp55
-rw-r--r--mert/Util.h66
-rw-r--r--mert/example/README9
-rw-r--r--mert/example/README.oldmert5
12 files changed, 145 insertions, 175 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 3c7678b1d..34373394a 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -29,23 +29,18 @@ void Data::loadnbest(const std::string &file)
int sentence_index;
int nextPound;
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
+ inputfilestream inp(file); // matches a stream with a file. Opens the file
- if (!inFile) {
- throw runtime_error("Unable to open: " + file);
- }
+ if (!inp.good())
+ throw runtime_error("Unable to open: " + file);
- while (!inFile.eof()){
+ std::string substring, subsubstring, stringBuf;
+ std::string theSentence;
+ std::string::size_type loc;
- std::string substring, subsubstring, stringBuf;
- std::string theSentence;
- std::string::size_type loc;
-
- std::getline(inFile, stringBuf);
+ while (getline(inp,stringBuf,'\n')){
if (stringBuf.empty()) continue;
-// TRACE_ERR("Reading: " << stringBuf << std::endl);
-
nextPound = getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = atoi(substring.c_str());
@@ -53,19 +48,19 @@ void Data::loadnbest(const std::string &file)
theSentence = substring;
// adding statistics for error measures
- scoreentry.clear();
- theScorer->prepareStats(sentence_index, theSentence,scoreentry);
- scoredata->add(scoreentry,sentence_index);
-
+ featentry.clear();
+ scoreentry.clear();
+ theScorer->prepareStats(sentence_index, theSentence, scoreentry);
+ scoredata->add(scoreentry, sentence_index);
nextPound = getNextPound(stringBuf, substring, "|||"); //third field
// adding features
- featentry.clear();
- scoreentry.clear();
while (!substring.empty()){
// TRACE_ERR("Decompounding: " << substring << std::endl);
nextPound = getNextPound(substring, subsubstring);
+
+// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1){
featentry.add(ATOFST(subsubstring.c_str()));
}
@@ -73,5 +68,5 @@ void Data::loadnbest(const std::string &file)
featdata->add(featentry,sentence_index);
}
- inFile.close();
+ inp.close();
}
diff --git a/mert/FeatureArray.cpp b/mert/FeatureArray.cpp
index 3e7fa7b01..d9b90654b 100644
--- a/mert/FeatureArray.cpp
+++ b/mert/FeatureArray.cpp
@@ -66,13 +66,12 @@ void FeatureArray::loadtxt(ifstream& inFile)
std::getline(inFile, stringBuf);
- if (stringBuf.empty()){
- TRACE_ERR("ERROR: Empty string" << std::endl);
+ if (!inFile.good()){
return;
- }
+ }
if (!stringBuf.empty()){
-// TRACE_ERR("Reading: " << stringBuf << std::endl);
+ TRACE_ERR("Reading: " << stringBuf << std::endl);
if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) != 0){
TRACE_ERR("ERROR: FeatureArray::loadtxt(): Wrong header");
return;
@@ -121,9 +120,9 @@ void FeatureArray::load(const std::string &file, bool bin)
{
TRACE_ERR("loading data from " << file << std::endl);
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
+ inputfilestream inFile(file); // matches a stream with a file. Opens the file
- load(inFile, bin);
+ load((ifstream&) inFile, bin);
inFile.close();
diff --git a/mert/FeatureData.cpp b/mert/FeatureData.cpp
index fea7cf602..d88b7ec05 100644
--- a/mert/FeatureData.cpp
+++ b/mert/FeatureData.cpp
@@ -39,12 +39,16 @@ void FeatureData::load(ifstream& inFile)
int iter=0;
while (!inFile.eof()){
+
+ if (!inFile.good()){
+ std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl;
+ }
+
entry.clear();
entry.load(inFile);
if (entry.size() == 0){
- TRACE_ERR("no more data" << std::endl);
- continue;
+ return;
}
add(entry);
iter++;
@@ -56,54 +60,16 @@ void FeatureData::load(const std::string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
-
- load(inFile);
-
- inFile.close();
-}
-
-void FeatureData::loadnbest(const std::string &file)
-{
- TRACE_ERR("loading nbest from " << file << std::endl);
-
- FeatureStats entry;
- int sentence_index;
- int nextPound;
-
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
+ inputfilestream inFile(file); // matches a stream with a file. Opens the file
-
- while (!inFile.eof()){
-
- std::string substring, subsubstring, stringBuf;
- std::string::size_type loc;
-
- std::getline(inFile, stringBuf);
- if (stringBuf.empty()) continue;
-
-// TRACE_ERR("Reading: " << stringBuf << std::endl);
-
- nextPound = getNextPound(stringBuf, substring, "|||"); //first field
- sentence_index = atoi(substring.c_str());
- nextPound = getNextPound(stringBuf, substring, "|||"); //second field
- nextPound = getNextPound(stringBuf, substring, "|||"); //third field
-
- entry.clear();
- while (!substring.empty()){
-// TRACE_ERR("Decompounding: " << substring << std::endl);
- nextPound = getNextPound(substring, subsubstring);
- if ((loc = subsubstring.find(":")) != subsubstring.length()-1){
- entry.add(ATOFST(subsubstring.c_str()));
- }
- }
-// entry.save();
- add(entry,sentence_index);
+ if (!inFile) {
+ throw runtime_error("Unable to open feature file: " + file);
}
+ load((ifstream&) inFile);
+
inFile.close();
}
-
void FeatureData::add(FeatureArray& e){
if (e.getIndex() < size()){ // array at poistion e.getIndex() already exists
//enlarge array at position e.getIndex()
diff --git a/mert/FeatureData.h b/mert/FeatureData.h
index 356fbf4df..aacf81255 100644
--- a/mert/FeatureData.h
+++ b/mert/FeatureData.h
@@ -23,6 +23,7 @@ class FeatureData
protected:
vector<FeatureArray> array_;
vector<int> idxmap_;
+ size_t number_of_feature;
private:
@@ -57,6 +58,7 @@ public:
void add(FeatureArray& e);
void add(FeatureStats e, int sent_idx);
+ inline size_t FeatureSize(){ return number_of_feature; }
inline size_t size(){ return array_.size(); }
void save(const std::string &file, bool bin=false);
@@ -65,8 +67,6 @@ public:
void load(ifstream& inFile);
void load(const std::string &file);
-
- void loadnbest(const std::string &file);
};
diff --git a/mert/Makefile b/mert/Makefile
index 321017952..8768060b3 100755
--- a/mert/Makefile
+++ b/mert/Makefile
@@ -4,9 +4,9 @@ FeatureStats.o FeatureArray.o FeatureData.o \
Data.o \
BleuScorer.o \
Point.o \
-Optimizer.o \
PerScorer.o \
Scorer.o
+#Optimizer.o \
ifndef DEBUG
CFLAGS=-O3 -DTRACE_ENABLE
@@ -17,12 +17,13 @@ endif
GCC=g++
LDFLAGS=
-LDLIBS=-lm
+LDLIBS=-lm -lz
all: \
-mert \
extractor \
-test_scorer
+prova-gz
+#test_scorer \
+#mert \
clean:
rm -f *.o
@@ -39,3 +40,6 @@ mert: $(OBJS) mert.cpp
test_scorer: $(OBJS) test_scorer.cpp
$(GCC) $(CFLAGS) $(OBJS) $(LDLIBS) -o $@ $@.cpp
+prova-gz: $(OBJS) prova-gz.cpp
+ $(GCC) $(CFLAGS) $(OBJS) $(LDLIBS) -o $@ $@.cpp
+
diff --git a/mert/ScoreArray.cpp b/mert/ScoreArray.cpp
index 6c57adf7f..2eb5361e4 100644
--- a/mert/ScoreArray.cpp
+++ b/mert/ScoreArray.cpp
@@ -67,10 +67,9 @@ void ScoreArray::loadtxt(ifstream& inFile)
TRACE_ERR("starting loadtxt..." << std::endl);
std::getline(inFile, stringBuf);
- if (stringBuf.empty()){
- TRACE_ERR("ERROR: Empty string" << std::endl);
+ if (!inFile.good()){
return;
- }
+ }
if (!stringBuf.empty()){
// TRACE_ERR("Reading: " << stringBuf << std::endl);
@@ -116,10 +115,11 @@ void ScoreArray::load(ifstream& inFile, bool bin)
void ScoreArray::load(const std::string &file , bool bin)
{
- TRACE_ERR("loading data from " << file << std::endl);
+ TRACE_ERR("loading data from " << file << std::endl);
+
+ inputfilestream inFile(file); // matches a stream with a file. Opens the file
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
+ load((ifstream&) inFile, bin);
- load(inFile, bin);
inFile.close();
}
diff --git a/mert/ScoreData.cpp b/mert/ScoreData.cpp
index fb1d4b9fb..236784016 100644
--- a/mert/ScoreData.cpp
+++ b/mert/ScoreData.cpp
@@ -53,7 +53,6 @@ void ScoreData::load(ifstream& inFile)
entry.loadtxt(inFile);
if (entry.size() == 0){
- TRACE_ERR("no more data" << std::endl);
continue;
}
add(entry);
@@ -64,58 +63,19 @@ void ScoreData::load(ifstream& inFile)
void ScoreData::load(const std::string &file)
{
- TRACE_ERR("loading score data from " << file << std::endl);
+ TRACE_ERR("loading score data from " << file << std::endl);
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
+ inputfilestream inFile(file); // matches a stream with a file. Opens the file
- if (!inFile) {
- throw runtime_error("Unable to open score file: " + file);
- }
- load(inFile);
-
- inFile.close();
-}
-
-void ScoreData::loadnbest(const std::string &file)
-{
- TRACE_ERR("loading nbest from " << file << std::endl);
-
- ScoreStats entry;
- int sentence_index;
- int nextPound;
-
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
-
-
- while (!inFile.eof()){
-
- std::string substring, subsubstring, stringBuf;
- std::string theSentence;
- std::string::size_type loc;
-
- std::getline(inFile, stringBuf);
- if (stringBuf.empty()) continue;
-
-// TRACE_ERR("Reading: " << stringBuf << std::endl);
-
- nextPound = getNextPound(stringBuf, substring, "|||"); //first field
- sentence_index = atoi(substring.c_str());
- nextPound = getNextPound(stringBuf, substring, "|||"); //second field
- theSentence = substring;
-
-
- entry.clear();
-
- theScorer->prepareStats(sentence_index, theSentence,entry);
-
- add(entry,sentence_index);
+ if (!inFile) {
+ throw runtime_error("Unable to open score file: " + file);
}
+ load((ifstream&) inFile);
+
inFile.close();
}
-
-
void ScoreData::add(const ScoreStats& e, int sent_idx){
if (exists(sent_idx)){
array_.at(sent_idx).add(e);
diff --git a/mert/ScoreData.h b/mert/ScoreData.h
index 2718de065..c2c046c45 100644
--- a/mert/ScoreData.h
+++ b/mert/ScoreData.h
@@ -28,6 +28,7 @@ protected:
private:
Scorer* theScorer;
std::string score_type;
+ size_t number_of_scores;
public:
ScoreData(Scorer& sc);
@@ -46,6 +47,7 @@ public:
void add(const ScoreArray& e){ array_.push_back(e); }
void add(const ScoreStats& e, int sent_idx);
+ inline size_t ScoreSize(){ return number_of_scores; }
inline size_t size(){ return array_.size(); }
void save(const std::string &file, bool bin=false);
@@ -54,9 +56,6 @@ public:
void load(ifstream& inFile);
void load(const std::string &file);
-
- void loadnbest(const std::string &file);
-
};
diff --git a/mert/Util.cpp b/mert/Util.cpp
index 67c1053d5..4531a8645 100644
--- a/mert/Util.cpp
+++ b/mert/Util.cpp
@@ -6,6 +6,7 @@
*
*/
+#include <stdexcept>
#include "Util.h"
int verbose=0;
@@ -37,3 +38,57 @@ int getNextPound(std::string &theString, std::string &substring, const std::stri
}
return (pos);
};
+
+inputfilestream::inputfilestream(const std::string &filePath)
+: std::istream(0),
+m_streambuf(0)
+{
+ //check if file is readable
+ std::filebuf* fb = new std::filebuf();
+ _good=(fb->open(filePath.c_str(), std::ios::in)!=NULL);
+
+ if (filePath.size() > 3 &&
+ filePath.substr(filePath.size() - 3, 3) == ".gz")
+ {
+ fb->close(); delete fb;
+ m_streambuf = new gzfilebuf(filePath.c_str());
+ } else {
+ m_streambuf = fb;
+ }
+ this->init(m_streambuf);
+}
+
+inputfilestream::~inputfilestream()
+{
+ delete m_streambuf; m_streambuf = 0;
+}
+
+void inputfilestream::close()
+{
+}
+
+outputfilestream::outputfilestream(const std::string &filePath)
+: std::ostream(0),
+m_streambuf(0)
+{
+ //check if file is readable
+ std::filebuf* fb = new std::filebuf();
+ _good=(fb->open(filePath.c_str(), std::ios::out)!=NULL);
+
+ if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz")
+ {
+ throw runtime_error("Output to a zipped file not supported!");
+ } else {
+ m_streambuf = fb;
+ }
+ this->init(m_streambuf);
+}
+
+outputfilestream::~outputfilestream()
+{
+ delete m_streambuf; m_streambuf = 0;
+}
+
+void outputfilestream::close()
+{
+}
diff --git a/mert/Util.h b/mert/Util.h
index 672917f8e..8b61a25ee 100644
--- a/mert/Util.h
+++ b/mert/Util.h
@@ -11,15 +11,21 @@
using namespace std;
+#include <stdexcept>
#include <limits>
#define US_NOSET (numeric_limits<unsigned short>::max())
+#define MAX_LINE 1024
+
#include <vector>
#include <map>
#include <iostream>
#include <sstream>
#include <string>
+#include <fstream>
+#include "gzfilebuf.h"
+
#include "ScoreStats.h"
#include "FeatureStats.h"
@@ -48,50 +54,30 @@ inline T Scan(const std::string &input)
return ret;
};
-template<typename T>
-int packVariable(char *buffer, size_t &bufferlen, T theVariable)
-{
- size_t variable_size = sizeof(T);
- memcpy(buffer + bufferlen, (char*) &theVariable, variable_size);
- bufferlen += variable_size;
- return variable_size;
-};
-
-template<typename T>
-int unpackVariable(char *buffer, size_t &bufferlen, T &theVariable)
+class inputfilestream : public std::istream
{
- size_t variable_size = sizeof(T);
- theVariable = *((T*)(buffer + bufferlen));
- bufferlen += variable_size;
- return variable_size;
+protected:
+ std::streambuf *m_streambuf;
+ bool _good;
+public:
+
+ inputfilestream(const std::string &filePath);
+ ~inputfilestream();
+ bool good(){return _good;}
+ void close();
};
-template<typename T>
-int packVector(char *buffer, size_t &bufferlen, vector<T> theVector)
-{
- int vector_size = packVariable(buffer, bufferlen, theVector.size());
-
- for (int i = 0; i < theVector.size(); i++)
- vector_size += packVariable(buffer, bufferlen, theVector.at(i));
-
- return vector_size;
-};
-
-template<typename T>
-int unpackVector(char *buffer, size_t &bufferlen, vector<T> &theVector)
+class outputfilestream : public std::ostream
{
- int vector_size;
- int vector_memsize = unpackVariable(buffer, bufferlen, vector_size);
-
- theVector.clear();
- T theVariable;
- for (int i = 0; i < vector_size; i++)
- {
- vector_memsize += unpackVariable(buffer, bufferlen, theVariable);
- theVector.push_back(theVariable);
- }
-
- return vector_memsize;
+protected:
+ std::streambuf *m_streambuf;
+ bool _good;
+public:
+
+ outputfilestream(const std::string &filePath);
+ ~outputfilestream();
+ bool good(){return _good;}
+ void close();
};
#endif
diff --git a/mert/example/README b/mert/example/README
index addb7334e..fd12f43ca 100644
--- a/mert/example/README
+++ b/mert/example/README
@@ -1,6 +1,13 @@
cmd=../extractor
#$cmd -NbestFile NBEST -Reference REF -OutputFeatureStatistics FEATSTAT.out -OutputScoreStatistics SCORESTAT.out
-$cmd --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.out --scfile SCORESTAT.out
+
+#to read an nbest file; output is in text format
+$cmd --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.out --scfile SCORESTAT.out --sctype BLEU4
+
+#to read a gzipped nbest file; output is in text format
+$cmd --nbest NBEST.gz --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.out --scfile SCORESTAT.out --sctype BLEU4
+
+exit
cp FEATSTAT.out FEATSTAT.in
cp SCORESTAT.out SCORESTAT.in
diff --git a/mert/example/README.oldmert b/mert/example/README.oldmert
index 39f6b18cf..cd323ca02 100644
--- a/mert/example/README.oldmert
+++ b/mert/example/README.oldmert
@@ -6,15 +6,14 @@ for normtype in '' '-n' ; do
for reflentype in '' '-a' '-s' '-e' ; do
basename=OLDMERT${casetype}${normtype}${reflentype}
-#cat NBEST | $scorecmd ${casetype} ${normtype} ${reflentype} REF.0 REF.1 REF.2 ./$basename.
-cat NBEST NBEST | sort -mnk 1,1 | $scorecmd ${casetype} ${normtype} ${reflentype} REF.0 REF.1 REF.2 ./$basename.
+cat NBEST | $scorecmd ${casetype} ${normtype} ${reflentype} REF.0 REF.1 REF.2 ./$basename.
+#cat NBEST NBEST | sort -mnk 1,1 | $scorecmd ${casetype} ${normtype} ${reflentype} REF.0 REF.1 REF.2 ./$basename.
cat ./$basename.feats.opt | cut -d' ' -f 16- > ./$basename.SCORESTAT.out
cat ./$basename.feats.opt | cut -d' ' -f 1-15 > ./$basename.FEATSTAT.out
echo comparing SCORESTAT.out and $basename.SCORESTAT.out
cat SCORESTAT.out | sort | grep -v "^SCORE"> AAA$$
cat $basename.SCORESTAT.out | sort >BBB$$
-#head -3 AAA$$ BBB$$
cmp AAA$$ BBB$$
echo comparing FEATSTAT.out and $basename.FEATSTAT.out