Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2011-11-15 17:12:14 +0400
committerBarry Haddow <barry.haddow@gmail.com>2011-11-15 17:12:14 +0400
commit0a2e0f44a6d5fa2755b6f3894a55aa608272987d (patch)
tree2408a5149de4070a9ea57a1696630cca50ab34c5 /mert
parent3a6c0e0680e656a9a05da24c1b54e54caf651f48 (diff)
Finish and test feature and score data iterators.
Diffstat (limited to 'mert')
-rw-r--r--mert/FeatureDataIterator.cpp54
-rw-r--r--mert/FeatureDataIterator.h31
-rw-r--r--mert/Makefile.am3
-rw-r--r--mert/ScoreDataIterator.cpp90
-rw-r--r--mert/ScoreDataIterator.h67
-rw-r--r--mert/pro.cpp24
6 files changed, 242 insertions, 27 deletions
diff --git a/mert/FeatureDataIterator.cpp b/mert/FeatureDataIterator.cpp
index 700398cbb..00b59bc38 100644
--- a/mert/FeatureDataIterator.cpp
+++ b/mert/FeatureDataIterator.cpp
@@ -1,6 +1,3 @@
-// $Id$
-// vim:tabstop=2
-
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
@@ -22,7 +19,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <iostream>
#include <sstream>
-#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
#include "FeatureArray.h"
@@ -32,16 +28,36 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
using namespace util;
+int ParseInt(const StringPiece& str ) {
+ char* errIndex;
+ //could wrap?
+ int value = static_cast<int>(strtol(str.data(), &errIndex,10));
+ if (errIndex == str.data()) {
+ throw util::ParseNumberException(str);
+ }
+ return value;
+}
+
+float ParseFloat(const StringPiece& str) {
+ char* errIndex;
+ float value = static_cast<float>(strtod(str.data(), &errIndex));
+ if (errIndex == str.data()) {
+ throw util::ParseNumberException(str);
+ }
+ return value;
+}
+
FeatureDataIterator::FeatureDataIterator() {}
-FeatureDataIterator::FeatureDataIterator(const string filename) {
+FeatureDataIterator::FeatureDataIterator(const string& filename) {
m_in.reset(new FilePiece(filename.c_str()));
readNext();
}
void FeatureDataIterator::readNext() {
+ m_next.clear();
try {
StringPiece marker = m_in->ReadDelimited();
if (marker != StringPiece(FEATURES_TXT_BEGIN)) {
@@ -49,20 +65,30 @@ void FeatureDataIterator::readNext() {
}
size_t sentenceId = m_in->ReadULong();
size_t count = m_in->ReadULong();
- cerr << "Expecting " << count << endl;
+ size_t length = m_in->ReadULong();
m_in->ReadLine(); //discard rest of line
for (size_t i = 0; i < count; ++i) {
StringPiece line = m_in->ReadLine();
- for (util::TokenIter<util::AnyCharacter, true> token(line, util::AnyCharacter(" \t")); token; ++token) {
- //TODO: Create FeatureDataItem
- char* err_ind;
- float value = static_cast<float>(strtod(token->data(), &err_ind));
- if (err_ind == token->data()) {
- throw FileFormatException(m_in->FileName(), line.as_string());
+ m_next.push_back(FeatureDataItem());
+ for (TokenIter<AnyCharacter, true> token(line, AnyCharacter(" \t")); token; ++token) {
+ TokenIter<AnyCharacter,false> value(*token,AnyCharacter(":"));
+ if (!value) throw FileFormatException(m_in->FileName(), line.as_string());
+ StringPiece first = *value;
+ ++value;
+ if (!value) {
+ //regular feature
+ float floatValue = ParseFloat(first);
+ m_next.back().dense.push_back(floatValue);
+ } else {
+ //sparse feature
+ StringPiece second = *value;
+ float floatValue = ParseFloat(second);
+ m_next.back().sparse.set(first.as_string(),floatValue);
}
- cerr << value << ",";
}
- cerr << "\n";
+ if (length != m_next.back().dense.size()) {
+ throw FileFormatException(m_in->FileName(), line.as_string());
+ }
}
StringPiece line = m_in->ReadLine();
if (line != StringPiece(FEATURES_TXT_END)) {
diff --git a/mert/FeatureDataIterator.h b/mert/FeatureDataIterator.h
index 49d77f77f..6df249822 100644
--- a/mert/FeatureDataIterator.h
+++ b/mert/FeatureDataIterator.h
@@ -1,6 +1,3 @@
-// $Id$
-// vim:tabstop=2
-
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
@@ -29,7 +26,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <fstream>
#include <map>
-#include <memory>
#include <stdexcept>
#include <vector>
@@ -37,23 +33,34 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <boost/shared_ptr.hpp>
#include "util/file_piece.hh"
+#include "util/string_piece.hh"
#include "FeatureStats.h"
-class FeatureDataItem {
- public:
- std::vector<float> dense;
- SparseVector sparse;
-};
-
-class FileFormatException : public util::Exception {
+class FileFormatException : public util::Exception
+{
public:
explicit FileFormatException(const std::string filename, const std::string& line) {
*this << "Error in line \"" << line << "\" of " << filename;
}
};
+
+/** Assumes a delimiter, so only apply to tokens */
+int ParseInt(const StringPiece& str );
+
+/** Assumes a delimiter, so only apply to tokens */
+float ParseFloat(const StringPiece& str);
+
+
+class FeatureDataItem
+{
+ public:
+ std::vector<float> dense;
+ SparseVector sparse;
+};
+
class FeatureDataIterator :
public boost::iterator_facade<FeatureDataIterator,
const std::vector<FeatureDataItem>,
@@ -61,7 +68,7 @@ class FeatureDataIterator :
{
public:
FeatureDataIterator();
- FeatureDataIterator(const std::string filename);
+ FeatureDataIterator(const std::string& filename);
static FeatureDataIterator end() {
return FeatureDataIterator();
diff --git a/mert/Makefile.am b/mert/Makefile.am
index 3454a0205..58baa5053 100644
--- a/mert/Makefile.am
+++ b/mert/Makefile.am
@@ -6,6 +6,7 @@ libmert_la_SOURCES = \
Util.cpp \
Timer.cpp \
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
+ScoreDataIterator.cpp \
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
FeatureDataIterator.cpp \
Data.cpp \
@@ -38,5 +39,5 @@ extractor_LDADD = libmert.la -lm -lz
mert_LDADD = libmert.la -lm -lz $(BOOST_THREAD_LDFLAGS) $(BOOST_THREAD_LIBS)
evaluator_LDADD = libmert.la -lm -lz
pro_LDADD = libmert.la @KENLM_LDFLAGS@ $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIBS)
-pro_DEPENDENCIES = $(top_srcdir)/kenlm/libkenlm.la
+pro_DEPENDENCIES = $(top_srcdir)/kenlm/libkenlm.la libmert.la
diff --git a/mert/ScoreDataIterator.cpp b/mert/ScoreDataIterator.cpp
new file mode 100644
index 000000000..c062cc52d
--- /dev/null
+++ b/mert/ScoreDataIterator.cpp
@@ -0,0 +1,90 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2011- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+#include <iostream>
+
+#include "util/tokenize_piece.hh"
+
+#include "ScoreArray.h"
+#include "ScoreDataIterator.h"
+
+using namespace std;
+using namespace util;
+
+ScoreDataIterator::ScoreDataIterator() {}
+
+ScoreDataIterator::ScoreDataIterator(const string& filename) {
+ m_in.reset(new FilePiece(filename.c_str()));
+ readNext();
+}
+
+void ScoreDataIterator::readNext() {
+ m_next.clear();
+ try {
+ StringPiece marker = m_in->ReadDelimited();
+ if (marker != StringPiece(SCORES_TXT_BEGIN)) {
+ throw FileFormatException(m_in->FileName(), marker.as_string());
+ }
+ size_t sentenceId = m_in->ReadULong();
+ size_t count = m_in->ReadULong();
+ size_t length = m_in->ReadULong();
+ m_in->ReadLine(); //ignore rest of line
+ for (size_t i = 0; i < count; ++i) {
+ StringPiece line = m_in->ReadLine();
+ cerr << line << endl;
+ m_next.push_back(ScoreDataItem());
+ for (TokenIter<AnyCharacter, true> token(line,AnyCharacter(" \t")); token; ++token) {
+ float value = ParseFloat(*token);
+ m_next.back().push_back(value);
+ }
+ if (length != m_next.back().size()) {
+ throw FileFormatException(m_in->FileName(), line.as_string());
+ }
+ }
+ StringPiece line = m_in->ReadLine();
+ if (line != StringPiece(SCORES_TXT_END)) {
+ throw FileFormatException(m_in->FileName(), line.as_string());
+ }
+ } catch (EndOfFileException& e) {
+ m_in.reset();
+ }
+}
+
+void ScoreDataIterator::increment() {
+ readNext();
+}
+
+
+bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
+ if (!m_in && !rhs.m_in) {
+ return true;
+ } else if (!m_in) {
+ return false;
+ } else if (!rhs.m_in) {
+ return false;
+ } else {
+ return m_in->FileName() == rhs.m_in->FileName() &&
+ m_in->Offset() == rhs.m_in->Offset();
+ }
+}
+
+
+const vector<ScoreDataItem>& ScoreDataIterator::dereference() const {
+ return m_next;
+}
+
diff --git a/mert/ScoreDataIterator.h b/mert/ScoreDataIterator.h
new file mode 100644
index 000000000..3248a7a78
--- /dev/null
+++ b/mert/ScoreDataIterator.h
@@ -0,0 +1,67 @@
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2011- University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#ifndef _SCORE_DATA_ITERATOR_
+#define _SCORE_DATA_ITERATOR_
+
+/*
+ * For loading from the score data file.
+**/
+#include <vector>
+
+
+#include <boost/iterator/iterator_facade.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+
+#include "FeatureDataIterator.h"
+
+typedef std::vector<float> ScoreDataItem;
+
+class ScoreDataIterator :
+ public boost::iterator_facade<ScoreDataIterator,
+ const std::vector<ScoreDataItem>,
+ boost::forward_traversal_tag>
+{
+ public:
+ ScoreDataIterator();
+ ScoreDataIterator(const std::string& filename);
+
+ static ScoreDataIterator end() {
+ return ScoreDataIterator();
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment();
+ bool equal(const ScoreDataIterator& rhs) const;
+ const std::vector<ScoreDataItem>& dereference() const;
+
+ void readNext();
+
+ boost::shared_ptr<util::FilePiece> m_in;
+ std::vector<ScoreDataItem> m_next;
+};
+
+
+#endif
+
diff --git a/mert/pro.cpp b/mert/pro.cpp
index 3272b3209..0a716abd8 100644
--- a/mert/pro.cpp
+++ b/mert/pro.cpp
@@ -36,6 +36,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <boost/program_options.hpp>
#include "FeatureDataIterator.h"
+#include "ScoreDataIterator.h"
using namespace std;
@@ -80,6 +81,29 @@ int main(int argc, char** argv)
//cerr << featureFiles[0] << endl;
for (; fi != FeatureDataIterator::end(); ++fi) {
const vector<FeatureDataItem>& featureData = *fi;
+ cerr << "Read " << featureData.size() << " items " << endl;
+ for (size_t i = 0; i < featureData.size(); ++i) {
+ cerr << "Dense: ";
+ for (size_t j = 0; j < featureData[i].dense.size(); ++j) {
+ cerr << featureData[i].dense[j] << " ";
+ }
+ cerr << "\n";
+ }
+ cerr << "\n";
+ }
+
+ ScoreDataIterator si(scoreFiles[0]);
+ for (; si != ScoreDataIterator::end(); ++si) {
+ const vector<ScoreDataItem>& scoreData = *si;
+ cerr << "Read " << scoreData.size() << " items " << endl;
+ for (size_t i = 0; i < scoreData.size(); ++i) {
+ cerr << "SD: ";
+ for (size_t j = 0; j < scoreData[i].size(); ++j) {
+ cerr << scoreData[i][j] << " ";
+ }
+ cerr << "\n";
+ }
+ cerr << "\n";
}
}