Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <fishandfrolick@gmail.com>2012-05-31 20:58:10 +0400
committerHieu Hoang <fishandfrolick@gmail.com>2012-05-31 20:58:10 +0400
commita5ca652a766ddb687891adac8e7ef252fa2f430d (patch)
tree7cac031a4a7d688369e0fd4538a65d855b6c390e /phrase-extract
parent4eef94b1217a82eb979242dd3e06d8a4b6255e6e (diff)
move c++ code out of /script/ to /
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/extract-rules.cpp4
-rwxr-xr-xphrase-extract/lexical-reordering/InputFileStream.cpp67
-rwxr-xr-xphrase-extract/lexical-reordering/InputFileStream.h49
-rw-r--r--phrase-extract/lexical-reordering/Jamfile2
-rwxr-xr-xphrase-extract/lexical-reordering/gzfilebuf.h85
-rw-r--r--phrase-extract/lexical-reordering/reordering_classes.cpp437
-rw-r--r--phrase-extract/lexical-reordering/reordering_classes.h146
-rw-r--r--phrase-extract/lexical-reordering/score.cpp225
8 files changed, 1013 insertions, 2 deletions
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index 997038224..762327681 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -46,8 +46,8 @@
#include "XmlTree.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
-#include "../../../moses/src/ThreadPool.h"
-#include "../../../moses/src/OutputCollector.h"
+#include "../moses/src/ThreadPool.h"
+#include "../moses/src/OutputCollector.h"
#define LINE_MAX_LENGTH 500000
diff --git a/phrase-extract/lexical-reordering/InputFileStream.cpp b/phrase-extract/lexical-reordering/InputFileStream.cpp
new file mode 100755
index 000000000..013781c36
--- /dev/null
+++ b/phrase-extract/lexical-reordering/InputFileStream.cpp
@@ -0,0 +1,67 @@
+// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+
+using namespace std;
+
+namespace Moses
+{
+InputFileStream::InputFileStream(const std::string &filePath)
+ : std::istream(NULL)
+ , m_streambuf(NULL)
+{
+ Open(filePath);
+}
+
+InputFileStream::~InputFileStream()
+{
+ Close();
+}
+
+void InputFileStream::Open(const std::string &filePath)
+{
+ if (filePath.size() > 3 &&
+ filePath.substr(filePath.size() - 3, 3) == ".gz") {
+ m_streambuf = new gzfilebuf(filePath.c_str());
+ } else {
+ std::filebuf* fb = new std::filebuf();
+ fb = fb->open(filePath.c_str(), std::ios::in);
+ if (! fb) {
+ cerr << "Can't read " << filePath.c_str() << endl;
+ exit(1);
+ }
+ m_streambuf = fb;
+ }
+ this->init(m_streambuf);
+}
+
+void InputFileStream::Close()
+{
+ delete m_streambuf;
+ m_streambuf = NULL;
+}
+
+
+}
+
diff --git a/phrase-extract/lexical-reordering/InputFileStream.h b/phrase-extract/lexical-reordering/InputFileStream.h
new file mode 100755
index 000000000..1f37715fd
--- /dev/null
+++ b/phrase-extract/lexical-reordering/InputFileStream.h
@@ -0,0 +1,49 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_InputFileStream_h
+#define moses_InputFileStream_h
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+namespace Moses
+{
+
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class InputFileStream : public std::istream
+{
+protected:
+ std::streambuf *m_streambuf;
+public:
+
+ InputFileStream(const std::string &filePath);
+ ~InputFileStream();
+
+ void Open(const std::string &filePath);
+ void Close();
+};
+
+}
+
+#endif
diff --git a/phrase-extract/lexical-reordering/Jamfile b/phrase-extract/lexical-reordering/Jamfile
new file mode 100644
index 000000000..a53465577
--- /dev/null
+++ b/phrase-extract/lexical-reordering/Jamfile
@@ -0,0 +1,2 @@
+exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../..//z ;
+
diff --git a/phrase-extract/lexical-reordering/gzfilebuf.h b/phrase-extract/lexical-reordering/gzfilebuf.h
new file mode 100755
index 000000000..b5b0ce87f
--- /dev/null
+++ b/phrase-extract/lexical-reordering/gzfilebuf.h
@@ -0,0 +1,85 @@
+#ifndef moses_gzfile_buf_h
+#define moses_gzfile_buf_h
+
+#include <streambuf>
+#include <zlib.h>
+#include <cstring>
+
+class gzfilebuf : public std::streambuf
+{
+public:
+ gzfilebuf(const char *filename) {
+ _gzf = gzopen(filename, "rb");
+ setg (_buff+sizeof(int), // beginning of putback area
+ _buff+sizeof(int), // read position
+ _buff+sizeof(int)); // end position
+ }
+ ~gzfilebuf() {
+ gzclose(_gzf);
+ }
+protected:
+ virtual int_type overflow (int_type c) {
+ throw;
+ }
+
+ // write multiple characters
+ virtual
+ std::streamsize xsputn (const char* s,
+ std::streamsize num) {
+ throw;
+ }
+
+ virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ) {
+ throw;
+ }
+
+ //read one character
+ virtual int_type underflow () {
+ // is read position before end of _buff?
+ if (gptr() < egptr()) {
+ return traits_type::to_int_type(*gptr());
+ }
+
+ /* process size of putback area
+ * - use number of characters read
+ * - but at most four
+ */
+ unsigned int numPutback = gptr() - eback();
+ if (numPutback > sizeof(int)) {
+ numPutback = sizeof(int);
+ }
+
+ /* copy up to four characters previously read into
+ * the putback _buff (area of first four characters)
+ */
+ std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
+ numPutback);
+
+ // read new characters
+ int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
+ if (num <= 0) {
+ // ERROR or EOF
+ return EOF;
+ }
+
+ // reset _buff pointers
+ setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
+ _buff+sizeof(int), // read position
+ _buff+sizeof(int)+num); // end of buffer
+
+ // return next character
+ return traits_type::to_int_type(*gptr());
+ }
+
+ std::streamsize xsgetn (char* s,
+ std::streamsize num) {
+ return gzread(_gzf,s,num);
+ }
+
+private:
+ gzFile _gzf;
+ static const unsigned int _buffsize = 1024;
+ char _buff[_buffsize];
+};
+
+#endif
diff --git a/phrase-extract/lexical-reordering/reordering_classes.cpp b/phrase-extract/lexical-reordering/reordering_classes.cpp
new file mode 100644
index 000000000..2f159e4fa
--- /dev/null
+++ b/phrase-extract/lexical-reordering/reordering_classes.cpp
@@ -0,0 +1,437 @@
+
+#include <vector>
+#include <iostream>
+#include <cstdlib>
+#include <numeric>
+#include <cstdio>
+#include <sstream>
+#include <string>
+#include "zlib.h"
+
+#include "reordering_classes.h"
+
+using namespace std;
+
+ModelScore::ModelScore()
+{
+ for(int i=MONO; i<=NOMONO; ++i) {
+ count_fe_prev.push_back(0);
+ count_fe_next.push_back(0);
+ count_f_prev.push_back(0);
+ count_f_next.push_back(0);
+ }
+}
+
+ModelScore::~ModelScore() {}
+
+ModelScore* ModelScore::createModelScore(const string& modeltype)
+{
+ if (modeltype.compare("mslr") == 0) {
+ return new ModelScoreMSLR();
+ } else if (modeltype.compare("msd") == 0) {
+ return new ModelScoreMSD();
+ } else if (modeltype.compare("monotonicity") == 0 ) {
+ return new ModelScoreMonotonicity();
+ } else if (modeltype.compare("leftright") == 0) {
+ return new ModelScoreLR();
+ } else {
+ cerr << "Illegal model type given for lexical reordering model scoring: " << modeltype << ". The allowed types are: mslr, msd, monotonicity, leftright" << endl;
+ exit(1);
+ }
+}
+
+void ModelScore::reset_fe()
+{
+ for(int i=MONO; i<=NOMONO; ++i) {
+ count_fe_prev[i] = 0;
+ count_fe_next[i] = 0;
+ }
+}
+
+void ModelScore::reset_f()
+{
+ for(int i=MONO; i<=NOMONO; ++i) {
+ count_f_prev[i] = 0;
+ count_f_next[i] = 0;
+ }
+}
+
+void ModelScore::add_example(const string& previous, string& next)
+{
+ count_fe_prev[getType(previous)]++;
+ count_f_prev[getType(previous)]++;
+ count_fe_next[getType(next)]++;
+ count_f_next[getType(next)]++;
+}
+
+const vector<double>& ModelScore::get_scores_fe_prev() const
+{
+ return count_fe_prev;
+}
+
+const vector<double>& ModelScore::get_scores_fe_next() const
+{
+ return count_fe_next;
+}
+
+const vector<double>& ModelScore::get_scores_f_prev() const
+{
+ return count_f_prev;
+}
+
+const vector<double>& ModelScore::get_scores_f_next() const
+{
+ return count_f_next;
+}
+
+
+ORIENTATION ModelScore::getType(const string& s)
+{
+ if (s.compare("mono") == 0) {
+ return MONO;
+ } else if (s.compare("swap") == 0) {
+ return SWAP;
+ } else if (s.compare("dright") == 0) {
+ return DRIGHT;
+ } else if (s.compare("dleft") == 0) {
+ return DLEFT;
+ } else if (s.compare("other") == 0) {
+ return OTHER;
+ } else if (s.compare("nomono") == 0) {
+ return NOMONO;
+ } else {
+ cerr << "Illegal reordering type used: " << s << endl;
+ exit(1);
+ }
+}
+
+
+ORIENTATION ModelScoreMSLR::getType(const string& s)
+{
+ if (s.compare("mono") == 0) {
+ return MONO;
+ } else if (s.compare("swap") == 0) {
+ return SWAP;
+ } else if (s.compare("dright") == 0) {
+ return DRIGHT;
+ } else if (s.compare("dleft") == 0) {
+ return DLEFT;
+ } else if (s.compare("other") == 0 || s.compare("nomono") == 0) {
+ cerr << "Illegal reordering type used: " << s << " for model type mslr. You have to re-run step 5 in order to train such a model." << endl;
+ exit(1);
+ } else {
+ cerr << "Illegal reordering type used: " << s << endl;
+ exit(1);
+ }
+}
+
+
+ORIENTATION ModelScoreLR::getType(const string& s)
+{
+ if (s.compare("mono") == 0 || s.compare("dright") == 0) {
+ return DRIGHT;
+ } else if (s.compare("swap") == 0 || s.compare("dleft") == 0) {
+ return DLEFT;
+ } else if (s.compare("other") == 0 || s.compare("nomono") == 0) {
+ cerr << "Illegal reordering type used: " << s << " for model type LeftRight. You have to re-run step 5 in order to train such a model." << endl;
+ exit(1);
+ } else {
+ cerr << "Illegal reordering type used: " << s << endl;
+ exit(1);
+ }
+}
+
+
+ORIENTATION ModelScoreMSD::getType(const string& s)
+{
+ if (s.compare("mono") == 0) {
+ return MONO;
+ } else if (s.compare("swap") == 0) {
+ return SWAP;
+ } else if (s.compare("dleft") == 0 ||
+ s.compare("dright") == 0 ||
+ s.compare("other") == 0) {
+ return OTHER;
+ } else if (s.compare("nomono") == 0) {
+ cerr << "Illegal reordering type used: " << s << " for model type msd. You have to re-run step 5 in order to train such a model." << endl;
+ exit(1);
+ } else {
+ cerr << "Illegal reordering type used: " << s << endl;
+ exit(1);
+ }
+}
+
+ORIENTATION ModelScoreMonotonicity::getType(const string& s)
+{
+ if (s.compare("mono") == 0) {
+ return MONO;
+ } else if (s.compare("swap") == 0 ||
+ s.compare("dleft") == 0 ||
+ s.compare("dright") == 0 ||
+ s.compare("other") == 0 ||
+ s.compare("nomono") == 0 ) {
+ return NOMONO;
+ } else {
+ cerr << "Illegal reordering type used: " << s << endl;
+ exit(1);
+ }
+}
+
+
+
+void ScorerMSLR::score(const vector<double>& all_scores, vector<double>& scores) const
+{
+ scores.push_back(all_scores[MONO]);
+ scores.push_back(all_scores[SWAP]);
+ scores.push_back(all_scores[DLEFT]);
+ scores.push_back(all_scores[DRIGHT]);
+}
+
+void ScorerMSD::score(const vector<double>& all_scores, vector<double>& scores) const
+{
+ scores.push_back(all_scores[MONO]);
+ scores.push_back(all_scores[SWAP]);
+ scores.push_back(all_scores[DRIGHT]+all_scores[DLEFT]+all_scores[OTHER]);
+}
+
+void ScorerMonotonicity::score(const vector<double>& all_scores, vector<double>& scores) const
+{
+ scores.push_back(all_scores[MONO]);
+ scores.push_back(all_scores[SWAP]+all_scores[DRIGHT]+all_scores[DLEFT]+all_scores[OTHER]+all_scores[NOMONO]);
+}
+
+
+void ScorerLR::score(const vector<double>& all_scores, vector<double>& scores) const
+{
+ scores.push_back(all_scores[MONO]+all_scores[DRIGHT]);
+ scores.push_back(all_scores[SWAP]+all_scores[DLEFT]);
+}
+
+
+void ScorerMSLR::createSmoothing(const vector<double>& scores, double weight, vector<double>& smoothing) const
+{
+ double total = accumulate(scores.begin(), scores.end(), 0);
+ smoothing.push_back(weight*(scores[MONO]+0.1)/total);
+ smoothing.push_back(weight*(scores[SWAP]+0.1)/total);
+ smoothing.push_back(weight*(scores[DLEFT]+0.1)/total);
+ smoothing.push_back(weight*(scores[DRIGHT]+0.1)/total);
+}
+
+void ScorerMSLR::createConstSmoothing(double weight, vector<double>& smoothing) const
+{
+ for (int i=1; i<=4; ++i) {
+ smoothing.push_back(weight);
+ }
+}
+
+
+void ScorerMSD::createSmoothing(const vector<double>& scores, double weight, vector<double>& smoothing) const
+{
+ double total = accumulate(scores.begin(), scores.end(), 0);
+ smoothing.push_back(weight*(scores[MONO]+0.1)/total);
+ smoothing.push_back(weight*(scores[SWAP]+0.1)/total);
+ smoothing.push_back(weight*(scores[DLEFT]+scores[DRIGHT]+scores[OTHER]+0.1)/total);
+}
+
+void ScorerMSD::createConstSmoothing(double weight, vector<double>& smoothing) const
+{
+ for (int i=1; i<=3; ++i) {
+ smoothing.push_back(weight);
+ }
+}
+
+void ScorerMonotonicity::createSmoothing(const vector<double>& scores, double weight, vector<double>& smoothing) const
+{
+ double total = accumulate(scores.begin(), scores.end(), 0);
+ smoothing.push_back(weight*(scores[MONO]+0.1)/total);
+ smoothing.push_back(weight*(scores[SWAP]+scores[DLEFT]+scores[DRIGHT]+scores[OTHER]+scores[NOMONO]+0.1)/total);
+}
+
+void ScorerMonotonicity::createConstSmoothing(double weight, vector<double>& smoothing) const
+{
+ for (double i=1; i<=2; ++i) {
+ smoothing.push_back(weight);
+ }
+}
+
+
+void ScorerLR::createSmoothing(const vector<double>& scores, double weight, vector<double>& smoothing) const
+{
+ double total = accumulate(scores.begin(), scores.end(), 0);
+ smoothing.push_back(weight*(scores[MONO]+scores[DRIGHT]+0.1)/total);
+ smoothing.push_back(weight*(scores[SWAP]+scores[DLEFT])/total);
+}
+
+void ScorerLR::createConstSmoothing(double weight, vector<double>& smoothing) const
+{
+ for (int i=1; i<=2; ++i) {
+ smoothing.push_back(weight);
+ }
+}
+
+void Model::score_fe(const string& f, const string& e)
+{
+ if (!fe) //Make sure we do not do anything if it is not a fe model
+ return;
+ fprintf(file,"%s ||| %s ||| ",f.c_str(),e.c_str());
+ //condition on the previous phrase
+ if (previous) {
+ vector<double> scores;
+ scorer->score(modelscore->get_scores_fe_prev(), scores);
+ double sum = 0;
+ for(size_t i=0; i<scores.size(); ++i) {
+ scores[i] += smoothing_prev[i];
+ sum += scores[i];
+ }
+ for(size_t i=0; i<scores.size(); ++i) {
+ fprintf(file,"%f ",scores[i]/sum);
+ }
+ //fprintf(file, "||| ");
+ }
+ //condition on the next phrase
+ if (next) {
+ vector<double> scores;
+ scorer->score(modelscore->get_scores_fe_next(), scores);
+ double sum = 0;
+ for(size_t i=0; i<scores.size(); ++i) {
+ scores[i] += smoothing_next[i];
+ sum += scores[i];
+ }
+ for(size_t i=0; i<scores.size(); ++i) {
+ fprintf(file, "%f ", scores[i]/sum);
+ }
+ }
+ fprintf(file,"\n");
+}
+
+void Model::score_f(const string& f)
+{
+ if (fe) //Make sure we do not do anything if it is not a f model
+ return;
+ fprintf(file, "%s ||| ", f.c_str());
+ //condition on the previous phrase
+ if (previous) {
+ vector<double> scores;
+ scorer->score(modelscore->get_scores_f_prev(), scores);
+ double sum = 0;
+ for(size_t i=0; i<scores.size(); ++i) {
+ scores[i] += smoothing_prev[i];
+ sum += scores[i];
+ }
+ for(size_t i=0; i<scores.size(); ++i) {
+ fprintf(file, "%f ", scores[i]/sum);
+ }
+ //fprintf(file, "||| ");
+ }
+ //condition on the next phrase
+ if (next) {
+ vector<double> scores;
+ scorer->score(modelscore->get_scores_f_next(), scores);
+ double sum = 0;
+ for(size_t i=0; i<scores.size(); ++i) {
+ scores[i] += smoothing_next[i];
+ sum += scores[i];
+ }
+ for(size_t i=0; i<scores.size(); ++i) {
+ fprintf(file, "%f ", scores[i]/sum);
+ }
+ }
+ fprintf(file, "\n");
+}
+
+Model::Model(ModelScore* ms, Scorer* sc, const string& dir, const string& lang, const string& fn)
+ : modelscore(ms), scorer(sc), filename(fn)
+{
+
+ file = fopen(filename.c_str(),"w");
+ if (!file) {
+ cerr << "Could not open the model output file: " << filename << endl;
+ exit(1);
+ }
+
+ fe = false;
+ if (lang.compare("fe") == 0) {
+ fe = true;
+ } else if (lang.compare("f") != 0) {
+ cerr << "You have given an illegal language to condition on: " << lang
+ << "\nLegal types: fe (on both languages), f (only on source language)\n";
+ exit(1);
+ }
+
+ previous = true;
+ next = true;
+ if (dir.compare("backward") == 0) {
+ next = false;
+ } else if (dir.compare("forward") == 0) {
+ previous = false;
+ }
+}
+
+Model::~Model()
+{
+ fclose(file);
+ delete modelscore;
+ delete scorer;
+}
+
+void Model::zipFile()
+{
+ fclose(file);
+ file = fopen(filename.c_str(), "rb");
+ gzFile gzfile = gzopen((filename+".gz").c_str(),"wb");
+ char inbuffer[128];
+ int num_read;
+ while ((num_read = fread(inbuffer, 1, sizeof(inbuffer), file)) > 0) {
+ gzwrite(gzfile, inbuffer, num_read);
+ }
+ fclose(file);
+ gzclose(gzfile);
+
+ //Remove the unzipped file
+ remove(filename.c_str());
+}
+
+void Model::split_config(const string& config, string& dir, string& lang, string& orient)
+{
+ istringstream is(config);
+ string type;
+ getline(is, type, '-');
+ getline(is, orient, '-');
+ getline(is, dir, '-');
+ getline(is, lang, '-');
+}
+
+Model* Model::createModel(ModelScore* modelscore, const string& config, const string& filepath)
+{
+ string dir, lang, orient, filename;
+ split_config(config,dir,lang,orient);
+
+ filename = filepath + config;
+ if (orient.compare("mslr") == 0) {
+ return new Model(modelscore, new ScorerMSLR(), dir, lang, filename);
+ } else if (orient.compare("msd") == 0) {
+ return new Model(modelscore, new ScorerMSD(), dir, lang, filename);
+ } else if (orient.compare("monotonicity") == 0) {
+ return new Model(modelscore, new ScorerMonotonicity(), dir, lang, filename);
+ } else if (orient.compare("leftright") == 0) {
+ return new Model(modelscore, new ScorerLR(), dir, lang, filename);
+ } else {
+ cerr << "Illegal orientation type of reordering model: " << orient
+ << "\n allowed types: mslr, msd, monotonicity, leftright\n";
+ exit(1);
+ }
+}
+
+
+
+void Model::createSmoothing(double w)
+{
+ scorer->createSmoothing(modelscore->get_scores_fe_prev(), w, smoothing_prev);
+ scorer->createSmoothing(modelscore->get_scores_fe_next(), w, smoothing_next);
+}
+
+void Model::createConstSmoothing(double w)
+{
+ scorer->createConstSmoothing(w, smoothing_prev);
+ scorer->createConstSmoothing(w, smoothing_next);
+}
diff --git a/phrase-extract/lexical-reordering/reordering_classes.h b/phrase-extract/lexical-reordering/reordering_classes.h
new file mode 100644
index 000000000..4d0b56240
--- /dev/null
+++ b/phrase-extract/lexical-reordering/reordering_classes.h
@@ -0,0 +1,146 @@
+/*
+ * reordering_classes.h
+ * Utility classes for lexical reordering table scoring
+ *
+ * Created by: Sara Stymne - Linköping University
+ * Machine Translation Marathon 2010, Dublin
+ */
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <fstream>
+
+
+enum ORIENTATION {MONO, SWAP, DRIGHT, DLEFT, OTHER, NOMONO};
+
+
+//Keeps the counts for the different reordering types
+//(Instantiated in 1-3 instances, one for each type of model (hier, phrase, wbe))
+class ModelScore
+{
+private:
+ std::vector<double> count_fe_prev;
+ std::vector<double> count_fe_next;
+ std::vector<double> count_f_prev;
+ std::vector<double> count_f_next;
+
+protected:
+ virtual ORIENTATION getType(const std::string& s);
+
+public:
+ ModelScore();
+ virtual ~ModelScore();
+ void add_example(const std::string& previous, std::string& next);
+ void reset_fe();
+ void reset_f();
+ const std::vector<double>& get_scores_fe_prev() const;
+ const std::vector<double>& get_scores_fe_next() const;
+ const std::vector<double>& get_scores_f_prev() const;
+ const std::vector<double>& get_scores_f_next() const;
+
+ static ModelScore* createModelScore(const std::string& modeltype);
+};
+
+class ModelScoreMSLR : public ModelScore
+{
+protected:
+ virtual ORIENTATION getType(const std::string& s);
+};
+
+class ModelScoreLR : public ModelScore
+{
+protected:
+ virtual ORIENTATION getType(const std::string& s);
+};
+
+class ModelScoreMSD : public ModelScore
+{
+protected:
+ virtual ORIENTATION getType(const std::string& s);
+};
+
+class ModelScoreMonotonicity : public ModelScore
+{
+protected:
+ virtual ORIENTATION getType(const std::string& s);
+};
+
+//Class for calculating total counts, and to calculate smoothing
+class Scorer
+{
+public:
+ virtual ~Scorer() {}
+ virtual void score(const std::vector<double>&, std::vector<double>&) const = 0;
+ virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const = 0;
+ virtual void createConstSmoothing(double, std::vector<double>&) const = 0;
+};
+
+class ScorerMSLR : public Scorer
+{
+public:
+ virtual void score(const std::vector<double>&, std::vector<double>&) const;
+ virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const;
+ virtual void createConstSmoothing(double, std::vector<double>&) const;
+};
+
+class ScorerMSD : public Scorer
+{
+public:
+ virtual void score(const std::vector<double>&, std::vector<double>&) const;
+ virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const;
+ virtual void createConstSmoothing(double, std::vector<double>&) const;
+};
+
+class ScorerMonotonicity : public Scorer
+{
+public:
+ virtual void score(const std::vector<double>&, std::vector<double>&) const;
+ virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const;
+ virtual void createConstSmoothing(double, std::vector<double>&) const;
+};
+
+class ScorerLR : public Scorer
+{
+public:
+ virtual void score(const std::vector<double>&, std::vector<double>&) const;
+ virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const;
+ virtual void createConstSmoothing(double, std::vector<double>&) const;
+};
+
+
+//Class for representing each model
+//Contains a modelscore and scorer (which can be of different model types (mslr, msd...)),
+//and file handling.
+//This class also keeps track of bidirectionality, and which language to condition on
+class Model
+{
+private:
+ ModelScore* modelscore;
+ Scorer* scorer;
+
+ std::FILE* file;
+ std::string filename;
+
+ bool fe;
+ bool previous;
+ bool next;
+
+ std::vector<double> smoothing_prev;
+ std::vector<double> smoothing_next;
+
+ static void split_config(const std::string& config, std::string& dir,
+ std::string& lang, std::string& orient);
+public:
+ Model(ModelScore* ms, Scorer* sc, const std::string& dir,
+ const std::string& lang, const std::string& fn);
+ ~Model();
+ static Model* createModel(ModelScore*, const std::string&, const std::string&);
+ void createSmoothing(double w);
+ void createConstSmoothing(double w);
+ void score_fe(const std::string& f, const std::string& e);
+ void score_f(const std::string& f);
+ void zipFile();
+};
+
diff --git a/phrase-extract/lexical-reordering/score.cpp b/phrase-extract/lexical-reordering/score.cpp
new file mode 100644
index 000000000..7f14b9fc8
--- /dev/null
+++ b/phrase-extract/lexical-reordering/score.cpp
@@ -0,0 +1,225 @@
+/*
+ * score_reordering.cpp
+ *
+ * Created by: Sara Stymne - Linköping University
+ * Machine Translation Marathon 2010, Dublin
+ */
+
+#include <string>
+#include <vector>
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cstdlib>
+#include <cstring>
+#include "InputFileStream.h"
+
+#include "reordering_classes.h"
+
+using namespace std;
+
+void split_line(const string& line, string& foreign, string& english, string& wbe, string& phrase, string& hier);
+void get_orientations(const string& pair, string& previous, string& next);
+
+
+int main(int argc, char* argv[])
+{
+
+ cerr << "Lexical Reordering Scorer\n"
+ << "scores lexical reordering models of several types (hierarchical, phrase-based and word-based-extraction\n";
+
+ if (argc < 3) {
+ cerr << "syntax: score_reordering extractFile smoothingValue filepath (--model \"type max-orientation (specification-strings)\" )+\n";
+ exit(1);
+ }
+
+ char* extractFileName = argv[1];
+ double smoothingValue = atof(argv[2]);
+ string filepath = argv[3];
+
+ Moses::InputFileStream eFile(extractFileName);
+ if (!eFile) {
+ cerr << "Could not open the extract file " << extractFileName <<"for scoring of lexical reordering models\n";
+ exit(1);
+ }
+
+ bool smoothWithCounts = false;
+ map<string,ModelScore*> modelScores;
+ vector<Model*> models;
+ bool hier = false;
+ bool phrase = false;
+ bool wbe = false;
+
+ string e,f,w,p,h;
+ string prev, next;
+
+ int i = 4;
+ while (i<argc) {
+ if (strcmp(argv[i],"--SmoothWithCounts") == 0) {
+ smoothWithCounts = true;
+ } else if (strcmp(argv[i],"--model") == 0) {
+ if (i+1 >= argc) {
+ cerr << "score: syntax error, no model information provided to the option" << argv[i] << endl;
+ exit(1);
+ }
+ istringstream is(argv[++i]);
+ string m,t;
+ is >> m >> t;
+ modelScores[m] = ModelScore::createModelScore(t);
+ if (m.compare("hier") == 0) {
+ hier = true;
+ } else if (m.compare("phrase") == 0) {
+ phrase = true;
+ }
+ if (m.compare("wbe") == 0) {
+ wbe = true;
+ }
+
+ if (!hier && !phrase && !wbe) {
+ cerr << "WARNING: No models specified for lexical reordering. No lexical reordering table will be trained.\n";
+ return 0;
+ }
+
+ string config;
+ //Store all models
+ while (is >> config) {
+ models.push_back(Model::createModel(modelScores[m],config,filepath));
+ }
+ } else {
+ cerr << "illegal option given to lexical reordering model score\n";
+ exit(1);
+ }
+ i++;
+ }
+
+ ////////////////////////////////////
+ //calculate smoothing
+ if (smoothWithCounts) {
+ string line;
+ while (getline(eFile,line)) {
+ split_line(line,e,f,w,p,h);
+ if (hier) {
+ get_orientations(h, prev, next);
+ modelScores["hier"]->add_example(prev,next);
+ }
+ if (phrase) {
+ get_orientations(p, prev, next);
+ modelScores["phrase"]->add_example(prev,next);
+ }
+ if (wbe) {
+ get_orientations(w, prev, next);
+ modelScores["wbe"]->add_example(prev,next);
+ }
+ }
+
+ // calculate smoothing for each model
+ for (size_t i=0; i<models.size(); ++i) {
+ models[i]->createSmoothing(smoothingValue);
+ }
+
+ //reopen eFile
+ eFile.Close();
+ eFile.Open(extractFileName);
+ } else {
+ //constant smoothing
+ for (size_t i=0; i<models.size(); ++i) {
+ models[i]->createConstSmoothing(smoothingValue);
+ }
+ }
+
+ ////////////////////////////////////
+ //calculate scores for reordering table
+ string line,f_current,e_current;
+ bool first = true;
+ while (getline(eFile, line)) {
+ split_line(line,f,e,w,p,h);
+
+ if (first) {
+ f_current = f;
+ e_current = e;
+ first = false;
+ } else if (f.compare(f_current) != 0 || e.compare(e_current) != 0) {
+ //fe - score
+ for (size_t i=0; i<models.size(); ++i) {
+ models[i]->score_fe(f_current,e_current);
+ }
+ //reset
+ for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
+ it->second->reset_fe();
+ }
+
+ if (f.compare(f_current) != 0) {
+ //f - score
+ for (size_t i=0; i<models.size(); ++i) {
+ models[i]->score_f(f_current);
+ }
+ //reset
+ for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
+ it->second->reset_f();
+ }
+ }
+ f_current = f;
+ e_current = e;
+ }
+
+ // uppdate counts
+ if (hier) {
+ get_orientations(h, prev, next);
+ modelScores["hier"]->add_example(prev,next);
+ }
+ if (phrase) {
+ get_orientations(p, prev, next);
+ modelScores["phrase"]->add_example(prev,next);
+ }
+ if (wbe) {
+ get_orientations(w, prev, next);
+ modelScores["wbe"]->add_example(prev,next);
+ }
+ }
+ //Score the last phrases
+ for (size_t i=0; i<models.size(); ++i) {
+ models[i]->score_fe(f,e);
+ }
+ for (size_t i=0; i<models.size(); ++i) {
+ models[i]->score_f(f);
+ }
+
+ //Zip all files
+ for (size_t i=0; i<models.size(); ++i) {
+ models[i]->zipFile();
+ }
+
+ return 0;
+}
+
+
+
+void split_line(const string& line, string& foreign, string& english, string& wbe, string& phrase, string& hier)
+{
+
+ int begin = 0;
+ int end = line.find(" ||| ");
+ foreign = line.substr(begin, end - begin);
+
+ begin = end+5;
+ end = line.find(" ||| ", begin);
+ english = line.substr(begin, end - begin);
+
+ begin = end+5;
+ end = line.find(" | ", begin);
+ wbe = line.substr(begin, end - begin);
+
+ begin = end+3;
+ end = line.find(" | ", begin);
+ phrase = line.substr(begin, end - begin);
+
+ begin = end+3;
+ hier = line.substr(begin, line.size() - begin);
+}
+
+void get_orientations(const string& pair, string& previous, string& next)
+{
+ istringstream is(pair);
+ is >> previous >> next;
+}