Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorskyload <skyload@1f5c12ca-751b-0410-a591-d2e778427230>2010-04-21 15:56:51 +0400
committerskyload <skyload@1f5c12ca-751b-0410-a591-d2e778427230>2010-04-21 15:56:51 +0400
commit96dd3ecb50b509dd2f4d5373d3c956cfc0ab8480 (patch)
treedbe48e813422cc0cb8fb032c696e9ce3fed04375 /moses-cmd/src
parent4ec8af8e42363d17b37b6615434cc40a5ad8448d (diff)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/DPR_MOSES@3160 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'moses-cmd/src')
-rw-r--r--moses-cmd/src/IOWrapper.cpp508
-rw-r--r--moses-cmd/src/IOWrapper.h115
-rw-r--r--moses-cmd/src/LatticeMBR.cpp507
-rw-r--r--moses-cmd/src/LatticeMBR.h117
-rw-r--r--moses-cmd/src/LatticeMBRGrid.cpp204
-rw-r--r--moses-cmd/src/Main.cpp233
-rw-r--r--moses-cmd/src/Main.h43
-rw-r--r--moses-cmd/src/MainMT.cpp268
-rw-r--r--moses-cmd/src/Makefile.am16
-rw-r--r--moses-cmd/src/ThreadPool.cpp97
-rw-r--r--moses-cmd/src/ThreadPool.h118
-rw-r--r--moses-cmd/src/TranslationAnalysis.cpp114
-rw-r--r--moses-cmd/src/TranslationAnalysis.h24
-rw-r--r--moses-cmd/src/mbr.cpp185
-rw-r--r--moses-cmd/src/mbr.h28
15 files changed, 2577 insertions, 0 deletions
diff --git a/moses-cmd/src/IOWrapper.cpp b/moses-cmd/src/IOWrapper.cpp
new file mode 100644
index 000000000..96573d88d
--- /dev/null
+++ b/moses-cmd/src/IOWrapper.cpp
@@ -0,0 +1,508 @@
+// $Id: IOWrapper.cpp 2953 2010-03-07 07:57:48Z abarun $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2006 University of Edinburgh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of the University of Edinburgh nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+// example file on how to use moses library
+
+#include <iostream>
+#include "TypeDef.h"
+#include "Util.h"
+#include "IOWrapper.h"
+#include "Hypothesis.h"
+#include "WordsRange.h"
+#include "TrellisPathList.h"
+#include "StaticData.h"
+#include "DummyScoreProducers.h"
+#include "InputFileStream.h"
+
+using namespace std;
+using namespace Moses;
+
+IOWrapper::IOWrapper(
+ const vector<FactorType> &inputFactorOrder
+ , const vector<FactorType> &outputFactorOrder
+ , const FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const string &nBestFilePath)
+:m_inputFactorOrder(inputFactorOrder)
+,m_outputFactorOrder(outputFactorOrder)
+,m_inputFactorUsed(inputFactorUsed)
+,m_inputFile(NULL)
+,m_inputStream(&std::cin)
+,m_nBestStream(NULL)
+,m_outputWordGraphStream(NULL)
+,m_outputSearchGraphStream(NULL)
+{
+ Initialization(inputFactorOrder, outputFactorOrder
+ , inputFactorUsed
+ , nBestSize, nBestFilePath);
+}
+
+IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
+ , const std::vector<FactorType> &outputFactorOrder
+ , const FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const std::string &nBestFilePath
+ , const std::string &inputFilePath)
+:m_inputFactorOrder(inputFactorOrder)
+,m_outputFactorOrder(outputFactorOrder)
+,m_inputFactorUsed(inputFactorUsed)
+,m_inputFilePath(inputFilePath)
+,m_inputFile(new InputFileStream(inputFilePath))
+,m_nBestStream(NULL)
+,m_outputWordGraphStream(NULL)
+,m_outputSearchGraphStream(NULL)
+{
+ Initialization(inputFactorOrder, outputFactorOrder
+ , inputFactorUsed
+ , nBestSize, nBestFilePath);
+
+ m_inputStream = m_inputFile;
+}
+
+IOWrapper::~IOWrapper()
+{
+ if (m_inputFile != NULL)
+ delete m_inputFile;
+ if (m_nBestStream != NULL && !m_surpressSingleBestOutput)
+ { // outputting n-best to file, rather than stdout. need to close file and delete obj
+ delete m_nBestStream;
+ }
+ if (m_outputWordGraphStream != NULL)
+ {
+ delete m_outputWordGraphStream;
+ }
+ if (m_outputSearchGraphStream != NULL)
+ {
+ delete m_outputSearchGraphStream;
+ }
+}
+
+void IOWrapper::Initialization(const std::vector<FactorType> &inputFactorOrder
+ , const std::vector<FactorType> &outputFactorOrder
+ , const FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const std::string &nBestFilePath)
+{
+ const StaticData &staticData = StaticData::Instance();
+
+ // n-best
+ m_surpressSingleBestOutput = false;
+
+ if (nBestSize > 0)
+ {
+ if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout")
+ {
+ m_nBestStream = &std::cout;
+ m_surpressSingleBestOutput = true;
+ }
+ else
+ {
+ std::ofstream *file = new std::ofstream;
+ m_nBestStream = file;
+ file->open(nBestFilePath.c_str());
+ }
+ }
+
+ // wordgraph output
+ if (staticData.GetOutputWordGraph())
+ {
+ string fileName = staticData.GetParam("output-word-graph")[0];
+ std::ofstream *file = new std::ofstream;
+ m_outputWordGraphStream = file;
+ file->open(fileName.c_str());
+ }
+
+ // search graph output
+ if (staticData.GetOutputSearchGraph())
+ {
+ string fileName;
+ if (staticData.GetOutputSearchGraphExtended())
+ fileName = staticData.GetParam("output-search-graph-extended")[0];
+ else
+ fileName = staticData.GetParam("output-search-graph")[0];
+ std::ofstream *file = new std::ofstream;
+ m_outputSearchGraphStream = file;
+ file->open(fileName.c_str());
+ }
+}
+
+InputType*IOWrapper::GetInput(InputType* inputType)
+{
+ if(inputType->Read(*m_inputStream, m_inputFactorOrder))
+ {
+ if (long x = inputType->GetTranslationId()) { if (x>=m_translationId) m_translationId = x+1; }
+ else inputType->SetTranslationId(m_translationId++);
+
+ return inputType;
+ }
+ else
+ {
+ delete inputType;
+ return NULL;
+ }
+}
+
+/***
+ * print surface factor only for the given phrase
+ */
+void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors)
+{
+ assert(outputFactorOrder.size() > 0);
+ if (reportAllFactors == true)
+ {
+ out << phrase;
+ }
+ else
+ {
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++)
+ {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+ out << *factor;
+
+ for (size_t i = 1 ; i < outputFactorOrder.size() ; i++)
+ {
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
+ out << "|" << *factor;
+ }
+ out << " ";
+ }
+ }
+}
+
+void OutputSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder
+ ,bool reportSegmentation, bool reportAllFactors)
+{
+ if ( hypo != NULL)
+ {
+ OutputSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
+ OutputSurface(out, hypo->GetCurrTargetPhrase(), outputFactorOrder, reportAllFactors);
+
+ if (reportSegmentation == true
+ && hypo->GetCurrTargetPhrase().GetSize() > 0) {
+ out << "|" << hypo->GetCurrSourceWordsRange().GetStartPos()
+ << "-" << hypo->GetCurrSourceWordsRange().GetEndPos() << "| ";
+ }
+ }
+}
+
+
+
+
+
+void IOWrapper::Backtrack(const Hypothesis *hypo){
+
+ if (hypo->GetPrevHypo() != NULL) {
+ VERBOSE(3,hypo->GetId() << " <= ");
+ Backtrack(hypo->GetPrevHypo());
+ }
+}
+
+void OutputBestHypo(const std::vector<const Factor*>& mbrBestHypo, long /*translationId*/, bool reportSegmentation, bool reportAllFactors, ostream& out)
+{
+ for (size_t i = 0 ; i < mbrBestHypo.size() ; i++)
+ {
+ const Factor *factor = mbrBestHypo[i];
+ if (i>0) out << " ";
+ out << factor->GetString();
+ }
+ out << endl;
+}
+
+void OutputBestHypo(const std::vector<Word>& mbrBestHypo, long /*translationId*/, bool reportSegmentation, bool reportAllFactors, ostream& out)
+{
+
+ for (size_t i = 0 ; i < mbrBestHypo.size() ; i++)
+ {
+ const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
+ if (i>0) out << " ";
+ out << *factor;
+ }
+ out << endl;
+}
+
+
+void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
+{
+ if (hypo->GetPrevHypo())
+ {
+ OutputInput(map, hypo->GetPrevHypo());
+ map[hypo->GetCurrSourceWordsRange().GetStartPos()] = hypo->GetSourcePhrase();
+ }
+}
+
+void OutputInput(std::ostream& os, const Hypothesis* hypo)
+{
+ size_t len = hypo->GetInput().GetSize();
+ std::vector<const Phrase*> inp_phrases(len, 0);
+ OutputInput(inp_phrases, hypo);
+ for (size_t i=0; i<len; ++i)
+ if (inp_phrases[i]) os << *inp_phrases[i];
+}
+
+void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, bool reportSegmentation, bool reportAllFactors)
+{
+ if (hypo != NULL)
+ {
+ VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
+ VERBOSE(3,"Best path: ");
+ Backtrack(hypo);
+ VERBOSE(3,"0" << std::endl);
+ if (!m_surpressSingleBestOutput)
+ {
+ if (StaticData::Instance().IsPathRecoveryEnabled()) {
+ OutputInput(cout, hypo);
+ cout << "||| ";
+ }
+ OutputSurface(cout, hypo, m_outputFactorOrder, reportSegmentation, reportAllFactors);
+ cout << endl;
+ }
+ }
+ else
+ {
+ VERBOSE(1, "NO BEST TRANSLATION" << endl);
+ if (!m_surpressSingleBestOutput)
+ {
+ cout << endl;
+ }
+ }
+}
+
+
+
+
+void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector<Moses::FactorType>& outputFactorOrder,long translationId)
+{
+ const StaticData &staticData = StaticData::Instance();
+ bool labeledOutput = staticData.IsLabeledNBestList();
+ bool reportAllFactors = staticData.GetReportAllFactorsNBest();
+ bool includeAlignment = staticData.NBestIncludesAlignment();
+ bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
+
+ TrellisPathList::const_iterator iter;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter)
+ {
+ const TrellisPath &path = **iter;
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
+
+ // print the surface factor of the translation
+ out << translationId << " ||| ";
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--)
+ {
+ const Hypothesis &edge = *edges[currEdge];
+ OutputSurface(out, edge.GetCurrTargetPhrase(), outputFactorOrder, reportAllFactors);
+ }
+ out << " |||";
+
+ std::string lastName = "";
+ const vector<const StatefulFeatureFunction*>& sff =
+ staticData.GetScoreIndexManager().GetStatefulFeatureFunctions();
+ for( size_t i=0; i<sff.size(); i++ )
+ {
+ if( labeledOutput && lastName != sff[i]->GetScoreProducerWeightShortName() )
+ {
+ lastName = sff[i]->GetScoreProducerWeightShortName();
+ out << " " << lastName << ":";
+ }
+ vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( sff[i] );
+ for (size_t j = 0; j<scores.size(); ++j)
+ {
+ out << " " << scores[j];
+ }
+ }
+
+ const vector<const StatelessFeatureFunction*>& slf =
+ staticData.GetScoreIndexManager().GetStatelessFeatureFunctions();
+ for( size_t i=0; i<slf.size(); i++ )
+ {
+ if( labeledOutput && lastName != slf[i]->GetScoreProducerWeightShortName() )
+ {
+ lastName = slf[i]->GetScoreProducerWeightShortName();
+ out << " " << lastName << ":";
+ }
+ vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( slf[i] );
+ for (size_t j = 0; j<scores.size(); ++j)
+ {
+ out << " " << scores[j];
+ }
+ }
+
+ // translation components
+ if (StaticData::Instance().GetInputType()==SentenceInput){
+ // translation components for text input
+ vector<PhraseDictionaryFeature*> pds = StaticData::Instance().GetPhraseDictionaries();
+ if (pds.size() > 0) {
+ if (labeledOutput)
+ out << " tm:";
+ vector<PhraseDictionaryFeature*>::iterator iter;
+ for (iter = pds.begin(); iter != pds.end(); ++iter) {
+ vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
+ for (size_t j = 0; j<scores.size(); ++j)
+ out << " " << scores[j];
+ }
+ }
+ }
+ else{
+ // translation components for Confusion Network input
+ // first translation component has GetNumInputScores() scores from the input Confusion Network
+ // at the beginning of the vector
+ vector<PhraseDictionaryFeature*> pds = StaticData::Instance().GetPhraseDictionaries();
+ if (pds.size() > 0) {
+ vector<PhraseDictionaryFeature*>::iterator iter;
+
+ iter = pds.begin();
+ vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
+
+ size_t pd_numinputscore = (*iter)->GetNumInputScores();
+
+ if (pd_numinputscore){
+
+ if (labeledOutput)
+ out << " I:";
+
+ for (size_t j = 0; j < pd_numinputscore; ++j)
+ out << " " << scores[j];
+ }
+
+
+ for (iter = pds.begin() ; iter != pds.end(); ++iter) {
+ vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
+
+ size_t pd_numinputscore = (*iter)->GetNumInputScores();
+
+ if (iter == pds.begin() && labeledOutput)
+ out << " tm:";
+ for (size_t j = pd_numinputscore; j < scores.size() ; ++j)
+ out << " " << scores[j];
+ }
+ }
+ }
+
+ // generation
+ vector<GenerationDictionary*> gds = StaticData::Instance().GetGenerationDictionaries();
+ if (gds.size() > 0) {
+ if (labeledOutput)
+ out << " g: ";
+ vector<GenerationDictionary*>::iterator iter;
+ for (iter = gds.begin(); iter != gds.end(); ++iter) {
+ vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer(*iter);
+ for (size_t j = 0; j<scores.size(); j++) {
+ out << scores[j] << " ";
+ }
+ }
+ }
+
+ // total
+ out << " ||| " << path.GetTotalScore();
+
+ //phrase-to-phrase alignment
+ if (includeAlignment) {
+ out << " |||";
+ for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--)
+ {
+ const Hypothesis &edge = *edges[currEdge];
+ const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
+ WordsRange targetRange = path.GetTargetWordsRange(edge);
+ out << " " << sourceRange.GetStartPos();
+ if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
+ out << "-" << sourceRange.GetEndPos();
+ }
+ out<< "=" << targetRange.GetStartPos();
+ if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
+ out<< "-" << targetRange.GetEndPos();
+ }
+ }
+ }
+
+ if (StaticData::Instance().IsPathRecoveryEnabled()) {
+ out << "|||";
+ OutputInput(out, edges[0]);
+ }
+
+ out << endl;
+ }
+
+
+ out <<std::flush;
+}
+
+void IOWrapper::OutputNBestList(const TrellisPathList &nBestList, long translationId) {
+ OutputNBest(*m_nBestStream, nBestList,m_outputFactorOrder, translationId);
+}
+
+bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
+{
+ delete source;
+ switch(inputType)
+ {
+ case SentenceInput: source = ioWrapper.GetInput(new Sentence(Input)); break;
+ case ConfusionNetworkInput: source = ioWrapper.GetInput(new ConfusionNet); break;
+ case WordLatticeInput: source = ioWrapper.GetInput(new WordLattice); break;
+ default: TRACE_ERR("Unknown input type: " << inputType << "\n");
+ }
+ return (source ? true : false);
+}
+
+
+
+IOWrapper *GetIODevice(const StaticData &staticData)
+{
+ IOWrapper *ioWrapper;
+ const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
+ ,&outputFactorOrder = staticData.GetOutputFactorOrder();
+ FactorMask inputFactorUsed(inputFactorOrder);
+
+ // io
+ if (staticData.GetParam("input-file").size() == 1)
+ {
+ VERBOSE(2,"IO from File" << endl);
+ string filePath = staticData.GetParam("input-file")[0];
+
+ ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
+ , staticData.GetNBestSize()
+ , staticData.GetNBestFilePath()
+ , filePath);
+ }
+ else
+ {
+ VERBOSE(1,"IO from STDOUT/STDIN" << endl);
+ ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
+ , staticData.GetNBestSize()
+ , staticData.GetNBestFilePath());
+ }
+ ioWrapper->ResetTranslationId();
+
+ IFVERBOSE(1)
+ PrintUserTime("Created input-output object");
+
+ return ioWrapper;
+}
diff --git a/moses-cmd/src/IOWrapper.h b/moses-cmd/src/IOWrapper.h
new file mode 100644
index 000000000..f09df7e7c
--- /dev/null
+++ b/moses-cmd/src/IOWrapper.h
@@ -0,0 +1,115 @@
+// $Id: IOWrapper.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2006 University of Edinburgh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of the University of Edinburgh nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+// example file on how to use moses library
+
+#ifndef moses_cmd_IOWrapper_h
+#define moses_cmd_IOWrapper_h
+
+#include <fstream>
+#include <ostream>
+#include <vector>
+
+#include "TypeDef.h"
+#include "Sentence.h"
+#include "FactorTypeSet.h"
+#include "FactorCollection.h"
+#include "Hypothesis.h"
+#include "TrellisPathList.h"
+#include "InputFileStream.h"
+#include "InputType.h"
+#include "WordLattice.h"
+
+class IOWrapper
+{
+protected:
+ long m_translationId;
+
+ const std::vector<Moses::FactorType> &m_inputFactorOrder;
+ const std::vector<Moses::FactorType> &m_outputFactorOrder;
+ const Moses::FactorMask &m_inputFactorUsed;
+ std::ostream *m_nBestStream
+ ,*m_outputWordGraphStream,*m_outputSearchGraphStream;
+ std::string m_inputFilePath;
+ std::istream *m_inputStream;
+ Moses::InputFileStream *m_inputFile;
+ bool m_surpressSingleBestOutput;
+
+ void Initialization(const std::vector<Moses::FactorType> &inputFactorOrder
+ , const std::vector<Moses::FactorType> &outputFactorOrder
+ , const Moses::FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const std::string &nBestFilePath);
+
+public:
+ IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
+ , const std::vector<Moses::FactorType> &outputFactorOrder
+ , const Moses::FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const std::string &nBestFilePath);
+
+ IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
+ , const std::vector<Moses::FactorType> &outputFactorOrder
+ , const Moses::FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const std::string &nBestFilePath
+ , const std::string &infilePath);
+ ~IOWrapper();
+
+ Moses::InputType* GetInput(Moses::InputType *inputType);
+
+ void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, bool reportSegmentation, bool reportAllFactors);
+ void OutputNBestList(const Moses::TrellisPathList &nBestList, long translationId);
+ void Backtrack(const Moses::Hypothesis *hypo);
+
+ void ResetTranslationId() { m_translationId = 0; }
+
+ std::ostream &GetOutputWordGraphStream()
+ {
+ return *m_outputWordGraphStream;
+ }
+ std::ostream &GetOutputSearchGraphStream()
+ {
+ return *m_outputSearchGraphStream;
+ }
+};
+
+IOWrapper *GetIODevice(const Moses::StaticData &staticData);
+bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source);
+void OutputSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder ,bool reportSegmentation, bool reportAllFactors);
+void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector<Moses::FactorType>&, long translationId);
+void OutputBestHypo(const std::vector<const Moses::Factor*>& mbrBestHypo, long translationId,
+ bool reportSegmentation, bool reportAllFactors, std::ostream& out);
+void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
+ bool reportSegmentation, bool reportAllFactors, std::ostream& out);
+
+#endif
diff --git a/moses-cmd/src/LatticeMBR.cpp b/moses-cmd/src/LatticeMBR.cpp
new file mode 100644
index 000000000..e8c5cf484
--- /dev/null
+++ b/moses-cmd/src/LatticeMBR.cpp
@@ -0,0 +1,507 @@
+/*
+ * LatticeMBR.cpp
+ * moses-cmd
+ *
+ * Created by Abhishek Arun on 26/01/2010.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "LatticeMBR.h"
+#include "StaticData.h"
+#include <algorithm>
+#include <set>
+
+size_t bleu_order = 4;
+float UNKNGRAMLOGPROB = -20;
+void GetOutputWords(const TrellisPath &path, vector <Word> &translation){
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
+
+ // print the surface factor of the translation
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--)
+ {
+ const Hypothesis &edge = *edges[currEdge];
+ const Phrase &phrase = edge.GetCurrTargetPhrase();
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++)
+ {
+ translation.push_back(phrase.GetWord(pos));
+ }
+ }
+}
+
+
+void extract_ngrams(const vector<Word >& sentence, map < Phrase, int > & allngrams)
+{
+ for (int k = 0; k < (int)bleu_order; k++)
+ {
+ for(int i =0; i < max((int)sentence.size()-k,0); i++)
+ {
+ Phrase ngram(Output);
+ for ( int j = i; j<= i+k; j++)
+ {
+ ngram.AddWord(sentence[j]);
+ }
+ ++allngrams[ngram];
+ }
+ }
+}
+
+
+
+void NgramScores::addScore(const Hypothesis* node, const Phrase& ngram, float score) {
+ set<Phrase>::const_iterator ngramIter = m_ngrams.find(ngram);
+ if (ngramIter == m_ngrams.end()) {
+ ngramIter = m_ngrams.insert(ngram).first;
+ }
+ map<const Phrase*,float>& ngramScores = m_scores[node];
+ map<const Phrase*,float>::iterator scoreIter = ngramScores.find(&(*ngramIter));
+ if (scoreIter == ngramScores.end()) {
+ ngramScores[&(*ngramIter)] = score;
+ } else {
+ ngramScores[&(*ngramIter)] = log_sum(score,scoreIter->second);
+ }
+}
+
+NgramScores::NodeScoreIterator NgramScores::nodeBegin(const Hypothesis* node) {
+ return m_scores[node].begin();
+}
+
+
+NgramScores::NodeScoreIterator NgramScores::nodeEnd(const Hypothesis* node) {
+ return m_scores[node].end();
+}
+
+
+void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const Hypothesis* > > & outgoingHyps, map<const Hypothesis*, vector<Edge> >& incomingEdges,
+ const vector< float> & estimatedScores, const Hypothesis* bestHypo, size_t edgeDensity) {
+
+ //Need hyp 0 in connectedHyp - Find empty hypothesis
+ VERBOSE(2,"Pruning lattice to edge density " << edgeDensity << endl);
+ const Hypothesis* emptyHyp = connectedHyp.at(0);
+ while (emptyHyp->GetId() != 0) {
+ emptyHyp = emptyHyp->GetPrevHypo();
+ }
+ connectedHyp.push_back(emptyHyp); //Add it to list of hyps
+
+ //Need hyp 0's outgoing Hyps
+ for (size_t i = 0; i < connectedHyp.size(); ++i) {
+ if (connectedHyp[i]->GetId() > 0 && connectedHyp[i]->GetPrevHypo()->GetId() == 0)
+ outgoingHyps[emptyHyp].insert(connectedHyp[i]);
+ }
+
+ //sort hyps based on estimated scores - do so by copying to multimap
+ multimap<float, const Hypothesis*> sortHypsByVal;
+ for (size_t i =0; i < estimatedScores.size(); ++i) {
+ sortHypsByVal.insert(make_pair<float, const Hypothesis*>(estimatedScores[i], connectedHyp[i]));
+ }
+
+ multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end();
+ float bestScore = it->first;
+ //store best score as score of hyp 0
+ sortHypsByVal.insert(make_pair<float, const Hypothesis*>(bestScore, emptyHyp));
+
+
+ IFVERBOSE(3) {
+ for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
+ const Hypothesis* currHyp = it->second;
+ cerr << "Hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl;
+ }
+ }
+
+
+ set <const Hypothesis*> survivingHyps; //store hyps that make the cut in this
+
+ VERBOSE(2, "BEST HYPO TARGET LENGTH : " << bestHypo->GetSize() << endl)
+ size_t numEdgesTotal = edgeDensity * bestHypo->GetSize(); //as per Shankar, aim for (density * target length of MAP solution) arcs
+ size_t numEdgesCreated = 0;
+ VERBOSE(2, "Target edge count: " << numEdgesTotal << endl);
+
+ float prevScore = -999999;
+
+ //now iterate over multimap
+ for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
+ float currEstimatedScore = it->first;
+ const Hypothesis* currHyp = it->second;
+
+ if (numEdgesCreated >= numEdgesTotal && prevScore > currEstimatedScore) //if this hyp has equal estimated score to previous, include its edges too
+ break;
+
+ prevScore = currEstimatedScore;
+ VERBOSE(3, "Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
+ VERBOSE(3, "Considering hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl)
+
+ survivingHyps.insert(currHyp); //CurrHyp made the cut
+
+ // is its best predecessor already included ?
+ if (survivingHyps.find(currHyp->GetPrevHypo()) != survivingHyps.end()) { //yes, then add an edge
+ vector <Edge>& edges = incomingEdges[currHyp];
+ Edge winningEdge(currHyp->GetPrevHypo(),currHyp,currHyp->GetScore() - currHyp->GetPrevHypo()->GetScore(),currHyp->GetTargetPhrase());
+ edges.push_back(winningEdge);
+ ++numEdgesCreated;
+ }
+
+ //let's try the arcs too
+ const ArcList *arcList = currHyp->GetArcList();
+ if (arcList != NULL) {
+ ArcList::const_iterator iterArcList;
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
+ const Hypothesis *loserHypo = *iterArcList;
+ const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
+ if (survivingHyps.find(loserPrevHypo) != survivingHyps.end()) { //found it, add edge
+ double arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
+ Edge losingEdge(loserPrevHypo, currHyp, arcScore, loserHypo->GetTargetPhrase());
+ vector <Edge>& edges = incomingEdges[currHyp];
+ edges.push_back(losingEdge);
+ ++numEdgesCreated;
+ }
+ }
+ }
+
+ //Now if a successor node has already been visited, add an edge connecting the two
+ map < const Hypothesis*, set < const Hypothesis* > >::const_iterator outgoingIt = outgoingHyps.find(currHyp);
+
+ if (outgoingIt != outgoingHyps.end()) {//currHyp does have successors
+ const set<const Hypothesis*> & outHyps = outgoingIt->second; //the successors
+ for (set<const Hypothesis*>::const_iterator outHypIts = outHyps.begin(); outHypIts != outHyps.end(); ++outHypIts) {
+ const Hypothesis* succHyp = *outHypIts;
+
+ if (survivingHyps.find(succHyp) == survivingHyps.end()) //Have we encountered the successor yet?
+ continue; //No, move on to next
+
+ //Curr Hyp can be : a) the best predecessor of succ b) or an arc attached to succ
+ if (succHyp->GetPrevHypo() == currHyp) { //best predecessor
+ vector <Edge>& succEdges = incomingEdges[succHyp];
+ Edge succWinningEdge(currHyp, succHyp, succHyp->GetScore() - currHyp->GetScore(), succHyp->GetTargetPhrase());
+ succEdges.push_back(succWinningEdge);
+ survivingHyps.insert(succHyp);
+ ++numEdgesCreated;
+ }
+
+ //now, let's find an arc
+ const ArcList *arcList = succHyp->GetArcList();
+ if (arcList != NULL) {
+ ArcList::const_iterator iterArcList;
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
+ const Hypothesis *loserHypo = *iterArcList;
+ const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
+ if (loserPrevHypo == currHyp) { //found it
+ vector <Edge>& succEdges = incomingEdges[succHyp];
+ double arcScore = loserHypo->GetScore() - currHyp->GetScore();
+ Edge losingEdge(currHyp, succHyp, arcScore, loserHypo->GetTargetPhrase());
+ succEdges.push_back(losingEdge);
+ ++numEdgesCreated;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ connectedHyp.clear();
+ for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
+ connectedHyp.push_back(*it);
+ }
+
+ VERBOSE(2, "Done! Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
+
+ IFVERBOSE(3) {
+ cerr << "Surviving hyps: " ;
+ for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
+ cerr << (*it)->GetId() << " ";
+ }
+ cerr << endl;
+ }
+}
+
+void calcNgramPosteriors(Lattice & connectedHyp, map<const Hypothesis*, vector<Edge> >& incomingEdges, float scale, map<Phrase, float>& finalNgramScores) {
+
+ sort(connectedHyp.begin(),connectedHyp.end(),ascendingCoverageCmp); //sort by increasing source word cov
+
+ map<const Hypothesis*, float> forwardScore;
+ forwardScore[connectedHyp[0]] = 0.0f; //forward score of hyp 0 is 1 (or 0 in logprob space)
+ set< const Hypothesis *> finalHyps; //store completed hyps
+
+ NgramScores ngramScores;//ngram scores for each hyp
+
+ for (size_t i = 1; i < connectedHyp.size(); ++i) {
+ const Hypothesis* currHyp = connectedHyp[i];
+ if (currHyp->GetWordsBitmap().IsComplete()) {
+ finalHyps.insert(currHyp);
+ }
+
+ VERBOSE(3, "Processing hyp: " << currHyp->GetId() << ", num words cov= " << currHyp->GetWordsBitmap().GetNumWordsCovered() << endl)
+
+ vector <Edge> & edges = incomingEdges[currHyp];
+ for (size_t e = 0; e < edges.size(); ++e) {
+ const Edge& edge = edges[e];
+ if (forwardScore.find(currHyp) == forwardScore.end()) {
+ forwardScore[currHyp] = forwardScore[edge.GetTailNode()] + edge.GetScore();
+ VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] = fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
+ }
+ else {
+ forwardScore[currHyp] = log_sum(forwardScore[currHyp], forwardScore[edge.GetTailNode()] + edge.GetScore());
+ VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] += fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
+ }
+ }
+
+ //Process ngrams now
+ for (size_t j =0 ; j < edges.size(); ++j) {
+ Edge& edge = edges[j];
+ const NgramHistory & incomingPhrases = edge.GetNgrams(incomingEdges);
+
+ //let's first score ngrams introduced by this edge
+ for (NgramHistory::const_iterator it = incomingPhrases.begin(); it != incomingPhrases.end(); ++it) {
+ const Phrase& ngram = it->first;
+ const PathCounts& pathCounts = it->second;
+ VERBOSE(4, "Calculating score for: " << it->first << endl)
+
+ for (PathCounts::const_iterator pathCountIt = pathCounts.begin(); pathCountIt != pathCounts.end(); ++pathCountIt) {
+ //Score of an n-gram is forward score of head node of leftmost edge + all edge scores
+ const Path& path = pathCountIt->first;
+ float score = forwardScore[path[0]->GetTailNode()];
+ for (size_t i = 0; i < path.size(); ++i) {
+ score += path[i]->GetScore();
+ }
+ ngramScores.addScore(currHyp,ngram,score);
+ }
+ }
+
+ //Now score ngrams that are just being propagated from the history
+ for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(edge.GetTailNode());
+ it != ngramScores.nodeEnd(edge.GetTailNode()); ++it) {
+ const Phrase & currNgram = *(it->first);
+ float currNgramScore = it->second;
+ VERBOSE(4, "Calculating score for: " << currNgram << endl)
+
+ if (incomingPhrases.find(currNgram) == incomingPhrases.end()) {
+ float score = edge.GetScore() + currNgramScore;
+ ngramScores.addScore(currHyp,currNgram,score);
+ }
+ }
+
+ }
+ }
+
+ float Z = 9999999; //the total score of the lattice
+
+ //Done - Print out ngram posteriors for final hyps
+ for (set< const Hypothesis *>::iterator finalHyp = finalHyps.begin(); finalHyp != finalHyps.end(); ++finalHyp) {
+ const Hypothesis* hyp = *finalHyp;
+
+ for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(hyp); it != ngramScores.nodeEnd(hyp); ++it) {
+ const Phrase& ngram = *(it->first);
+ if (finalNgramScores.find(ngram) == finalNgramScores.end()) {
+ finalNgramScores[ngram] = it->second;
+ }
+ else {
+ finalNgramScores[ngram] = log_sum(it->second, finalNgramScores[ngram]);
+ }
+ }
+
+ if (Z == 9999999) {
+ Z = forwardScore[hyp];
+ }
+ else {
+ Z = log_sum(Z, forwardScore[hyp]);
+ }
+ }
+
+ Z *= scale; //scale the score
+
+ for (map<Phrase, float>::iterator finalScoresIt = finalNgramScores.begin(); finalScoresIt != finalNgramScores.end(); ++finalScoresIt) {
+ finalScoresIt->second = finalScoresIt->second * scale - Z;
+ IFVERBOSE(2) {
+ VERBOSE(2,finalScoresIt->first << " [" << finalScoresIt->second << "]" << endl);
+ }
+ }
+
+}
+
+const NgramHistory& Edge::GetNgrams(map<const Hypothesis*, vector<Edge> > & incomingEdges) {
+
+ if (m_ngrams.size() > 0)
+ return m_ngrams;
+
+ const Phrase& currPhrase = GetWords();
+ //Extract the n-grams local to this edge
+ for (size_t start = 0; start < currPhrase.GetSize(); ++start) {
+ for (size_t end = start; end < start + bleu_order; ++end) {
+ if (end < currPhrase.GetSize()){
+ Phrase edgeNgram(Output);
+ for (size_t index = start; index <= end; ++index) {
+ edgeNgram.AddWord(currPhrase.GetWord(index));
+ }
+ //cout << "Inserting Phrase : " << edgeNgram << endl;
+ vector<const Edge*> edgeHistory;
+ edgeHistory.push_back(this);
+ storeNgramHistory(edgeNgram, edgeHistory);
+ }
+ else {
+ break;
+ }
+ }
+ }
+
+ map<const Hypothesis*, vector<Edge> >::iterator it = incomingEdges.find(m_tailNode);
+ if (it != incomingEdges.end()) { //node has incoming edges
+ vector<Edge> & inEdges = it->second;
+
+ for (vector<Edge>::iterator edge = inEdges.begin(); edge != inEdges.end(); ++edge) {//add the ngrams straddling prev and curr edge
+ const NgramHistory & edgeIncomingNgrams = edge->GetNgrams(incomingEdges);
+ for (NgramHistory::const_iterator edgeInNgramHist = edgeIncomingNgrams.begin(); edgeInNgramHist != edgeIncomingNgrams.end(); ++edgeInNgramHist) {
+ const Phrase& edgeIncomingNgram = edgeInNgramHist->first;
+ const PathCounts & edgeIncomingNgramPaths = edgeInNgramHist->second;
+ size_t back = min(edgeIncomingNgram.GetSize(), edge->GetWordsSize());
+ const Phrase& edgeWords = edge->GetWords();
+ IFVERBOSE(3) {
+ cerr << "Edge: "<< *edge <<endl;
+ cerr << "edgeWords: " << edgeWords << endl;
+ cerr << "edgeInNgram: " << edgeIncomingNgram << endl;
+ }
+
+ Phrase edgeSuffix(Output);
+ Phrase ngramSuffix(Output);
+ GetPhraseSuffix(edgeWords,back,edgeSuffix);
+ GetPhraseSuffix(edgeIncomingNgram,back,ngramSuffix);
+
+ if (ngramSuffix == edgeSuffix) { //we've got the suffix of previous edge
+ size_t edgeInNgramSize = edgeIncomingNgram.GetSize();
+
+ for (size_t i = 0; i < GetWordsSize() && i + edgeInNgramSize < bleu_order ; ++i){
+ Phrase newNgram(edgeIncomingNgram);
+ for (size_t j = 0; j <= i ; ++j){
+ newNgram.AddWord(GetWords().GetWord(j));
+ }
+ VERBOSE(3, "Inserting New Phrase : " << newNgram << endl)
+
+ for (PathCounts::const_iterator pathIt = edgeIncomingNgramPaths.begin(); pathIt != edgeIncomingNgramPaths.end(); ++pathIt) {
+ Path newNgramPath = pathIt->first;
+ newNgramPath.push_back(this);
+ storeNgramHistory(newNgram, newNgramPath, pathIt->second);
+ }
+ }
+ }
+ }
+ }
+ }
+ return m_ngrams;
+}
+
+//Add the last lastN words of origPhrase to targetPhrase
+void Edge::GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const {
+ size_t origSize = origPhrase.GetSize();
+ size_t startIndex = origSize - lastN;
+ for (size_t index = startIndex; index < origPhrase.GetSize(); ++index) {
+ targetPhrase.AddWord(origPhrase.GetWord(index));
+ }
+}
+
+bool Edge::operator< (const Edge& compare ) const {
+ if (m_headNode->GetId() < compare.m_headNode->GetId())
+ return true;
+ if (compare.m_headNode->GetId() < m_headNode->GetId())
+ return false;
+ if (m_tailNode->GetId() < compare.m_tailNode->GetId())
+ return true;
+ if (compare.m_tailNode->GetId() < m_tailNode->GetId())
+ return false;
+ return GetScore() < compare.GetScore();
+}
+
+ostream& operator<< (ostream& out, const Edge& edge) {
+ out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl;
+ return out;
+}
+
+bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b) {
+ return a->GetWordsBitmap().GetNumWordsCovered() < b->GetWordsBitmap().GetNumWordsCovered();
+}
+
+vector<Word> calcMBRSol(const TrellisPathList& nBestList, map<Phrase, float>& finalNgramScores, const vector<float> & thetas, float p, float r){
+
+ vector<float> mbrThetas = thetas;
+ if (thetas.size() == 0) { //thetas not specified on the command line, use p and r instead
+ mbrThetas.push_back(-1); //Theta 0
+ mbrThetas.push_back(1/(bleu_order*p));
+ for (size_t i = 2; i <= bleu_order; ++i){
+ mbrThetas.push_back(mbrThetas[i-1] / r);
+ }
+ }
+ IFVERBOSE(2) {
+ VERBOSE(2,"Thetas: ");
+ for (size_t i = 0; i < mbrThetas.size(); ++i) {
+ VERBOSE(2,mbrThetas[i] << " ");
+ }
+ VERBOSE(2,endl);
+ }
+
+ float argmaxScore = -1e20;
+ TrellisPathList::const_iterator iter;
+ size_t ctr = 0;
+
+ vector<Word> argmaxTranslation;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr)
+ {
+ const TrellisPath &path = **iter;
+ // get words in translation
+ vector<Word> translation;
+ GetOutputWords(path, translation);
+
+ // collect n-gram counts
+ map < Phrase, int > counts;
+ extract_ngrams(translation,counts);
+
+ //Now score this translation
+ float mbrScore = mbrThetas[0] * translation.size();
+
+ float ngramScore = 0;
+
+ for (map < Phrase, int >::iterator ngrams = counts.begin(); ngrams != counts.end(); ++ngrams) {
+ float ngramPosterior = UNKNGRAMLOGPROB;
+ map<Phrase,float>::const_iterator ngramPosteriorIt = finalNgramScores.find(ngrams->first);
+ if (ngramPosteriorIt != finalNgramScores.end()) {
+ ngramPosterior = ngramPosteriorIt->second;
+ }
+
+ if (ngramScore == 0) {
+ ngramScore = log((double) ngrams->second) + ngramPosterior + log(mbrThetas[(ngrams->first).GetSize()]);
+ }
+ else {
+ ngramScore = log_sum(ngramScore, float(log((double) ngrams->second) + ngramPosterior + log(mbrThetas[(ngrams->first).GetSize()])));
+ }
+ //cout << "Ngram: " << ngrams->first << endl;
+ }
+
+ mbrScore += exp(ngramScore);
+
+ if (mbrScore > argmaxScore){
+ argmaxScore = mbrScore;
+ IFVERBOSE(2) {
+ VERBOSE(2,"HYP " << ctr << " IS NEW BEST: ");
+ for (size_t i = 0; i < translation.size(); ++i)
+ VERBOSE(2,translation[i]);
+ VERBOSE(2,"[" << argmaxScore << "]" << endl);
+ }
+ argmaxTranslation = translation;
+ }
+ }
+ return argmaxTranslation;
+}
+
+vector<Word> doLatticeMBR(Manager& manager, TrellisPathList& nBestList) {
+ const StaticData& staticData = StaticData::Instance();
+ std::map < int, bool > connected;
+ std::vector< const Hypothesis *> connectedList;
+ map<Phrase, float> ngramPosteriors;
+ std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
+ map<const Hypothesis*, vector<Edge> > incomingEdges;
+ vector< float> estimatedScores;
+ manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
+ pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor());
+ calcNgramPosteriors(connectedList, incomingEdges, staticData.GetMBRScale(), ngramPosteriors);
+ vector<Word> mbrBestHypo = calcMBRSol(nBestList, ngramPosteriors, staticData.GetLatticeMBRThetas(),
+ staticData.GetLatticeMBRPrecision(), staticData.GetLatticeMBRPRatio());
+ return mbrBestHypo;
+}
+
diff --git a/moses-cmd/src/LatticeMBR.h b/moses-cmd/src/LatticeMBR.h
new file mode 100644
index 000000000..e6a67cc2b
--- /dev/null
+++ b/moses-cmd/src/LatticeMBR.h
@@ -0,0 +1,117 @@
+/*
+ * LatticeMBR.h
+ * moses-cmd
+ *
+ * Created by Abhishek Arun on 26/01/2010.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#ifndef moses_cmd_LatticeMBR_h
+#define moses_cmd_LatticeMBR_h
+
+#include <map>
+#include <vector>
+#include <set>
+#include "Hypothesis.h"
+#include "Manager.h"
+#include "TrellisPathList.h"
+
+using namespace Moses;
+
+template<class T>
+T log_sum (T log_a, T log_b)
+{
+ T v;
+ if (log_a < log_b) {
+ v = log_b+log ( 1 + exp ( log_a-log_b ));
+ } else {
+ v = log_a+log ( 1 + exp ( log_b-log_a ));
+ }
+ return ( v );
+}
+
+class Edge;
+
+typedef std::vector< const Hypothesis *> Lattice;
+typedef vector<const Edge*> Path;
+typedef map<Path, size_t> PathCounts;
+typedef map<Phrase, PathCounts > NgramHistory;
+
+class Edge {
+ const Hypothesis* m_tailNode;
+ const Hypothesis* m_headNode;
+ float m_score;
+ TargetPhrase m_targetPhrase;
+ NgramHistory m_ngrams;
+
+ public:
+ Edge(const Hypothesis* from, const Hypothesis* to, float score, const TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) {
+ //cout << "Creating new edge from Node " << from->GetId() << ", to Node : " << to->GetId() << ", score: " << score << " phrase: " << targetPhrase << endl;
+ }
+
+ const Hypothesis* GetHeadNode() const {
+ return m_headNode;
+ }
+
+ const Hypothesis* GetTailNode() const {
+ return m_tailNode;
+ }
+
+ float GetScore() const {
+ return m_score;
+ }
+
+ size_t GetWordsSize() const {
+ return m_targetPhrase.GetSize();
+ }
+
+ const Phrase& GetWords() const {
+ return m_targetPhrase;
+ }
+
+ friend ostream& operator<< (ostream& out, const Edge& edge);
+
+ const NgramHistory& GetNgrams( map<const Hypothesis*, vector<Edge> > & incomingEdges) ;
+
+ bool operator < (const Edge & compare) const;
+
+ void GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const;
+
+ void storeNgramHistory(const Phrase& phrase, Path & path, size_t count = 1){
+ m_ngrams[phrase][path]+= count;
+ }
+
+};
+
+/**
+* Data structure to hold the ngram scores as we traverse the lattice. Maps (hypo,ngram) to score
+*/
+class NgramScores {
+ public:
+ NgramScores() {}
+
+ /** logsum this score to the existing score */
+ void addScore(const Hypothesis* node, const Phrase& ngram, float score);
+
+ /** Iterate through ngrams for selected node */
+ typedef map<const Phrase*, float>::const_iterator NodeScoreIterator;
+ NodeScoreIterator nodeBegin(const Hypothesis* node);
+ NodeScoreIterator nodeEnd(const Hypothesis* node);
+
+ private:
+ set<Phrase> m_ngrams;
+ map<const Hypothesis*, map<const Phrase*, float> > m_scores;
+};
+
+void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const Hypothesis* > > & outgoingHyps, map<const Hypothesis*, vector<Edge> >& incomingEdges,
+ const vector< float> & estimatedScores, const Hypothesis*, size_t edgeDensity);
+
+vector<Word> calcMBRSol(Lattice & connectedHyp, map<Phrase, float>& finalNgramScores,const vector<float> & thetas, float, float);
+vector<Word> calcMBRSol(const TrellisPathList& nBestList, map<Phrase, float>& finalNgramScores,const vector<float> & thetas, float, float);
+void calcNgramPosteriors(Lattice & connectedHyp, map<const Hypothesis*, vector<Edge> >& incomingEdges, float scale, map<Phrase, float>& finalNgramScores);
+void GetOutputFactors(const TrellisPath &path, vector <Word> &translation);
+void extract_ngrams(const vector<Word >& sentence, map < Phrase, int > & allngrams);
+bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b);
+vector<Word> doLatticeMBR(Manager& manager, TrellisPathList& nBestList);
+#endif
diff --git a/moses-cmd/src/LatticeMBRGrid.cpp b/moses-cmd/src/LatticeMBRGrid.cpp
new file mode 100644
index 000000000..89b69b36e
--- /dev/null
+++ b/moses-cmd/src/LatticeMBRGrid.cpp
@@ -0,0 +1,204 @@
+// $Id: $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2010 University of Edinburgh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of the University of Edinburgh nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+/**
+* Lattice MBR grid search. Enables a grid search through the four parameters (p,r,scale and prune) used in lattice MBR.
+ See 'Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation by Tromble, Kumar, Och and Macherey,
+ EMNLP 2008 for details of the parameters.
+
+ The grid search is controlled by specifying comma separated lists for the lmbr parameters (-lmbr-p, -lmbr-r,
+ -lmbr-pruning-factor and -mbr-scale). All other parameters are passed through to moses. If any of the lattice mbr
+ parameters are missing, then they are set to their default values. Output is of the form:
+ sentence-id ||| p r prune scale ||| translation-hypothesis
+**/
+
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <stdexcept>
+#include <set>
+
+#include "IOWrapper.h"
+#include "LatticeMBR.h"
+#include "Manager.h"
+#include "StaticData.h"
+
+
+using namespace std;
+using namespace Moses;
+
+//keys
+enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale};
+
+class Grid {
+ public:
+ /** Add a parameter with key, command line argument, and default value */
+ void addParam(gridkey key, const string& arg, float defaultValue) {
+ m_args[arg] = key;
+ assert(m_grid.find(key) == m_grid.end());
+ m_grid[key].push_back(defaultValue);
+ }
+
+ /** Parse the arguments, removing those that define the grid and returning a copy of the rest */
+ void parseArgs(int& argc, char**& argv) {
+ char** newargv = new char*[argc+1]; //Space to add mbr parameter
+ int newargc = 0;
+ for (int i = 0; i < argc; ++i) {
+ bool consumed = false;
+ for (map<string,gridkey>::const_iterator argi = m_args.begin(); argi != m_args.end(); ++argi) {
+ if (!strcmp(argv[i], argi->first.c_str())) {
+ ++i;
+ if (i >= argc) {
+ cerr << "Error: missing parameter for " << argi->first << endl;
+ throw runtime_error("Missing parameter");
+ } else {
+ string value = argv[i];
+ gridkey key = argi->second;
+ if (m_grid[key].size() != 1) {
+ throw runtime_error("Duplicate grid argument");
+ }
+ m_grid[key].clear();
+ char delim = ',';
+ string::size_type lastpos = value.find_first_not_of(delim);
+ string::size_type pos = value.find_first_of(delim,lastpos);
+ while (string::npos != pos || string::npos != lastpos) {
+ float param = atof(value.substr(lastpos, pos-lastpos).c_str());
+ if (!param) {
+ cerr << "Error: Illegal grid parameter for " << argi->first << endl;
+ throw runtime_error("Illegal grid parameter");
+ }
+ m_grid[key].push_back(param);
+ lastpos = value.find_first_not_of(delim,pos);
+ pos = value.find_first_of(delim,lastpos);
+ }
+ consumed = true;
+ }
+ if (consumed) break;
+ }
+ }
+ if (!consumed) {
+ newargv[newargc] = new char[strlen(argv[i]) + 1];
+ strcpy(newargv[newargc],argv[i]);
+ ++newargc;
+ }
+ }
+ argc = newargc;
+ argv = newargv;
+ }
+
+ /** Get the grid for a particular key.*/
+ const vector<float>& getGrid(gridkey key) const {
+ map<gridkey,vector<float> >::const_iterator iter = m_grid.find(key);
+ assert (iter != m_grid.end());
+ return iter->second;
+
+ }
+
+ private:
+ map<gridkey,vector<float> > m_grid;
+ map<string,gridkey> m_args;
+};
+
+int main(int argc, char* argv[]) {
+ cerr << "Lattice MBR Grid search" << endl;
+
+ Grid grid;
+ grid.addParam(lmbr_p, "-lmbr-p", 0.5);
+ grid.addParam(lmbr_r, "-lmbr-r", 0.5);
+ grid.addParam(lmbr_prune, "-lmbr-pruning-factor",30.0);
+ grid.addParam(lmbr_scale, "-mbr-scale",1.0);
+
+ grid.parseArgs(argc,argv);
+
+ Parameter* params = new Parameter();
+ if (!params->LoadParam(argc,argv)) {
+ params->Explain();
+ exit(1);
+ }
+ if (!StaticData::LoadDataStatic(params)) {
+ exit(1);
+ }
+
+ StaticData& staticData = const_cast<StaticData&>(StaticData::Instance());
+ staticData.SetUseLatticeMBR(true);
+ IOWrapper* ioWrapper = GetIODevice(staticData);
+
+ if (!ioWrapper) {
+ throw runtime_error("Failed to initialise IOWrapper");
+ }
+ size_t nBestSize = staticData.GetMBRSize();
+
+ if (nBestSize <= 0){
+ throw new runtime_error("Non-positive size specified for n-best list");
+ }
+
+ size_t lineCount = 0;
+ InputType* source = NULL;
+
+ const vector<float>& pgrid = grid.getGrid(lmbr_p);
+ const vector<float>& rgrid = grid.getGrid(lmbr_r);
+ const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
+ const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
+
+ while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ ++lineCount;
+ Sentence sentence(Input);
+ Manager manager(*source,staticData.GetSearchAlgorithm());
+ manager.ProcessSentence();
+ TrellisPathList nBestList;
+ manager.CalcNBest(nBestSize, nBestList,true);
+ //grid search
+ for (vector<float>::const_iterator pi = pgrid.begin(); pi != pgrid.end(); ++pi) {
+ float p = *pi;
+ staticData.SetLatticeMBRPrecision(p);
+ for (vector<float>::const_iterator ri = rgrid.begin(); ri != rgrid.end(); ++ri) {
+ float r = *ri;
+ staticData.SetLatticeMBRPRatio(r);
+ for (vector<float>::const_iterator prune_i = prune_grid.begin(); prune_i != prune_grid.end(); ++prune_i) {
+ size_t prune = (size_t)(*prune_i);
+ staticData.SetLatticeMBRPruningFactor(prune);
+ for (vector<float>::const_iterator scale_i = scale_grid.begin(); scale_i != scale_grid.end(); ++scale_i) {
+ float scale = *scale_i;
+ staticData.SetMBRScale(scale);
+ cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| ";
+ vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
+ OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),cout);
+ }
+ }
+
+ }
+ }
+
+
+ }
+
+}
diff --git a/moses-cmd/src/Main.cpp b/moses-cmd/src/Main.cpp
new file mode 100644
index 000000000..8a053ff56
--- /dev/null
+++ b/moses-cmd/src/Main.cpp
@@ -0,0 +1,233 @@
+// $Id: Main.cpp 2954 2010-03-07 08:28:16Z abarun $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2006 University of Edinburgh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of the University of Edinburgh nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+// example file on how to use moses library
+
+#ifdef WIN32
+// Include Visual Leak Detector
+#include <vld.h>
+#endif
+
+#include <fstream>
+#include "Main.h"
+#include "TrellisPath.h"
+#include "FactorCollection.h"
+#include "Manager.h"
+#include "Phrase.h"
+#include "Util.h"
+#include "TrellisPathList.h"
+#include "Timer.h"
+#include "IOWrapper.h"
+#include "Sentence.h"
+#include "ConfusionNet.h"
+#include "WordLattice.h"
+#include "TranslationAnalysis.h"
+#include "mbr.h"
+#include "LatticeMBR.h"
+
+#if HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_PROTOBUF
+#include "hypergraph.pb.h"
+#endif
+
+
+using namespace std;
+using namespace Moses;
+
+int main(int argc, char* argv[])
+{
+
+
+#ifdef HAVE_PROTOBUF
+ GOOGLE_PROTOBUF_VERIFY_VERSION;
+#endif
+ IFVERBOSE(1)
+ {
+ TRACE_ERR("command: ");
+ for(int i=0;i<argc;++i) TRACE_ERR(argv[i]<<" ");
+ TRACE_ERR(endl);
+ }
+
+ cout.setf(std::ios::fixed);
+ cout.precision(3);
+ cerr.setf(std::ios::fixed);
+ cerr.precision(3);
+
+ // load data structures
+ Parameter parameter;
+ if (!parameter.LoadParam(argc, argv))
+ {
+ parameter.Explain();
+ return EXIT_FAILURE;
+ }
+
+ const StaticData &staticData = StaticData::Instance();
+ if (!StaticData::LoadDataStatic(&parameter))
+ return EXIT_FAILURE;
+
+ // set up read/writing class
+ IOWrapper *ioWrapper = GetIODevice(staticData);
+
+ // check on weights
+ vector<float> weights = staticData.GetAllWeights();
+ IFVERBOSE(2) {
+ TRACE_ERR("The score component vector looks like this:\n" << staticData.GetScoreIndexManager());
+ TRACE_ERR("The global weight vector looks like this:");
+ for (size_t j=0; j<weights.size(); j++) { TRACE_ERR(" " << weights[j]); }
+ TRACE_ERR("\n");
+ }
+ // every score must have a weight! check that here:
+ if(weights.size() != staticData.GetScoreIndexManager().GetTotalNumberOfScores()) {
+ TRACE_ERR("ERROR: " << staticData.GetScoreIndexManager().GetTotalNumberOfScores() << " score components, but " << weights.size() << " weights defined" << std::endl);
+ return EXIT_FAILURE;
+ }
+
+ if (ioWrapper == NULL)
+ return EXIT_FAILURE;
+
+ // read each sentence & decode
+ InputType *source=0;
+ size_t lineCount = 0;
+ while(ReadInput(*ioWrapper,staticData.GetInputType(),source))
+ {
+ // note: source is only valid within this while loop!
+ IFVERBOSE(1)
+ ResetUserTime();
+
+ VERBOSE(2,"\nTRANSLATING(" << ++lineCount << "): " << *source);
+
+ Manager manager(*source, staticData.GetSearchAlgorithm());
+ manager.ProcessSentence();
+
+ if (staticData.GetOutputWordGraph())
+ manager.GetWordGraph(source->GetTranslationId(), ioWrapper->GetOutputWordGraphStream());
+
+ if (staticData.GetOutputSearchGraph())
+ manager.GetSearchGraph(source->GetTranslationId(), ioWrapper->GetOutputSearchGraphStream());
+
+#ifdef HAVE_PROTOBUF
+ if (staticData.GetOutputSearchGraphPB()) {
+ ostringstream sfn;
+ sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << source->GetTranslationId() << ".pb" << ends;
+ string fn = sfn.str();
+ VERBOSE(2, "Writing search graph to " << fn << endl);
+ fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out);
+ manager.SerializeSearchGraphPB(source->GetTranslationId(), output);
+ }
+#endif
+
+ //Print all derivations in search graph
+ if (staticData.PrintAllDerivations()) {
+ manager.PrintAllDerivations(source->GetTranslationId());
+ }
+
+ // pick best translation (maximum a posteriori decoding)
+ if (! staticData.UseMBR() ) {
+ ioWrapper->OutputBestHypo(manager.GetBestHypothesis(), source->GetTranslationId(),
+ staticData.GetReportSegmentation(), staticData.GetReportAllFactors());
+ IFVERBOSE(2) { PrintUserTime("Best Hypothesis Generation Time:"); }
+
+ // n-best
+ size_t nBestSize = staticData.GetNBestSize();
+ if (nBestSize > 0)
+ {
+ VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl);
+ TrellisPathList nBestList;
+ manager.CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest());
+ ioWrapper->OutputNBestList(nBestList, source->GetTranslationId());
+ //RemoveAllInColl(nBestList);
+
+ IFVERBOSE(2) { PrintUserTime("N-Best Hypotheses Generation Time:"); }
+ }
+ }
+ else {
+ size_t nBestSize = staticData.GetMBRSize();
+
+ if (nBestSize <= 0)
+ {
+ cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
+ return EXIT_FAILURE;
+ }
+ TrellisPathList nBestList;
+ manager.CalcNBest(nBestSize, nBestList,true);
+ VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
+ IFVERBOSE(2) { PrintUserTime("calculated n-best list for (L)MBR decoding"); }
+ if (staticData.UseLatticeMBR()) {
+ vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
+ OutputBestHypo(mbrBestHypo, source->GetTranslationId(), staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),cout);
+ IFVERBOSE(2) { PrintUserTime("finished Lattice MBR decoding"); }
+ } else {
+ std::vector<const Factor*> mbrBestHypo = doMBR(nBestList);
+ OutputBestHypo(mbrBestHypo, source->GetTranslationId(),
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),cout);
+ IFVERBOSE(2) { PrintUserTime("finished MBR decoding"); }
+ }
+
+ if (!staticData.GetNBestFilePath().empty()){
+ //print the all nbest used for MBR (and not the amount passed through the parameter
+ VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl);
+ ioWrapper->OutputNBestList(nBestList, source->GetTranslationId());
+ IFVERBOSE(2) { PrintUserTime("N-Best Hypotheses Generation Time:"); }
+ }
+ }
+
+
+
+ if (staticData.IsDetailedTranslationReportingEnabled()) {
+ TranslationAnalysis::PrintTranslationAnalysis(std::cerr, manager.GetBestHypothesis());
+ }
+
+ IFVERBOSE(2) { PrintUserTime("Sentence Decoding Time:"); }
+
+ manager.CalcDecoderStatistics();
+
+ }
+
+ delete ioWrapper;
+
+ IFVERBOSE(1)
+ PrintUserTime("End.");
+
+ #ifndef EXIT_RETURN
+ //This avoids that detructors are called (it can take a long time)
+ exit(EXIT_SUCCESS);
+ #else
+ return EXIT_SUCCESS;
+ #endif
+}
+
+
diff --git a/moses-cmd/src/Main.h b/moses-cmd/src/Main.h
new file mode 100644
index 000000000..67dfc30a0
--- /dev/null
+++ b/moses-cmd/src/Main.h
@@ -0,0 +1,43 @@
+// $Id: Main.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2006 University of Edinburgh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of the University of Edinburgh nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+// example file on how to use moses library
+
+#ifndef moses_cmd_Main_h
+#define moses_cmd_Main_h
+
+#include "StaticData.h"
+
+class IOWrapper;
+
+int main(int argc, char* argv[]);
+#endif
diff --git a/moses-cmd/src/MainMT.cpp b/moses-cmd/src/MainMT.cpp
new file mode 100644
index 000000000..252b165c7
--- /dev/null
+++ b/moses-cmd/src/MainMT.cpp
@@ -0,0 +1,268 @@
+// $Id: $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2009 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+/**
+ * Main for multithreaded moses.
+ **/
+
+#include <sstream>
+#include <vector>
+
+#include <boost/thread/mutex.hpp>
+
+#if defined(BOOST_HAS_PTHREADS)
+#include <pthread.h>
+#endif
+
+
+#include "Hypothesis.h"
+#include "IOWrapper.h"
+#include "LatticeMBR.h"
+#include "Manager.h"
+#include "StaticData.h"
+#include "ThreadPool.h"
+#include "Util.h"
+#include "mbr.h"
+
+using namespace std;
+using namespace Moses;
+
+
+/**
+ * Makes sure output goes in the correct order.
+ **/
+class OutputCollector {
+ public:
+ OutputCollector(std::ostream* outStream= &cout, std::ostream* debugStream=&cerr) :
+ m_nextOutput(0),m_outStream(outStream),m_debugStream(debugStream) {}
+
+
+ /**
+ * Write or cache the output, as appropriate.
+ **/
+ void Write(int sourceId,const string& output,const string& debug="") {
+ boost::mutex::scoped_lock lock(m_mutex);
+ if (sourceId == m_nextOutput) {
+ //This is the one we were expecting
+ *m_outStream << output;
+ *m_debugStream << debug;
+ ++m_nextOutput;
+ //see if there's any more
+ map<int,string>::iterator iter;
+ while ((iter = m_outputs.find(m_nextOutput)) != m_outputs.end()) {
+ *m_outStream << iter->second;
+ m_outputs.erase(iter);
+ ++m_nextOutput;
+ map<int,string>::iterator debugIter = m_debugs.find(iter->first);
+ if (debugIter != m_debugs.end()) {
+ *m_debugStream << debugIter->second;
+ m_debugs.erase(debugIter);
+ }
+ }
+ } else {
+ //save for later
+ m_outputs[sourceId] = output;
+ m_debugs[sourceId] = debug;
+ }
+ }
+
+ private:
+ map<int,string> m_outputs;
+ map<int,string> m_debugs;
+ int m_nextOutput;
+ ostream* m_outStream;
+ ostream* m_debugStream;
+ boost::mutex m_mutex;
+};
+
+/**
+ * Translates a sentence.
+ **/
+class TranslationTask : public Task {
+
+ public:
+
+ TranslationTask(size_t lineNumber,
+ InputType* source, OutputCollector* outputCollector, OutputCollector* nbestCollector) :
+ m_source(source), m_lineNumber(lineNumber),
+ m_outputCollector(outputCollector), m_nbestCollector(nbestCollector) {}
+
+ void Run() {
+#if defined(BOOST_HAS_PTHREADS)
+ TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << (int)pthread_self() << std::endl);
+#endif
+ const StaticData &staticData = StaticData::Instance();
+ Sentence sentence(Input);
+ Manager manager(*m_source,staticData.GetSearchAlgorithm());
+ manager.ProcessSentence();
+
+ if (m_outputCollector) {
+ ostringstream out;
+ ostringstream debug;
+ const Hypothesis* bestHypo = NULL;
+ if (!staticData.UseMBR()) {
+ bestHypo = manager.GetBestHypothesis();
+ if (bestHypo) {
+ OutputSurface(
+ out,
+ bestHypo,
+ staticData.GetOutputFactorOrder(),
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors());
+ IFVERBOSE(1) {
+ debug << "BEST TRANSLATION: " << *bestHypo << endl;
+ }
+ }
+ out << endl;
+ } else {
+ size_t nBestSize = staticData.GetMBRSize();
+ if (nBestSize <= 0) {
+ cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
+ exit(1);
+ }
+ TrellisPathList nBestList;
+ manager.CalcNBest(nBestSize, nBestList,true);
+ VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
+ IFVERBOSE(2) { PrintUserTime("calculated n-best list for (L)MBR decoding"); }
+
+ if (staticData.UseLatticeMBR()) {
+ //Lattice MBR decoding
+ vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
+ OutputBestHypo(mbrBestHypo, m_lineNumber, staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),out);
+ IFVERBOSE(2) { PrintUserTime("finished Lattice MBR decoding"); }
+ } else {
+ //MBR decoding
+ std::vector<const Factor*> mbrBestHypo = doMBR(nBestList);
+ OutputBestHypo(mbrBestHypo, m_lineNumber,
+ staticData.GetReportSegmentation(),
+ staticData.GetReportAllFactors(),out);
+ IFVERBOSE(2) { PrintUserTime("finished MBR decoding"); }
+
+ }
+ }
+ m_outputCollector->Write(m_lineNumber,out.str(),debug.str());
+ }
+ if (m_nbestCollector) {
+ TrellisPathList nBestList;
+ ostringstream out;
+ manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest());
+ OutputNBest(out,nBestList, staticData.GetOutputFactorOrder(), m_lineNumber);
+ m_nbestCollector->Write(m_lineNumber, out.str());
+ }
+ }
+
+ ~TranslationTask() {delete m_source;}
+
+ private:
+ InputType* m_source;
+ size_t m_lineNumber;
+ OutputCollector* m_outputCollector;
+ OutputCollector* m_nbestCollector;
+
+};
+
+int main(int argc, char** argv) {
+ //extract pool-size args, send others to moses
+ char** mosesargv = new char*[argc+2];
+ int mosesargc = 0;
+ int threadcount = 10;
+ for (int i = 0; i < argc; ++i) {
+ if (!strcmp(argv[i], "-threads")) {
+ ++i;
+ if (i >= argc) {
+ cerr << "Error: Missing argument to -threads" << endl;
+ exit(1);
+ } else {
+ threadcount = atoi(argv[i]);
+ }
+ } else {
+ mosesargv[mosesargc] = new char[strlen(argv[i])+1];
+ strcpy(mosesargv[mosesargc],argv[i]);
+ ++mosesargc;
+ }
+ }
+ if (threadcount <= 0) {
+ cerr << "Error: Must specify a positive number of threads" << endl;
+ exit(1);
+ }
+
+ Parameter* params = new Parameter();
+ if (!params->LoadParam(mosesargc,mosesargv)) {
+ params->Explain();
+ exit(1);
+ }
+ if (!StaticData::LoadDataStatic(params)) {
+ exit(1);
+ }
+
+ const StaticData& staticData = StaticData::Instance();
+ IOWrapper* ioWrapper = GetIODevice(staticData);
+
+ if (!ioWrapper) {
+ cerr << "Error; Failed to create IO object" << endl;
+ exit(1);
+ }
+ ThreadPool pool(threadcount);
+ InputType* source = NULL;
+ size_t lineCount = 0;
+ auto_ptr<OutputCollector> outputCollector;//for translations
+ auto_ptr<OutputCollector> nbestCollector;
+ auto_ptr<ofstream> nbestOut;
+ size_t nbestSize = staticData.GetNBestSize();
+ string nbestFile = staticData.GetNBestFilePath();
+ if (nbestSize) {
+ if (nbestFile == "-") {
+ //nbest to stdout, no 1-best
+ //FIXME: Moses doesn't actually let you pass a '-' on the command line.
+ nbestCollector.reset(new OutputCollector());
+ } else {
+ //nbest to file, 1-best to stdout
+ nbestOut.reset(new ofstream(nbestFile.c_str()));
+ assert(nbestOut->good());
+ nbestCollector.reset(new OutputCollector(nbestOut.get()));
+ outputCollector.reset(new OutputCollector());
+ }
+ } else {
+ outputCollector.reset(new OutputCollector());
+ }
+
+ while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ TranslationTask* task =
+ new TranslationTask(lineCount,source, outputCollector.get(), nbestCollector.get());
+ pool.Submit(task);
+ source = NULL; //make sure it doesn't get deleted
+ ++lineCount;
+ }
+
+ pool.Stop(true); //flush remaining jobs
+
+ #ifndef EXIT_RETURN
+ //This avoids that detructors are called (it can take a long time)
+ exit(EXIT_SUCCESS);
+ #else
+ return EXIT_SUCCESS;
+ #endif
+}
+
+
+
+
diff --git a/moses-cmd/src/Makefile.am b/moses-cmd/src/Makefile.am
new file mode 100644
index 000000000..f5de6d79c
--- /dev/null
+++ b/moses-cmd/src/Makefile.am
@@ -0,0 +1,16 @@
+if WITH_THREADS
+ bin_PROGRAMS = moses mosesmt lmbrgrid
+else
+ bin_PROGRAMS = moses lmbrgrid
+endif
+
+AM_CPPFLAGS = -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES -DUSE_HYPO_POOL -I$(top_srcdir)/moses/src $(BOOST_CPPFLAGS)
+
+moses_SOURCES = Main.cpp mbr.cpp IOWrapper.cpp TranslationAnalysis.cpp LatticeMBR.cpp
+moses_LDADD = $(top_builddir)/moses/src/libmoses.la $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB)
+
+mosesmt_SOURCES = MainMT.cpp mbr.cpp IOWrapper.cpp TranslationAnalysis.cpp ThreadPool.cpp LatticeMBR.cpp
+mosesmt_LDADD = $(top_builddir)/moses/src/libmoses.la $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB)
+
+lmbrgrid_SOURCES = LatticeMBRGrid.cpp LatticeMBR.cpp IOWrapper.cpp
+lmbrgrid_LDADD = $(top_builddir)/moses/src/libmoses.la $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB)
diff --git a/moses-cmd/src/ThreadPool.cpp b/moses-cmd/src/ThreadPool.cpp
new file mode 100644
index 000000000..cbed7c4d9
--- /dev/null
+++ b/moses-cmd/src/ThreadPool.cpp
@@ -0,0 +1,97 @@
+// $Id: $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2009 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+
+#include "ThreadPool.h"
+
+using namespace std;
+using namespace Moses;
+
+Moses::ThreadPool::ThreadPool( size_t numThreads )
+ : m_stopped(false), m_stopping(false)
+{
+ for (size_t i = 0; i < numThreads; ++i) {
+ m_threads.create_thread(boost::bind(&ThreadPool::Execute,this));
+ }
+}
+
+void Moses::ThreadPool::Execute()
+{
+ do {
+ Task* task = NULL;
+ { // Find a job to perform
+ boost::mutex::scoped_lock lock(m_mutex);
+ if (m_tasks.empty() && !m_stopped) {
+ m_threadNeeded.wait(lock);
+ }
+ if (!m_stopped && !m_tasks.empty()) {
+ task = m_tasks.front();
+ m_tasks.pop();
+ }
+ }
+ //Execute job
+ if (task) {
+ task->Run();
+ delete task;
+ }
+ m_threadAvailable.notify_all();
+ } while (!m_stopped);
+#if defined(BOOST_HAS_PTHREADS)
+ TRACE_ERR("Thread " << (int)pthread_self() << " exiting" << endl);
+#endif
+}
+
+void Moses::ThreadPool::Submit( Task* task )
+{
+ boost::mutex::scoped_lock lock(m_mutex);
+ if (m_stopping) {
+ throw runtime_error("ThreadPool stopping - unable to accept new jobs");
+ }
+ m_tasks.push(task);
+ m_threadNeeded.notify_all();
+
+}
+
+void Moses::ThreadPool::Stop(bool processRemainingJobs)
+{
+ {
+ //prevent more jobs from being added to the queue
+ boost::mutex::scoped_lock lock(m_mutex);
+ if (m_stopped) return;
+ m_stopping = true;
+ }
+ if (processRemainingJobs) {
+ boost::mutex::scoped_lock lock(m_mutex);
+ //wait for queue to drain.
+ while (!m_tasks.empty() && !m_stopped) {
+ m_threadAvailable.wait(lock);
+ }
+ }
+ //tell all threads to stop
+ {
+ boost::mutex::scoped_lock lock(m_mutex);
+ m_stopped = true;
+ }
+ m_threadNeeded.notify_all();
+
+ cerr << m_threads.size() << endl;
+ m_threads.join_all();
+}
diff --git a/moses-cmd/src/ThreadPool.h b/moses-cmd/src/ThreadPool.h
new file mode 100644
index 000000000..eb8ced97f
--- /dev/null
+++ b/moses-cmd/src/ThreadPool.h
@@ -0,0 +1,118 @@
+// $Id: $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2009 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#ifndef moses_cmd_ThreadPool_h
+#define moses_cmd_ThreadPool_h
+
+#include <iostream>
+#include <queue>
+#include <vector>
+
+
+#include <boost/bind.hpp>
+#include <boost/thread.hpp>
+
+#if defined(BOOST_HAS_PTHREADS)
+#include <pthread.h>
+#endif
+
+
+#include "Util.h"
+
+
+/**
+ * Classes to implement a ThreadPool.
+ **/
+
+namespace Moses {
+
+
+/**
+* A task to be executed by the ThreadPool
+**/
+class Task {
+ public:
+ virtual void Run() = 0;
+ virtual ~Task() {}
+};
+
+class ThreadPool {
+ public:
+ /**
+ * Construct a thread pool of a fixed size.
+ **/
+ ThreadPool(size_t numThreads);
+
+
+ /**
+ * Add a job to the threadpool.
+ **/
+ void Submit(Task* task);
+
+ /**
+ * Wait until all queued jobs have completed, and shut down
+ * the ThreadPool.
+ **/
+ void Stop(bool processRemainingJobs = false);
+
+ ~ThreadPool() { Stop(); }
+
+
+
+ private:
+ /**
+ * The main loop executed by each thread.
+ **/
+ void Execute();
+
+ std::queue<Task*> m_tasks;
+ boost::thread_group m_threads;
+ boost::mutex m_mutex;
+ boost::condition_variable m_threadNeeded;
+ boost::condition_variable m_threadAvailable;
+ bool m_stopped;
+ bool m_stopping;
+
+};
+
+
+class TestTask : public Task {
+ public:
+ TestTask(int id) : m_id(id) {}
+ virtual void Run() {
+#if defined(BOOST_HAS_PTHREADS)
+ int tid = (int)pthread_self();
+#else
+ int tid = 0;
+#endif
+ std::cerr << "Executing " << m_id << " in thread id " << tid << std::endl;
+ }
+
+ virtual ~TestTask() {}
+
+ private:
+ int m_id;
+};
+
+
+
+}
+#endif
diff --git a/moses-cmd/src/TranslationAnalysis.cpp b/moses-cmd/src/TranslationAnalysis.cpp
new file mode 100644
index 000000000..add50ff3c
--- /dev/null
+++ b/moses-cmd/src/TranslationAnalysis.cpp
@@ -0,0 +1,114 @@
+// $Id: TranslationAnalysis.cpp 2717 2010-01-28 15:32:04Z phkoehn $
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+#include "StaticData.h"
+#include "Hypothesis.h"
+#include "TranslationAnalysis.h"
+
+using namespace Moses;
+
+namespace TranslationAnalysis {
+
+void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo)
+{
+ os << std::endl << "TRANSLATION HYPOTHESIS DETAILS:" << std::endl;
+ std::vector<const Hypothesis*> translationPath;
+ while (hypo) {
+ translationPath.push_back(hypo);
+ hypo = hypo->GetPrevHypo();
+ }
+ std::reverse(translationPath.begin(), translationPath.end());
+
+ std::vector<std::string> droppedWords;
+ std::vector<const Hypothesis*>::iterator tpi = translationPath.begin();
+ ++tpi; // skip initial translation state
+ std::vector<std::string> sourceMap;
+ std::vector<std::string> targetMap;
+ std::vector<unsigned int> lmAcc(0);
+ size_t lmCalls = 0;
+ bool doLMStats = ((*tpi)->GetLMStats() != 0);
+ if (doLMStats)
+ lmAcc.resize((*tpi)->GetLMStats()->size(), 0);
+ for (; tpi != translationPath.end(); ++tpi) {
+ std::ostringstream sms;
+ std::ostringstream tms;
+ std::string target = (*tpi)->GetTargetPhraseStringRep();
+ std::string source = (*tpi)->GetSourcePhraseStringRep();
+ WordsRange twr = (*tpi)->GetCurrTargetWordsRange();
+ WordsRange swr = (*tpi)->GetCurrSourceWordsRange();
+
+ // language model backoff stats,
+ if (doLMStats) {
+ std::vector<std::vector<unsigned int> >& lmstats = *(*tpi)->GetLMStats();
+ std::vector<std::vector<unsigned int> >::iterator i = lmstats.begin();
+ std::vector<unsigned int>::iterator acc = lmAcc.begin();
+
+ for (; i != lmstats.end(); ++i, ++acc) {
+ std::vector<unsigned int>::iterator j = i->begin();
+ lmCalls += i->size();
+ for (; j != i->end(); ++j) {
+ (*acc) += *j;
+ }
+ }
+ }
+
+ bool epsilon = false;
+ if (target == "") {
+ target="<EPSILON>";
+ epsilon = true;
+ droppedWords.push_back(source);
+ }
+ os << " SOURCE: " << swr << " " << source << std::endl
+ << " TRANSLATED AS: " << target << std::endl;
+ size_t twr_i = twr.GetStartPos();
+ size_t swr_i = swr.GetStartPos();
+ if (!epsilon) { sms << twr_i; }
+ if (epsilon) { tms << "del(" << swr_i << ")"; } else { tms << swr_i; }
+ swr_i++; twr_i++;
+ for (; twr_i <= twr.GetEndPos() && twr.GetEndPos() != NOT_FOUND; twr_i++) {
+ sms << '-' << twr_i;
+ }
+ for (; swr_i <= swr.GetEndPos() && swr.GetEndPos() != NOT_FOUND; swr_i++) {
+ tms << '-' << swr_i;
+ }
+ if (!epsilon) targetMap.push_back(sms.str());
+ sourceMap.push_back(tms.str());
+ }
+ std::vector<std::string>::iterator si = sourceMap.begin();
+ std::vector<std::string>::iterator ti = targetMap.begin();
+ os << std::endl << "SOURCE/TARGET SPANS:";
+ os << std::endl << " SOURCE:";
+ for (; si != sourceMap.end(); ++si) {
+ os << " " << *si;
+ }
+ os << std::endl << " TARGET:";
+ for (; ti != targetMap.end(); ++ti) {
+ os << " " << *ti;
+ }
+ os << std::endl << std::endl;
+ if (doLMStats && lmCalls > 0) {
+ std::vector<unsigned int>::iterator acc = lmAcc.begin();
+ const LMList& lmlist = StaticData::Instance().GetAllLM();
+ LMList::const_iterator i = lmlist.begin();
+ for (; acc != lmAcc.end(); ++acc, ++i) {
+ char buf[256];
+ sprintf(buf, "%.4f", (float)(*acc)/(float)lmCalls);
+ os << (*i)->GetScoreProducerDescription() <<", AVG N-GRAM LENGTH: " << buf << std::endl;
+ }
+ }
+
+ if (droppedWords.size() > 0) {
+ std::vector<std::string>::iterator dwi = droppedWords.begin();
+ os << std::endl << "WORDS/PHRASES DROPPED:" << std::endl;
+ for (; dwi != droppedWords.end(); ++dwi) {
+ os << "\tdropped=" << *dwi << std::endl;
+ }
+ }
+ os << std::endl << "SCORES (UNWEIGHTED/WEIGHTED): ";
+ StaticData::Instance().GetScoreIndexManager().PrintLabeledWeightedScores(os, translationPath.back()->GetScoreBreakdown(), StaticData::Instance().GetAllWeights());
+ os << std::endl;
+}
+
+}
diff --git a/moses-cmd/src/TranslationAnalysis.h b/moses-cmd/src/TranslationAnalysis.h
new file mode 100644
index 000000000..877ca743d
--- /dev/null
+++ b/moses-cmd/src/TranslationAnalysis.h
@@ -0,0 +1,24 @@
+// $Id: TranslationAnalysis.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/*
+ * also see moses/SentenceStats
+ */
+
+#ifndef moses_cmd_TranslationAnalysis_h
+#define moses_cmd_TranslationAnalysis_h
+
+#include <iostream>
+#include "Hypothesis.h"
+
+namespace TranslationAnalysis
+{
+
+/***
+ * print details about the translation represented in hypothesis to
+ * os. Included information: phrase alignment, words dropped, scores
+ */
+ void PrintTranslationAnalysis(std::ostream &os, const Moses::Hypothesis* hypo);
+
+}
+
+#endif
diff --git a/moses-cmd/src/mbr.cpp b/moses-cmd/src/mbr.cpp
new file mode 100644
index 000000000..9a77fbc5c
--- /dev/null
+++ b/moses-cmd/src/mbr.cpp
@@ -0,0 +1,185 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <vector>
+#include <map>
+#include <stdlib.h>
+#include <math.h>
+#include <algorithm>
+#include <stdio.h>
+#include "TrellisPathList.h"
+#include "TrellisPath.h"
+#include "StaticData.h"
+#include "Util.h"
+#include "mbr.h"
+
+using namespace std ;
+using namespace Moses;
+
+
+/* Input :
+ 1. a sorted n-best list, with duplicates filtered out in the following format
+ 0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
+
+ 2. a weight vector
+ 3. bleu order ( default = 4)
+ 4. scaling factor to weigh the weight vector (default = 1.0)
+
+ Output :
+ translations that minimise the Bayes Risk of the n-best list
+
+
+*/
+
+int BLEU_ORDER = 4;
+int SMOOTH = 1;
+float min_interval = 1e-4;
+void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
+{
+ vector< const Factor* > ngram;
+ for (int k = 0; k < BLEU_ORDER; k++)
+ {
+ for(int i =0; i < max((int)sentence.size()-k,0); i++)
+ {
+ for ( int j = i; j<= i+k; j++)
+ {
+ ngram.push_back(sentence[j]);
+ }
+ ++allngrams[ngram];
+ ngram.clear();
+ }
+ }
+}
+
+float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats ) {
+ int comps_n = 2*BLEU_ORDER+1;
+ vector<int> comps(comps_n);
+ float logbleu = 0.0, brevity;
+
+ int hyp_length = sents[hyp].size();
+
+ for (int i =0; i<BLEU_ORDER;i++)
+ {
+ comps[2*i] = 0;
+ comps[2*i+1] = max(hyp_length-i,0);
+ }
+
+ map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
+ map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
+
+ for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
+ it != hyp_ngrams.end(); it++)
+ {
+ map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
+ if(ref_it != ref_ngrams.end())
+ {
+ comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
+ }
+ }
+ comps[comps_n-1] = sents[ref].size();
+
+ for (int i=0; i<BLEU_ORDER; i++)
+ {
+ if (comps[0] == 0)
+ return 0.0;
+ if ( i > 0 )
+ logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
+ else
+ logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
+ }
+ logbleu /= BLEU_ORDER;
+ brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
+ if (brevity < 0.0)
+ logbleu += brevity;
+ return exp(logbleu);
+}
+
+vector<const Factor*> doMBR(const TrellisPathList& nBestList){
+ float marginal = 0;
+
+ vector<float> joint_prob_vec;
+ vector< vector<const Factor*> > translations;
+ float joint_prob;
+ vector< map < vector <const Factor *>, int > > ngram_stats;
+
+ TrellisPathList::const_iterator iter;
+
+ // get max score to prevent underflow
+ float maxScore = -1e20;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter)
+ {
+ const TrellisPath &path = **iter;
+ float score = StaticData::Instance().GetMBRScale()
+ * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights());
+ if (maxScore < score) maxScore = score;
+ }
+
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter)
+ {
+ const TrellisPath &path = **iter;
+ joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights()) - maxScore);
+ marginal += joint_prob;
+ joint_prob_vec.push_back(joint_prob);
+
+ // get words in translation
+ vector<const Factor*> translation;
+ GetOutputFactors(path, translation);
+
+ // collect n-gram counts
+ map < vector < const Factor *>, int > counts;
+ extract_ngrams(translation,counts);
+
+ ngram_stats.push_back(counts);
+ translations.push_back(translation);
+ }
+
+ vector<float> mbr_loss;
+ float bleu, weightedLoss;
+ float weightedLossCumul = 0;
+ float minMBRLoss = 1000000;
+ int minMBRLossIdx = -1;
+
+ /* Main MBR computation done here */
+ iter = nBestList.begin();
+ for (unsigned int i = 0; i < nBestList.GetSize(); i++){
+ weightedLossCumul = 0;
+ for (unsigned int j = 0; j < nBestList.GetSize(); j++){
+ if ( i != j) {
+ bleu = calculate_score(translations, j, i,ngram_stats );
+ weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
+ weightedLossCumul += weightedLoss;
+ if (weightedLossCumul > minMBRLoss)
+ break;
+ }
+ }
+ if (weightedLossCumul < minMBRLoss){
+ minMBRLoss = weightedLossCumul;
+ minMBRLossIdx = i;
+ }
+ iter++;
+ }
+ /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
+ return translations[minMBRLossIdx];
+}
+
+void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation){
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
+ const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+ assert (outputFactorOrder.size() == 1);
+
+ // print the surface factor of the translation
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--)
+ {
+ const Hypothesis &edge = *edges[currEdge];
+ const Phrase &phrase = edge.GetCurrTargetPhrase();
+ size_t size = phrase.GetSize();
+ for (size_t pos = 0 ; pos < size ; pos++)
+ {
+
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+ translation.push_back(factor);
+ }
+ }
+}
+
diff --git a/moses-cmd/src/mbr.h b/moses-cmd/src/mbr.h
new file mode 100644
index 000000000..467bbfa54
--- /dev/null
+++ b/moses-cmd/src/mbr.h
@@ -0,0 +1,28 @@
+// $Id: mbr.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#ifndef moses_cmd_mbr_h
+#define moses_cmd_mbr_h
+
+std::vector<const Moses::Factor*> doMBR(const Moses::TrellisPathList& nBestList);
+void GetOutputFactors(const Moses::TrellisPath &path, std::vector <const Moses::Factor*> &translation);
+float calculate_score(const std::vector< std::vector<const Moses::Factor*> > & sents, int ref, int hyp, std::vector < std::map < std::vector < const Moses::Factor *>, int > > & ngram_stats );
+#endif