From 96dd3ecb50b509dd2f4d5373d3c956cfc0ab8480 Mon Sep 17 00:00:00 2001 From: skyload Date: Wed, 21 Apr 2010 11:56:51 +0000 Subject: git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/DPR_MOSES@3160 1f5c12ca-751b-0410-a591-d2e778427230 --- moses-cmd/src/IOWrapper.cpp | 508 ++++++++++++++++++++++++++++++++++ moses-cmd/src/IOWrapper.h | 115 ++++++++ moses-cmd/src/LatticeMBR.cpp | 507 +++++++++++++++++++++++++++++++++ moses-cmd/src/LatticeMBR.h | 117 ++++++++ moses-cmd/src/LatticeMBRGrid.cpp | 204 ++++++++++++++ moses-cmd/src/Main.cpp | 233 ++++++++++++++++ moses-cmd/src/Main.h | 43 +++ moses-cmd/src/MainMT.cpp | 268 ++++++++++++++++++ moses-cmd/src/Makefile.am | 16 ++ moses-cmd/src/ThreadPool.cpp | 97 +++++++ moses-cmd/src/ThreadPool.h | 118 ++++++++ moses-cmd/src/TranslationAnalysis.cpp | 114 ++++++++ moses-cmd/src/TranslationAnalysis.h | 24 ++ moses-cmd/src/mbr.cpp | 185 +++++++++++++ moses-cmd/src/mbr.h | 28 ++ 15 files changed, 2577 insertions(+) create mode 100644 moses-cmd/src/IOWrapper.cpp create mode 100644 moses-cmd/src/IOWrapper.h create mode 100644 moses-cmd/src/LatticeMBR.cpp create mode 100644 moses-cmd/src/LatticeMBR.h create mode 100644 moses-cmd/src/LatticeMBRGrid.cpp create mode 100644 moses-cmd/src/Main.cpp create mode 100644 moses-cmd/src/Main.h create mode 100644 moses-cmd/src/MainMT.cpp create mode 100644 moses-cmd/src/Makefile.am create mode 100644 moses-cmd/src/ThreadPool.cpp create mode 100644 moses-cmd/src/ThreadPool.h create mode 100644 moses-cmd/src/TranslationAnalysis.cpp create mode 100644 moses-cmd/src/TranslationAnalysis.h create mode 100644 moses-cmd/src/mbr.cpp create mode 100644 moses-cmd/src/mbr.h (limited to 'moses-cmd/src') diff --git a/moses-cmd/src/IOWrapper.cpp b/moses-cmd/src/IOWrapper.cpp new file mode 100644 index 000000000..96573d88d --- /dev/null +++ b/moses-cmd/src/IOWrapper.cpp @@ -0,0 +1,508 @@ +// $Id: IOWrapper.cpp 2953 2010-03-07 07:57:48Z abarun $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (c) 2006 University of Edinburgh +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the University of Edinburgh nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +// example file on how to use moses library + +#include +#include "TypeDef.h" +#include "Util.h" +#include "IOWrapper.h" +#include "Hypothesis.h" +#include "WordsRange.h" +#include "TrellisPathList.h" +#include "StaticData.h" +#include "DummyScoreProducers.h" +#include "InputFileStream.h" + +using namespace std; +using namespace Moses; + +IOWrapper::IOWrapper( + const vector &inputFactorOrder + , const vector &outputFactorOrder + , const FactorMask &inputFactorUsed + , size_t nBestSize + , const string &nBestFilePath) +:m_inputFactorOrder(inputFactorOrder) +,m_outputFactorOrder(outputFactorOrder) +,m_inputFactorUsed(inputFactorUsed) +,m_inputFile(NULL) +,m_inputStream(&std::cin) +,m_nBestStream(NULL) +,m_outputWordGraphStream(NULL) +,m_outputSearchGraphStream(NULL) +{ + Initialization(inputFactorOrder, outputFactorOrder + , inputFactorUsed + , nBestSize, nBestFilePath); +} + +IOWrapper::IOWrapper(const std::vector &inputFactorOrder + , const std::vector &outputFactorOrder + , const FactorMask &inputFactorUsed + , size_t nBestSize + , const std::string &nBestFilePath + , const std::string &inputFilePath) +:m_inputFactorOrder(inputFactorOrder) +,m_outputFactorOrder(outputFactorOrder) +,m_inputFactorUsed(inputFactorUsed) +,m_inputFilePath(inputFilePath) +,m_inputFile(new InputFileStream(inputFilePath)) +,m_nBestStream(NULL) +,m_outputWordGraphStream(NULL) +,m_outputSearchGraphStream(NULL) +{ + Initialization(inputFactorOrder, outputFactorOrder + , inputFactorUsed + , nBestSize, nBestFilePath); + + m_inputStream = m_inputFile; +} + +IOWrapper::~IOWrapper() +{ + if (m_inputFile != NULL) + delete m_inputFile; + if (m_nBestStream != NULL && !m_surpressSingleBestOutput) + { // outputting n-best to file, rather than stdout. need to close file and delete obj + delete m_nBestStream; + } + if (m_outputWordGraphStream != NULL) + { + delete m_outputWordGraphStream; + } + if (m_outputSearchGraphStream != NULL) + { + delete m_outputSearchGraphStream; + } +} + +void IOWrapper::Initialization(const std::vector &inputFactorOrder + , const std::vector &outputFactorOrder + , const FactorMask &inputFactorUsed + , size_t nBestSize + , const std::string &nBestFilePath) +{ + const StaticData &staticData = StaticData::Instance(); + + // n-best + m_surpressSingleBestOutput = false; + + if (nBestSize > 0) + { + if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") + { + m_nBestStream = &std::cout; + m_surpressSingleBestOutput = true; + } + else + { + std::ofstream *file = new std::ofstream; + m_nBestStream = file; + file->open(nBestFilePath.c_str()); + } + } + + // wordgraph output + if (staticData.GetOutputWordGraph()) + { + string fileName = staticData.GetParam("output-word-graph")[0]; + std::ofstream *file = new std::ofstream; + m_outputWordGraphStream = file; + file->open(fileName.c_str()); + } + + // search graph output + if (staticData.GetOutputSearchGraph()) + { + string fileName; + if (staticData.GetOutputSearchGraphExtended()) + fileName = staticData.GetParam("output-search-graph-extended")[0]; + else + fileName = staticData.GetParam("output-search-graph")[0]; + std::ofstream *file = new std::ofstream; + m_outputSearchGraphStream = file; + file->open(fileName.c_str()); + } +} + +InputType*IOWrapper::GetInput(InputType* inputType) +{ + if(inputType->Read(*m_inputStream, m_inputFactorOrder)) + { + if (long x = inputType->GetTranslationId()) { if (x>=m_translationId) m_translationId = x+1; } + else inputType->SetTranslationId(m_translationId++); + + return inputType; + } + else + { + delete inputType; + return NULL; + } +} + +/*** + * print surface factor only for the given phrase + */ +void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector &outputFactorOrder, bool reportAllFactors) +{ + assert(outputFactorOrder.size() > 0); + if (reportAllFactors == true) + { + out << phrase; + } + else + { + size_t size = phrase.GetSize(); + for (size_t pos = 0 ; pos < size ; pos++) + { + const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]); + out << *factor; + + for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) + { + const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]); + out << "|" << *factor; + } + out << " "; + } + } +} + +void OutputSurface(std::ostream &out, const Hypothesis *hypo, const std::vector &outputFactorOrder + ,bool reportSegmentation, bool reportAllFactors) +{ + if ( hypo != NULL) + { + OutputSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors); + OutputSurface(out, hypo->GetCurrTargetPhrase(), outputFactorOrder, reportAllFactors); + + if (reportSegmentation == true + && hypo->GetCurrTargetPhrase().GetSize() > 0) { + out << "|" << hypo->GetCurrSourceWordsRange().GetStartPos() + << "-" << hypo->GetCurrSourceWordsRange().GetEndPos() << "| "; + } + } +} + + + + + +void IOWrapper::Backtrack(const Hypothesis *hypo){ + + if (hypo->GetPrevHypo() != NULL) { + VERBOSE(3,hypo->GetId() << " <= "); + Backtrack(hypo->GetPrevHypo()); + } +} + +void OutputBestHypo(const std::vector& mbrBestHypo, long /*translationId*/, bool reportSegmentation, bool reportAllFactors, ostream& out) +{ + for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) + { + const Factor *factor = mbrBestHypo[i]; + if (i>0) out << " "; + out << factor->GetString(); + } + out << endl; +} + +void OutputBestHypo(const std::vector& mbrBestHypo, long /*translationId*/, bool reportSegmentation, bool reportAllFactors, ostream& out) +{ + + for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) + { + const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]); + if (i>0) out << " "; + out << *factor; + } + out << endl; +} + + +void OutputInput(std::vector& map, const Hypothesis* hypo) +{ + if (hypo->GetPrevHypo()) + { + OutputInput(map, hypo->GetPrevHypo()); + map[hypo->GetCurrSourceWordsRange().GetStartPos()] = hypo->GetSourcePhrase(); + } +} + +void OutputInput(std::ostream& os, const Hypothesis* hypo) +{ + size_t len = hypo->GetInput().GetSize(); + std::vector inp_phrases(len, 0); + OutputInput(inp_phrases, hypo); + for (size_t i=0; i& outputFactorOrder,long translationId) +{ + const StaticData &staticData = StaticData::Instance(); + bool labeledOutput = staticData.IsLabeledNBestList(); + bool reportAllFactors = staticData.GetReportAllFactorsNBest(); + bool includeAlignment = staticData.NBestIncludesAlignment(); + bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest(); + + TrellisPathList::const_iterator iter; + for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) + { + const TrellisPath &path = **iter; + const std::vector &edges = path.GetEdges(); + + // print the surface factor of the translation + out << translationId << " ||| "; + for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) + { + const Hypothesis &edge = *edges[currEdge]; + OutputSurface(out, edge.GetCurrTargetPhrase(), outputFactorOrder, reportAllFactors); + } + out << " |||"; + + std::string lastName = ""; + const vector& sff = + staticData.GetScoreIndexManager().GetStatefulFeatureFunctions(); + for( size_t i=0; iGetScoreProducerWeightShortName() ) + { + lastName = sff[i]->GetScoreProducerWeightShortName(); + out << " " << lastName << ":"; + } + vector scores = path.GetScoreBreakdown().GetScoresForProducer( sff[i] ); + for (size_t j = 0; j& slf = + staticData.GetScoreIndexManager().GetStatelessFeatureFunctions(); + for( size_t i=0; iGetScoreProducerWeightShortName() ) + { + lastName = slf[i]->GetScoreProducerWeightShortName(); + out << " " << lastName << ":"; + } + vector scores = path.GetScoreBreakdown().GetScoresForProducer( slf[i] ); + for (size_t j = 0; j pds = StaticData::Instance().GetPhraseDictionaries(); + if (pds.size() > 0) { + if (labeledOutput) + out << " tm:"; + vector::iterator iter; + for (iter = pds.begin(); iter != pds.end(); ++iter) { + vector scores = path.GetScoreBreakdown().GetScoresForProducer(*iter); + for (size_t j = 0; j pds = StaticData::Instance().GetPhraseDictionaries(); + if (pds.size() > 0) { + vector::iterator iter; + + iter = pds.begin(); + vector scores = path.GetScoreBreakdown().GetScoresForProducer(*iter); + + size_t pd_numinputscore = (*iter)->GetNumInputScores(); + + if (pd_numinputscore){ + + if (labeledOutput) + out << " I:"; + + for (size_t j = 0; j < pd_numinputscore; ++j) + out << " " << scores[j]; + } + + + for (iter = pds.begin() ; iter != pds.end(); ++iter) { + vector scores = path.GetScoreBreakdown().GetScoresForProducer(*iter); + + size_t pd_numinputscore = (*iter)->GetNumInputScores(); + + if (iter == pds.begin() && labeledOutput) + out << " tm:"; + for (size_t j = pd_numinputscore; j < scores.size() ; ++j) + out << " " << scores[j]; + } + } + } + + // generation + vector gds = StaticData::Instance().GetGenerationDictionaries(); + if (gds.size() > 0) { + if (labeledOutput) + out << " g: "; + vector::iterator iter; + for (iter = gds.begin(); iter != gds.end(); ++iter) { + vector scores = path.GetScoreBreakdown().GetScoresForProducer(*iter); + for (size_t j = 0; j= 0 ; currEdge--) + { + const Hypothesis &edge = *edges[currEdge]; + const WordsRange &sourceRange = edge.GetCurrSourceWordsRange(); + WordsRange targetRange = path.GetTargetWordsRange(edge); + out << " " << sourceRange.GetStartPos(); + if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) { + out << "-" << sourceRange.GetEndPos(); + } + out<< "=" << targetRange.GetStartPos(); + if (targetRange.GetStartPos() < targetRange.GetEndPos()) { + out<< "-" << targetRange.GetEndPos(); + } + } + } + + if (StaticData::Instance().IsPathRecoveryEnabled()) { + out << "|||"; + OutputInput(out, edges[0]); + } + + out << endl; + } + + + out < &inputFactorOrder = staticData.GetInputFactorOrder() + ,&outputFactorOrder = staticData.GetOutputFactorOrder(); + FactorMask inputFactorUsed(inputFactorOrder); + + // io + if (staticData.GetParam("input-file").size() == 1) + { + VERBOSE(2,"IO from File" << endl); + string filePath = staticData.GetParam("input-file")[0]; + + ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed + , staticData.GetNBestSize() + , staticData.GetNBestFilePath() + , filePath); + } + else + { + VERBOSE(1,"IO from STDOUT/STDIN" << endl); + ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed + , staticData.GetNBestSize() + , staticData.GetNBestFilePath()); + } + ioWrapper->ResetTranslationId(); + + IFVERBOSE(1) + PrintUserTime("Created input-output object"); + + return ioWrapper; +} diff --git a/moses-cmd/src/IOWrapper.h b/moses-cmd/src/IOWrapper.h new file mode 100644 index 000000000..f09df7e7c --- /dev/null +++ b/moses-cmd/src/IOWrapper.h @@ -0,0 +1,115 @@ +// $Id: IOWrapper.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (c) 2006 University of Edinburgh +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the University of Edinburgh nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +// example file on how to use moses library + +#ifndef moses_cmd_IOWrapper_h +#define moses_cmd_IOWrapper_h + +#include +#include +#include + +#include "TypeDef.h" +#include "Sentence.h" +#include "FactorTypeSet.h" +#include "FactorCollection.h" +#include "Hypothesis.h" +#include "TrellisPathList.h" +#include "InputFileStream.h" +#include "InputType.h" +#include "WordLattice.h" + +class IOWrapper +{ +protected: + long m_translationId; + + const std::vector &m_inputFactorOrder; + const std::vector &m_outputFactorOrder; + const Moses::FactorMask &m_inputFactorUsed; + std::ostream *m_nBestStream + ,*m_outputWordGraphStream,*m_outputSearchGraphStream; + std::string m_inputFilePath; + std::istream *m_inputStream; + Moses::InputFileStream *m_inputFile; + bool m_surpressSingleBestOutput; + + void Initialization(const std::vector &inputFactorOrder + , const std::vector &outputFactorOrder + , const Moses::FactorMask &inputFactorUsed + , size_t nBestSize + , const std::string &nBestFilePath); + +public: + IOWrapper(const std::vector &inputFactorOrder + , const std::vector &outputFactorOrder + , const Moses::FactorMask &inputFactorUsed + , size_t nBestSize + , const std::string &nBestFilePath); + + IOWrapper(const std::vector &inputFactorOrder + , const std::vector &outputFactorOrder + , const Moses::FactorMask &inputFactorUsed + , size_t nBestSize + , const std::string &nBestFilePath + , const std::string &infilePath); + ~IOWrapper(); + + Moses::InputType* GetInput(Moses::InputType *inputType); + + void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, bool reportSegmentation, bool reportAllFactors); + void OutputNBestList(const Moses::TrellisPathList &nBestList, long translationId); + void Backtrack(const Moses::Hypothesis *hypo); + + void ResetTranslationId() { m_translationId = 0; } + + std::ostream &GetOutputWordGraphStream() + { + return *m_outputWordGraphStream; + } + std::ostream &GetOutputSearchGraphStream() + { + return *m_outputSearchGraphStream; + } +}; + +IOWrapper *GetIODevice(const Moses::StaticData &staticData); +bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source); +void OutputSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector &outputFactorOrder ,bool reportSegmentation, bool reportAllFactors); +void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector&, long translationId); +void OutputBestHypo(const std::vector& mbrBestHypo, long translationId, + bool reportSegmentation, bool reportAllFactors, std::ostream& out); +void OutputBestHypo(const std::vector& mbrBestHypo, long /*translationId*/, + bool reportSegmentation, bool reportAllFactors, std::ostream& out); + +#endif diff --git a/moses-cmd/src/LatticeMBR.cpp b/moses-cmd/src/LatticeMBR.cpp new file mode 100644 index 000000000..e8c5cf484 --- /dev/null +++ b/moses-cmd/src/LatticeMBR.cpp @@ -0,0 +1,507 @@ +/* + * LatticeMBR.cpp + * moses-cmd + * + * Created by Abhishek Arun on 26/01/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ + +#include "LatticeMBR.h" +#include "StaticData.h" +#include +#include + +size_t bleu_order = 4; +float UNKNGRAMLOGPROB = -20; +void GetOutputWords(const TrellisPath &path, vector &translation){ + const std::vector &edges = path.GetEdges(); + + // print the surface factor of the translation + for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) + { + const Hypothesis &edge = *edges[currEdge]; + const Phrase &phrase = edge.GetCurrTargetPhrase(); + size_t size = phrase.GetSize(); + for (size_t pos = 0 ; pos < size ; pos++) + { + translation.push_back(phrase.GetWord(pos)); + } + } +} + + +void extract_ngrams(const vector& sentence, map < Phrase, int > & allngrams) +{ + for (int k = 0; k < (int)bleu_order; k++) + { + for(int i =0; i < max((int)sentence.size()-k,0); i++) + { + Phrase ngram(Output); + for ( int j = i; j<= i+k; j++) + { + ngram.AddWord(sentence[j]); + } + ++allngrams[ngram]; + } + } +} + + + +void NgramScores::addScore(const Hypothesis* node, const Phrase& ngram, float score) { + set::const_iterator ngramIter = m_ngrams.find(ngram); + if (ngramIter == m_ngrams.end()) { + ngramIter = m_ngrams.insert(ngram).first; + } + map& ngramScores = m_scores[node]; + map::iterator scoreIter = ngramScores.find(&(*ngramIter)); + if (scoreIter == ngramScores.end()) { + ngramScores[&(*ngramIter)] = score; + } else { + ngramScores[&(*ngramIter)] = log_sum(score,scoreIter->second); + } +} + +NgramScores::NodeScoreIterator NgramScores::nodeBegin(const Hypothesis* node) { + return m_scores[node].begin(); +} + + +NgramScores::NodeScoreIterator NgramScores::nodeEnd(const Hypothesis* node) { + return m_scores[node].end(); +} + + +void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set > & outgoingHyps, map >& incomingEdges, + const vector< float> & estimatedScores, const Hypothesis* bestHypo, size_t edgeDensity) { + + //Need hyp 0 in connectedHyp - Find empty hypothesis + VERBOSE(2,"Pruning lattice to edge density " << edgeDensity << endl); + const Hypothesis* emptyHyp = connectedHyp.at(0); + while (emptyHyp->GetId() != 0) { + emptyHyp = emptyHyp->GetPrevHypo(); + } + connectedHyp.push_back(emptyHyp); //Add it to list of hyps + + //Need hyp 0's outgoing Hyps + for (size_t i = 0; i < connectedHyp.size(); ++i) { + if (connectedHyp[i]->GetId() > 0 && connectedHyp[i]->GetPrevHypo()->GetId() == 0) + outgoingHyps[emptyHyp].insert(connectedHyp[i]); + } + + //sort hyps based on estimated scores - do so by copying to multimap + multimap sortHypsByVal; + for (size_t i =0; i < estimatedScores.size(); ++i) { + sortHypsByVal.insert(make_pair(estimatedScores[i], connectedHyp[i])); + } + + multimap::const_iterator it = --sortHypsByVal.end(); + float bestScore = it->first; + //store best score as score of hyp 0 + sortHypsByVal.insert(make_pair(bestScore, emptyHyp)); + + + IFVERBOSE(3) { + for (multimap::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) { + const Hypothesis* currHyp = it->second; + cerr << "Hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl; + } + } + + + set survivingHyps; //store hyps that make the cut in this + + VERBOSE(2, "BEST HYPO TARGET LENGTH : " << bestHypo->GetSize() << endl) + size_t numEdgesTotal = edgeDensity * bestHypo->GetSize(); //as per Shankar, aim for (density * target length of MAP solution) arcs + size_t numEdgesCreated = 0; + VERBOSE(2, "Target edge count: " << numEdgesTotal << endl); + + float prevScore = -999999; + + //now iterate over multimap + for (multimap::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) { + float currEstimatedScore = it->first; + const Hypothesis* currHyp = it->second; + + if (numEdgesCreated >= numEdgesTotal && prevScore > currEstimatedScore) //if this hyp has equal estimated score to previous, include its edges too + break; + + prevScore = currEstimatedScore; + VERBOSE(3, "Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl) + VERBOSE(3, "Considering hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl) + + survivingHyps.insert(currHyp); //CurrHyp made the cut + + // is its best predecessor already included ? + if (survivingHyps.find(currHyp->GetPrevHypo()) != survivingHyps.end()) { //yes, then add an edge + vector & edges = incomingEdges[currHyp]; + Edge winningEdge(currHyp->GetPrevHypo(),currHyp,currHyp->GetScore() - currHyp->GetPrevHypo()->GetScore(),currHyp->GetTargetPhrase()); + edges.push_back(winningEdge); + ++numEdgesCreated; + } + + //let's try the arcs too + const ArcList *arcList = currHyp->GetArcList(); + if (arcList != NULL) { + ArcList::const_iterator iterArcList; + for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) { + const Hypothesis *loserHypo = *iterArcList; + const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo(); + if (survivingHyps.find(loserPrevHypo) != survivingHyps.end()) { //found it, add edge + double arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore(); + Edge losingEdge(loserPrevHypo, currHyp, arcScore, loserHypo->GetTargetPhrase()); + vector & edges = incomingEdges[currHyp]; + edges.push_back(losingEdge); + ++numEdgesCreated; + } + } + } + + //Now if a successor node has already been visited, add an edge connecting the two + map < const Hypothesis*, set < const Hypothesis* > >::const_iterator outgoingIt = outgoingHyps.find(currHyp); + + if (outgoingIt != outgoingHyps.end()) {//currHyp does have successors + const set & outHyps = outgoingIt->second; //the successors + for (set::const_iterator outHypIts = outHyps.begin(); outHypIts != outHyps.end(); ++outHypIts) { + const Hypothesis* succHyp = *outHypIts; + + if (survivingHyps.find(succHyp) == survivingHyps.end()) //Have we encountered the successor yet? + continue; //No, move on to next + + //Curr Hyp can be : a) the best predecessor of succ b) or an arc attached to succ + if (succHyp->GetPrevHypo() == currHyp) { //best predecessor + vector & succEdges = incomingEdges[succHyp]; + Edge succWinningEdge(currHyp, succHyp, succHyp->GetScore() - currHyp->GetScore(), succHyp->GetTargetPhrase()); + succEdges.push_back(succWinningEdge); + survivingHyps.insert(succHyp); + ++numEdgesCreated; + } + + //now, let's find an arc + const ArcList *arcList = succHyp->GetArcList(); + if (arcList != NULL) { + ArcList::const_iterator iterArcList; + for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) { + const Hypothesis *loserHypo = *iterArcList; + const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo(); + if (loserPrevHypo == currHyp) { //found it + vector & succEdges = incomingEdges[succHyp]; + double arcScore = loserHypo->GetScore() - currHyp->GetScore(); + Edge losingEdge(currHyp, succHyp, arcScore, loserHypo->GetTargetPhrase()); + succEdges.push_back(losingEdge); + ++numEdgesCreated; + } + } + } + } + } + } + + connectedHyp.clear(); + for (set ::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) { + connectedHyp.push_back(*it); + } + + VERBOSE(2, "Done! Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl) + + IFVERBOSE(3) { + cerr << "Surviving hyps: " ; + for (set ::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) { + cerr << (*it)->GetId() << " "; + } + cerr << endl; + } +} + +void calcNgramPosteriors(Lattice & connectedHyp, map >& incomingEdges, float scale, map& finalNgramScores) { + + sort(connectedHyp.begin(),connectedHyp.end(),ascendingCoverageCmp); //sort by increasing source word cov + + map forwardScore; + forwardScore[connectedHyp[0]] = 0.0f; //forward score of hyp 0 is 1 (or 0 in logprob space) + set< const Hypothesis *> finalHyps; //store completed hyps + + NgramScores ngramScores;//ngram scores for each hyp + + for (size_t i = 1; i < connectedHyp.size(); ++i) { + const Hypothesis* currHyp = connectedHyp[i]; + if (currHyp->GetWordsBitmap().IsComplete()) { + finalHyps.insert(currHyp); + } + + VERBOSE(3, "Processing hyp: " << currHyp->GetId() << ", num words cov= " << currHyp->GetWordsBitmap().GetNumWordsCovered() << endl) + + vector & edges = incomingEdges[currHyp]; + for (size_t e = 0; e < edges.size(); ++e) { + const Edge& edge = edges[e]; + if (forwardScore.find(currHyp) == forwardScore.end()) { + forwardScore[currHyp] = forwardScore[edge.GetTailNode()] + edge.GetScore(); + VERBOSE(3, "Fwd score["<GetId()<<"] = fwdScore["<GetId() << "] + edge Score: " << edge.GetScore() << endl) + } + else { + forwardScore[currHyp] = log_sum(forwardScore[currHyp], forwardScore[edge.GetTailNode()] + edge.GetScore()); + VERBOSE(3, "Fwd score["<GetId()<<"] += fwdScore["<GetId() << "] + edge Score: " << edge.GetScore() << endl) + } + } + + //Process ngrams now + for (size_t j =0 ; j < edges.size(); ++j) { + Edge& edge = edges[j]; + const NgramHistory & incomingPhrases = edge.GetNgrams(incomingEdges); + + //let's first score ngrams introduced by this edge + for (NgramHistory::const_iterator it = incomingPhrases.begin(); it != incomingPhrases.end(); ++it) { + const Phrase& ngram = it->first; + const PathCounts& pathCounts = it->second; + VERBOSE(4, "Calculating score for: " << it->first << endl) + + for (PathCounts::const_iterator pathCountIt = pathCounts.begin(); pathCountIt != pathCounts.end(); ++pathCountIt) { + //Score of an n-gram is forward score of head node of leftmost edge + all edge scores + const Path& path = pathCountIt->first; + float score = forwardScore[path[0]->GetTailNode()]; + for (size_t i = 0; i < path.size(); ++i) { + score += path[i]->GetScore(); + } + ngramScores.addScore(currHyp,ngram,score); + } + } + + //Now score ngrams that are just being propagated from the history + for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(edge.GetTailNode()); + it != ngramScores.nodeEnd(edge.GetTailNode()); ++it) { + const Phrase & currNgram = *(it->first); + float currNgramScore = it->second; + VERBOSE(4, "Calculating score for: " << currNgram << endl) + + if (incomingPhrases.find(currNgram) == incomingPhrases.end()) { + float score = edge.GetScore() + currNgramScore; + ngramScores.addScore(currHyp,currNgram,score); + } + } + + } + } + + float Z = 9999999; //the total score of the lattice + + //Done - Print out ngram posteriors for final hyps + for (set< const Hypothesis *>::iterator finalHyp = finalHyps.begin(); finalHyp != finalHyps.end(); ++finalHyp) { + const Hypothesis* hyp = *finalHyp; + + for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(hyp); it != ngramScores.nodeEnd(hyp); ++it) { + const Phrase& ngram = *(it->first); + if (finalNgramScores.find(ngram) == finalNgramScores.end()) { + finalNgramScores[ngram] = it->second; + } + else { + finalNgramScores[ngram] = log_sum(it->second, finalNgramScores[ngram]); + } + } + + if (Z == 9999999) { + Z = forwardScore[hyp]; + } + else { + Z = log_sum(Z, forwardScore[hyp]); + } + } + + Z *= scale; //scale the score + + for (map::iterator finalScoresIt = finalNgramScores.begin(); finalScoresIt != finalNgramScores.end(); ++finalScoresIt) { + finalScoresIt->second = finalScoresIt->second * scale - Z; + IFVERBOSE(2) { + VERBOSE(2,finalScoresIt->first << " [" << finalScoresIt->second << "]" << endl); + } + } + +} + +const NgramHistory& Edge::GetNgrams(map > & incomingEdges) { + + if (m_ngrams.size() > 0) + return m_ngrams; + + const Phrase& currPhrase = GetWords(); + //Extract the n-grams local to this edge + for (size_t start = 0; start < currPhrase.GetSize(); ++start) { + for (size_t end = start; end < start + bleu_order; ++end) { + if (end < currPhrase.GetSize()){ + Phrase edgeNgram(Output); + for (size_t index = start; index <= end; ++index) { + edgeNgram.AddWord(currPhrase.GetWord(index)); + } + //cout << "Inserting Phrase : " << edgeNgram << endl; + vector edgeHistory; + edgeHistory.push_back(this); + storeNgramHistory(edgeNgram, edgeHistory); + } + else { + break; + } + } + } + + map >::iterator it = incomingEdges.find(m_tailNode); + if (it != incomingEdges.end()) { //node has incoming edges + vector & inEdges = it->second; + + for (vector::iterator edge = inEdges.begin(); edge != inEdges.end(); ++edge) {//add the ngrams straddling prev and curr edge + const NgramHistory & edgeIncomingNgrams = edge->GetNgrams(incomingEdges); + for (NgramHistory::const_iterator edgeInNgramHist = edgeIncomingNgrams.begin(); edgeInNgramHist != edgeIncomingNgrams.end(); ++edgeInNgramHist) { + const Phrase& edgeIncomingNgram = edgeInNgramHist->first; + const PathCounts & edgeIncomingNgramPaths = edgeInNgramHist->second; + size_t back = min(edgeIncomingNgram.GetSize(), edge->GetWordsSize()); + const Phrase& edgeWords = edge->GetWords(); + IFVERBOSE(3) { + cerr << "Edge: "<< *edge <first; + newNgramPath.push_back(this); + storeNgramHistory(newNgram, newNgramPath, pathIt->second); + } + } + } + } + } + } + return m_ngrams; +} + +//Add the last lastN words of origPhrase to targetPhrase +void Edge::GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const { + size_t origSize = origPhrase.GetSize(); + size_t startIndex = origSize - lastN; + for (size_t index = startIndex; index < origPhrase.GetSize(); ++index) { + targetPhrase.AddWord(origPhrase.GetWord(index)); + } +} + +bool Edge::operator< (const Edge& compare ) const { + if (m_headNode->GetId() < compare.m_headNode->GetId()) + return true; + if (compare.m_headNode->GetId() < m_headNode->GetId()) + return false; + if (m_tailNode->GetId() < compare.m_tailNode->GetId()) + return true; + if (compare.m_tailNode->GetId() < m_tailNode->GetId()) + return false; + return GetScore() < compare.GetScore(); +} + +ostream& operator<< (ostream& out, const Edge& edge) { + out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl; + return out; +} + +bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b) { + return a->GetWordsBitmap().GetNumWordsCovered() < b->GetWordsBitmap().GetNumWordsCovered(); +} + +vector calcMBRSol(const TrellisPathList& nBestList, map& finalNgramScores, const vector & thetas, float p, float r){ + + vector mbrThetas = thetas; + if (thetas.size() == 0) { //thetas not specified on the command line, use p and r instead + mbrThetas.push_back(-1); //Theta 0 + mbrThetas.push_back(1/(bleu_order*p)); + for (size_t i = 2; i <= bleu_order; ++i){ + mbrThetas.push_back(mbrThetas[i-1] / r); + } + } + IFVERBOSE(2) { + VERBOSE(2,"Thetas: "); + for (size_t i = 0; i < mbrThetas.size(); ++i) { + VERBOSE(2,mbrThetas[i] << " "); + } + VERBOSE(2,endl); + } + + float argmaxScore = -1e20; + TrellisPathList::const_iterator iter; + size_t ctr = 0; + + vector argmaxTranslation; + for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr) + { + const TrellisPath &path = **iter; + // get words in translation + vector translation; + GetOutputWords(path, translation); + + // collect n-gram counts + map < Phrase, int > counts; + extract_ngrams(translation,counts); + + //Now score this translation + float mbrScore = mbrThetas[0] * translation.size(); + + float ngramScore = 0; + + for (map < Phrase, int >::iterator ngrams = counts.begin(); ngrams != counts.end(); ++ngrams) { + float ngramPosterior = UNKNGRAMLOGPROB; + map::const_iterator ngramPosteriorIt = finalNgramScores.find(ngrams->first); + if (ngramPosteriorIt != finalNgramScores.end()) { + ngramPosterior = ngramPosteriorIt->second; + } + + if (ngramScore == 0) { + ngramScore = log((double) ngrams->second) + ngramPosterior + log(mbrThetas[(ngrams->first).GetSize()]); + } + else { + ngramScore = log_sum(ngramScore, float(log((double) ngrams->second) + ngramPosterior + log(mbrThetas[(ngrams->first).GetSize()]))); + } + //cout << "Ngram: " << ngrams->first << endl; + } + + mbrScore += exp(ngramScore); + + if (mbrScore > argmaxScore){ + argmaxScore = mbrScore; + IFVERBOSE(2) { + VERBOSE(2,"HYP " << ctr << " IS NEW BEST: "); + for (size_t i = 0; i < translation.size(); ++i) + VERBOSE(2,translation[i]); + VERBOSE(2,"[" << argmaxScore << "]" << endl); + } + argmaxTranslation = translation; + } + } + return argmaxTranslation; +} + +vector doLatticeMBR(Manager& manager, TrellisPathList& nBestList) { + const StaticData& staticData = StaticData::Instance(); + std::map < int, bool > connected; + std::vector< const Hypothesis *> connectedList; + map ngramPosteriors; + std::map < const Hypothesis*, set > outgoingHyps; + map > incomingEdges; + vector< float> estimatedScores; + manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores); + pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor()); + calcNgramPosteriors(connectedList, incomingEdges, staticData.GetMBRScale(), ngramPosteriors); + vector mbrBestHypo = calcMBRSol(nBestList, ngramPosteriors, staticData.GetLatticeMBRThetas(), + staticData.GetLatticeMBRPrecision(), staticData.GetLatticeMBRPRatio()); + return mbrBestHypo; +} + diff --git a/moses-cmd/src/LatticeMBR.h b/moses-cmd/src/LatticeMBR.h new file mode 100644 index 000000000..e6a67cc2b --- /dev/null +++ b/moses-cmd/src/LatticeMBR.h @@ -0,0 +1,117 @@ +/* + * LatticeMBR.h + * moses-cmd + * + * Created by Abhishek Arun on 26/01/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ + +#ifndef moses_cmd_LatticeMBR_h +#define moses_cmd_LatticeMBR_h + +#include +#include +#include +#include "Hypothesis.h" +#include "Manager.h" +#include "TrellisPathList.h" + +using namespace Moses; + +template +T log_sum (T log_a, T log_b) +{ + T v; + if (log_a < log_b) { + v = log_b+log ( 1 + exp ( log_a-log_b )); + } else { + v = log_a+log ( 1 + exp ( log_b-log_a )); + } + return ( v ); +} + +class Edge; + +typedef std::vector< const Hypothesis *> Lattice; +typedef vector Path; +typedef map PathCounts; +typedef map NgramHistory; + +class Edge { + const Hypothesis* m_tailNode; + const Hypothesis* m_headNode; + float m_score; + TargetPhrase m_targetPhrase; + NgramHistory m_ngrams; + + public: + Edge(const Hypothesis* from, const Hypothesis* to, float score, const TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) { + //cout << "Creating new edge from Node " << from->GetId() << ", to Node : " << to->GetId() << ", score: " << score << " phrase: " << targetPhrase << endl; + } + + const Hypothesis* GetHeadNode() const { + return m_headNode; + } + + const Hypothesis* GetTailNode() const { + return m_tailNode; + } + + float GetScore() const { + return m_score; + } + + size_t GetWordsSize() const { + return m_targetPhrase.GetSize(); + } + + const Phrase& GetWords() const { + return m_targetPhrase; + } + + friend ostream& operator<< (ostream& out, const Edge& edge); + + const NgramHistory& GetNgrams( map > & incomingEdges) ; + + bool operator < (const Edge & compare) const; + + void GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const; + + void storeNgramHistory(const Phrase& phrase, Path & path, size_t count = 1){ + m_ngrams[phrase][path]+= count; + } + +}; + +/** +* Data structure to hold the ngram scores as we traverse the lattice. Maps (hypo,ngram) to score +*/ +class NgramScores { + public: + NgramScores() {} + + /** logsum this score to the existing score */ + void addScore(const Hypothesis* node, const Phrase& ngram, float score); + + /** Iterate through ngrams for selected node */ + typedef map::const_iterator NodeScoreIterator; + NodeScoreIterator nodeBegin(const Hypothesis* node); + NodeScoreIterator nodeEnd(const Hypothesis* node); + + private: + set m_ngrams; + map > m_scores; +}; + +void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set > & outgoingHyps, map >& incomingEdges, + const vector< float> & estimatedScores, const Hypothesis*, size_t edgeDensity); + +vector calcMBRSol(Lattice & connectedHyp, map& finalNgramScores,const vector & thetas, float, float); +vector calcMBRSol(const TrellisPathList& nBestList, map& finalNgramScores,const vector & thetas, float, float); +void calcNgramPosteriors(Lattice & connectedHyp, map >& incomingEdges, float scale, map& finalNgramScores); +void GetOutputFactors(const TrellisPath &path, vector &translation); +void extract_ngrams(const vector& sentence, map < Phrase, int > & allngrams); +bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b); +vector doLatticeMBR(Manager& manager, TrellisPathList& nBestList); +#endif diff --git a/moses-cmd/src/LatticeMBRGrid.cpp b/moses-cmd/src/LatticeMBRGrid.cpp new file mode 100644 index 000000000..89b69b36e --- /dev/null +++ b/moses-cmd/src/LatticeMBRGrid.cpp @@ -0,0 +1,204 @@ +// $Id: $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (c) 2010 University of Edinburgh +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the University of Edinburgh nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ +/** +* Lattice MBR grid search. Enables a grid search through the four parameters (p,r,scale and prune) used in lattice MBR. + See 'Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation by Tromble, Kumar, Och and Macherey, + EMNLP 2008 for details of the parameters. + + The grid search is controlled by specifying comma separated lists for the lmbr parameters (-lmbr-p, -lmbr-r, + -lmbr-pruning-factor and -mbr-scale). All other parameters are passed through to moses. If any of the lattice mbr + parameters are missing, then they are set to their default values. Output is of the form: + sentence-id ||| p r prune scale ||| translation-hypothesis +**/ + +#include +#include +#include +#include +#include + +#include "IOWrapper.h" +#include "LatticeMBR.h" +#include "Manager.h" +#include "StaticData.h" + + +using namespace std; +using namespace Moses; + +//keys +enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale}; + +class Grid { + public: + /** Add a parameter with key, command line argument, and default value */ + void addParam(gridkey key, const string& arg, float defaultValue) { + m_args[arg] = key; + assert(m_grid.find(key) == m_grid.end()); + m_grid[key].push_back(defaultValue); + } + + /** Parse the arguments, removing those that define the grid and returning a copy of the rest */ + void parseArgs(int& argc, char**& argv) { + char** newargv = new char*[argc+1]; //Space to add mbr parameter + int newargc = 0; + for (int i = 0; i < argc; ++i) { + bool consumed = false; + for (map::const_iterator argi = m_args.begin(); argi != m_args.end(); ++argi) { + if (!strcmp(argv[i], argi->first.c_str())) { + ++i; + if (i >= argc) { + cerr << "Error: missing parameter for " << argi->first << endl; + throw runtime_error("Missing parameter"); + } else { + string value = argv[i]; + gridkey key = argi->second; + if (m_grid[key].size() != 1) { + throw runtime_error("Duplicate grid argument"); + } + m_grid[key].clear(); + char delim = ','; + string::size_type lastpos = value.find_first_not_of(delim); + string::size_type pos = value.find_first_of(delim,lastpos); + while (string::npos != pos || string::npos != lastpos) { + float param = atof(value.substr(lastpos, pos-lastpos).c_str()); + if (!param) { + cerr << "Error: Illegal grid parameter for " << argi->first << endl; + throw runtime_error("Illegal grid parameter"); + } + m_grid[key].push_back(param); + lastpos = value.find_first_not_of(delim,pos); + pos = value.find_first_of(delim,lastpos); + } + consumed = true; + } + if (consumed) break; + } + } + if (!consumed) { + newargv[newargc] = new char[strlen(argv[i]) + 1]; + strcpy(newargv[newargc],argv[i]); + ++newargc; + } + } + argc = newargc; + argv = newargv; + } + + /** Get the grid for a particular key.*/ + const vector& getGrid(gridkey key) const { + map >::const_iterator iter = m_grid.find(key); + assert (iter != m_grid.end()); + return iter->second; + + } + + private: + map > m_grid; + map m_args; +}; + +int main(int argc, char* argv[]) { + cerr << "Lattice MBR Grid search" << endl; + + Grid grid; + grid.addParam(lmbr_p, "-lmbr-p", 0.5); + grid.addParam(lmbr_r, "-lmbr-r", 0.5); + grid.addParam(lmbr_prune, "-lmbr-pruning-factor",30.0); + grid.addParam(lmbr_scale, "-mbr-scale",1.0); + + grid.parseArgs(argc,argv); + + Parameter* params = new Parameter(); + if (!params->LoadParam(argc,argv)) { + params->Explain(); + exit(1); + } + if (!StaticData::LoadDataStatic(params)) { + exit(1); + } + + StaticData& staticData = const_cast(StaticData::Instance()); + staticData.SetUseLatticeMBR(true); + IOWrapper* ioWrapper = GetIODevice(staticData); + + if (!ioWrapper) { + throw runtime_error("Failed to initialise IOWrapper"); + } + size_t nBestSize = staticData.GetMBRSize(); + + if (nBestSize <= 0){ + throw new runtime_error("Non-positive size specified for n-best list"); + } + + size_t lineCount = 0; + InputType* source = NULL; + + const vector& pgrid = grid.getGrid(lmbr_p); + const vector& rgrid = grid.getGrid(lmbr_r); + const vector& prune_grid = grid.getGrid(lmbr_prune); + const vector& scale_grid = grid.getGrid(lmbr_scale); + + while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) { + ++lineCount; + Sentence sentence(Input); + Manager manager(*source,staticData.GetSearchAlgorithm()); + manager.ProcessSentence(); + TrellisPathList nBestList; + manager.CalcNBest(nBestSize, nBestList,true); + //grid search + for (vector::const_iterator pi = pgrid.begin(); pi != pgrid.end(); ++pi) { + float p = *pi; + staticData.SetLatticeMBRPrecision(p); + for (vector::const_iterator ri = rgrid.begin(); ri != rgrid.end(); ++ri) { + float r = *ri; + staticData.SetLatticeMBRPRatio(r); + for (vector::const_iterator prune_i = prune_grid.begin(); prune_i != prune_grid.end(); ++prune_i) { + size_t prune = (size_t)(*prune_i); + staticData.SetLatticeMBRPruningFactor(prune); + for (vector::const_iterator scale_i = scale_grid.begin(); scale_i != scale_grid.end(); ++scale_i) { + float scale = *scale_i; + staticData.SetMBRScale(scale); + cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| "; + vector mbrBestHypo = doLatticeMBR(manager,nBestList); + OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(), + staticData.GetReportAllFactors(),cout); + } + } + + } + } + + + } + +} diff --git a/moses-cmd/src/Main.cpp b/moses-cmd/src/Main.cpp new file mode 100644 index 000000000..8a053ff56 --- /dev/null +++ b/moses-cmd/src/Main.cpp @@ -0,0 +1,233 @@ +// $Id: Main.cpp 2954 2010-03-07 08:28:16Z abarun $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (c) 2006 University of Edinburgh +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the University of Edinburgh nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +// example file on how to use moses library + +#ifdef WIN32 +// Include Visual Leak Detector +#include +#endif + +#include +#include "Main.h" +#include "TrellisPath.h" +#include "FactorCollection.h" +#include "Manager.h" +#include "Phrase.h" +#include "Util.h" +#include "TrellisPathList.h" +#include "Timer.h" +#include "IOWrapper.h" +#include "Sentence.h" +#include "ConfusionNet.h" +#include "WordLattice.h" +#include "TranslationAnalysis.h" +#include "mbr.h" +#include "LatticeMBR.h" + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_PROTOBUF +#include "hypergraph.pb.h" +#endif + + +using namespace std; +using namespace Moses; + +int main(int argc, char* argv[]) +{ + + +#ifdef HAVE_PROTOBUF + GOOGLE_PROTOBUF_VERIFY_VERSION; +#endif + IFVERBOSE(1) + { + TRACE_ERR("command: "); + for(int i=0;i weights = staticData.GetAllWeights(); + IFVERBOSE(2) { + TRACE_ERR("The score component vector looks like this:\n" << staticData.GetScoreIndexManager()); + TRACE_ERR("The global weight vector looks like this:"); + for (size_t j=0; jGetTranslationId(), ioWrapper->GetOutputWordGraphStream()); + + if (staticData.GetOutputSearchGraph()) + manager.GetSearchGraph(source->GetTranslationId(), ioWrapper->GetOutputSearchGraphStream()); + +#ifdef HAVE_PROTOBUF + if (staticData.GetOutputSearchGraphPB()) { + ostringstream sfn; + sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << source->GetTranslationId() << ".pb" << ends; + string fn = sfn.str(); + VERBOSE(2, "Writing search graph to " << fn << endl); + fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out); + manager.SerializeSearchGraphPB(source->GetTranslationId(), output); + } +#endif + + //Print all derivations in search graph + if (staticData.PrintAllDerivations()) { + manager.PrintAllDerivations(source->GetTranslationId()); + } + + // pick best translation (maximum a posteriori decoding) + if (! staticData.UseMBR() ) { + ioWrapper->OutputBestHypo(manager.GetBestHypothesis(), source->GetTranslationId(), + staticData.GetReportSegmentation(), staticData.GetReportAllFactors()); + IFVERBOSE(2) { PrintUserTime("Best Hypothesis Generation Time:"); } + + // n-best + size_t nBestSize = staticData.GetNBestSize(); + if (nBestSize > 0) + { + VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl); + TrellisPathList nBestList; + manager.CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest()); + ioWrapper->OutputNBestList(nBestList, source->GetTranslationId()); + //RemoveAllInColl(nBestList); + + IFVERBOSE(2) { PrintUserTime("N-Best Hypotheses Generation Time:"); } + } + } + else { + size_t nBestSize = staticData.GetMBRSize(); + + if (nBestSize <= 0) + { + cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl; + return EXIT_FAILURE; + } + TrellisPathList nBestList; + manager.CalcNBest(nBestSize, nBestList,true); + VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl); + IFVERBOSE(2) { PrintUserTime("calculated n-best list for (L)MBR decoding"); } + if (staticData.UseLatticeMBR()) { + vector mbrBestHypo = doLatticeMBR(manager,nBestList); + OutputBestHypo(mbrBestHypo, source->GetTranslationId(), staticData.GetReportSegmentation(), + staticData.GetReportAllFactors(),cout); + IFVERBOSE(2) { PrintUserTime("finished Lattice MBR decoding"); } + } else { + std::vector mbrBestHypo = doMBR(nBestList); + OutputBestHypo(mbrBestHypo, source->GetTranslationId(), + staticData.GetReportSegmentation(), + staticData.GetReportAllFactors(),cout); + IFVERBOSE(2) { PrintUserTime("finished MBR decoding"); } + } + + if (!staticData.GetNBestFilePath().empty()){ + //print the all nbest used for MBR (and not the amount passed through the parameter + VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl); + ioWrapper->OutputNBestList(nBestList, source->GetTranslationId()); + IFVERBOSE(2) { PrintUserTime("N-Best Hypotheses Generation Time:"); } + } + } + + + + if (staticData.IsDetailedTranslationReportingEnabled()) { + TranslationAnalysis::PrintTranslationAnalysis(std::cerr, manager.GetBestHypothesis()); + } + + IFVERBOSE(2) { PrintUserTime("Sentence Decoding Time:"); } + + manager.CalcDecoderStatistics(); + + } + + delete ioWrapper; + + IFVERBOSE(1) + PrintUserTime("End."); + + #ifndef EXIT_RETURN + //This avoids that detructors are called (it can take a long time) + exit(EXIT_SUCCESS); + #else + return EXIT_SUCCESS; + #endif +} + + diff --git a/moses-cmd/src/Main.h b/moses-cmd/src/Main.h new file mode 100644 index 000000000..67dfc30a0 --- /dev/null +++ b/moses-cmd/src/Main.h @@ -0,0 +1,43 @@ +// $Id: Main.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (c) 2006 University of Edinburgh +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the University of Edinburgh nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +// example file on how to use moses library + +#ifndef moses_cmd_Main_h +#define moses_cmd_Main_h + +#include "StaticData.h" + +class IOWrapper; + +int main(int argc, char* argv[]); +#endif diff --git a/moses-cmd/src/MainMT.cpp b/moses-cmd/src/MainMT.cpp new file mode 100644 index 000000000..252b165c7 --- /dev/null +++ b/moses-cmd/src/MainMT.cpp @@ -0,0 +1,268 @@ +// $Id: $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2009 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +/** + * Main for multithreaded moses. + **/ + +#include +#include + +#include + +#if defined(BOOST_HAS_PTHREADS) +#include +#endif + + +#include "Hypothesis.h" +#include "IOWrapper.h" +#include "LatticeMBR.h" +#include "Manager.h" +#include "StaticData.h" +#include "ThreadPool.h" +#include "Util.h" +#include "mbr.h" + +using namespace std; +using namespace Moses; + + +/** + * Makes sure output goes in the correct order. + **/ +class OutputCollector { + public: + OutputCollector(std::ostream* outStream= &cout, std::ostream* debugStream=&cerr) : + m_nextOutput(0),m_outStream(outStream),m_debugStream(debugStream) {} + + + /** + * Write or cache the output, as appropriate. + **/ + void Write(int sourceId,const string& output,const string& debug="") { + boost::mutex::scoped_lock lock(m_mutex); + if (sourceId == m_nextOutput) { + //This is the one we were expecting + *m_outStream << output; + *m_debugStream << debug; + ++m_nextOutput; + //see if there's any more + map::iterator iter; + while ((iter = m_outputs.find(m_nextOutput)) != m_outputs.end()) { + *m_outStream << iter->second; + m_outputs.erase(iter); + ++m_nextOutput; + map::iterator debugIter = m_debugs.find(iter->first); + if (debugIter != m_debugs.end()) { + *m_debugStream << debugIter->second; + m_debugs.erase(debugIter); + } + } + } else { + //save for later + m_outputs[sourceId] = output; + m_debugs[sourceId] = debug; + } + } + + private: + map m_outputs; + map m_debugs; + int m_nextOutput; + ostream* m_outStream; + ostream* m_debugStream; + boost::mutex m_mutex; +}; + +/** + * Translates a sentence. + **/ +class TranslationTask : public Task { + + public: + + TranslationTask(size_t lineNumber, + InputType* source, OutputCollector* outputCollector, OutputCollector* nbestCollector) : + m_source(source), m_lineNumber(lineNumber), + m_outputCollector(outputCollector), m_nbestCollector(nbestCollector) {} + + void Run() { +#if defined(BOOST_HAS_PTHREADS) + TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << (int)pthread_self() << std::endl); +#endif + const StaticData &staticData = StaticData::Instance(); + Sentence sentence(Input); + Manager manager(*m_source,staticData.GetSearchAlgorithm()); + manager.ProcessSentence(); + + if (m_outputCollector) { + ostringstream out; + ostringstream debug; + const Hypothesis* bestHypo = NULL; + if (!staticData.UseMBR()) { + bestHypo = manager.GetBestHypothesis(); + if (bestHypo) { + OutputSurface( + out, + bestHypo, + staticData.GetOutputFactorOrder(), + staticData.GetReportSegmentation(), + staticData.GetReportAllFactors()); + IFVERBOSE(1) { + debug << "BEST TRANSLATION: " << *bestHypo << endl; + } + } + out << endl; + } else { + size_t nBestSize = staticData.GetMBRSize(); + if (nBestSize <= 0) { + cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl; + exit(1); + } + TrellisPathList nBestList; + manager.CalcNBest(nBestSize, nBestList,true); + VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl); + IFVERBOSE(2) { PrintUserTime("calculated n-best list for (L)MBR decoding"); } + + if (staticData.UseLatticeMBR()) { + //Lattice MBR decoding + vector mbrBestHypo = doLatticeMBR(manager,nBestList); + OutputBestHypo(mbrBestHypo, m_lineNumber, staticData.GetReportSegmentation(), + staticData.GetReportAllFactors(),out); + IFVERBOSE(2) { PrintUserTime("finished Lattice MBR decoding"); } + } else { + //MBR decoding + std::vector mbrBestHypo = doMBR(nBestList); + OutputBestHypo(mbrBestHypo, m_lineNumber, + staticData.GetReportSegmentation(), + staticData.GetReportAllFactors(),out); + IFVERBOSE(2) { PrintUserTime("finished MBR decoding"); } + + } + } + m_outputCollector->Write(m_lineNumber,out.str(),debug.str()); + } + if (m_nbestCollector) { + TrellisPathList nBestList; + ostringstream out; + manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest()); + OutputNBest(out,nBestList, staticData.GetOutputFactorOrder(), m_lineNumber); + m_nbestCollector->Write(m_lineNumber, out.str()); + } + } + + ~TranslationTask() {delete m_source;} + + private: + InputType* m_source; + size_t m_lineNumber; + OutputCollector* m_outputCollector; + OutputCollector* m_nbestCollector; + +}; + +int main(int argc, char** argv) { + //extract pool-size args, send others to moses + char** mosesargv = new char*[argc+2]; + int mosesargc = 0; + int threadcount = 10; + for (int i = 0; i < argc; ++i) { + if (!strcmp(argv[i], "-threads")) { + ++i; + if (i >= argc) { + cerr << "Error: Missing argument to -threads" << endl; + exit(1); + } else { + threadcount = atoi(argv[i]); + } + } else { + mosesargv[mosesargc] = new char[strlen(argv[i])+1]; + strcpy(mosesargv[mosesargc],argv[i]); + ++mosesargc; + } + } + if (threadcount <= 0) { + cerr << "Error: Must specify a positive number of threads" << endl; + exit(1); + } + + Parameter* params = new Parameter(); + if (!params->LoadParam(mosesargc,mosesargv)) { + params->Explain(); + exit(1); + } + if (!StaticData::LoadDataStatic(params)) { + exit(1); + } + + const StaticData& staticData = StaticData::Instance(); + IOWrapper* ioWrapper = GetIODevice(staticData); + + if (!ioWrapper) { + cerr << "Error; Failed to create IO object" << endl; + exit(1); + } + ThreadPool pool(threadcount); + InputType* source = NULL; + size_t lineCount = 0; + auto_ptr outputCollector;//for translations + auto_ptr nbestCollector; + auto_ptr nbestOut; + size_t nbestSize = staticData.GetNBestSize(); + string nbestFile = staticData.GetNBestFilePath(); + if (nbestSize) { + if (nbestFile == "-") { + //nbest to stdout, no 1-best + //FIXME: Moses doesn't actually let you pass a '-' on the command line. + nbestCollector.reset(new OutputCollector()); + } else { + //nbest to file, 1-best to stdout + nbestOut.reset(new ofstream(nbestFile.c_str())); + assert(nbestOut->good()); + nbestCollector.reset(new OutputCollector(nbestOut.get())); + outputCollector.reset(new OutputCollector()); + } + } else { + outputCollector.reset(new OutputCollector()); + } + + while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) { + TranslationTask* task = + new TranslationTask(lineCount,source, outputCollector.get(), nbestCollector.get()); + pool.Submit(task); + source = NULL; //make sure it doesn't get deleted + ++lineCount; + } + + pool.Stop(true); //flush remaining jobs + + #ifndef EXIT_RETURN + //This avoids that detructors are called (it can take a long time) + exit(EXIT_SUCCESS); + #else + return EXIT_SUCCESS; + #endif +} + + + + diff --git a/moses-cmd/src/Makefile.am b/moses-cmd/src/Makefile.am new file mode 100644 index 000000000..f5de6d79c --- /dev/null +++ b/moses-cmd/src/Makefile.am @@ -0,0 +1,16 @@ +if WITH_THREADS + bin_PROGRAMS = moses mosesmt lmbrgrid +else + bin_PROGRAMS = moses lmbrgrid +endif + +AM_CPPFLAGS = -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES -DUSE_HYPO_POOL -I$(top_srcdir)/moses/src $(BOOST_CPPFLAGS) + +moses_SOURCES = Main.cpp mbr.cpp IOWrapper.cpp TranslationAnalysis.cpp LatticeMBR.cpp +moses_LDADD = $(top_builddir)/moses/src/libmoses.la $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB) + +mosesmt_SOURCES = MainMT.cpp mbr.cpp IOWrapper.cpp TranslationAnalysis.cpp ThreadPool.cpp LatticeMBR.cpp +mosesmt_LDADD = $(top_builddir)/moses/src/libmoses.la $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB) + +lmbrgrid_SOURCES = LatticeMBRGrid.cpp LatticeMBR.cpp IOWrapper.cpp +lmbrgrid_LDADD = $(top_builddir)/moses/src/libmoses.la $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB) diff --git a/moses-cmd/src/ThreadPool.cpp b/moses-cmd/src/ThreadPool.cpp new file mode 100644 index 000000000..cbed7c4d9 --- /dev/null +++ b/moses-cmd/src/ThreadPool.cpp @@ -0,0 +1,97 @@ +// $Id: $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2009 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + + +#include "ThreadPool.h" + +using namespace std; +using namespace Moses; + +Moses::ThreadPool::ThreadPool( size_t numThreads ) + : m_stopped(false), m_stopping(false) +{ + for (size_t i = 0; i < numThreads; ++i) { + m_threads.create_thread(boost::bind(&ThreadPool::Execute,this)); + } +} + +void Moses::ThreadPool::Execute() +{ + do { + Task* task = NULL; + { // Find a job to perform + boost::mutex::scoped_lock lock(m_mutex); + if (m_tasks.empty() && !m_stopped) { + m_threadNeeded.wait(lock); + } + if (!m_stopped && !m_tasks.empty()) { + task = m_tasks.front(); + m_tasks.pop(); + } + } + //Execute job + if (task) { + task->Run(); + delete task; + } + m_threadAvailable.notify_all(); + } while (!m_stopped); +#if defined(BOOST_HAS_PTHREADS) + TRACE_ERR("Thread " << (int)pthread_self() << " exiting" << endl); +#endif +} + +void Moses::ThreadPool::Submit( Task* task ) +{ + boost::mutex::scoped_lock lock(m_mutex); + if (m_stopping) { + throw runtime_error("ThreadPool stopping - unable to accept new jobs"); + } + m_tasks.push(task); + m_threadNeeded.notify_all(); + +} + +void Moses::ThreadPool::Stop(bool processRemainingJobs) +{ + { + //prevent more jobs from being added to the queue + boost::mutex::scoped_lock lock(m_mutex); + if (m_stopped) return; + m_stopping = true; + } + if (processRemainingJobs) { + boost::mutex::scoped_lock lock(m_mutex); + //wait for queue to drain. + while (!m_tasks.empty() && !m_stopped) { + m_threadAvailable.wait(lock); + } + } + //tell all threads to stop + { + boost::mutex::scoped_lock lock(m_mutex); + m_stopped = true; + } + m_threadNeeded.notify_all(); + + cerr << m_threads.size() << endl; + m_threads.join_all(); +} diff --git a/moses-cmd/src/ThreadPool.h b/moses-cmd/src/ThreadPool.h new file mode 100644 index 000000000..eb8ced97f --- /dev/null +++ b/moses-cmd/src/ThreadPool.h @@ -0,0 +1,118 @@ +// $Id: $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2009 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_cmd_ThreadPool_h +#define moses_cmd_ThreadPool_h + +#include +#include +#include + + +#include +#include + +#if defined(BOOST_HAS_PTHREADS) +#include +#endif + + +#include "Util.h" + + +/** + * Classes to implement a ThreadPool. + **/ + +namespace Moses { + + +/** +* A task to be executed by the ThreadPool +**/ +class Task { + public: + virtual void Run() = 0; + virtual ~Task() {} +}; + +class ThreadPool { + public: + /** + * Construct a thread pool of a fixed size. + **/ + ThreadPool(size_t numThreads); + + + /** + * Add a job to the threadpool. + **/ + void Submit(Task* task); + + /** + * Wait until all queued jobs have completed, and shut down + * the ThreadPool. + **/ + void Stop(bool processRemainingJobs = false); + + ~ThreadPool() { Stop(); } + + + + private: + /** + * The main loop executed by each thread. + **/ + void Execute(); + + std::queue m_tasks; + boost::thread_group m_threads; + boost::mutex m_mutex; + boost::condition_variable m_threadNeeded; + boost::condition_variable m_threadAvailable; + bool m_stopped; + bool m_stopping; + +}; + + +class TestTask : public Task { + public: + TestTask(int id) : m_id(id) {} + virtual void Run() { +#if defined(BOOST_HAS_PTHREADS) + int tid = (int)pthread_self(); +#else + int tid = 0; +#endif + std::cerr << "Executing " << m_id << " in thread id " << tid << std::endl; + } + + virtual ~TestTask() {} + + private: + int m_id; +}; + + + +} +#endif diff --git a/moses-cmd/src/TranslationAnalysis.cpp b/moses-cmd/src/TranslationAnalysis.cpp new file mode 100644 index 000000000..add50ff3c --- /dev/null +++ b/moses-cmd/src/TranslationAnalysis.cpp @@ -0,0 +1,114 @@ +// $Id: TranslationAnalysis.cpp 2717 2010-01-28 15:32:04Z phkoehn $ + +#include +#include +#include +#include "StaticData.h" +#include "Hypothesis.h" +#include "TranslationAnalysis.h" + +using namespace Moses; + +namespace TranslationAnalysis { + +void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo) +{ + os << std::endl << "TRANSLATION HYPOTHESIS DETAILS:" << std::endl; + std::vector translationPath; + while (hypo) { + translationPath.push_back(hypo); + hypo = hypo->GetPrevHypo(); + } + std::reverse(translationPath.begin(), translationPath.end()); + + std::vector droppedWords; + std::vector::iterator tpi = translationPath.begin(); + ++tpi; // skip initial translation state + std::vector sourceMap; + std::vector targetMap; + std::vector lmAcc(0); + size_t lmCalls = 0; + bool doLMStats = ((*tpi)->GetLMStats() != 0); + if (doLMStats) + lmAcc.resize((*tpi)->GetLMStats()->size(), 0); + for (; tpi != translationPath.end(); ++tpi) { + std::ostringstream sms; + std::ostringstream tms; + std::string target = (*tpi)->GetTargetPhraseStringRep(); + std::string source = (*tpi)->GetSourcePhraseStringRep(); + WordsRange twr = (*tpi)->GetCurrTargetWordsRange(); + WordsRange swr = (*tpi)->GetCurrSourceWordsRange(); + + // language model backoff stats, + if (doLMStats) { + std::vector >& lmstats = *(*tpi)->GetLMStats(); + std::vector >::iterator i = lmstats.begin(); + std::vector::iterator acc = lmAcc.begin(); + + for (; i != lmstats.end(); ++i, ++acc) { + std::vector::iterator j = i->begin(); + lmCalls += i->size(); + for (; j != i->end(); ++j) { + (*acc) += *j; + } + } + } + + bool epsilon = false; + if (target == "") { + target=""; + epsilon = true; + droppedWords.push_back(source); + } + os << " SOURCE: " << swr << " " << source << std::endl + << " TRANSLATED AS: " << target << std::endl; + size_t twr_i = twr.GetStartPos(); + size_t swr_i = swr.GetStartPos(); + if (!epsilon) { sms << twr_i; } + if (epsilon) { tms << "del(" << swr_i << ")"; } else { tms << swr_i; } + swr_i++; twr_i++; + for (; twr_i <= twr.GetEndPos() && twr.GetEndPos() != NOT_FOUND; twr_i++) { + sms << '-' << twr_i; + } + for (; swr_i <= swr.GetEndPos() && swr.GetEndPos() != NOT_FOUND; swr_i++) { + tms << '-' << swr_i; + } + if (!epsilon) targetMap.push_back(sms.str()); + sourceMap.push_back(tms.str()); + } + std::vector::iterator si = sourceMap.begin(); + std::vector::iterator ti = targetMap.begin(); + os << std::endl << "SOURCE/TARGET SPANS:"; + os << std::endl << " SOURCE:"; + for (; si != sourceMap.end(); ++si) { + os << " " << *si; + } + os << std::endl << " TARGET:"; + for (; ti != targetMap.end(); ++ti) { + os << " " << *ti; + } + os << std::endl << std::endl; + if (doLMStats && lmCalls > 0) { + std::vector::iterator acc = lmAcc.begin(); + const LMList& lmlist = StaticData::Instance().GetAllLM(); + LMList::const_iterator i = lmlist.begin(); + for (; acc != lmAcc.end(); ++acc, ++i) { + char buf[256]; + sprintf(buf, "%.4f", (float)(*acc)/(float)lmCalls); + os << (*i)->GetScoreProducerDescription() <<", AVG N-GRAM LENGTH: " << buf << std::endl; + } + } + + if (droppedWords.size() > 0) { + std::vector::iterator dwi = droppedWords.begin(); + os << std::endl << "WORDS/PHRASES DROPPED:" << std::endl; + for (; dwi != droppedWords.end(); ++dwi) { + os << "\tdropped=" << *dwi << std::endl; + } + } + os << std::endl << "SCORES (UNWEIGHTED/WEIGHTED): "; + StaticData::Instance().GetScoreIndexManager().PrintLabeledWeightedScores(os, translationPath.back()->GetScoreBreakdown(), StaticData::Instance().GetAllWeights()); + os << std::endl; +} + +} diff --git a/moses-cmd/src/TranslationAnalysis.h b/moses-cmd/src/TranslationAnalysis.h new file mode 100644 index 000000000..877ca743d --- /dev/null +++ b/moses-cmd/src/TranslationAnalysis.h @@ -0,0 +1,24 @@ +// $Id: TranslationAnalysis.h 2939 2010-02-24 11:15:44Z jfouet $ + +/* + * also see moses/SentenceStats + */ + +#ifndef moses_cmd_TranslationAnalysis_h +#define moses_cmd_TranslationAnalysis_h + +#include +#include "Hypothesis.h" + +namespace TranslationAnalysis +{ + +/*** + * print details about the translation represented in hypothesis to + * os. Included information: phrase alignment, words dropped, scores + */ + void PrintTranslationAnalysis(std::ostream &os, const Moses::Hypothesis* hypo); + +} + +#endif diff --git a/moses-cmd/src/mbr.cpp b/moses-cmd/src/mbr.cpp new file mode 100644 index 000000000..9a77fbc5c --- /dev/null +++ b/moses-cmd/src/mbr.cpp @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "TrellisPathList.h" +#include "TrellisPath.h" +#include "StaticData.h" +#include "Util.h" +#include "mbr.h" + +using namespace std ; +using namespace Moses; + + +/* Input : + 1. a sorted n-best list, with duplicates filtered out in the following format + 0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432 + + 2. a weight vector + 3. bleu order ( default = 4) + 4. scaling factor to weigh the weight vector (default = 1.0) + + Output : + translations that minimise the Bayes Risk of the n-best list + + +*/ + +int BLEU_ORDER = 4; +int SMOOTH = 1; +float min_interval = 1e-4; +void extract_ngrams(const vector& sentence, map < vector < const Factor* >, int > & allngrams) +{ + vector< const Factor* > ngram; + for (int k = 0; k < BLEU_ORDER; k++) + { + for(int i =0; i < max((int)sentence.size()-k,0); i++) + { + for ( int j = i; j<= i+k; j++) + { + ngram.push_back(sentence[j]); + } + ++allngrams[ngram]; + ngram.clear(); + } + } +} + +float calculate_score(const vector< vector > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats ) { + int comps_n = 2*BLEU_ORDER+1; + vector comps(comps_n); + float logbleu = 0.0, brevity; + + int hyp_length = sents[hyp].size(); + + for (int i =0; i ,int > & hyp_ngrams = ngram_stats[hyp] ; + map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ; + + for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin(); + it != hyp_ngrams.end(); it++) + { + map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first); + if(ref_it != ref_ngrams.end()) + { + comps[2* (it->first.size()-1)] += min(ref_it->second,it->second); + } + } + comps[comps_n-1] = sents[ref].size(); + + for (int i=0; i 0 ) + logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH); + else + logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]); + } + logbleu /= BLEU_ORDER; + brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length + if (brevity < 0.0) + logbleu += brevity; + return exp(logbleu); +} + +vector doMBR(const TrellisPathList& nBestList){ + float marginal = 0; + + vector joint_prob_vec; + vector< vector > translations; + float joint_prob; + vector< map < vector , int > > ngram_stats; + + TrellisPathList::const_iterator iter; + + // get max score to prevent underflow + float maxScore = -1e20; + for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) + { + const TrellisPath &path = **iter; + float score = StaticData::Instance().GetMBRScale() + * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights()); + if (maxScore < score) maxScore = score; + } + + for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) + { + const TrellisPath &path = **iter; + joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights()) - maxScore); + marginal += joint_prob; + joint_prob_vec.push_back(joint_prob); + + // get words in translation + vector translation; + GetOutputFactors(path, translation); + + // collect n-gram counts + map < vector < const Factor *>, int > counts; + extract_ngrams(translation,counts); + + ngram_stats.push_back(counts); + translations.push_back(translation); + } + + vector mbr_loss; + float bleu, weightedLoss; + float weightedLossCumul = 0; + float minMBRLoss = 1000000; + int minMBRLossIdx = -1; + + /* Main MBR computation done here */ + iter = nBestList.begin(); + for (unsigned int i = 0; i < nBestList.GetSize(); i++){ + weightedLossCumul = 0; + for (unsigned int j = 0; j < nBestList.GetSize(); j++){ + if ( i != j) { + bleu = calculate_score(translations, j, i,ngram_stats ); + weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal); + weightedLossCumul += weightedLoss; + if (weightedLossCumul > minMBRLoss) + break; + } + } + if (weightedLossCumul < minMBRLoss){ + minMBRLoss = weightedLossCumul; + minMBRLossIdx = i; + } + iter++; + } + /* Find sentence that minimises Bayes Risk under 1- BLEU loss */ + return translations[minMBRLossIdx]; +} + +void GetOutputFactors(const TrellisPath &path, vector &translation){ + const std::vector &edges = path.GetEdges(); + const std::vector& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); + assert (outputFactorOrder.size() == 1); + + // print the surface factor of the translation + for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) + { + const Hypothesis &edge = *edges[currEdge]; + const Phrase &phrase = edge.GetCurrTargetPhrase(); + size_t size = phrase.GetSize(); + for (size_t pos = 0 ; pos < size ; pos++) + { + + const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]); + translation.push_back(factor); + } + } +} + diff --git a/moses-cmd/src/mbr.h b/moses-cmd/src/mbr.h new file mode 100644 index 000000000..467bbfa54 --- /dev/null +++ b/moses-cmd/src/mbr.h @@ -0,0 +1,28 @@ +// $Id: mbr.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_cmd_mbr_h +#define moses_cmd_mbr_h + +std::vector doMBR(const Moses::TrellisPathList& nBestList); +void GetOutputFactors(const Moses::TrellisPath &path, std::vector &translation); +float calculate_score(const std::vector< std::vector > & sents, int ref, int hyp, std::vector < std::map < std::vector < const Moses::Factor *>, int > > & ngram_stats ); +#endif -- cgit v1.2.3