Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicola Bertoldi <bertoldi@fbk.eu>2014-12-13 14:52:47 +0300
committerNicola Bertoldi <bertoldi@fbk.eu>2014-12-13 14:52:47 +0300
commite4eb201c52be74fee74399a6f35fcbe8eb85d834 (patch)
tree7792ef96d63262f6e28f1857741e1162c7dccbc4 /moses-cmd
parentcea2d9d8bb34a81660974cae20d66aefec4e0468 (diff)
parenta0b6b6a341e74b47bbef4652ad7fd928cf91e17c (diff)
merged master into dynamic-models and solved conflicts
Diffstat (limited to 'moses-cmd')
-rw-r--r--moses-cmd/IOWrapper.cpp679
-rw-r--r--moses-cmd/IOWrapper.h166
-rw-r--r--moses-cmd/Jamfile4
-rw-r--r--moses-cmd/LatticeMBR.cpp669
-rw-r--r--moses-cmd/LatticeMBR.h153
-rw-r--r--moses-cmd/LatticeMBRGrid.cpp20
-rw-r--r--moses-cmd/Main.cpp705
-rw-r--r--moses-cmd/Main.h5
-rw-r--r--moses-cmd/TranslationAnalysis.cpp137
-rw-r--r--moses-cmd/TranslationAnalysis.h24
-rw-r--r--moses-cmd/mbr.cpp178
-rw-r--r--moses-cmd/mbr.h28
12 files changed, 74 insertions, 2694 deletions
diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
deleted file mode 100644
index 120301dbe..000000000
--- a/moses-cmd/IOWrapper.cpp
+++ /dev/null
@@ -1,679 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ***********************************************************************/
-
-// example file on how to use moses library
-
-#include <iostream>
-#include <stack>
-#include <boost/algorithm/string.hpp>
-
-#include "moses/TypeDef.h"
-#include "moses/Util.h"
-#include "moses/Hypothesis.h"
-#include "moses/WordsRange.h"
-#include "moses/TrellisPathList.h"
-#include "moses/StaticData.h"
-#include "moses/FeatureVector.h"
-#include "moses/InputFileStream.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-#include "util/exception.hh"
-
-#include "IOWrapper.h"
-
-using namespace std;
-using namespace Moses;
-
-namespace MosesCmd
-{
-
-IOWrapper::IOWrapper(
- const vector<FactorType> &inputFactorOrder
- , const vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const string &nBestFilePath)
- :m_inputFactorOrder(inputFactorOrder)
- ,m_outputFactorOrder(outputFactorOrder)
- ,m_inputFactorUsed(inputFactorUsed)
- ,m_inputFile(NULL)
- ,m_inputStream(&std::cin)
- ,m_nBestStream(NULL)
- ,m_outputWordGraphStream(NULL)
- ,m_outputSearchGraphStream(NULL)
- ,m_detailedTranslationReportingStream(NULL)
- ,m_alignmentOutputStream(NULL)
-{
- Initialization(inputFactorOrder, outputFactorOrder
- , inputFactorUsed
- , nBestSize, nBestFilePath);
-}
-
-IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
- , const std::vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &inputFilePath)
- :m_inputFactorOrder(inputFactorOrder)
- ,m_outputFactorOrder(outputFactorOrder)
- ,m_inputFactorUsed(inputFactorUsed)
- ,m_inputFilePath(inputFilePath)
- ,m_inputFile(new InputFileStream(inputFilePath))
- ,m_nBestStream(NULL)
- ,m_outputWordGraphStream(NULL)
- ,m_outputSearchGraphStream(NULL)
- ,m_detailedTranslationReportingStream(NULL)
- ,m_alignmentOutputStream(NULL)
-{
- Initialization(inputFactorOrder, outputFactorOrder
- , inputFactorUsed
- , nBestSize, nBestFilePath);
-
- m_inputStream = m_inputFile;
-}
-
-IOWrapper::~IOWrapper()
-{
- if (m_inputFile != NULL)
- delete m_inputFile;
- if (m_nBestStream != NULL && !m_surpressSingleBestOutput) {
- // outputting n-best to file, rather than stdout. need to close file and delete obj
- delete m_nBestStream;
- }
- if (m_outputWordGraphStream != NULL) {
- delete m_outputWordGraphStream;
- }
- if (m_outputSearchGraphStream != NULL) {
- delete m_outputSearchGraphStream;
- }
- delete m_detailedTranslationReportingStream;
- delete m_alignmentOutputStream;
-}
-
-void IOWrapper::Initialization(const std::vector<FactorType> &/*inputFactorOrder*/
- , const std::vector<FactorType> &/*outputFactorOrder*/
- , const FactorMask &/*inputFactorUsed*/
- , size_t nBestSize
- , const std::string &nBestFilePath)
-{
- const StaticData &staticData = StaticData::Instance();
-
- // n-best
- m_surpressSingleBestOutput = false;
-
- if (nBestSize > 0) {
- if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") {
- m_nBestStream = &std::cout;
- m_surpressSingleBestOutput = true;
- } else {
- std::ofstream *file = new std::ofstream;
- m_nBestStream = file;
- file->open(nBestFilePath.c_str());
- }
- }
-
- // wordgraph output
- if (staticData.GetOutputWordGraph()) {
- string fileName = staticData.GetParam("output-word-graph")[0];
- std::ofstream *file = new std::ofstream;
- m_outputWordGraphStream = file;
- file->open(fileName.c_str());
- }
-
-
- // search graph output
- if (staticData.GetOutputSearchGraph()) {
- string fileName;
- if (staticData.GetOutputSearchGraphExtended())
- fileName = staticData.GetParam("output-search-graph-extended")[0];
- else
- fileName = staticData.GetParam("output-search-graph")[0];
- std::ofstream *file = new std::ofstream;
- m_outputSearchGraphStream = file;
- file->open(fileName.c_str());
- }
-
- // detailed translation reporting
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
- m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
- UTIL_THROW_IF(!m_detailedTranslationReportingStream->good(),
- util::FileOpenException,
- "File for output of detailed translation report could not be open");
- }
-
- // sentence alignment output
- if (! staticData.GetAlignmentOutputFile().empty()) {
- m_alignmentOutputStream = new ofstream(staticData.GetAlignmentOutputFile().c_str());
- UTIL_THROW_IF(!m_alignmentOutputStream->good(),
- util::FileOpenException,
- "File for output of word alignment could not be open");
- }
-
-}
-
-InputType*
-IOWrapper::
-GetInput(InputType* inputType)
-{
- if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
- if (long x = inputType->GetTranslationId()) {
- if (x>=m_translationId) m_translationId = x+1;
- } else inputType->SetTranslationId(m_translationId++);
-
- return inputType;
- } else {
- delete inputType;
- return NULL;
- }
-}
-
-std::map<size_t, const Factor*> GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor)
-{
- const InputPath &inputPath = hypo.GetTranslationOption().GetInputPath();
- const Phrase &inputPhrase = inputPath.GetPhrase();
-
- std::map<size_t, const Factor*> ret;
-
- for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) {
- const Factor *factor = inputPhrase.GetFactor(sourcePos, placeholderFactor);
- if (factor) {
- std::set<size_t> targetPos = hypo.GetTranslationOption().GetTargetPhrase().GetAlignTerm().GetAlignmentsForSource(sourcePos);
- UTIL_THROW_IF2(targetPos.size() != 1,
- "Placeholder should be aligned to 1, and only 1, word");
- ret[*targetPos.begin()] = factor;
- }
- }
-
- return ret;
-}
-
-/***
- * print surface factor only for the given phrase
- */
-void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
- char reportSegmentation, bool reportAllFactors)
-{
- UTIL_THROW_IF2(outputFactorOrder.size() == 0,
- "Must specific at least 1 output factor");
- const TargetPhrase& phrase = edge.GetCurrTargetPhrase();
- bool markUnknown = StaticData::Instance().GetMarkUnknown();
- if (reportAllFactors == true) {
- out << phrase;
- } else {
- FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor();
-
- std::map<size_t, const Factor*> placeholders;
- if (placeholderFactor != NOT_FOUND) {
- // creates map of target position -> factor for placeholders
- placeholders = GetPlaceholders(edge, placeholderFactor);
- }
-
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
-
- if (placeholders.size()) {
- // do placeholders
- std::map<size_t, const Factor*>::const_iterator iter = placeholders.find(pos);
- if (iter != placeholders.end()) {
- factor = iter->second;
- }
- }
-
- UTIL_THROW_IF2(factor == NULL,
- "No factor 0 at position " << pos);
-
- //preface surface form with UNK if marking unknowns
- const Word &word = phrase.GetWord(pos);
- if(markUnknown && word.IsOOV()) {
- out << "UNK" << *factor;
- } else {
- out << *factor;
- }
-
- for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
- UTIL_THROW_IF2(factor == NULL,
- "No factor " << i << " at position " << pos);
-
- out << "|" << *factor;
- }
- out << " ";
- }
- }
-
- // trace ("report segmentation") option "-t" / "-tt"
- if (reportSegmentation > 0 && phrase.GetSize() > 0) {
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- const int sourceStart = sourceRange.GetStartPos();
- const int sourceEnd = sourceRange.GetEndPos();
- out << "|" << sourceStart << "-" << sourceEnd; // enriched "-tt"
- if (reportSegmentation == 2) {
- out << ",wa=";
- const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
- OutputAlignment(out, ai, 0, 0);
- out << ",total=";
- out << edge.GetScore() - edge.GetPrevHypo()->GetScore();
- out << ",";
- ScoreComponentCollection scoreBreakdown(edge.GetScoreBreakdown());
- scoreBreakdown.MinusEquals(edge.GetPrevHypo()->GetScoreBreakdown());
- OutputAllFeatureScores(scoreBreakdown, out);
- }
- out << "| ";
- }
-}
-
-void OutputPassthroughInformation(std::string& passthrough, const Hypothesis *hypo)
-{
- passthrough = hypo->GetManager().GetSource().GetPassthroughInformation();
-}
-
-void OutputPassthroughInformation(std::ostream &out, const Hypothesis *hypo)
-{
- std::string passthrough;
- passthrough = hypo->GetManager().GetSource().GetPassthroughInformation();
- out << passthrough;
-}
-
-void OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder,
- char reportSegmentation, bool reportAllFactors)
-{
- if (hypo != NULL) {
- // recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence
- OutputBestSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
- OutputSurface(out, *hypo, outputFactorOrder, reportSegmentation, reportAllFactors);
- }
-}
-
-void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
-{
- typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
- AlignVec alignments = ai.GetSortedAlignments();
-
- AlignVec::const_iterator it;
- for (it = alignments.begin(); it != alignments.end(); ++it) {
- const std::pair<size_t,size_t> &alignment = **it;
- out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
- }
-
-}
-
-void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
-{
- size_t targetOffset = 0;
-
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const TargetPhrase &tp = edge.GetCurrTargetPhrase();
- size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
-
- OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
-
- targetOffset += tp.GetSize();
- }
- out << std::endl;
-}
-
-void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
-{
- std::vector<const Hypothesis *> edges;
- const Hypothesis *currentHypo = hypo;
- while (currentHypo) {
- edges.push_back(currentHypo);
- currentHypo = currentHypo->GetPrevHypo();
- }
-
- OutputAlignment(out, edges);
-
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
-{
- ostringstream out;
- OutputAlignment(out, edges);
-
- collector->Write(lineNo,out.str());
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const Hypothesis *hypo)
-{
- if (collector) {
- std::vector<const Hypothesis *> edges;
- const Hypothesis *currentHypo = hypo;
- while (currentHypo) {
- edges.push_back(currentHypo);
- currentHypo = currentHypo->GetPrevHypo();
- }
-
- OutputAlignment(collector,lineNo, edges);
- }
-}
-
-void OutputAlignment(OutputCollector* collector, size_t lineNo , const TrellisPath &path)
-{
- if (collector) {
- OutputAlignment(collector,lineNo, path.GetEdges());
- }
-}
-
-void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/, char reportSegmentation, bool reportAllFactors, std::ostream &out)
-{
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
-
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- OutputSurface(out, edge, StaticData::Instance().GetOutputFactorOrder(), reportSegmentation, reportAllFactors);
- }
- out << endl;
-}
-
-void IOWrapper::Backtrack(const Hypothesis *hypo)
-{
-
- if (hypo->GetPrevHypo() != NULL) {
- VERBOSE(3,hypo->GetId() << " <= ");
- Backtrack(hypo->GetPrevHypo());
- }
-}
-
-void OutputBestHypo(const std::vector<Word>& mbrBestHypo, long /*translationId*/, char /*reportSegmentation*/, bool /*reportAllFactors*/, ostream& out)
-{
-
- for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
- const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
- UTIL_THROW_IF2(factor == NULL,
- "No factor 0 at position " << i);
- if (i>0) out << " " << *factor;
- else out << *factor;
- }
- out << endl;
-}
-
-
-void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
-{
- if (hypo->GetPrevHypo()) {
- OutputInput(map, hypo->GetPrevHypo());
- map[hypo->GetCurrSourceWordsRange().GetStartPos()] = &hypo->GetTranslationOption().GetInputPath().GetPhrase();
- }
-}
-
-void OutputInput(std::ostream& os, const Hypothesis* hypo)
-{
- size_t len = hypo->GetInput().GetSize();
- std::vector<const Phrase*> inp_phrases(len, 0);
- OutputInput(inp_phrases, hypo);
- for (size_t i=0; i<len; ++i)
- if (inp_phrases[i]) os << *inp_phrases[i];
-}
-
-void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, char reportSegmentation, bool reportAllFactors)
-{
- if (hypo != NULL) {
- VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
- VERBOSE(3,"Best path: ");
- if (StaticData::Instance().IsPassthroughEnabled()) {
- OutputPassthroughInformation(cout, hypo);
- }
- Backtrack(hypo);
- VERBOSE(3,"0" << std::endl);
- if (!m_surpressSingleBestOutput) {
- if (StaticData::Instance().GetOutputHypoScore()) {
- cout << hypo->GetTotalScore() << " ";
- }
-
- if (StaticData::Instance().IsPathRecoveryEnabled()) {
- OutputInput(cout, hypo);
- cout << "||| ";
- }
- OutputBestSurface(cout, hypo, m_outputFactorOrder, reportSegmentation, reportAllFactors);
- cout << endl;
- }
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- if (!m_surpressSingleBestOutput) {
- cout << endl;
- }
- }
-}
-
-void OutputNBest(std::ostream& out
- , const Moses::TrellisPathList &nBestList
- , const std::vector<Moses::FactorType>& outputFactorOrder
- , long translationId
- , char reportSegmentation)
-{
- const StaticData &staticData = StaticData::Instance();
- bool reportAllFactors = staticData.GetReportAllFactorsNBest();
- bool includeSegmentation = staticData.NBestIncludesSegmentation();
- bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
-
- TrellisPathList::const_iterator iter;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const TrellisPath &path = **iter;
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
-
- // print the surface factor of the translation
- out << translationId << " ||| ";
- if (staticData.IsPassthroughInNBestEnabled()) {
- OutputPassthroughInformation(out, edges[edges.size() - 1]);
- }
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- OutputSurface(out, edge, outputFactorOrder, reportSegmentation, reportAllFactors);
- }
- out << " |||";
-
- // print scores with feature names
- OutputAllFeatureScores(path.GetScoreBreakdown(), out );
-
- // total
- out << " ||| " << path.GetTotalScore();
-
- //phrase-to-phrase segmentation
- if (includeSegmentation) {
- out << " |||";
- for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- WordsRange targetRange = path.GetTargetWordsRange(edge);
- out << " " << sourceRange.GetStartPos();
- if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
- out << "-" << sourceRange.GetEndPos();
- }
- out<< "=" << targetRange.GetStartPos();
- if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
- out<< "-" << targetRange.GetEndPos();
- }
- }
- }
-
- if (includeWordAlignment) {
- out << " ||| ";
- for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
- WordsRange targetRange = path.GetTargetWordsRange(edge);
- const int sourceOffset = sourceRange.GetStartPos();
- const int targetOffset = targetRange.GetStartPos();
- const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
-
- OutputAlignment(out, ai, sourceOffset, targetOffset);
-
- }
- }
-
- if (StaticData::Instance().IsPathRecoveryEnabled()) {
- out << " ||| ";
- OutputInput(out, edges[0]);
- }
-
- out << endl;
- }
-
- out << std::flush;
-}
-
-void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out)
-{
- std::string lastName = "";
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for( size_t i=0; i<sff.size(); i++ ) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
- && ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- for( size_t i=0; i<slf.size(); i++ ) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- OutputFeatureScores( out, features, ff, lastName );
- }
- }
-}
-
-void OutputFeatureScores( std::ostream& out
- , const ScoreComponentCollection &features
- , const FeatureFunction *ff
- , std::string &lastName )
-{
- const StaticData &staticData = StaticData::Instance();
- bool labeledOutput = staticData.IsLabeledNBestList();
-
- // regular features (not sparse)
- if (ff->GetNumScoreComponents() != 0) {
- if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
- lastName = ff->GetScoreProducerDescription();
- out << " " << lastName << "=";
- }
- vector<float> scores = features.GetScoresForProducer( ff );
- for (size_t j = 0; j<scores.size(); ++j) {
- out << " " << scores[j];
- }
- }
-
- // sparse features
- const FVector scores = features.GetVectorForProducer( ff );
- for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
- out << " " << i->first << "= " << i->second;
- }
-}
-
-void OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId)
-{
- for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
- out << translationId;
- out << " |||";
- const vector<Word> mbrHypo = si->GetWords();
- for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
- const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
- if (i>0) out << " " << *factor;
- else out << *factor;
- }
- out << " |||";
- out << " map: " << si->GetMapScore();
- out << " w: " << mbrHypo.size();
- const vector<float>& ngramScores = si->GetNgramScores();
- for (size_t i = 0; i < ngramScores.size(); ++i) {
- out << " " << ngramScores[i];
- }
- out << " ||| " << si->GetScore();
-
- out << endl;
- }
-}
-
-
-void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solutions,long translationId)
-{
- OutputLatticeMBRNBest(*m_nBestStream, solutions,translationId);
-}
-
-bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
-{
- if (source) delete source;
- switch(inputType) {
- case SentenceInput:
- source = ioWrapper.GetInput(new Sentence);
- break;
- case ConfusionNetworkInput:
- source = ioWrapper.GetInput(new ConfusionNet);
- break;
- case WordLatticeInput:
- source = ioWrapper.GetInput(new WordLattice);
- break;
- default:
- TRACE_ERR("Unknown input type: " << inputType << "\n");
- source = NULL;
- }
- return (source ? true : false);
-}
-
-
-
-IOWrapper *GetIOWrapper(const StaticData &staticData)
-{
- IOWrapper *ioWrapper;
- const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
- ,&outputFactorOrder = staticData.GetOutputFactorOrder();
- FactorMask inputFactorUsed(inputFactorOrder);
-
- // io
- if (staticData.GetParam("input-file").size() == 1) {
- VERBOSE(2,"IO from File" << endl);
- string filePath = staticData.GetParam("input-file")[0];
-
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath()
- , filePath);
- } else {
- VERBOSE(1,"IO from STDOUT/STDIN" << endl);
- ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath());
- }
- ioWrapper->ResetTranslationId();
-
- IFVERBOSE(1)
- PrintUserTime("Created input-output object");
-
- return ioWrapper;
-}
-
-}
-
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
deleted file mode 100644
index 7afb18948..000000000
--- a/moses-cmd/IOWrapper.h
+++ /dev/null
@@ -1,166 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (c) 2006 University of Edinburgh
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the University of Edinburgh nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-***********************************************************************/
-
-// example file on how to use moses library
-
-#ifndef moses_cmd_IOWrapper_h
-#define moses_cmd_IOWrapper_h
-
-#include <cassert>
-#include <fstream>
-#include <ostream>
-#include <vector>
-
-#include "moses/TypeDef.h"
-#include "moses/Sentence.h"
-#include "moses/FactorTypeSet.h"
-#include "moses/FactorCollection.h"
-#include "moses/Hypothesis.h"
-#include "moses/OutputCollector.h"
-#include "moses/TrellisPathList.h"
-#include "moses/InputFileStream.h"
-#include "moses/InputType.h"
-#include "moses/WordLattice.h"
-#include "LatticeMBR.h"
-
-namespace Moses
-{
-class ScoreComponentCollection;
-class Hypothesis;
-class Factor;
-}
-
-namespace MosesCmd
-{
-
-/** Helper class that holds misc variables to write data out to command line.
- */
-class IOWrapper
-{
-protected:
- long m_translationId;
-
- const std::vector<Moses::FactorType> &m_inputFactorOrder;
- const std::vector<Moses::FactorType> &m_outputFactorOrder;
- const Moses::FactorMask &m_inputFactorUsed;
- std::string m_inputFilePath;
- Moses::InputFileStream *m_inputFile;
- std::istream *m_inputStream;
- std::ostream *m_nBestStream
- ,*m_outputWordGraphStream,*m_outputSearchGraphStream;
- std::ostream *m_detailedTranslationReportingStream;
- std::ofstream *m_alignmentOutputStream;
- bool m_surpressSingleBestOutput;
-
- void Initialization(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath);
-
-
-public:
- IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath);
-
- IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
- , const std::vector<Moses::FactorType> &outputFactorOrder
- , const Moses::FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &infilePath);
- ~IOWrapper();
-
- Moses::InputType* GetInput(Moses::InputType *inputType);
-
- void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, char reportSegmentation, bool reportAllFactors);
- void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
- void Backtrack(const Moses::Hypothesis *hypo);
-
- void ResetTranslationId() {
- m_translationId = 0;
- }
-
- std::ofstream *GetAlignmentOutputStream() {
- return m_alignmentOutputStream;
- }
-
- std::ostream &GetOutputWordGraphStream() {
- return *m_outputWordGraphStream;
- }
- std::ostream &GetOutputSearchGraphStream() {
- return *m_outputSearchGraphStream;
- }
-
- std::ostream &GetDetailedTranslationReportingStream() {
- assert (m_detailedTranslationReportingStream);
- return *m_detailedTranslationReportingStream;
- }
-};
-
-IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
-bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source);
-void OutputLanguageModelOrder(std::ostream &out, const Moses::Hypothesis *hypo, Moses::Manager &manager);
-void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, char reportSegmentation, bool reportAllFactors);
-void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
-void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
- char reportSegmentation, bool reportAllFactors, std::ostream& out);
-void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,char reportSegmentation, bool reportAllFactors, std::ostream &out);
-void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
-void OutputPassthroughInformation(std::string& passthrough, const Moses::Hypothesis* hypo);
-void OutputPassthroughInformation(std::ostream& os, const Moses::Hypothesis* hypo);
-void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
-void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
-void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
-void OutputAlignment(std::ostream &out, const Moses::AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset);
-
-void OutputNBest(std::ostream& out
- , const Moses::TrellisPathList &nBestList
- , const std::vector<Moses::FactorType>& outputFactorOrder
- , long translationId
- , char reportSegmentation);
-void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out);
-void OutputFeatureScores( std::ostream& out
- , const Moses::ScoreComponentCollection &features
- , const Moses::FeatureFunction *ff
- , std::string &lastName );
-
-// creates a map of TARGET positions which should be replaced by word using placeholder
-std::map<size_t, const Moses::Factor*> GetPlaceholders(const Moses::Hypothesis &hypo, Moses::FactorType placeholderFactor);
-
-}
-
-#endif
diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index bddc10911..ee762823e 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -1,6 +1,6 @@
-alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
+alias deps : ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
exe moses : Main.cpp deps ;
exe lmbrgrid : LatticeMBRGrid.cpp deps ;
-
alias programs : moses lmbrgrid ;
+
diff --git a/moses-cmd/LatticeMBR.cpp b/moses-cmd/LatticeMBR.cpp
deleted file mode 100644
index 148b44743..000000000
--- a/moses-cmd/LatticeMBR.cpp
+++ /dev/null
@@ -1,669 +0,0 @@
-/*
- * LatticeMBR.cpp
- * moses-cmd
- *
- * Created by Abhishek Arun on 26/01/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-
-#include "LatticeMBR.h"
-#include "moses/StaticData.h"
-#include <algorithm>
-#include <set>
-
-using namespace std;
-using namespace Moses;
-
-namespace MosesCmd
-{
-
-size_t bleu_order = 4;
-float UNKNGRAMLOGPROB = -20;
-void GetOutputWords(const TrellisPath &path, vector <Word> &translation)
-{
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
-
- // print the surface factor of the translation
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const Phrase &phrase = edge.GetCurrTargetPhrase();
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
- translation.push_back(phrase.GetWord(pos));
- }
- }
-}
-
-
-void extract_ngrams(const vector<Word >& sentence, map < Phrase, int > & allngrams)
-{
- for (int k = 0; k < (int)bleu_order; k++) {
- for(int i =0; i < max((int)sentence.size()-k,0); i++) {
- Phrase ngram( k+1);
- for ( int j = i; j<= i+k; j++) {
- ngram.AddWord(sentence[j]);
- }
- ++allngrams[ngram];
- }
- }
-}
-
-
-
-void NgramScores::addScore(const Hypothesis* node, const Phrase& ngram, float score)
-{
- set<Phrase>::const_iterator ngramIter = m_ngrams.find(ngram);
- if (ngramIter == m_ngrams.end()) {
- ngramIter = m_ngrams.insert(ngram).first;
- }
- map<const Phrase*,float>& ngramScores = m_scores[node];
- map<const Phrase*,float>::iterator scoreIter = ngramScores.find(&(*ngramIter));
- if (scoreIter == ngramScores.end()) {
- ngramScores[&(*ngramIter)] = score;
- } else {
- ngramScores[&(*ngramIter)] = log_sum(score,scoreIter->second);
- }
-}
-
-NgramScores::NodeScoreIterator NgramScores::nodeBegin(const Hypothesis* node)
-{
- return m_scores[node].begin();
-}
-
-
-NgramScores::NodeScoreIterator NgramScores::nodeEnd(const Hypothesis* node)
-{
- return m_scores[node].end();
-}
-
-LatticeMBRSolution::LatticeMBRSolution(const TrellisPath& path, bool isMap) :
- m_score(0.0f)
-{
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
-
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const Phrase &phrase = edge.GetCurrTargetPhrase();
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
- m_words.push_back(phrase.GetWord(pos));
- }
- }
- if (isMap) {
- m_mapScore = path.GetTotalScore();
- } else {
- m_mapScore = 0;
- }
-}
-
-
-void LatticeMBRSolution::CalcScore(map<Phrase, float>& finalNgramScores, const vector<float>& thetas, float mapWeight)
-{
- m_ngramScores.assign(thetas.size()-1, -10000);
-
- map < Phrase, int > counts;
- extract_ngrams(m_words,counts);
-
- //Now score this translation
- m_score = thetas[0] * m_words.size();
-
- //Calculate the ngramScores, working in log space at first
- for (map < Phrase, int >::iterator ngrams = counts.begin(); ngrams != counts.end(); ++ngrams) {
- float ngramPosterior = UNKNGRAMLOGPROB;
- map<Phrase,float>::const_iterator ngramPosteriorIt = finalNgramScores.find(ngrams->first);
- if (ngramPosteriorIt != finalNgramScores.end()) {
- ngramPosterior = ngramPosteriorIt->second;
- }
- size_t ngramSize = ngrams->first.GetSize();
- m_ngramScores[ngramSize-1] = log_sum(log((float)ngrams->second) + ngramPosterior,m_ngramScores[ngramSize-1]);
- }
-
- //convert from log to probability and create weighted sum
- for (size_t i = 0; i < m_ngramScores.size(); ++i) {
- m_ngramScores[i] = exp(m_ngramScores[i]);
- m_score += thetas[i+1] * m_ngramScores[i];
- }
-
-
- //The map score
- m_score += m_mapScore*mapWeight;
-}
-
-
-void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const Hypothesis* > > & outgoingHyps, map<const Hypothesis*, vector<Edge> >& incomingEdges,
- const vector< float> & estimatedScores, const Hypothesis* bestHypo, size_t edgeDensity, float scale)
-{
-
- //Need hyp 0 in connectedHyp - Find empty hypothesis
- VERBOSE(2,"Pruning lattice to edge density " << edgeDensity << endl);
- const Hypothesis* emptyHyp = connectedHyp.at(0);
- while (emptyHyp->GetId() != 0) {
- emptyHyp = emptyHyp->GetPrevHypo();
- }
- connectedHyp.push_back(emptyHyp); //Add it to list of hyps
-
- //Need hyp 0's outgoing Hyps
- for (size_t i = 0; i < connectedHyp.size(); ++i) {
- if (connectedHyp[i]->GetId() > 0 && connectedHyp[i]->GetPrevHypo()->GetId() == 0)
- outgoingHyps[emptyHyp].insert(connectedHyp[i]);
- }
-
- //sort hyps based on estimated scores - do so by copying to multimap
- multimap<float, const Hypothesis*> sortHypsByVal;
- for (size_t i =0; i < estimatedScores.size(); ++i) {
- sortHypsByVal.insert(make_pair(estimatedScores[i], connectedHyp[i]));
- }
-
- multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end();
- float bestScore = it->first;
- //store best score as score of hyp 0
- sortHypsByVal.insert(make_pair(bestScore, emptyHyp));
-
-
- IFVERBOSE(3) {
- for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
- const Hypothesis* currHyp = it->second;
- cerr << "Hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl;
- }
- }
-
-
- set <const Hypothesis*> survivingHyps; //store hyps that make the cut in this
-
- VERBOSE(2, "BEST HYPO TARGET LENGTH : " << bestHypo->GetSize() << endl)
- size_t numEdgesTotal = edgeDensity * bestHypo->GetSize(); //as per Shankar, aim for (density * target length of MAP solution) arcs
- size_t numEdgesCreated = 0;
- VERBOSE(2, "Target edge count: " << numEdgesTotal << endl);
-
- float prevScore = -999999;
-
- //now iterate over multimap
- for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
- float currEstimatedScore = it->first;
- const Hypothesis* currHyp = it->second;
-
- if (numEdgesCreated >= numEdgesTotal && prevScore > currEstimatedScore) //if this hyp has equal estimated score to previous, include its edges too
- break;
-
- prevScore = currEstimatedScore;
- VERBOSE(3, "Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
- VERBOSE(3, "Considering hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl)
-
- survivingHyps.insert(currHyp); //CurrHyp made the cut
-
- // is its best predecessor already included ?
- if (survivingHyps.find(currHyp->GetPrevHypo()) != survivingHyps.end()) { //yes, then add an edge
- vector <Edge>& edges = incomingEdges[currHyp];
- Edge winningEdge(currHyp->GetPrevHypo(),currHyp,scale*(currHyp->GetScore() - currHyp->GetPrevHypo()->GetScore()),currHyp->GetCurrTargetPhrase());
- edges.push_back(winningEdge);
- ++numEdgesCreated;
- }
-
- //let's try the arcs too
- const ArcList *arcList = currHyp->GetArcList();
- if (arcList != NULL) {
- ArcList::const_iterator iterArcList;
- for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
- const Hypothesis *loserHypo = *iterArcList;
- const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
- if (survivingHyps.find(loserPrevHypo) != survivingHyps.end()) { //found it, add edge
- double arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
- Edge losingEdge(loserPrevHypo, currHyp, arcScore*scale, loserHypo->GetCurrTargetPhrase());
- vector <Edge>& edges = incomingEdges[currHyp];
- edges.push_back(losingEdge);
- ++numEdgesCreated;
- }
- }
- }
-
- //Now if a successor node has already been visited, add an edge connecting the two
- map < const Hypothesis*, set < const Hypothesis* > >::const_iterator outgoingIt = outgoingHyps.find(currHyp);
-
- if (outgoingIt != outgoingHyps.end()) {//currHyp does have successors
- const set<const Hypothesis*> & outHyps = outgoingIt->second; //the successors
- for (set<const Hypothesis*>::const_iterator outHypIts = outHyps.begin(); outHypIts != outHyps.end(); ++outHypIts) {
- const Hypothesis* succHyp = *outHypIts;
-
- if (survivingHyps.find(succHyp) == survivingHyps.end()) //Have we encountered the successor yet?
- continue; //No, move on to next
-
- //Curr Hyp can be : a) the best predecessor of succ b) or an arc attached to succ
- if (succHyp->GetPrevHypo() == currHyp) { //best predecessor
- vector <Edge>& succEdges = incomingEdges[succHyp];
- Edge succWinningEdge(currHyp, succHyp, scale*(succHyp->GetScore() - currHyp->GetScore()), succHyp->GetCurrTargetPhrase());
- succEdges.push_back(succWinningEdge);
- survivingHyps.insert(succHyp);
- ++numEdgesCreated;
- }
-
- //now, let's find an arc
- const ArcList *arcList = succHyp->GetArcList();
- if (arcList != NULL) {
- ArcList::const_iterator iterArcList;
- //QUESTION: What happens if there's more than one loserPrevHypo?
- for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
- const Hypothesis *loserHypo = *iterArcList;
- const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
- if (loserPrevHypo == currHyp) { //found it
- vector <Edge>& succEdges = incomingEdges[succHyp];
- double arcScore = loserHypo->GetScore() - currHyp->GetScore();
- Edge losingEdge(currHyp, succHyp,scale* arcScore, loserHypo->GetCurrTargetPhrase());
- succEdges.push_back(losingEdge);
- ++numEdgesCreated;
- }
- }
- }
- }
- }
- }
-
- connectedHyp.clear();
- for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
- connectedHyp.push_back(*it);
- }
-
- VERBOSE(2, "Done! Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
-
- IFVERBOSE(3) {
- cerr << "Surviving hyps: " ;
- for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
- cerr << (*it)->GetId() << " ";
- }
- cerr << endl;
- }
-
-
-}
-
-void calcNgramExpectations(Lattice & connectedHyp, map<const Hypothesis*, vector<Edge> >& incomingEdges,
- map<Phrase, float>& finalNgramScores, bool posteriors)
-{
-
- sort(connectedHyp.begin(),connectedHyp.end(),ascendingCoverageCmp); //sort by increasing source word cov
-
- /*cerr << "Lattice:" << endl;
- for (Lattice::const_iterator i = connectedHyp.begin(); i != connectedHyp.end(); ++i) {
- const Hypothesis* h = *i;
- cerr << *h << endl;
- const vector<Edge>& edges = incomingEdges[h];
- for (size_t e = 0; e < edges.size(); ++e) {
- cerr << edges[e];
- }
- }*/
-
- map<const Hypothesis*, float> forwardScore;
- forwardScore[connectedHyp[0]] = 0.0f; //forward score of hyp 0 is 1 (or 0 in logprob space)
- set< const Hypothesis *> finalHyps; //store completed hyps
-
- NgramScores ngramScores;//ngram scores for each hyp
-
- for (size_t i = 1; i < connectedHyp.size(); ++i) {
- const Hypothesis* currHyp = connectedHyp[i];
- if (currHyp->GetWordsBitmap().IsComplete()) {
- finalHyps.insert(currHyp);
- }
-
- VERBOSE(3, "Processing hyp: " << currHyp->GetId() << ", num words cov= " << currHyp->GetWordsBitmap().GetNumWordsCovered() << endl)
-
- vector <Edge> & edges = incomingEdges[currHyp];
- for (size_t e = 0; e < edges.size(); ++e) {
- const Edge& edge = edges[e];
- if (forwardScore.find(currHyp) == forwardScore.end()) {
- forwardScore[currHyp] = forwardScore[edge.GetTailNode()] + edge.GetScore();
- VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] = fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
- } else {
- forwardScore[currHyp] = log_sum(forwardScore[currHyp], forwardScore[edge.GetTailNode()] + edge.GetScore());
- VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] += fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
- }
- }
-
- //Process ngrams now
- for (size_t j =0 ; j < edges.size(); ++j) {
- Edge& edge = edges[j];
- const NgramHistory & incomingPhrases = edge.GetNgrams(incomingEdges);
-
- //let's first score ngrams introduced by this edge
- for (NgramHistory::const_iterator it = incomingPhrases.begin(); it != incomingPhrases.end(); ++it) {
- const Phrase& ngram = it->first;
- const PathCounts& pathCounts = it->second;
- VERBOSE(4, "Calculating score for: " << it->first << endl)
-
- for (PathCounts::const_iterator pathCountIt = pathCounts.begin(); pathCountIt != pathCounts.end(); ++pathCountIt) {
- //Score of an n-gram is forward score of head node of leftmost edge + all edge scores
- const Path& path = pathCountIt->first;
- //cerr << "path count for " << ngram << " is " << pathCountIt->second << endl;
- float score = forwardScore[path[0]->GetTailNode()];
- for (size_t i = 0; i < path.size(); ++i) {
- score += path[i]->GetScore();
- }
- //if we're doing expectations, then the number of times the ngram
- //appears on the path is relevant.
- size_t count = posteriors ? 1 : pathCountIt->second;
- for (size_t k = 0; k < count; ++k) {
- ngramScores.addScore(currHyp,ngram,score);
- }
- }
- }
-
- //Now score ngrams that are just being propagated from the history
- for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(edge.GetTailNode());
- it != ngramScores.nodeEnd(edge.GetTailNode()); ++it) {
- const Phrase & currNgram = *(it->first);
- float currNgramScore = it->second;
- VERBOSE(4, "Calculating score for: " << currNgram << endl)
-
- // For posteriors, don't double count ngrams
- if (!posteriors || incomingPhrases.find(currNgram) == incomingPhrases.end()) {
- float score = edge.GetScore() + currNgramScore;
- ngramScores.addScore(currHyp,currNgram,score);
- }
- }
-
- }
- }
-
- float Z = 9999999; //the total score of the lattice
-
- //Done - Print out ngram posteriors for final hyps
- for (set< const Hypothesis *>::iterator finalHyp = finalHyps.begin(); finalHyp != finalHyps.end(); ++finalHyp) {
- const Hypothesis* hyp = *finalHyp;
-
- for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(hyp); it != ngramScores.nodeEnd(hyp); ++it) {
- const Phrase& ngram = *(it->first);
- if (finalNgramScores.find(ngram) == finalNgramScores.end()) {
- finalNgramScores[ngram] = it->second;
- } else {
- finalNgramScores[ngram] = log_sum(it->second, finalNgramScores[ngram]);
- }
- }
-
- if (Z == 9999999) {
- Z = forwardScore[hyp];
- } else {
- Z = log_sum(Z, forwardScore[hyp]);
- }
- }
-
- //Z *= scale; //scale the score
-
- for (map<Phrase, float>::iterator finalScoresIt = finalNgramScores.begin(); finalScoresIt != finalNgramScores.end(); ++finalScoresIt) {
- finalScoresIt->second = finalScoresIt->second - Z;
- IFVERBOSE(2) {
- VERBOSE(2,finalScoresIt->first << " [" << finalScoresIt->second << "]" << endl);
- }
- }
-
-}
-
-const NgramHistory& Edge::GetNgrams(map<const Hypothesis*, vector<Edge> > & incomingEdges)
-{
-
- if (m_ngrams.size() > 0)
- return m_ngrams;
-
- const Phrase& currPhrase = GetWords();
- //Extract the n-grams local to this edge
- for (size_t start = 0; start < currPhrase.GetSize(); ++start) {
- for (size_t end = start; end < start + bleu_order; ++end) {
- if (end < currPhrase.GetSize()) {
- Phrase edgeNgram(end-start+1);
- for (size_t index = start; index <= end; ++index) {
- edgeNgram.AddWord(currPhrase.GetWord(index));
- }
- //cout << "Inserting Phrase : " << edgeNgram << endl;
- vector<const Edge*> edgeHistory;
- edgeHistory.push_back(this);
- storeNgramHistory(edgeNgram, edgeHistory);
- } else {
- break;
- }
- }
- }
-
- map<const Hypothesis*, vector<Edge> >::iterator it = incomingEdges.find(m_tailNode);
- if (it != incomingEdges.end()) { //node has incoming edges
- vector<Edge> & inEdges = it->second;
-
- for (vector<Edge>::iterator edge = inEdges.begin(); edge != inEdges.end(); ++edge) {//add the ngrams straddling prev and curr edge
- const NgramHistory & edgeIncomingNgrams = edge->GetNgrams(incomingEdges);
- for (NgramHistory::const_iterator edgeInNgramHist = edgeIncomingNgrams.begin(); edgeInNgramHist != edgeIncomingNgrams.end(); ++edgeInNgramHist) {
- const Phrase& edgeIncomingNgram = edgeInNgramHist->first;
- const PathCounts & edgeIncomingNgramPaths = edgeInNgramHist->second;
- size_t back = min(edgeIncomingNgram.GetSize(), edge->GetWordsSize());
- const Phrase& edgeWords = edge->GetWords();
- IFVERBOSE(3) {
- cerr << "Edge: "<< *edge <<endl;
- cerr << "edgeWords: " << edgeWords << endl;
- cerr << "edgeInNgram: " << edgeIncomingNgram << endl;
- }
-
- Phrase edgeSuffix(ARRAY_SIZE_INCR);
- Phrase ngramSuffix(ARRAY_SIZE_INCR);
- GetPhraseSuffix(edgeWords,back,edgeSuffix);
- GetPhraseSuffix(edgeIncomingNgram,back,ngramSuffix);
-
- if (ngramSuffix == edgeSuffix) { //we've got the suffix of previous edge
- size_t edgeInNgramSize = edgeIncomingNgram.GetSize();
-
- for (size_t i = 0; i < GetWordsSize() && i + edgeInNgramSize < bleu_order ; ++i) {
- Phrase newNgram(edgeIncomingNgram);
- for (size_t j = 0; j <= i ; ++j) {
- newNgram.AddWord(GetWords().GetWord(j));
- }
- VERBOSE(3, "Inserting New Phrase : " << newNgram << endl)
-
- for (PathCounts::const_iterator pathIt = edgeIncomingNgramPaths.begin(); pathIt != edgeIncomingNgramPaths.end(); ++pathIt) {
- Path newNgramPath = pathIt->first;
- newNgramPath.push_back(this);
- storeNgramHistory(newNgram, newNgramPath, pathIt->second);
- }
- }
- }
- }
- }
- }
- return m_ngrams;
-}
-
-//Add the last lastN words of origPhrase to targetPhrase
-void Edge::GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const
-{
- size_t origSize = origPhrase.GetSize();
- size_t startIndex = origSize - lastN;
- for (size_t index = startIndex; index < origPhrase.GetSize(); ++index) {
- targetPhrase.AddWord(origPhrase.GetWord(index));
- }
-}
-
-bool Edge::operator< (const Edge& compare ) const
-{
- if (m_headNode->GetId() < compare.m_headNode->GetId())
- return true;
- if (compare.m_headNode->GetId() < m_headNode->GetId())
- return false;
- if (m_tailNode->GetId() < compare.m_tailNode->GetId())
- return true;
- if (compare.m_tailNode->GetId() < m_tailNode->GetId())
- return false;
- return GetScore() < compare.GetScore();
-}
-
-ostream& operator<< (ostream& out, const Edge& edge)
-{
- out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl;
- return out;
-}
-
-bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b)
-{
- return a->GetWordsBitmap().GetNumWordsCovered() < b->GetWordsBitmap().GetNumWordsCovered();
-}
-
-void getLatticeMBRNBest(Manager& manager, TrellisPathList& nBestList,
- vector<LatticeMBRSolution>& solutions, size_t n)
-{
- const StaticData& staticData = StaticData::Instance();
- std::map < int, bool > connected;
- std::vector< const Hypothesis *> connectedList;
- map<Phrase, float> ngramPosteriors;
- std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
- map<const Hypothesis*, vector<Edge> > incomingEdges;
- vector< float> estimatedScores;
- manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
- pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
- calcNgramExpectations(connectedList, incomingEdges, ngramPosteriors,true);
-
- vector<float> mbrThetas = staticData.GetLatticeMBRThetas();
- float p = staticData.GetLatticeMBRPrecision();
- float r = staticData.GetLatticeMBRPRatio();
- float mapWeight = staticData.GetLatticeMBRMapWeight();
- if (mbrThetas.size() == 0) { //thetas not specified on the command line, use p and r instead
- mbrThetas.push_back(-1); //Theta 0
- mbrThetas.push_back(1/(bleu_order*p));
- for (size_t i = 2; i <= bleu_order; ++i) {
- mbrThetas.push_back(mbrThetas[i-1] / r);
- }
- }
- IFVERBOSE(2) {
- VERBOSE(2,"Thetas: ");
- for (size_t i = 0; i < mbrThetas.size(); ++i) {
- VERBOSE(2,mbrThetas[i] << " ");
- }
- VERBOSE(2,endl);
- }
- TrellisPathList::const_iterator iter;
- size_t ctr = 0;
- LatticeMBRSolutionComparator comparator;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr) {
- const TrellisPath &path = **iter;
- solutions.push_back(LatticeMBRSolution(path,iter==nBestList.begin()));
- solutions.back().CalcScore(ngramPosteriors,mbrThetas,mapWeight);
- sort(solutions.begin(), solutions.end(), comparator);
- while (solutions.size() > n) {
- solutions.pop_back();
- }
- }
- VERBOSE(2,"LMBR Score: " << solutions[0].GetScore() << endl);
-}
-
-vector<Word> doLatticeMBR(Manager& manager, TrellisPathList& nBestList)
-{
-
- vector<LatticeMBRSolution> solutions;
- getLatticeMBRNBest(manager, nBestList, solutions,1);
- return solutions.at(0).GetWords();
-}
-
-const TrellisPath doConsensusDecoding(Manager& manager, TrellisPathList& nBestList)
-{
- static const int BLEU_ORDER = 4;
- static const float SMOOTH = 1;
-
- //calculate the ngram expectations
- const StaticData& staticData = StaticData::Instance();
- std::map < int, bool > connected;
- std::vector< const Hypothesis *> connectedList;
- map<Phrase, float> ngramExpectations;
- std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
- map<const Hypothesis*, vector<Edge> > incomingEdges;
- vector< float> estimatedScores;
- manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
- pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
- calcNgramExpectations(connectedList, incomingEdges, ngramExpectations,false);
-
- //expected length is sum of expected unigram counts
- //cerr << "Thread " << pthread_self() << " Ngram expectations size: " << ngramExpectations.size() << endl;
- float ref_length = 0.0f;
- for (map<Phrase,float>::const_iterator ref_iter = ngramExpectations.begin();
- ref_iter != ngramExpectations.end(); ++ref_iter) {
- //cerr << "Ngram: " << ref_iter->first << " score: " <<
- // ref_iter->second << endl;
- if (ref_iter->first.GetSize() == 1) {
- ref_length += exp(ref_iter->second);
- // cerr << "Expected for " << ref_iter->first << " is " << exp(ref_iter->second) << endl;
- }
- }
-
- VERBOSE(2,"REF Length: " << ref_length << endl);
-
- //use the ngram expectations to rescore the nbest list.
- TrellisPathList::const_iterator iter;
- TrellisPathList::const_iterator best = nBestList.end();
- float bestScore = -100000;
- //cerr << "nbest list size: " << nBestList.GetSize() << endl;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const TrellisPath &path = **iter;
- vector<Word> words;
- map<Phrase,int> ngrams;
- GetOutputWords(path,words);
- /*for (size_t i = 0; i < words.size(); ++i) {
- cerr << words[i].GetFactor(0)->GetString() << " ";
- }
- cerr << endl;
- */
- extract_ngrams(words,ngrams);
-
- vector<float> comps(2*BLEU_ORDER+1);
- float logbleu = 0.0;
- float brevity = 0.0;
- int hyp_length = words.size();
- for (int i = 0; i < BLEU_ORDER; ++i) {
- comps[2*i] = 0.0;
- comps[2*i+1] = max(hyp_length-i,0);
- }
-
- for (map<Phrase,int>::const_iterator hyp_iter = ngrams.begin();
- hyp_iter != ngrams.end(); ++hyp_iter) {
- map<Phrase,float>::const_iterator ref_iter = ngramExpectations.find(hyp_iter->first);
- if (ref_iter != ngramExpectations.end()) {
- comps[2*(hyp_iter->first.GetSize()-1)] += min(exp(ref_iter->second), (float)(hyp_iter->second));
- }
-
- }
- comps[comps.size()-1] = ref_length;
- /*for (size_t i = 0; i < comps.size(); ++i) {
- cerr << comps[i] << " ";
- }
- cerr << endl;
- */
-
- float score = 0.0f;
- if (comps[0] != 0) {
- for (int i=0; i<BLEU_ORDER; i++) {
- if ( i > 0 ) {
- logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
- } else {
- logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
- }
- }
- logbleu /= BLEU_ORDER;
- brevity = 1.0-(float)comps[comps.size()-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
- if (brevity < 0.0) {
- logbleu += brevity;
- }
- score = exp(logbleu);
- }
-
- //cerr << "score: " << score << " bestScore: " << bestScore << endl;
- if (score > bestScore) {
- bestScore = score;
- best = iter;
- VERBOSE(2,"NEW BEST: " << score << endl);
- //for (size_t i = 0; i < comps.size(); ++i) {
- // cerr << comps[i] << " ";
- //}
- //cerr << endl;
- }
- }
-
- assert (best != nBestList.end());
- return **best;
- //vector<Word> bestWords;
- //GetOutputWords(**best,bestWords);
- //return bestWords;
-}
-
-}
-
-
diff --git a/moses-cmd/LatticeMBR.h b/moses-cmd/LatticeMBR.h
deleted file mode 100644
index ab8b3cb76..000000000
--- a/moses-cmd/LatticeMBR.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * LatticeMBR.h
- * moses-cmd
- *
- * Created by Abhishek Arun on 26/01/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-
-#ifndef moses_cmd_LatticeMBR_h
-#define moses_cmd_LatticeMBR_h
-
-#include <map>
-#include <vector>
-#include <set>
-#include "moses/Hypothesis.h"
-#include "moses/Manager.h"
-#include "moses/TrellisPathList.h"
-
-
-
-namespace MosesCmd
-{
-
-class Edge;
-
-typedef std::vector< const Moses::Hypothesis *> Lattice;
-typedef std::vector<const Edge*> Path;
-typedef std::map<Path, size_t> PathCounts;
-typedef std::map<Moses::Phrase, PathCounts > NgramHistory;
-
-class Edge
-{
- const Moses::Hypothesis* m_tailNode;
- const Moses::Hypothesis* m_headNode;
- float m_score;
- Moses::TargetPhrase m_targetPhrase;
- NgramHistory m_ngrams;
-
-public:
- Edge(const Moses::Hypothesis* from, const Moses::Hypothesis* to, float score, const Moses::TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) {
- //cout << "Creating new edge from Node " << from->GetId() << ", to Node : " << to->GetId() << ", score: " << score << " phrase: " << targetPhrase << endl;
- }
-
- const Moses::Hypothesis* GetHeadNode() const {
- return m_headNode;
- }
-
- const Moses::Hypothesis* GetTailNode() const {
- return m_tailNode;
- }
-
- float GetScore() const {
- return m_score;
- }
-
- size_t GetWordsSize() const {
- return m_targetPhrase.GetSize();
- }
-
- const Moses::Phrase& GetWords() const {
- return m_targetPhrase;
- }
-
- friend std::ostream& operator<< (std::ostream& out, const Edge& edge);
-
- const NgramHistory& GetNgrams( std::map<const Moses::Hypothesis*, std::vector<Edge> > & incomingEdges) ;
-
- bool operator < (const Edge & compare) const;
-
- void GetPhraseSuffix(const Moses::Phrase& origPhrase, size_t lastN, Moses::Phrase& targetPhrase) const;
-
- void storeNgramHistory(const Moses::Phrase& phrase, Path & path, size_t count = 1) {
- m_ngrams[phrase][path]+= count;
- }
-
-};
-
-/**
-* Data structure to hold the ngram scores as we traverse the lattice. Maps (hypo,ngram) to score
-*/
-class NgramScores
-{
-public:
- NgramScores() {}
-
- /** logsum this score to the existing score */
- void addScore(const Moses::Hypothesis* node, const Moses::Phrase& ngram, float score);
-
- /** Iterate through ngrams for selected node */
- typedef std::map<const Moses::Phrase*, float>::const_iterator NodeScoreIterator;
- NodeScoreIterator nodeBegin(const Moses::Hypothesis* node);
- NodeScoreIterator nodeEnd(const Moses::Hypothesis* node);
-
-private:
- std::set<Moses::Phrase> m_ngrams;
- std::map<const Moses::Hypothesis*, std::map<const Moses::Phrase*, float> > m_scores;
-};
-
-
-/** Holds a lattice mbr solution, and its scores */
-class LatticeMBRSolution
-{
-public:
- /** Read the words from the path */
- LatticeMBRSolution(const Moses::TrellisPath& path, bool isMap);
- const std::vector<float>& GetNgramScores() const {
- return m_ngramScores;
- }
- const std::vector<Moses::Word>& GetWords() const {
- return m_words;
- }
- float GetMapScore() const {
- return m_mapScore;
- }
- float GetScore() const {
- return m_score;
- }
-
- /** Initialise ngram scores */
- void CalcScore(std::map<Moses::Phrase, float>& finalNgramScores, const std::vector<float>& thetas, float mapWeight);
-
-private:
- std::vector<Moses::Word> m_words;
- float m_mapScore;
- std::vector<float> m_ngramScores;
- float m_score;
-};
-
-struct LatticeMBRSolutionComparator {
- bool operator()(const LatticeMBRSolution& a, const LatticeMBRSolution& b) {
- return a.GetScore() > b.GetScore();
- }
-};
-
-void pruneLatticeFB(Lattice & connectedHyp, std::map < const Moses::Hypothesis*, std::set <const Moses::Hypothesis* > > & outgoingHyps, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges,
- const std::vector< float> & estimatedScores, const Moses::Hypothesis*, size_t edgeDensity,float scale);
-
-//Use the ngram scores to rerank the nbest list, return at most n solutions
-void getLatticeMBRNBest(Moses::Manager& manager, Moses::TrellisPathList& nBestList, std::vector<LatticeMBRSolution>& solutions, size_t n);
-//calculate expectated ngram counts, clipping at 1 (ie calculating posteriors) if posteriors==true.
-void calcNgramExpectations(Lattice & connectedHyp, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges, std::map<Moses::Phrase,
- float>& finalNgramScores, bool posteriors);
-void GetOutputFactors(const Moses::TrellisPath &path, std::vector <Moses::Word> &translation);
-void extract_ngrams(const std::vector<Moses::Word >& sentence, std::map < Moses::Phrase, int > & allngrams);
-bool ascendingCoverageCmp(const Moses::Hypothesis* a, const Moses::Hypothesis* b);
-std::vector<Moses::Word> doLatticeMBR(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
-const Moses::TrellisPath doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
-//std::vector<Moses::Word> doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
-
-}
-
-#endif
diff --git a/moses-cmd/LatticeMBRGrid.cpp b/moses-cmd/LatticeMBRGrid.cpp
index 39d88f34d..9b2ee167c 100644
--- a/moses-cmd/LatticeMBRGrid.cpp
+++ b/moses-cmd/LatticeMBRGrid.cpp
@@ -46,8 +46,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include <stdexcept>
#include <set>
-#include "IOWrapper.h"
-#include "LatticeMBR.h"
+#include "moses/IOWrapper.h"
+#include "moses/LatticeMBR.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
#include "util/exception.hh"
@@ -55,12 +55,11 @@ POSSIBILITY OF SUCH DAMAGE.
using namespace std;
using namespace Moses;
-using namespace MosesCmd;
//keys
enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale};
-namespace MosesCmd
+namespace Moses
{
class Grid
@@ -159,8 +158,8 @@ int main(int argc, char* argv[])
StaticData& staticData = const_cast<StaticData&>(StaticData::Instance());
staticData.SetUseLatticeMBR(true);
- IOWrapper* ioWrapper = GetIOWrapper(staticData);
+ IOWrapper* ioWrapper = new IOWrapper();
if (!ioWrapper) {
throw runtime_error("Failed to initialise IOWrapper");
}
@@ -178,11 +177,12 @@ int main(int argc, char* argv[])
const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
++lineCount;
- Sentence sentence;
- Manager manager(lineCount, *source, staticData.GetSearchAlgorithm());
- manager.ProcessSentence();
+ source->SetTranslationId(lineCount);
+
+ Manager manager(*source, staticData.GetSearchAlgorithm());
+ manager.Decode();
TrellisPathList nBestList;
manager.CalcNBest(nBestSize, nBestList,true);
//grid search
@@ -200,7 +200,7 @@ int main(int argc, char* argv[])
staticData.SetMBRScale(scale);
cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| ";
vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
- OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
+ ioWrapper->OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
staticData.GetReportAllFactors(),cout);
}
}
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index c931ea3dc..03b3a5054 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -22,14 +22,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
/**
* Moses main, for single-threaded and multi-threaded.
**/
-
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/iostreams/device/file.hpp>
-#include <boost/iostreams/filter/bzip2.hpp>
-#include <boost/iostreams/filter/gzip.hpp>
-#include <boost/iostreams/filtering_stream.hpp>
-
#include <exception>
#include <fstream>
#include <sstream>
@@ -42,537 +34,39 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//#include <vld.h>
#endif
-#include "TranslationAnalysis.h"
-#include "IOWrapper.h"
-#include "mbr.h"
-
+#include "moses/IOWrapper.h"
#include "moses/Hypothesis.h"
#include "moses/Manager.h"
#include "moses/StaticData.h"
+#include "moses/TypeDef.h"
#include "moses/Util.h"
#include "moses/Timer.h"
-#include "moses/ThreadPool.h"
-#include "moses/OutputCollector.h"
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/FF/StatefulFeatureFunction.h"
#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/TranslationTask.h"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
#endif
-using namespace std;
-using namespace Moses;
-using namespace MosesCmd;
-
-namespace MosesCmd
-{
-// output floats with five significant digits
-static const size_t PRECISION = 3;
-
-/** Enforce rounding */
-void fix(std::ostream& stream, size_t size)
-{
- stream.setf(std::ios::fixed);
- stream.precision(size);
-}
-
-/** Translates a sentence.
- * - calls the search (Manager)
- * - applies the decision rule
- * - outputs best translation and additional reporting
- **/
-class TranslationTask : public Task
-{
-
-public:
-
- TranslationTask(size_t lineNumber,
- InputType* source, OutputCollector* outputCollector, OutputCollector* nbestCollector,
- OutputCollector* latticeSamplesCollector,
- OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
- OutputCollector* detailedTranslationCollector,
- OutputCollector* alignmentInfoCollector,
- OutputCollector* unknownsCollector,
- bool outputSearchGraphSLF,
- bool outputSearchGraphHypergraph) :
- m_source(source), m_lineNumber(lineNumber),
- m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
- m_latticeSamplesCollector(latticeSamplesCollector),
- m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
- m_detailedTranslationCollector(detailedTranslationCollector),
- m_alignmentInfoCollector(alignmentInfoCollector),
- m_unknownsCollector(unknownsCollector),
- m_outputSearchGraphSLF(outputSearchGraphSLF),
- m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
-
- /** Translate one sentence
- * gets called by main function implemented at end of this source file */
- void Run() {
- // shorthand for "global data"
- const StaticData &staticData = StaticData::Instance();
-
- // input sentence
- Sentence sentence;
-
- // report wall time spent on translation
- Timer translationTime;
- translationTime.start();
-
- // report thread number
-#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
- TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << pthread_self() << std::endl);
-#endif
-
-
- // execute the translation
- // note: this executes the search, resulting in a search graph
- // we still need to apply the decision rule (MAP, MBR, ...)
- Timer initTime;
- initTime.start();
- Manager manager(m_lineNumber, *m_source,staticData.GetSearchAlgorithm());
- VERBOSE(1, "Line " << m_lineNumber << ": Initialize search took " << initTime << " seconds total" << endl);
- manager.ProcessSentence();
-
- // we are done with search, let's look what we got
- Timer additionalReportingTime;
- additionalReportingTime.start();
-
- // output word graph
- if (m_wordGraphCollector) {
- ostringstream out;
- fix(out,PRECISION);
- manager.GetWordGraph(m_lineNumber, out);
- m_wordGraphCollector->Write(m_lineNumber, out.str());
- }
-
- // output search graph
- if (m_searchGraphCollector) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraph(m_lineNumber, out);
- m_searchGraphCollector->Write(m_lineNumber, out.str());
-
-#ifdef HAVE_PROTOBUF
- if (staticData.GetOutputSearchGraphPB()) {
- ostringstream sfn;
- sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << m_lineNumber << ".pb" << ends;
- string fn = sfn.str();
- VERBOSE(2, "Writing search graph to " << fn << endl);
- fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out);
- manager.SerializeSearchGraphPB(m_lineNumber, output);
- }
+#ifdef PT_UG
+#include <boost/foreach.hpp>
+#include "moses/TranslationModel/UG/mmsapt.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
#endif
- }
-
- // Output search graph in HTK standard lattice format (SLF)
- if (m_outputSearchGraphSLF) {
- stringstream fileName;
- fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
- std::ofstream *file = new std::ofstream;
- file->open(fileName.str().c_str());
- if (file->is_open() && file->good()) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraphAsSLF(m_lineNumber, out);
- *file << out.str();
- file -> flush();
- } else {
- TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
- }
- delete file;
- }
-
- // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
- if (m_outputSearchGraphHypergraph) {
-
- vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph");
-
- bool appendSuffix;
- if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
- appendSuffix = true;
- } else {
- appendSuffix = false;
- }
-
- string compression;
- if (hypergraphParameters.size() > 1) {
- compression = hypergraphParameters[1];
- } else {
- compression = "txt";
- }
-
- string hypergraphDir;
- if ( hypergraphParameters.size() > 2 ) {
- hypergraphDir = hypergraphParameters[2];
- } else {
- string nbestFile = staticData.GetNBestFilePath();
- if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
- boost::filesystem::path nbestPath(nbestFile);
-
- // In the Boost filesystem API version 2,
- // which was the default prior to Boost 1.46,
- // the filename() method returned a string.
- //
- // In the Boost filesystem API version 3,
- // which is the default starting with Boost 1.46,
- // the filename() method returns a path object.
- //
- // To get a string from the path object,
- // the native() method must be called.
- // hypergraphDir = nbestPath.parent_path().filename()
- //#if BOOST_VERSION >= 104600
- // .native()
- //#endif
- //;
-
- // Hopefully the following compiles under all versions of Boost.
- //
- // If this line gives you compile errors,
- // contact Lane Schwartz on the Moses mailing list
- hypergraphDir = nbestPath.parent_path().string();
-
- } else {
- stringstream hypergraphDirName;
- hypergraphDirName << boost::filesystem::current_path().string() << "/hypergraph";
- hypergraphDir = hypergraphDirName.str();
- }
- }
-
- if ( ! boost::filesystem::exists(hypergraphDir) ) {
- boost::filesystem::create_directory(hypergraphDir);
- }
-
- if ( ! boost::filesystem::exists(hypergraphDir) ) {
- TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
- } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
- TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
- } else {
- stringstream fileName;
- fileName << hypergraphDir << "/" << m_lineNumber;
- if ( appendSuffix ) {
- fileName << "." << compression;
- }
- boost::iostreams::filtering_ostream *file
- = new boost::iostreams::filtering_ostream;
-
- if ( compression == "gz" ) {
- file->push( boost::iostreams::gzip_compressor() );
- } else if ( compression == "bz2" ) {
- file->push( boost::iostreams::bzip2_compressor() );
- } else if ( compression != "txt" ) {
- TRACE_ERR("Unrecognized hypergraph compression format ("
- << compression
- << ") - using uncompressed plain txt" << std::endl);
- compression = "txt";
- }
-
- file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
-
- if (file->is_complete() && file->good()) {
- fix(*file,PRECISION);
- manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
- file -> flush();
- } else {
- TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber
- << " because the output file " << fileName.str()
- << " is not open or not ready for writing"
- << std::endl);
- }
- file -> pop();
- delete file;
- }
- }
- additionalReportingTime.stop();
-
- // apply decision rule and output best translation(s)
- if (m_outputCollector) {
- ostringstream out;
- ostringstream debug;
- fix(debug,PRECISION);
-
- // all derivations - send them to debug stream
- if (staticData.PrintAllDerivations()) {
- additionalReportingTime.start();
- manager.PrintAllDerivations(m_lineNumber, debug);
- additionalReportingTime.stop();
- }
-
- Timer decisionRuleTime;
- decisionRuleTime.start();
-
- // MAP decoding: best hypothesis
- const Hypothesis* bestHypo = NULL;
- if (!staticData.UseMBR()) {
- bestHypo = manager.GetBestHypothesis();
- if (bestHypo) {
- if (staticData.GetOutputHypoScore()) {
- out << bestHypo->GetTotalScore() << ' ';
- }
- if (staticData.IsPathRecoveryEnabled()) {
- OutputInput(out, bestHypo);
- out << "||| ";
- }
- if (staticData.IsIDEnabled()) {
- out << m_source->GetTranslationId() << " ";
- }
- if (staticData.IsPassthroughEnabled()) {
- OutputPassthroughInformation(out, bestHypo);
- }
-
- if (staticData.GetReportSegmentation() == 2) {
- manager.GetOutputLanguageModelOrder(out, bestHypo);
- }
- OutputBestSurface(
- out,
- bestHypo,
- staticData.GetOutputFactorOrder(),
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors());
- if (staticData.PrintAlignmentInfo()) {
- out << "||| ";
- OutputAlignment(out, bestHypo);
- }
-
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
- IFVERBOSE(1) {
- debug << "BEST TRANSLATION: " << *bestHypo << endl;
- }
- } else {
- VERBOSE(1, "NO BEST TRANSLATION" << endl);
- }
-
- out << endl;
- }
-
- // MBR decoding (n-best MBR, lattice MBR, consensus)
- else {
- // we first need the n-best translations
- size_t nBestSize = staticData.GetMBRSize();
- if (nBestSize <= 0) {
- cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
- exit(1);
- }
- TrellisPathList nBestList;
- manager.CalcNBest(nBestSize, nBestList,true);
- VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
- IFVERBOSE(2) {
- PrintUserTime("calculated n-best list for (L)MBR decoding");
- }
-
- // lattice MBR
- if (staticData.UseLatticeMBR()) {
- if (m_nbestCollector) {
- //lattice mbr nbest
- vector<LatticeMBRSolution> solutions;
- size_t n = min(nBestSize, staticData.GetNBestSize());
- getLatticeMBRNBest(manager,nBestList,solutions,n);
- ostringstream out;
- OutputLatticeMBRNBest(out, solutions,m_lineNumber);
- m_nbestCollector->Write(m_lineNumber, out.str());
- } else {
- //Lattice MBR decoding
- vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
- OutputBestHypo(mbrBestHypo, m_lineNumber, staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- IFVERBOSE(2) {
- PrintUserTime("finished Lattice MBR decoding");
- }
- }
- }
-
- // consensus decoding
- else if (staticData.UseConsensusDecoding()) {
- const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
- OutputBestHypo(conBestHypo, m_lineNumber,
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, conBestHypo);
- IFVERBOSE(2) {
- PrintUserTime("finished Consensus decoding");
- }
- }
-
- // n-best MBR decoding
- else {
- const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
- OutputBestHypo(mbrBestHypo, m_lineNumber,
- staticData.GetReportSegmentation(),
- staticData.GetReportAllFactors(),out);
- OutputAlignment(m_alignmentInfoCollector, m_lineNumber, mbrBestHypo);
- IFVERBOSE(2) {
- PrintUserTime("finished MBR decoding");
- }
- }
- }
-
- // report best translation to output collector
- m_outputCollector->Write(m_lineNumber,out.str(),debug.str());
- decisionRuleTime.stop();
- VERBOSE(1, "Line " << m_lineNumber << ": Decision rule took " << decisionRuleTime << " seconds total" << endl);
- }
-
- additionalReportingTime.start();
-
- // output n-best list
- if (m_nbestCollector && !staticData.UseLatticeMBR()) {
- TrellisPathList nBestList;
- ostringstream out;
- manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest());
- OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
- m_nbestCollector->Write(m_lineNumber, out.str());
- }
-
- //lattice samples
- if (m_latticeSamplesCollector) {
- TrellisPathList latticeSamples;
- ostringstream out;
- manager.CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples);
- OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
- m_latticeSamplesCollector->Write(m_lineNumber, out.str());
- }
-
- // detailed translation reporting
- if (m_detailedTranslationCollector) {
- ostringstream out;
- fix(out,PRECISION);
- TranslationAnalysis::PrintTranslationAnalysis(out, manager.GetBestHypothesis());
- m_detailedTranslationCollector->Write(m_lineNumber,out.str());
- }
-
- //list of unknown words
- if (m_unknownsCollector) {
- const vector<const Phrase*>& unknowns = manager.getSntTranslationOptions()->GetUnknownSources();
- ostringstream out;
- for (size_t i = 0; i < unknowns.size(); ++i) {
- out << *(unknowns[i]);
- }
- out << endl;
- m_unknownsCollector->Write(m_lineNumber, out.str());
- }
-
- // report additional statistics
- manager.CalcDecoderStatistics();
- VERBOSE(1, "Line " << m_lineNumber << ": Additional reporting took " << additionalReportingTime << " seconds total" << endl);
- VERBOSE(1, "Line " << m_lineNumber << ": Translation took " << translationTime << " seconds total" << endl);
- IFVERBOSE(2) {
- PrintUserTime("Sentence Decoding Time:");
- }
- }
-
- ~TranslationTask() {
- delete m_source;
- }
-
-private:
- InputType* m_source;
- size_t m_lineNumber;
- OutputCollector* m_outputCollector;
- OutputCollector* m_nbestCollector;
- OutputCollector* m_latticeSamplesCollector;
- OutputCollector* m_wordGraphCollector;
- OutputCollector* m_searchGraphCollector;
- OutputCollector* m_detailedTranslationCollector;
- OutputCollector* m_alignmentInfoCollector;
- OutputCollector* m_unknownsCollector;
- bool m_outputSearchGraphSLF;
- bool m_outputSearchGraphHypergraph;
- std::ofstream *m_alignmentStream;
-
-
-};
-
-static void PrintFeatureWeight(const FeatureFunction* ff)
-{
- cout << ff->GetScoreProducerDescription() << "=";
- size_t numScoreComps = ff->GetNumScoreComponents();
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- for (size_t i = 0; i < numScoreComps; ++i) {
- cout << " " << values[i];
- }
- cout << endl;
-}
-
-static void ShowWeights()
-{
- //TODO: Find a way of ensuring this order is synced with the nbest
- fix(cout,6);
- const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
-
- for (size_t i = 0; i < sff.size(); ++i) {
- const StatefulFeatureFunction *ff = sff[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- } else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- const StatelessFeatureFunction *ff = slf[i];
- if (ff->IsTuneable()) {
- PrintFeatureWeight(ff);
- } else {
- cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
- }
- }
-}
+using namespace std;
+using namespace Moses;
-size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
+namespace Moses
{
- size_t numScoreComps = ff->GetNumScoreComponents();
- if (numScoreComps != 0) {
- vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
- if (numScoreComps > 1) {
- for (size_t i = 0; i < numScoreComps; ++i) {
- outputSearchGraphStream << ff->GetScoreProducerDescription()
- << i
- << "=" << values[i] << endl;
- }
- } else {
- outputSearchGraphStream << ff->GetScoreProducerDescription()
- << "=" << values[0] << endl;
- }
- return index+numScoreComps;
- } else {
- UTIL_THROW2("Sparse features are not yet supported when outputting hypergraph format");
- }
-}
void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
{
outputSearchGraphStream.setf(std::ios::fixed);
outputSearchGraphStream.precision(6);
-
- const vector<const StatelessFeatureFunction*>& slf =StatelessFeatureFunction::GetStatelessFeatureFunctions();
- const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- size_t featureIndex = 1;
- for (size_t i = 0; i < sff.size(); ++i) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
- }
- for (size_t i = 0; i < slf.size(); ++i) {
- /*
- if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
- slf[i]->GetScoreProducerWeightShortName() != "tm" &&
- slf[i]->GetScoreProducerWeightShortName() != "I" &&
- slf[i]->GetScoreProducerWeightShortName() != "g")
- */
- {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
- }
- }
- const vector<PhraseDictionary*>& pds = PhraseDictionary::GetColl();
- for( size_t i=0; i<pds.size(); i++ ) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
- }
- const vector<GenerationDictionary*>& gds = GenerationDictionary::GetColl();
- for( size_t i=0; i<gds.size(); i++ ) {
- featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
- }
-
+ StaticData::Instance().GetAllWeights().Save(outputSearchGraphStream);
}
@@ -586,7 +80,7 @@ int main(int argc, char** argv)
#ifdef HAVE_PROTOBUF
GOOGLE_PROTOBUF_VERIFY_VERSION;
#endif
-
+
// echo command line, if verbose
IFVERBOSE(1) {
TRACE_ERR("command: ");
@@ -595,8 +89,8 @@ int main(int argc, char** argv)
}
// set number of significant decimals in output
- fix(cout,PRECISION);
- fix(cerr,PRECISION);
+ FixPrecision(cout);
+ FixPrecision(cerr);
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
@@ -605,15 +99,13 @@ int main(int argc, char** argv)
exit(1);
}
- std::cerr <<"Before StaticData::LoadDataStatic" << std::endl;
+
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(&params, argv[0])) {
exit(1);
}
- std::cerr <<"After StaticData::LoadDataStatic" << std::endl;
- std::cerr <<"Before ShowWeights" << std::endl;
// setting "-show-weights" -> just dump out weights and exit
if (params.isParamSpecified("show-weights")) {
ShowWeights();
@@ -628,8 +120,12 @@ int main(int argc, char** argv)
srand(time(NULL));
// set up read/writing class
- IOWrapper* ioWrapper = GetIOWrapper(staticData);
- if (!ioWrapper) {
+ IFVERBOSE(1) {
+ PrintUserTime("Created input-output object");
+ }
+
+ IOWrapper* ioWrapper = new IOWrapper();
+ if (ioWrapper == NULL) {
cerr << "Error; Failed to create IO object" << endl;
exit(1);
}
@@ -641,114 +137,6 @@ int main(int argc, char** argv)
TRACE_ERR(weights);
TRACE_ERR("\n");
}
- if (staticData.GetOutputSearchGraphHypergraph()) {
- ofstream* weightsOut = new std::ofstream;
- stringstream weightsFilename;
- if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
- weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
- } else {
- string nbestFile = staticData.GetNBestFilePath();
- if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
- boost::filesystem::path nbestPath(nbestFile);
- weightsFilename << nbestPath.parent_path().filename() << "/weights";
- } else {
- weightsFilename << boost::filesystem::current_path().string() << "/hypergraph/weights";
- }
- }
- boost::filesystem::path weightsFilePath(weightsFilename.str());
- if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) {
- boost::filesystem::create_directory(weightsFilePath.parent_path());
- }
- TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
- weightsOut->open(weightsFilename.str().c_str());
- OutputFeatureWeightsForHypergraph(*weightsOut);
- weightsOut->flush();
- weightsOut->close();
- delete weightsOut;
- }
-
-
- // initialize output streams
- // note: we can't just write to STDOUT or files
- // because multithreading may return sentences in shuffled order
- auto_ptr<OutputCollector> outputCollector; // for translations
- auto_ptr<OutputCollector> nbestCollector; // for n-best lists
- auto_ptr<OutputCollector> latticeSamplesCollector; //for lattice samples
- auto_ptr<ofstream> nbestOut;
- auto_ptr<ofstream> latticeSamplesOut;
- size_t nbestSize = staticData.GetNBestSize();
- string nbestFile = staticData.GetNBestFilePath();
- bool output1best = true;
- if (nbestSize) {
- if (nbestFile == "-" || nbestFile == "/dev/stdout") {
- // nbest to stdout, no 1-best
- nbestCollector.reset(new OutputCollector());
- output1best = false;
- } else {
- // nbest to file, 1-best to stdout
- nbestOut.reset(new ofstream(nbestFile.c_str()));
- if (!nbestOut->good()) {
- TRACE_ERR("ERROR: Failed to open " << nbestFile << " for nbest lists" << endl);
- exit(1);
- }
- nbestCollector.reset(new OutputCollector(nbestOut.get()));
- }
- }
- size_t latticeSamplesSize = staticData.GetLatticeSamplesSize();
- string latticeSamplesFile = staticData.GetLatticeSamplesFilePath();
- if (latticeSamplesSize) {
- if (latticeSamplesFile == "-" || latticeSamplesFile == "/dev/stdout") {
- latticeSamplesCollector.reset(new OutputCollector());
- output1best = false;
- } else {
- latticeSamplesOut.reset(new ofstream(latticeSamplesFile.c_str()));
- if (!latticeSamplesOut->good()) {
- TRACE_ERR("ERROR: Failed to open " << latticeSamplesFile << " for lattice samples" << endl);
- exit(1);
- }
- latticeSamplesCollector.reset(new OutputCollector(latticeSamplesOut.get()));
- }
- }
- if (output1best) {
- outputCollector.reset(new OutputCollector());
- }
-
- // initialize stream for word graph (aka: output lattice)
- auto_ptr<OutputCollector> wordGraphCollector;
- if (staticData.GetOutputWordGraph()) {
- wordGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputWordGraphStream())));
- }
-
- // initialize stream for search graph
- // note: this is essentially the same as above, but in a different format
- auto_ptr<OutputCollector> searchGraphCollector;
- if (staticData.GetOutputSearchGraph()) {
- searchGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputSearchGraphStream())));
- }
-
- // initialize stram for details about the decoder run
- auto_ptr<OutputCollector> detailedTranslationCollector;
- if (staticData.IsDetailedTranslationReportingEnabled()) {
- detailedTranslationCollector.reset(new OutputCollector(&(ioWrapper->GetDetailedTranslationReportingStream())));
- }
-
- // initialize stram for word alignment between input and output
- auto_ptr<OutputCollector> alignmentInfoCollector;
- if (!staticData.GetAlignmentOutputFile().empty()) {
- alignmentInfoCollector.reset(new OutputCollector(ioWrapper->GetAlignmentOutputStream()));
- }
-
- //initialise stream for unknown (oov) words
- auto_ptr<OutputCollector> unknownsCollector;
- auto_ptr<ofstream> unknownsStream;
- if (!staticData.GetOutputUnknownsFile().empty()) {
- unknownsStream.reset(new ofstream(staticData.GetOutputUnknownsFile().c_str()));
- if (!unknownsStream->good()) {
- TRACE_ERR("Unable to open " << staticData.GetOutputUnknownsFile() << " for unknowns");
- exit(1);
- }
- unknownsCollector.reset(new OutputCollector(unknownsStream.get()));
- }
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
@@ -757,24 +145,51 @@ int main(int argc, char** argv)
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = staticData.GetStartTranslationId();
- while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+ while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
+ source->SetTranslationId(lineCount);
IFVERBOSE(1) {
ResetUserTime();
}
+
+ FeatureFunction::CallChangeSource(source);
+
// set up task of translating one sentence
- TranslationTask* task =
- new TranslationTask(lineCount,source, outputCollector.get(),
- nbestCollector.get(),
- latticeSamplesCollector.get(),
- wordGraphCollector.get(),
- searchGraphCollector.get(),
- detailedTranslationCollector.get(),
- alignmentInfoCollector.get(),
- unknownsCollector.get(),
- staticData.GetOutputSearchGraphSLF(),
- staticData.GetOutputSearchGraphHypergraph());
+ TranslationTask* task;
+ if (staticData.IsChart()) {
+ // scfg
+ task = new TranslationTask(source, *ioWrapper, 2);
+ }
+ else {
+ // pb
+ task = new TranslationTask(source, *ioWrapper, 1);
+ }
+
// execute task
#ifdef WITH_THREADS
+#ifdef PT_UG
+ bool spe = params.isParamSpecified("spe-src");
+ if (spe) {
+ // simulated post-editing: always run single-threaded!
+ task->Run();
+ delete task;
+ string src,trg,aln;
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_src,src), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_trg,trg), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_aln,aln), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl())
+ {
+ Mmsapt* sapt = dynamic_cast<Mmsapt*>(pd);
+ if (sapt) sapt->add(src,trg,aln);
+ VERBOSE(1,"[" << HERE << " added src] " << src << endl);
+ VERBOSE(1,"[" << HERE << " added trg] " << trg << endl);
+ VERBOSE(1,"[" << HERE << " added aln] " << aln << endl);
+ }
+ }
+ else
+#endif
pool.Submit(task);
#else
task->Run();
diff --git a/moses-cmd/Main.h b/moses-cmd/Main.h
index 362c1f245..49fee0219 100644
--- a/moses-cmd/Main.h
+++ b/moses-cmd/Main.h
@@ -1,3 +1,4 @@
+#pragma once
// $Id$
/***********************************************************************
@@ -32,12 +33,10 @@ POSSIBILITY OF SUCH DAMAGE.
// example file on how to use moses library
-#ifndef moses_cmd_Main_h
-#define moses_cmd_Main_h
#include "moses/StaticData.h"
class IOWrapper;
int main(int argc, char* argv[]);
-#endif
+
diff --git a/moses-cmd/TranslationAnalysis.cpp b/moses-cmd/TranslationAnalysis.cpp
deleted file mode 100644
index e77486162..000000000
--- a/moses-cmd/TranslationAnalysis.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-// $Id$
-
-#include <iostream>
-#include <sstream>
-#include <algorithm>
-#include "moses/StaticData.h"
-#include "moses/Hypothesis.h"
-#include "TranslationAnalysis.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-#include "moses/LM/Base.h"
-
-using namespace Moses;
-
-namespace TranslationAnalysis
-{
-
-void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo)
-{
- os << std::endl << "TRANSLATION HYPOTHESIS DETAILS:" << std::endl;
- std::vector<const Hypothesis*> translationPath;
-
- while (hypo) {
- translationPath.push_back(hypo);
- hypo = hypo->GetPrevHypo();
- }
-
- std::reverse(translationPath.begin(), translationPath.end());
- std::vector<std::string> droppedWords;
- std::vector<const Hypothesis*>::iterator tpi = translationPath.begin();
- if(tpi == translationPath.end())
- return;
- ++tpi; // skip initial translation state
- std::vector<std::string> sourceMap;
- std::vector<std::string> targetMap;
- std::vector<unsigned int> lmAcc(0);
- size_t lmCalls = 0;
- bool doLMStats = ((*tpi)->GetLMStats() != 0);
- if (doLMStats)
- lmAcc.resize((*tpi)->GetLMStats()->size(), 0);
- for (; tpi != translationPath.end(); ++tpi) {
- std::ostringstream sms;
- std::ostringstream tms;
- std::string target = (*tpi)->GetTargetPhraseStringRep();
- std::string source = (*tpi)->GetSourcePhraseStringRep();
- WordsRange twr = (*tpi)->GetCurrTargetWordsRange();
- WordsRange swr = (*tpi)->GetCurrSourceWordsRange();
- const AlignmentInfo &alignmentInfo = (*tpi)->GetCurrTargetPhrase().GetAlignTerm();
- // language model backoff stats,
- if (doLMStats) {
- std::vector<std::vector<unsigned int> >& lmstats = *(*tpi)->GetLMStats();
- std::vector<std::vector<unsigned int> >::iterator i = lmstats.begin();
- std::vector<unsigned int>::iterator acc = lmAcc.begin();
-
- for (; i != lmstats.end(); ++i, ++acc) {
- std::vector<unsigned int>::iterator j = i->begin();
- lmCalls += i->size();
- for (; j != i->end(); ++j) {
- (*acc) += *j;
- }
- }
- }
-
- bool epsilon = false;
- if (target == "") {
- target="<EPSILON>";
- epsilon = true;
- droppedWords.push_back(source);
- }
- os << " SOURCE: " << swr << " " << source << std::endl
- << " TRANSLATED AS: " << target << std::endl
- << " WORD ALIGNED: " << alignmentInfo << std::endl;
- size_t twr_i = twr.GetStartPos();
- size_t swr_i = swr.GetStartPos();
- if (!epsilon) {
- sms << twr_i;
- }
- if (epsilon) {
- tms << "del(" << swr_i << ")";
- } else {
- tms << swr_i;
- }
- swr_i++;
- twr_i++;
- for (; twr_i <= twr.GetEndPos() && twr.GetEndPos() != NOT_FOUND; twr_i++) {
- sms << '-' << twr_i;
- }
- for (; swr_i <= swr.GetEndPos() && swr.GetEndPos() != NOT_FOUND; swr_i++) {
- tms << '-' << swr_i;
- }
- if (!epsilon) targetMap.push_back(sms.str());
- sourceMap.push_back(tms.str());
- }
- std::vector<std::string>::iterator si = sourceMap.begin();
- std::vector<std::string>::iterator ti = targetMap.begin();
- os << std::endl << "SOURCE/TARGET SPANS:";
- os << std::endl << " SOURCE:";
- for (; si != sourceMap.end(); ++si) {
- os << " " << *si;
- }
- os << std::endl << " TARGET:";
- for (; ti != targetMap.end(); ++ti) {
- os << " " << *ti;
- }
- os << std::endl << std::endl;
- if (doLMStats && lmCalls > 0) {
- std::vector<unsigned int>::iterator acc = lmAcc.begin();
-
- const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for (size_t i = 0; i < statefulFFs.size(); ++i) {
- const StatefulFeatureFunction *ff = statefulFFs[i];
- const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
-
- if (lm) {
- char buf[256];
- sprintf(buf, "%.4f", (float)(*acc)/(float)lmCalls);
- os << lm->GetScoreProducerDescription() <<", AVG N-GRAM LENGTH: " << buf << std::endl;
-
- ++acc;
- }
- }
- }
-
- if (droppedWords.size() > 0) {
- std::vector<std::string>::iterator dwi = droppedWords.begin();
- os << std::endl << "WORDS/PHRASES DROPPED:" << std::endl;
- for (; dwi != droppedWords.end(); ++dwi) {
- os << "\tdropped=" << *dwi << std::endl;
- }
- }
- os << std::endl << "SCORES (UNWEIGHTED/WEIGHTED): ";
- os << translationPath.back()->GetScoreBreakdown();
- os << " weighted(TODO)";
- os << std::endl;
-}
-
-}
diff --git a/moses-cmd/TranslationAnalysis.h b/moses-cmd/TranslationAnalysis.h
deleted file mode 100644
index 348cfe512..000000000
--- a/moses-cmd/TranslationAnalysis.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// $Id$
-
-/*
- * also see moses/SentenceStats
- */
-
-#ifndef moses_cmd_TranslationAnalysis_h
-#define moses_cmd_TranslationAnalysis_h
-
-#include <iostream>
-#include "moses/Hypothesis.h"
-
-namespace TranslationAnalysis
-{
-
-/***
- * print details about the translation represented in hypothesis to
- * os. Included information: phrase alignment, words dropped, scores
- */
-void PrintTranslationAnalysis(std::ostream &os, const Moses::Hypothesis* hypo);
-
-}
-
-#endif
diff --git a/moses-cmd/mbr.cpp b/moses-cmd/mbr.cpp
deleted file mode 100644
index 6a8dfa823..000000000
--- a/moses-cmd/mbr.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <iomanip>
-#include <vector>
-#include <map>
-#include <stdlib.h>
-#include <math.h>
-#include <algorithm>
-#include <stdio.h>
-#include "moses/TrellisPathList.h"
-#include "moses/TrellisPath.h"
-#include "moses/StaticData.h"
-#include "moses/Util.h"
-#include "mbr.h"
-
-using namespace std ;
-using namespace Moses;
-
-
-/* Input :
- 1. a sorted n-best list, with duplicates filtered out in the following format
- 0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
-
- 2. a weight vector
- 3. bleu order ( default = 4)
- 4. scaling factor to weigh the weight vector (default = 1.0)
-
- Output :
- translations that minimise the Bayes Risk of the n-best list
-
-
-*/
-
-int BLEU_ORDER = 4;
-int SMOOTH = 1;
-float min_interval = 1e-4;
-void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
-{
- vector< const Factor* > ngram;
- for (int k = 0; k < BLEU_ORDER; k++) {
- for(int i =0; i < max((int)sentence.size()-k,0); i++) {
- for ( int j = i; j<= i+k; j++) {
- ngram.push_back(sentence[j]);
- }
- ++allngrams[ngram];
- ngram.clear();
- }
- }
-}
-
-float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats )
-{
- int comps_n = 2*BLEU_ORDER+1;
- vector<int> comps(comps_n);
- float logbleu = 0.0, brevity;
-
- int hyp_length = sents[hyp].size();
-
- for (int i =0; i<BLEU_ORDER; i++) {
- comps[2*i] = 0;
- comps[2*i+1] = max(hyp_length-i,0);
- }
-
- map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
- map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
-
- for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
- it != hyp_ngrams.end(); it++) {
- map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
- if(ref_it != ref_ngrams.end()) {
- comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
- }
- }
- comps[comps_n-1] = sents[ref].size();
-
- for (int i=0; i<BLEU_ORDER; i++) {
- if (comps[0] == 0)
- return 0.0;
- if ( i > 0 )
- logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
- else
- logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
- }
- logbleu /= BLEU_ORDER;
- brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
- if (brevity < 0.0)
- logbleu += brevity;
- return exp(logbleu);
-}
-
-const TrellisPath doMBR(const TrellisPathList& nBestList)
-{
- float marginal = 0;
-
- vector<float> joint_prob_vec;
- vector< vector<const Factor*> > translations;
- float joint_prob;
- vector< map < vector <const Factor *>, int > > ngram_stats;
-
- TrellisPathList::const_iterator iter;
-
- // get max score to prevent underflow
- float maxScore = -1e20;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const TrellisPath &path = **iter;
- float score = StaticData::Instance().GetMBRScale()
- * path.GetScoreBreakdown().GetWeightedScore();
- if (maxScore < score) maxScore = score;
- }
-
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const TrellisPath &path = **iter;
- joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore() - maxScore);
- marginal += joint_prob;
- joint_prob_vec.push_back(joint_prob);
-
- // get words in translation
- vector<const Factor*> translation;
- GetOutputFactors(path, translation);
-
- // collect n-gram counts
- map < vector < const Factor *>, int > counts;
- extract_ngrams(translation,counts);
-
- ngram_stats.push_back(counts);
- translations.push_back(translation);
- }
-
- vector<float> mbr_loss;
- float bleu, weightedLoss;
- float weightedLossCumul = 0;
- float minMBRLoss = 1000000;
- int minMBRLossIdx = -1;
-
- /* Main MBR computation done here */
- iter = nBestList.begin();
- for (unsigned int i = 0; i < nBestList.GetSize(); i++) {
- weightedLossCumul = 0;
- for (unsigned int j = 0; j < nBestList.GetSize(); j++) {
- if ( i != j) {
- bleu = calculate_score(translations, j, i,ngram_stats );
- weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
- weightedLossCumul += weightedLoss;
- if (weightedLossCumul > minMBRLoss)
- break;
- }
- }
- if (weightedLossCumul < minMBRLoss) {
- minMBRLoss = weightedLossCumul;
- minMBRLossIdx = i;
- }
- iter++;
- }
- /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
- return nBestList.at(minMBRLossIdx);
- //return translations[minMBRLossIdx];
-}
-
-void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
-{
- const std::vector<const Hypothesis *> &edges = path.GetEdges();
- const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
- assert (outputFactorOrder.size() == 1);
-
- // print the surface factor of the translation
- for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
- const Hypothesis &edge = *edges[currEdge];
- const Phrase &phrase = edge.GetCurrTargetPhrase();
- size_t size = phrase.GetSize();
- for (size_t pos = 0 ; pos < size ; pos++) {
-
- const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
- translation.push_back(factor);
- }
- }
-}
-
diff --git a/moses-cmd/mbr.h b/moses-cmd/mbr.h
deleted file mode 100644
index d08b11a98..000000000
--- a/moses-cmd/mbr.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// $Id$
-
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
-
-#ifndef moses_cmd_mbr_h
-#define moses_cmd_mbr_h
-
-const Moses::TrellisPath doMBR(const Moses::TrellisPathList& nBestList);
-void GetOutputFactors(const Moses::TrellisPath &path, std::vector <const Moses::Factor*> &translation);
-float calculate_score(const std::vector< std::vector<const Moses::Factor*> > & sents, int ref, int hyp, std::vector < std::map < std::vector < const Moses::Factor *>, int > > & ngram_stats );
-#endif