From 774ed64f2e06c39d95f73192754c208d7bb53599 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 15 Feb 2013 13:06:54 -0500 Subject: Work to allow output search graph in HTK standard lattice format --- moses-cmd/IOWrapper.cpp | 9 ++++++ moses-cmd/IOWrapper.h | 2 ++ moses-cmd/Main.cpp | 31 ++++++++++++++++++-- moses/Manager.cpp | 77 ++++++++++++++++++++++++++++++++++++++++++++++++- moses/Manager.h | 1 + moses/Parameter.cpp | 1 + moses/StaticData.cpp | 6 +++- moses/StaticData.h | 4 +++ 8 files changed, 126 insertions(+), 5 deletions(-) diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp index f11516839..451c53ae0 100644 --- a/moses-cmd/IOWrapper.cpp +++ b/moses-cmd/IOWrapper.cpp @@ -189,6 +189,15 @@ InputType*IOWrapper::GetInput(InputType* inputType) } } + ofstream* IOWrapper::GetOutputSearchGraphSLFStream(size_t sentenceNumber) { + const StaticData &staticData = StaticData::Instance(); + stringstream fileName; + fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << sentenceNumber << ".slf"; + std::ofstream *file = new std::ofstream; + file->open(fileName.str().c_str()); + return file; + } + /*** * print surface factor only for the given phrase */ diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h index 8f164dfb3..1fdc1c6e4 100644 --- a/moses-cmd/IOWrapper.h +++ b/moses-cmd/IOWrapper.h @@ -117,6 +117,8 @@ public: return *m_outputSearchGraphStream; } + std::ofstream *GetOutputSearchGraphSLFStream(size_t sentenceNumber); + std::ostream &GetDetailedTranslationReportingStream() { assert (m_detailedTranslationReportingStream); return *m_detailedTranslationReportingStream; diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index ac4527aae..16754b2fb 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -83,14 +83,16 @@ public: OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector, OutputCollector* detailedTranslationCollector, OutputCollector* alignmentInfoCollector, - OutputCollector* unknownsCollector) : + OutputCollector* unknownsCollector, + std::ofstream* searchGraphSLFStream) : m_source(source), m_lineNumber(lineNumber), m_outputCollector(outputCollector), m_nbestCollector(nbestCollector), m_latticeSamplesCollector(latticeSamplesCollector), m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector), m_detailedTranslationCollector(detailedTranslationCollector), m_alignmentInfoCollector(alignmentInfoCollector), - m_unknownsCollector(unknownsCollector) {} + m_unknownsCollector(unknownsCollector), + m_searchGraphSLFStream(searchGraphSLFStream) {} /** Translate one sentence * gets called by main function implemented at end of this source file */ @@ -143,6 +145,19 @@ public: #endif } + // Output search graph in HTK standard lattice format (SLF) + if (m_searchGraphSLFStream) { + if (m_searchGraphSLFStream->is_open() && m_searchGraphSLFStream->good()) { + ostringstream out; + fix(out,PRECISION); + manager.OutputSearchGraphAsSLF(m_lineNumber, out); + *m_searchGraphSLFStream << out.str(); + m_searchGraphSLFStream -> flush(); + } else { + TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); + } + } + // apply decision rule and output best translation(s) if (m_outputCollector) { ostringstream out; @@ -297,7 +312,14 @@ public: } ~TranslationTask() { + + if (m_searchGraphSLFStream) { + m_searchGraphSLFStream->close(); + } + + delete m_searchGraphSLFStream; delete m_source; + } private: @@ -311,6 +333,7 @@ private: OutputCollector* m_detailedTranslationCollector; OutputCollector* m_alignmentInfoCollector; OutputCollector* m_unknownsCollector; + std::ofstream *m_searchGraphSLFStream; std::ofstream *m_alignmentStream; @@ -533,7 +556,9 @@ int main(int argc, char** argv) searchGraphCollector.get(), detailedTranslationCollector.get(), alignmentInfoCollector.get(), - unknownsCollector.get() ); + unknownsCollector.get(), + staticData.GetOutputSearchGraphSLF() ? + ioWrapper->GetOutputSearchGraphSLFStream(lineCount) : NULL); // execute task #ifdef WITH_THREADS pool.Submit(task); diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 468db0de3..c80bd59e4 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #endif #include -#include #include +#include +#include +#include #include "Manager.h" #include "TypeDef.h" #include "Util.h" @@ -628,6 +630,79 @@ void Manager::GetSearchGraph(vector& searchGraph) const } +/**! Output search graph in HTK standard lattice format (SLF) */ +void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const +{ + + vector searchGraph; + GetSearchGraph(searchGraph); + + long numArcs = 0; + long numNodes = 0; + + map nodes; + set terminalNodes; + + // Unique start node + nodes[0] = 0; + numNodes += 1; + + for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) { + + numArcs += 1; + + int hypothesisID = searchGraph[arcNumber].hypo->GetId(); + if (nodes.count(hypothesisID) == 0) { + nodes[hypothesisID] = numNodes; + numNodes += 1; + + bool terminalNode = (searchGraph[arcNumber].forward == -1); + if (terminalNode) { + numArcs += 1; + } + } + + } + + // Unique end node + nodes[numNodes] = numNodes; + + outputSearchGraphStream << "UTTERANCE=\"Sentence " << translationId << "\"" << endl; + outputSearchGraphStream << "VERSION=1.1" << endl; + outputSearchGraphStream << "base=e" << endl; + outputSearchGraphStream << "NODES=" << numNodes << endl; + outputSearchGraphStream << "LINKS=" << numArcs << endl; + + const vector &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); + + for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) { + const Hypothesis *thisHypo = searchGraph[arcNumber].hypo; + const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); + if (prevHypo) { + + int startNode = nodes[prevHypo->GetId()]; + int endNode = nodes[thisHypo->GetId()]; + bool terminalNode = (searchGraph[arcNumber].forward == -1); + + outputSearchGraphStream << "J=" << arcNumber + << " S=" << startNode + << " E=" << endNode + << " W=\"" << thisHypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" + << endl; + + if (terminalNode && terminalNodes.count(endNode) == 0) { + terminalNodes.insert(endNode); + outputSearchGraphStream << "J=" << arcNumber + << " S=" << endNode + << " E=" << numNodes + << endl; + + } + } + } + +} + void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream, const SearchGraphNode& searchNode) { diff --git a/moses/Manager.h b/moses/Manager.h index dd011bc84..0ae7cd6f1 100644 --- a/moses/Manager.h +++ b/moses/Manager.h @@ -137,6 +137,7 @@ public: #endif void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const; + void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const; void GetSearchGraph(std::vector& searchGraph) const; const InputType& GetSource() const { return m_source; diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 103277d34..876cbd224 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -130,6 +130,7 @@ Parameter::Parameter() AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename"); AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format"); AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses"); + AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)"); AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)"); #ifdef HAVE_PROTOBUF AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path."); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index d056dc78e..1d9d4907c 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -235,8 +235,12 @@ bool StaticData::LoadData(Parameter *parameter) } m_outputSearchGraph = true; m_outputSearchGraphExtended = true; - } else + } else { m_outputSearchGraph = false; + } + if (m_parameter->GetParam("output-search-graph-slf").size() > 0) { + m_outputSearchGraphSLF = true; + } #ifdef HAVE_PROTOBUF if (m_parameter->GetParam("output-search-graph-pb").size() > 0) { if (m_parameter->GetParam("output-search-graph-pb").size() != 1) { diff --git a/moses/StaticData.h b/moses/StaticData.h index 448f1a4e7..d644e59f7 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -216,6 +216,7 @@ protected: bool m_outputWordGraph; //! whether to output word graph bool m_outputSearchGraph; //! whether to output search graph bool m_outputSearchGraphExtended; //! ... in extended format + bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF) #ifdef HAVE_PROTOBUF bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf #endif @@ -631,6 +632,9 @@ public: bool GetOutputSearchGraphExtended() const { return m_outputSearchGraphExtended; } + bool GetOutputSearchGraphSLF() const { + return m_outputSearchGraphSLF; + } #ifdef HAVE_PROTOBUF bool GetOutputSearchGraphPB() const { return m_outputSearchGraphPB; -- cgit v1.2.3 From e106e04dc3c3fe609f82780cbd8286d042a5e47d Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 15 Feb 2013 15:49:26 -0500 Subject: More work on outputting HTK SLF. Now, each arc emits exactly one word. --- moses/Manager.cpp | 55 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/moses/Manager.cpp b/moses/Manager.cpp index c80bd59e4..39eb7f917 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -645,16 +645,19 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea // Unique start node nodes[0] = 0; - numNodes += 1; + // numNodes += 1; for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) { - numArcs += 1; + int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize(); + numArcs += targetWordCount; int hypothesisID = searchGraph[arcNumber].hypo->GetId(); if (nodes.count(hypothesisID) == 0) { + + numNodes += targetWordCount; nodes[hypothesisID] = numNodes; - numNodes += 1; + //numNodes += 1; bool terminalNode = (searchGraph[arcNumber].forward == -1); if (terminalNode) { @@ -663,32 +666,54 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea } } + numNodes += 1; // Unique end node nodes[numNodes] = numNodes; - outputSearchGraphStream << "UTTERANCE=\"Sentence " << translationId << "\"" << endl; + outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl; outputSearchGraphStream << "VERSION=1.1" << endl; outputSearchGraphStream << "base=e" << endl; - outputSearchGraphStream << "NODES=" << numNodes << endl; + outputSearchGraphStream << "NODES=" << (numNodes+1) << endl; outputSearchGraphStream << "LINKS=" << numArcs << endl; - const vector &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); + // const vector &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); - for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) { - const Hypothesis *thisHypo = searchGraph[arcNumber].hypo; + for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) { + const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); if (prevHypo) { int startNode = nodes[prevHypo->GetId()]; int endNode = nodes[thisHypo->GetId()]; - bool terminalNode = (searchGraph[arcNumber].forward == -1); + bool terminalNode = (searchGraph[lineNumber].forward == -1); + const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase(); + int targetWordCount = targetPhrase.GetSize(); + + for (int targetWordIndex=0; targetWordIndexGetId()] - targetWordCount + 1, + // nextNode = startNode + 1; + // nextNode < endNode; startNode+=1, nextNode+=1) { + int x = (targetWordCount-targetWordIndex); + + outputSearchGraphStream << "J=" << arcNumber; + // outputSearchGraphStream << " startNode=" << startNode; + // outputSearchGraphStream << " endNode=" << endNode; + // outputSearchGraphStream << " targetWordCount=" << targetWordCount; + // outputSearchGraphStream << " targetWordIndex=" << targetWordIndex; + + if (targetWordIndex==0) { + outputSearchGraphStream << " S=" << startNode; + } else { + outputSearchGraphStream << " S=" << endNode - x; + } + + outputSearchGraphStream << " E=" << endNode - (x-1) //(startNode + targetWordIndex + 1) + << " W=" << targetPhrase.GetWord(targetWordIndex) + << endl; - outputSearchGraphStream << "J=" << arcNumber - << " S=" << startNode - << " E=" << endNode - << " W=\"" << thisHypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" - << endl; + arcNumber += 1; + } if (terminalNode && terminalNodes.count(endNode) == 0) { terminalNodes.insert(endNode); @@ -696,7 +721,7 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea << " S=" << endNode << " E=" << numNodes << endl; - + arcNumber += 1; } } } -- cgit v1.2.3 From e7563111de02c5e39ff297e58641b612ff02fb4b Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Wed, 20 Feb 2013 11:03:23 -0500 Subject: More work on outputting HTK lattice format --- moses/Manager.cpp | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- moses/Manager.h | 7 ++- 2 files changed, 150 insertions(+), 6 deletions(-) diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 39eb7f917..ce214c414 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -53,12 +53,12 @@ using namespace std; namespace Moses { Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system) - :m_lineNumber(lineNumber) - ,m_system(system) + :m_system(system) ,m_transOptColl(source.CreateTranslationOptionCollection(system)) ,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl)) ,interrupted_flag(0) ,m_hypoId(0) + ,m_lineNumber(lineNumber) ,m_source(source) { m_system->InitializeBeforeSentenceProcessing(source); @@ -630,6 +630,140 @@ void Manager::GetSearchGraph(vector& searchGraph) const } +void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; iGetScoreBreakdown(); + // outputSearchGraphStream << scoreCollection << endl; + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; iGetNumScoreComponents(); + if (numScoreComps != ScoreProducer::unlimited) { + vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << "# " << ff->GetScoreProducerDescription() + << " " << ff->GetScoreProducerWeightShortName() + << " " << (i+1) << " of " << numScoreComps << endl + << "x" << (index+i) << "scale=" << values[i] << endl; + } + return index+numScoreComps; + } else { + cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl; + assert(false); + return 0; + } +} + +size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const +{ + + // { const FeatureFunction* sp = ff; + // const FVector& m_scores = scoreCollection.GetScoresVector(); + // FVector& scores = const_cast(m_scores); + // std::string prefix = sp->GetScoreProducerDescription() + FName::SEP; + // // std::cout << "prefix==" << prefix << endl; + // // cout << "m_scores==" << m_scores << endl; + // // cout << "m_scores.size()==" << m_scores.size() << endl; + // // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl; + // // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl; + + + // // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) { + // // std::cout<first) << "\t" << (i->second) << std::endl; + // // } + // for(int i=0, n=v.size(); iGetScoreBreakdown(); + + vector featureValues = scoreCollection.GetScoresForProducer(ff); + size_t numScoreComps = featureValues.size();//featureValues.coreSize(); + // if (numScoreComps != ScoreProducer::unlimited) { + // vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " "; + } + return index+numScoreComps; + // } else { + // cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl; + // assert(false); + // return 0; + // } +} + /**! Output search graph in HTK standard lattice format (SLF) */ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const { @@ -673,10 +807,12 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl; outputSearchGraphStream << "VERSION=1.1" << endl; - outputSearchGraphStream << "base=e" << endl; + outputSearchGraphStream << "base=2.71828182845905" << endl; outputSearchGraphStream << "NODES=" << (numNodes+1) << endl; outputSearchGraphStream << "LINKS=" << numArcs << endl; + OutputFeatureWeightsForSLF(outputSearchGraphStream); + // const vector &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) { @@ -709,8 +845,11 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea } outputSearchGraphStream << " E=" << endNode - (x-1) //(startNode + targetWordIndex + 1) - << " W=" << targetPhrase.GetWord(targetWordIndex) - << endl; + << " W=" << targetPhrase.GetWord(targetWordIndex); + + OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream); + + outputSearchGraphStream << endl; arcNumber += 1; } diff --git a/moses/Manager.h b/moses/Manager.h index 0ae7cd6f1..c5f54847b 100644 --- a/moses/Manager.h +++ b/moses/Manager.h @@ -93,6 +93,11 @@ class Manager Manager(Manager const&); void operator=(Manager const&); const TranslationSystem* m_system; +private: + void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const; + size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; + void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const; + size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; protected: // data // InputType const& m_source; /**< source sentence to be translated */ @@ -103,6 +108,7 @@ protected: size_t interrupted_flag; std::auto_ptr m_sentenceStats; int m_hypoId; //used to number the hypos as they are created. + size_t m_lineNumber; void GetConnectedGraph( std::map< int, bool >* pConnected, @@ -113,7 +119,6 @@ protected: public: - size_t m_lineNumber; InputType const& m_source; /**< source sentence to be translated */ Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system); ~Manager(); -- cgit v1.2.3 From 04f107fbb02442638928c190dd3fa2f13225d570 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 22 Feb 2013 12:24:35 -0500 Subject: Add flag to output search graph in Kenneth's hypergraph format. --- moses-cmd/IOWrapper.cpp | 9 +++ moses-cmd/IOWrapper.h | 1 + moses-cmd/Main.cpp | 24 +++++- moses/Manager.cpp | 204 ++++++++++++++++++++++++++++++++++++++++++++++++ moses/Manager.h | 11 +++ moses/Parameter.cpp | 1 + moses/StaticData.cpp | 7 ++ moses/StaticData.h | 4 + 8 files changed, 258 insertions(+), 3 deletions(-) diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp index 451c53ae0..7c27476d1 100644 --- a/moses-cmd/IOWrapper.cpp +++ b/moses-cmd/IOWrapper.cpp @@ -198,6 +198,15 @@ InputType*IOWrapper::GetInput(InputType* inputType) return file; } + ofstream* IOWrapper::GetOutputSearchGraphHypergraphStream(size_t sentenceNumber) { + const StaticData &staticData = StaticData::Instance(); + stringstream fileName; + fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << sentenceNumber; + std::ofstream *file = new std::ofstream; + file->open(fileName.str().c_str()); + return file; + } + /*** * print surface factor only for the given phrase */ diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h index 1fdc1c6e4..044e71491 100644 --- a/moses-cmd/IOWrapper.h +++ b/moses-cmd/IOWrapper.h @@ -118,6 +118,7 @@ public: } std::ofstream *GetOutputSearchGraphSLFStream(size_t sentenceNumber); + std::ofstream *GetOutputSearchGraphHypergraphStream(size_t sentenceNumber); std::ostream &GetDetailedTranslationReportingStream() { assert (m_detailedTranslationReportingStream); diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 16754b2fb..afadddabf 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -84,7 +84,8 @@ public: OutputCollector* detailedTranslationCollector, OutputCollector* alignmentInfoCollector, OutputCollector* unknownsCollector, - std::ofstream* searchGraphSLFStream) : + std::ofstream* searchGraphSLFStream, + std::ofstream* searchGraphHypergraphStream) : m_source(source), m_lineNumber(lineNumber), m_outputCollector(outputCollector), m_nbestCollector(nbestCollector), m_latticeSamplesCollector(latticeSamplesCollector), @@ -92,7 +93,8 @@ public: m_detailedTranslationCollector(detailedTranslationCollector), m_alignmentInfoCollector(alignmentInfoCollector), m_unknownsCollector(unknownsCollector), - m_searchGraphSLFStream(searchGraphSLFStream) {} + m_searchGraphSLFStream(searchGraphSLFStream), + m_searchGraphHypergraphStream(searchGraphHypergraphStream) {} /** Translate one sentence * gets called by main function implemented at end of this source file */ @@ -158,6 +160,19 @@ public: } } + // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder + if (m_searchGraphHypergraphStream) { + if (m_searchGraphHypergraphStream->is_open() && m_searchGraphHypergraphStream->good()) { + ostringstream out; + fix(out,PRECISION); + manager.OutputSearchGraphAsHypergraph(m_lineNumber, out); + *m_searchGraphHypergraphStream << out.str(); + m_searchGraphHypergraphStream -> flush(); + } else { + TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); + } + } + // apply decision rule and output best translation(s) if (m_outputCollector) { ostringstream out; @@ -334,6 +349,7 @@ private: OutputCollector* m_alignmentInfoCollector; OutputCollector* m_unknownsCollector; std::ofstream *m_searchGraphSLFStream; + std::ofstream *m_searchGraphHypergraphStream; std::ofstream *m_alignmentStream; @@ -558,7 +574,9 @@ int main(int argc, char** argv) alignmentInfoCollector.get(), unknownsCollector.get(), staticData.GetOutputSearchGraphSLF() ? - ioWrapper->GetOutputSearchGraphSLFStream(lineCount) : NULL); + ioWrapper->GetOutputSearchGraphSLFStream(lineCount) : NULL, + staticData.GetOutputSearchGraphHypergraph() ? + ioWrapper->GetOutputSearchGraphHypergraphStream(lineCount) : NULL); // execute task #ifdef WITH_THREADS pool.Submit(task); diff --git a/moses/Manager.cpp b/moses/Manager.cpp index ce214c414..21f116f42 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -663,6 +663,39 @@ void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) } +void Manager::OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) const +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; i& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; iGetNumScoreComponents(); + if (numScoreComps != ScoreProducer::unlimited) { + vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + if (numScoreComps > 1) { + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() + << i + << "=" << values[i] << endl; + } + } else { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() + << "=" << values[0] << endl; + } + return index+numScoreComps; + } else { + cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl; + assert(false); + return 0; + } +} + + size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const { @@ -764,6 +854,120 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth // } } +size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const +{ + + const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); + + vector featureValues = scoreCollection.GetScoresForProducer(ff); + size_t numScoreComps = featureValues.size(); + + if (numScoreComps > 1) { + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << i << "=" << featureValues[i] << " "; + } + } else { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << "=" << featureValues[0] << " "; + } + + return index+numScoreComps; +} + +void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream, + const SearchGraphNode& searchNode); +/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */ +void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const +{ + vector searchGraph; + GetSearchGraph(searchGraph); +outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << endl; + // long numArcs = 0; + long numNodes = 0; + + map nodes; + set terminalNodes; + multimap nodeToLines; + + // Unique start node + // nodes[0] = 0; + //numNodes += 1; + for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) { +OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]); + // Record that this arc ends at this node + // numArcs += 1; + nodeToLines.insert(pair(numNodes,arcNumber)); + + int hypothesisID = searchGraph[arcNumber].hypo->GetId(); + if (nodes.count(hypothesisID) == 0) { + + nodes[hypothesisID] = numNodes; + numNodes += 1; + + bool terminalNode = (searchGraph[arcNumber].forward == -1); + if (terminalNode) { + terminalNodes.insert(numNodes); + // numArcs += 1; // Final arc to end node, representing the end of the sentence + } + } + + } + + // Unique end node + nodes[numNodes] = numNodes; + numNodes += 1; + + long numArcs = searchGraph.size() + terminalNodes.size(); + // Unique start node + // numNodes += 1; + + // Print number of nodes and arcs + outputSearchGraphStream << numNodes << " " << numArcs << "(" << searchGraph.size() << ", " << terminalNodes.size() << ")" << endl; + + // Print node and arc for beginning of sentence + // outputSearchGraphStream << 1 << endl; + // outputSearchGraphStream << " ||| " << endl; + + for (int nodeNumber=0; nodeNumber <= numNodes; nodeNumber+=1) { + + size_t count = nodeToLines.count(nodeNumber); + if (count > 0) { + outputSearchGraphStream << count << endl; + + pair::iterator, multimap::iterator> range = nodeToLines.equal_range(nodeNumber); + for (multimap::iterator it=range.first; it!=range.second; ++it) { + int lineNumber = (*it).second; + const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; + const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); + if (prevHypo==NULL) { + outputSearchGraphStream << " ||| " << endl; + } else { + int startNode = nodes[prevHypo->GetId()]; + + const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase(); + int targetWordCount = targetPhrase.GetSize(); + + outputSearchGraphStream << "[" << startNode << "]"; + for (int targetWordIndex=0; targetWordIndex + outputSearchGraphStream << terminalNodes.size() << endl; + for (set::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) { + outputSearchGraphStream << "[" << (*it) << "] ||| " << endl; + } + +} + + /**! Output search graph in HTK standard lattice format (SLF) */ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const { diff --git a/moses/Manager.h b/moses/Manager.h index c5f54847b..d580674b4 100644 --- a/moses/Manager.h +++ b/moses/Manager.h @@ -94,10 +94,20 @@ class Manager void operator=(Manager const&); const TranslationSystem* m_system; private: + + // Helper functions to output search graph in HTK standard lattice format void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const; size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const; size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; + + // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder + void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) const; + size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; + void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const; + size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; + + protected: // data // InputType const& m_source; /**< source sentence to be translated */ @@ -143,6 +153,7 @@ public: void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const; void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const; + void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const; void GetSearchGraph(std::vector& searchGraph) const; const InputType& GetSource() const { return m_source; diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 876cbd224..359174280 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -131,6 +131,7 @@ Parameter::Parameter() AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format"); AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses"); AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)"); + AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)"); AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)"); #ifdef HAVE_PROTOBUF AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path."); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 1d9d4907c..cf797582b 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -240,6 +240,13 @@ bool StaticData::LoadData(Parameter *parameter) } if (m_parameter->GetParam("output-search-graph-slf").size() > 0) { m_outputSearchGraphSLF = true; + } else { + m_outputSearchGraphSLF = false; + } + if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) { + m_outputSearchGraphHypergraph = true; + } else { + m_outputSearchGraphHypergraph = false; } #ifdef HAVE_PROTOBUF if (m_parameter->GetParam("output-search-graph-pb").size() > 0) { diff --git a/moses/StaticData.h b/moses/StaticData.h index d644e59f7..8a9e65162 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -217,6 +217,7 @@ protected: bool m_outputSearchGraph; //! whether to output search graph bool m_outputSearchGraphExtended; //! ... in extended format bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF) + bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph #ifdef HAVE_PROTOBUF bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf #endif @@ -635,6 +636,9 @@ public: bool GetOutputSearchGraphSLF() const { return m_outputSearchGraphSLF; } + bool GetOutputSearchGraphHypergraph() const { + return m_outputSearchGraphHypergraph; + } #ifdef HAVE_PROTOBUF bool GetOutputSearchGraphPB() const { return m_outputSearchGraphPB; -- cgit v1.2.3 From 764ce067266bb3373d5fdd8cdd528484301907c5 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 22 Feb 2013 15:48:40 -0500 Subject: More work on outputting search graph as hypergraph --- moses/Hypothesis.cpp | 2 +- moses/Manager.cpp | 6 +++--- moses/StaticData.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index 506193d5b..5bd3a4e2b 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList() */ const StaticData &staticData = StaticData::Instance(); size_t nBestSize = staticData.GetNBestSize(); - bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ; + bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ; if (!distinctNBest && m_arcList->size() > nBestSize * 5) { // prune arc list only if there too many arcs diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 21f116f42..0e72d90e6 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -880,7 +880,7 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou { vector searchGraph; GetSearchGraph(searchGraph); -outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << endl; + //outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << endl; // long numArcs = 0; long numNodes = 0; @@ -892,7 +892,7 @@ outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << end // nodes[0] = 0; //numNodes += 1; for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) { -OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]); + //OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]); // Record that this arc ends at this node // numArcs += 1; nodeToLines.insert(pair(numNodes,arcNumber)); @@ -921,7 +921,7 @@ OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]); // numNodes += 1; // Print number of nodes and arcs - outputSearchGraphStream << numNodes << " " << numArcs << "(" << searchGraph.size() << ", " << terminalNodes.size() << ")" << endl; + outputSearchGraphStream << numNodes << " " << numArcs /*<< "(" << searchGraph.size() << ", " << terminalNodes.size() << ")"*/ << endl; // Print node and arc for beginning of sentence // outputSearchGraphStream << 1 << endl; diff --git a/moses/StaticData.h b/moses/StaticData.h index 8a9e65162..ce93a5629 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -460,7 +460,7 @@ public: return m_nBestFilePath; } bool IsNBestEnabled() const { - return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty() + return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty() #ifdef HAVE_PROTOBUF || m_outputSearchGraphPB #endif -- cgit v1.2.3 From 285661fec7fc24c8d328f1442c6b59edd508649f Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 22 Feb 2013 15:51:56 -0500 Subject: Deleted stale commented code --- moses/Manager.cpp | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 0e72d90e6..0c7471b88 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -873,28 +873,21 @@ size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* return index+numScoreComps; } -void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream, - const SearchGraphNode& searchNode); /**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const { vector searchGraph; GetSearchGraph(searchGraph); - //outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << endl; - // long numArcs = 0; + long numNodes = 0; map nodes; set terminalNodes; multimap nodeToLines; - // Unique start node - // nodes[0] = 0; - //numNodes += 1; for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) { - //OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]); + // Record that this arc ends at this node - // numArcs += 1; nodeToLines.insert(pair(numNodes,arcNumber)); int hypothesisID = searchGraph[arcNumber].hypo->GetId(); @@ -905,8 +898,8 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou bool terminalNode = (searchGraph[arcNumber].forward == -1); if (terminalNode) { + // Final arc to end node, representing the end of the sentence terminalNodes.insert(numNodes); - // numArcs += 1; // Final arc to end node, representing the end of the sentence } } @@ -917,15 +910,9 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou numNodes += 1; long numArcs = searchGraph.size() + terminalNodes.size(); - // Unique start node - // numNodes += 1; // Print number of nodes and arcs - outputSearchGraphStream << numNodes << " " << numArcs /*<< "(" << searchGraph.size() << ", " << terminalNodes.size() << ")"*/ << endl; - - // Print node and arc for beginning of sentence - // outputSearchGraphStream << 1 << endl; - // outputSearchGraphStream << " ||| " << endl; + outputSearchGraphStream << numNodes << " " << numArcs << endl; for (int nodeNumber=0; nodeNumber <= numNodes; nodeNumber+=1) { @@ -983,7 +970,6 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea // Unique start node nodes[0] = 0; - // numNodes += 1; for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) { @@ -1017,8 +1003,6 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea OutputFeatureWeightsForSLF(outputSearchGraphStream); - // const vector &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); - for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) { const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); @@ -1031,16 +1015,9 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea int targetWordCount = targetPhrase.GetSize(); for (int targetWordIndex=0; targetWordIndexGetId()] - targetWordCount + 1, - // nextNode = startNode + 1; - // nextNode < endNode; startNode+=1, nextNode+=1) { int x = (targetWordCount-targetWordIndex); outputSearchGraphStream << "J=" << arcNumber; - // outputSearchGraphStream << " startNode=" << startNode; - // outputSearchGraphStream << " endNode=" << endNode; - // outputSearchGraphStream << " targetWordCount=" << targetWordCount; - // outputSearchGraphStream << " targetWordIndex=" << targetWordIndex; if (targetWordIndex==0) { outputSearchGraphStream << " S=" << startNode; @@ -1048,7 +1025,7 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea outputSearchGraphStream << " S=" << endNode - x; } - outputSearchGraphStream << " E=" << endNode - (x-1) //(startNode + targetWordIndex + 1) + outputSearchGraphStream << " E=" << endNode - (x-1) << " W=" << targetPhrase.GetWord(targetWordIndex); OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream); -- cgit v1.2.3 From 4adeb7e33868dfc44875af9534047da6cc6bfee0 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 22 Feb 2013 16:20:03 -0500 Subject: Output feature weights to a separate file when producing hypergraph --- moses-cmd/IOWrapper.cpp | 9 +++++++ moses-cmd/IOWrapper.h | 1 + moses-cmd/Main.cpp | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ moses/Manager.cpp | 58 ------------------------------------------- moses/Manager.h | 2 -- 5 files changed, 76 insertions(+), 60 deletions(-) diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp index 7c27476d1..6fffead46 100644 --- a/moses-cmd/IOWrapper.cpp +++ b/moses-cmd/IOWrapper.cpp @@ -207,6 +207,15 @@ InputType*IOWrapper::GetInput(InputType* inputType) return file; } + ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() { + const StaticData &staticData = StaticData::Instance(); + stringstream fileName; + fileName << staticData.GetParam("output-search-graph-hypergraph")[1]; + std::ofstream *file = new std::ofstream; + file->open(fileName.str().c_str()); + return file; + } + /*** * print surface factor only for the given phrase */ diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h index 044e71491..0376eff6f 100644 --- a/moses-cmd/IOWrapper.h +++ b/moses-cmd/IOWrapper.h @@ -119,6 +119,7 @@ public: std::ofstream *GetOutputSearchGraphSLFStream(size_t sentenceNumber); std::ofstream *GetOutputSearchGraphHypergraphStream(size_t sentenceNumber); + std::ofstream *GetOutputSearchGraphHypergraphWeightsStream(); std::ostream &GetDetailedTranslationReportingStream() { assert (m_detailedTranslationReportingStream); diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index afadddabf..0e48ae64f 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -333,6 +333,7 @@ public: } delete m_searchGraphSLFStream; + delete m_searchGraphHypergraphStream; delete m_source; } @@ -406,6 +407,63 @@ static void ShowWeights() } +size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) +{ + size_t numScoreComps = ff->GetNumScoreComponents(); + if (numScoreComps != ScoreProducer::unlimited) { + vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + if (numScoreComps > 1) { + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() + << i + << "=" << values[i] << endl; + } + } else { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() + << "=" << values[0] << endl; + } + return index+numScoreComps; + } else { + cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl; + assert(false); + return 0; + } +} + +void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector& slf =system.GetStatelessFeatureFunctions(); + const vector& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream); + } + } + const vector& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i& gds = system.GetGenerationDictionaries(); + for( size_t i=0; i 1) { + ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream(); + OutputFeatureWeightsForHypergraph(*weightsOut); + weightsOut->flush(); + weightsOut->close(); + delete weightsOut; + } + // initialize output streams // note: we can't just write to STDOUT or files diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 0c7471b88..760587a55 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -663,40 +663,6 @@ void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) } -void Manager::OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) const -{ - outputSearchGraphStream.setf(std::ios::fixed); - outputSearchGraphStream.precision(6); - - const StaticData& staticData = StaticData::Instance(); - const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); - const vector& slf =system.GetStatelessFeatureFunctions(); - const vector& sff = system.GetStatefulFeatureFunctions(); - size_t featureIndex = 1; - for (size_t i = 0; i < sff.size(); ++i) { - featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream); - } - for (size_t i = 0; i < slf.size(); ++i) { - if (slf[i]->GetScoreProducerWeightShortName() != "u" && - slf[i]->GetScoreProducerWeightShortName() != "tm" && - slf[i]->GetScoreProducerWeightShortName() != "I" && - slf[i]->GetScoreProducerWeightShortName() != "g") - { - featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream); - } - } - const vector& pds = system.GetPhraseDictionaries(); - for( size_t i=0; i& gds = system.GetGenerationDictionaries(); - for( size_t i=0; iGetNumScoreComponents(); - if (numScoreComps != ScoreProducer::unlimited) { - vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); - if (numScoreComps > 1) { - for (size_t i = 0; i < numScoreComps; ++i) { - outputSearchGraphStream << ff->GetScoreProducerWeightShortName() - << i - << "=" << values[i] << endl; - } - } else { - outputSearchGraphStream << ff->GetScoreProducerWeightShortName() - << "=" << values[0] << endl; - } - return index+numScoreComps; - } else { - cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl; - assert(false); - return 0; - } -} - - size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const { diff --git a/moses/Manager.h b/moses/Manager.h index d580674b4..e2f8ed8e5 100644 --- a/moses/Manager.h +++ b/moses/Manager.h @@ -102,8 +102,6 @@ private: size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder - void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) const; - size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const; size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; -- cgit v1.2.3 From 2eb0c5e11da9b00f8675688841ba118b91aa7471 Mon Sep 17 00:00:00 2001 From: amittai Date: Sun, 24 Feb 2013 18:07:11 -0800 Subject: let's be consistently case-insensitive with respect to the xml tags --- scripts/ems/support/wrap-xml.perl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index e941aa95b..4ef6a1de6 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -13,10 +13,10 @@ chomp(@OUT); while() { chomp; if (/^) { my $line = shift(@OUT); $line = "" if $line =~ /NO BEST TRANSLATION/; if (/<\/seg>/) { - s/(]+> *).*(<\/seg>)/$1$line$2/; + s/(]+> *).*(<\/seg>)/$1$line$2/i; } else { - s/(]+> *)[^<]*/$1$line/; + s/(]+> *)[^<]*/$1$line/i; } } print $_."\n"; -- cgit v1.2.3 From 8b6e98c633695f05190f66af644d9fc9295a52d9 Mon Sep 17 00:00:00 2001 From: amittai Date: Sun, 24 Feb 2013 18:10:19 -0800 Subject: Revert "let's be consistently case-insensitive with respect to the xml tags" This reverts commit 2eb0c5e11da9b00f8675688841ba118b91aa7471. --- scripts/ems/support/wrap-xml.perl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index 4ef6a1de6..e941aa95b 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -13,10 +13,10 @@ chomp(@OUT); while() { chomp; if (/^) { my $line = shift(@OUT); $line = "" if $line =~ /NO BEST TRANSLATION/; if (/<\/seg>/) { - s/(]+> *).*(<\/seg>)/$1$line$2/i; + s/(]+> *).*(<\/seg>)/$1$line$2/; } else { - s/(]+> *)[^<]*/$1$line/i; + s/(]+> *)[^<]*/$1$line/; } } print $_."\n"; -- cgit v1.2.3 From 1bba58b134a67e7fcb8d65f941a0570d4ed75a43 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 26 Feb 2013 11:01:01 +0000 Subject: eclipse project files --- contrib/other-builds/lm/.project | 20 -------------------- contrib/other-builds/moses-chart-cmd/.cproject | 10 ++++++++-- contrib/other-builds/moses-cmd/.cproject | 2 ++ 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/contrib/other-builds/lm/.project b/contrib/other-builds/lm/.project index e75388ac1..a1bde37c2 100644 --- a/contrib/other-builds/lm/.project +++ b/contrib/other-builds/lm/.project @@ -141,11 +141,6 @@ 1 PARENT-3-PROJECT_LOC/lm/build_binary - - build_binary.cc - 1 - PARENT-3-PROJECT_LOC/lm/build_binary.cc - clean.sh 1 @@ -176,11 +171,6 @@ 1 PARENT-3-PROJECT_LOC/lm/facade.hh - - fragment.cc - 1 - PARENT-3-PROJECT_LOC/lm/fragment.cc - left.hh 1 @@ -211,11 +201,6 @@ 1 PARENT-3-PROJECT_LOC/lm/lm_exception.hh - - max_order.cc - 1 - PARENT-3-PROJECT_LOC/lm/max_order.cc - max_order.hh 1 @@ -241,11 +226,6 @@ 1 PARENT-3-PROJECT_LOC/lm/model_type.hh - - ngram_query.cc - 1 - PARENT-3-PROJECT_LOC/lm/ngram_query.cc - ngram_query.hh 1 diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index fedda926b..7120f0b71 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -46,6 +46,7 @@ + terminalNodes.insert(hypergraphHypothesisID); + } + + hypergraphHypothesisID += 1; } } + + // Unique end node + endNode = hypergraphHypothesisID; + // mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID; + numNodes = endNode + 1; } - // Unique end node - nodes[numNodes] = numNodes; - numNodes += 1; long numArcs = searchGraph.size() + terminalNodes.size(); // Print number of nodes and arcs outputSearchGraphStream << numNodes << " " << numArcs << endl; - for (int nodeNumber=0; nodeNumber <= numNodes; nodeNumber+=1) { - - size_t count = nodeToLines.count(nodeNumber); + for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) { + // int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID]; + size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID); if (count > 0) { outputSearchGraphStream << count << endl; - pair::iterator, multimap::iterator> range = nodeToLines.equal_range(nodeNumber); + pair::iterator, multimap::iterator> range = + hypergraphIDToArcs.equal_range(hypergraphHypothesisID); for (multimap::iterator it=range.first; it!=range.second; ++it) { int lineNumber = (*it).second; const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; + int mosesHypothesisID = thisHypo->GetId(); + // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID]; + UTIL_THROW_IF( + (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]), + util::Exception, + "Error while writing search lattice as hypergraph for sentence " << translationId << ". " << + "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID << + ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] << + ". There are " << numNodes << " nodes in the search lattice." + ); + const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); if (prevHypo==NULL) { outputSearchGraphStream << " ||| " << endl; } else { - int startNode = nodes[prevHypo->GetId()]; + int startNode = mosesIDToHypergraphID[prevHypo->GetId()]; UTIL_THROW_IF( - (startNode >= nodeNumber), + (startNode >= hypergraphHypothesisID), util::Exception, - "Error while writing search lattice as hypergraph for sentence" << translationId << "." << + "Error while writing search lattice as hypergraph for sentence" << translationId << ". " << "The nodes must be output in topological order. The code attempted to violate this restriction." ); -- cgit v1.2.3 From f2536cddffe5dbb141387fea0a27da19e1da21e2 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Wed, 6 Mar 2013 13:37:41 +0000 Subject: Added arrow based Moses training pipeline demonstration program to contrib. --- .gitmodules | 3 + .../training-pipeline/moses-pypeline.dia | Bin 0 -> 3532 bytes contrib/arrow-pipelines/python/README | 32 ++++ contrib/arrow-pipelines/python/libs/pypeline | 1 + contrib/arrow-pipelines/python/manager.py | 192 +++++++++++++++++++++ contrib/arrow-pipelines/python/test/__init__.py | 0 contrib/arrow-pipelines/python/test/test.py | 11 ++ .../arrow-pipelines/python/training/__init__.py | 0 .../python/training/components/__init__.py | 0 .../python/training/components/cleanup/__init__.py | 0 .../python/training/components/cleanup/cleanup.py | 125 ++++++++++++++ .../python/training/components/cleanup/cleanup3.py | 109 ++++++++++++ .../training/components/data_split/__init__.py | 0 .../training/components/data_split/data_split.py | 146 ++++++++++++++++ .../training/components/irstlm_build/__init__.py | 0 .../components/irstlm_build/irstlm_build.py | 106 ++++++++++++ .../python/training/components/mert/__init__.py | 0 .../python/training/components/mert/mert.py | 83 +++++++++ .../training/components/model_training/__init__.py | 0 .../components/model_training/model_training.py | 72 ++++++++ .../training/components/tokenizer/__init__.py | 0 .../training/components/tokenizer/src_tokenizer.py | 43 +++++ .../python/training/components/tokenizer/tmp.de | 3 + .../training/components/tokenizer/tokenizer.py | 36 ++++ .../training/components/tokenizer/trg_tokenizer.py | 43 +++++ 25 files changed, 1005 insertions(+) create mode 100644 contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia create mode 100644 contrib/arrow-pipelines/python/README create mode 160000 contrib/arrow-pipelines/python/libs/pypeline create mode 100644 contrib/arrow-pipelines/python/manager.py create mode 100644 contrib/arrow-pipelines/python/test/__init__.py create mode 100644 contrib/arrow-pipelines/python/test/test.py create mode 100644 contrib/arrow-pipelines/python/training/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py create mode 100644 contrib/arrow-pipelines/python/training/components/data_split/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/data_split/data_split.py create mode 100644 contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py create mode 100644 contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py create mode 100644 contrib/arrow-pipelines/python/training/components/mert/__init__.py create mode 100755 contrib/arrow-pipelines/python/training/components/mert/mert.py create mode 100644 contrib/arrow-pipelines/python/training/components/model_training/__init__.py create mode 100755 contrib/arrow-pipelines/python/training/components/model_training/model_training.py create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py create mode 100755 contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py create mode 100755 contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py diff --git a/.gitmodules b/.gitmodules index e69de29bb..d3a8cb4da 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "contrib/arrow-pipelines/python/libs/pypeline"] + path = contrib/arrow-pipelines/python/libs/pypeline + url = git://github.com/ianj-als/pypeline.git diff --git a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia new file mode 100644 index 000000000..1d35a1dea Binary files /dev/null and b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia differ diff --git a/contrib/arrow-pipelines/python/README b/contrib/arrow-pipelines/python/README new file mode 100644 index 000000000..e1e12975c --- /dev/null +++ b/contrib/arrow-pipelines/python/README @@ -0,0 +1,32 @@ +Arrow Based Moses Training Pipeline +=================================== + +To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command: + +$ git submodule init + +This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline: + +$ cd libs/pypeline +$ python setup.py install + +Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library. + +This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia. + +Three environment variables need to be set before the manager.py script can be run, they are: + + - MOSES_HOME : The directory where Moses has been cloned, or installed, + - IRSTLM : The installation directory of your IRSTLM, and + - GIZA_HOME : The installation directory of GIZA++. + +The manager.py script takes four positional command-line arguments: + + - The source language code, + - The target language code, + - The source corpus file. This file *must* be cleaned prior to use, and + - The target corpus file. This file *must* be cleaned prior to use. + +For example, run the manager.py script with: + +$ python manager.py en lt cleantrain.en cleantrain.lt diff --git a/contrib/arrow-pipelines/python/libs/pypeline b/contrib/arrow-pipelines/python/libs/pypeline new file mode 160000 index 000000000..a7084b686 --- /dev/null +++ b/contrib/arrow-pipelines/python/libs/pypeline @@ -0,0 +1 @@ +Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb diff --git a/contrib/arrow-pipelines/python/manager.py b/contrib/arrow-pipelines/python/manager.py new file mode 100644 index 000000000..1c3ece111 --- /dev/null +++ b/contrib/arrow-pipelines/python/manager.py @@ -0,0 +1,192 @@ +import logging +import os + +from concurrent.futures import Future, ThreadPoolExecutor +from functools import partial +from pypeline.helpers.parallel_helpers import eval_pipeline, \ + cons_function_component, \ + cons_wire, \ + cons_split_wire, \ + cons_unsplit_wire, \ + cons_dictionary_wire + + +# +# Some logging please +# +FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s' +logging.basicConfig(format = FORMAT, level = logging.DEBUG) +logger = logging.getLogger("manager") + + +# Build the pipeline components +def build_components(components, configuration, executor): + pipeline_components = dict() + pipeline_configuration = dict() + + for component_id, module_name in components.items(): + logger.info("Loading [%s] component from [%s]..." % (component_id, module_name)) + + module = __import__(module_name, fromlist = ['configure', 'initialise']) + + # Component builds its own configuration object + config_func = getattr(module, 'configure') + component_config = config_func(configuration) + pipeline_configuration.update(component_config) + + # Now build the component + init_func = getattr(module, 'initialise') + component_function = init_func(component_config) + + # A wrapper for the component's function that submits to the executor + def get_component_function_wrapper(inner_function, comp_id, mod_name): + def component_function_wrapper(a, s): + logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \ + (comp_id, mod_name, a, s)) + return inner_function(a, s) + + return component_function_wrapper + + # Arrowize the component + component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name)) + + # And store + pipeline_components[component_id] = component + + return pipeline_components, pipeline_configuration + + +# Go! +def main(src_lang, trg_lang, src_filename, trg_filename): + # Global configuration + # One day, this configuration shall be constructed from + # command line options, or a properties file. + configuration = { + 'moses_installation_dir': os.environ['MOSES_HOME'], + 'irstlm_installation_dir': os.environ['IRSTLM'], + 'giza_installation_dir': os.environ['GIZA_HOME'], + 'src_lang': src_lang, + 'src_tokenisation_dir': './tokenisation', + 'trg_lang': trg_lang, + 'trg_tokenisation_dir': './tokenisation', + 'segment_length_limit': 60, + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': './language-model', + 'translation_model_directory': './translation-model', + 'mert_working_directory': './mert', + 'evaluation_data_size': 100, + 'development_data_size': 100 + } + + # The modules to load + # In the future, the components shall be specified in some kind + # pipeline description file. + component_modules = { + 'src_tokenizer': 'training.components.tokenizer.src_tokenizer', + 'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer', + 'cleanup': 'training.components.cleanup.cleanup', + 'data_split': 'training.components.data_split.data_split', + 'irstlm_build': 'training.components.irstlm_build.irstlm_build', + 'model_training': 'training.components.model_training.model_training', + 'mert': 'training.components.mert.mert' + } + + # The thread pool + executor = ThreadPoolExecutor(max_workers = 3) + + # Phew, build the required components + components, component_config = build_components(component_modules, configuration, executor) + + # + # Wire up components + # Description of wiring should be, in the future, alongside the component + # specification in some kind of confuguration file. Components shall be + # declared then used, i.e., bind a component instance to a unique component + # identifier, then wire component instances together by identifier. + # + + # + # Tokenisation of source and target... + # + # IRSTLM Build components + irstlm_build_component = cons_split_wire() >> \ + (cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \ + components['irstlm_build']).second() >> \ + cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'], + 'trg_language_model_filename': b['compiled_lm_filename']}) + + # The complete tokenisation component + tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \ + irstlm_build_component.second() >> \ + cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'], + 'trg_filename': b['tokenised_trg_filename'], + 'trg_language_model_filename': b['trg_language_model_filename']}) + + # + # Cleanup and Data Spliting... + # + + # + # A function that clips off the last '.' delimited string + # + def clip_last_bit(filename): + bn = os.path.basename(filename) + directory = os.path.dirname(filename) + bits = bn.split(".") + bits.pop() + return os.path.join(directory, ".".join(bits)) + + cleanup_datasplit_component = components['cleanup'] >> \ + cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'], + 'trg_filename': a['cleaned_trg_filename']}) >> \ + components['data_split'] >> \ + cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']), + 'eval_src_filename': a['eval_src_filename'], + 'eval_trg_filename': a['eval_trg_filename']}) + + # + # Translation model training + # + translation_model_component = cons_split_wire() >> \ + components['model_training'].first() >> \ + cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'], + 'development_data_filename': b['eval_src_filename']}) + + # + # The whole pipeline + # + pipeline = tokenisation_component >> \ + cons_split_wire() >> \ + (cleanup_datasplit_component >> translation_model_component).first() >> \ + cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'], + 'development_data_filename': clip_last_bit(t['development_data_filename']), + 'trg_language_model_filename': b['trg_language_model_filename'], + 'trg_language_model_order': 3, + 'trg_language_model_type': 9}) >> \ + components['mert'] + + + # + # The input to the pipeline + # + value = {'src_filename': src_filename, + 'trg_filename': trg_filename} + + # + # Evaluate the pipeline + # + logger.info("Evaluating pipeline with input [%s]..." % value) + new_value = eval_pipeline(executor, pipeline, value, component_config) + + # + # Wait for all components to finish + # + executor.shutdown(True) + + logger.info("Pipeline evaluated to %s" % new_value) + + +if __name__ == '__main__': + import sys + + main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/contrib/arrow-pipelines/python/test/__init__.py b/contrib/arrow-pipelines/python/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/test/test.py b/contrib/arrow-pipelines/python/test/test.py new file mode 100644 index 000000000..628796f7d --- /dev/null +++ b/contrib/arrow-pipelines/python/test/test.py @@ -0,0 +1,11 @@ +import subprocess + +def cat(filename, content): + fh = open(filename, "w") + for line in content: + #print(line, file=fh) + print >> fh, line + fh.close() + +def diff(filename1, filename2): + subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT) diff --git a/contrib/arrow-pipelines/python/training/__init__.py b/contrib/arrow-pipelines/python/training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/__init__.py b/contrib/arrow-pipelines/python/training/components/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py new file mode 100644 index 000000000..cb2e057ce --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py @@ -0,0 +1,125 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['segment_length'] = args['segment_length_limit'] + return result + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + #print(line, ":", n) + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print >>ofh1, l1, + print >>ofh2, l2, + + def _make_cleaned_filename(filename): + bits = filename.split(".") + bits[-1] = "clean" + return ".".join(bits) + + def _filter_main(value, config): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = value['src_filename'] + input_trg_filename = value['trg_filename'] + + print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename) + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + + cleaned_src_filename = _make_cleaned_filename(input_src_filename) + cleaned_trg_filename = _make_cleaned_filename(input_trg_filename) + ofh1 = open(cleaned_src_filename, "w") + ofh2 = open(cleaned_trg_filename, "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': cleaned_src_filename, + 'cleaned_trg_filename': cleaned_trg_filename} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _filter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected" + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + try: + thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename']) + finally: + os.unlink(output['cleaned_src_filename']) + os.unlink(output['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + #expected output: + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py new file mode 100644 index 000000000..27625c612 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py @@ -0,0 +1,109 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['segment_length'] = args['segment_length_limit'] + return result + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + #print(line, ":", n) + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print(l1, end='', file=ofh1) + print(l2, end='', file=ofh2) + + def _filter_main(config, value): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + ifh1 = open(value['src_filename'], "r") + ifh2 = open(value['trg_filename'], "r") + ofh1 = open(value['cleaned_src_filename'], "w") + ofh2 = open(value['cleaned_trg_filename'], "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': value['cleaned_src_filename'], + 'cleaned_trg_filename': value['cleaned_trg_filename']} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return cons_function_component(_filter_main) + + +if __name__ == '__main__': + import os + import tempfile + import training.components.shared.test as thelp + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_filename': src_filename[1] + ".clean", + 'cleaned_trg_filename': trg_filename[1] + ".clean", + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected" + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + + run_pipeline(box, box_config, box_eval) + thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + #expected output: + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/data_split/__init__.py b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py new file mode 100644 index 000000000..b8469cbf6 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py @@ -0,0 +1,146 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['evaluate_size'] = args['evaluation_data_size'] + result['development_size'] = args['development_data_size'] + return result + +def initialise(config): + + def _copy(size, inp, ofh1, ofh2): + try: + while size != 0: + (l1, l2) = inp.next() + print >>ofh1, l1, + print >>ofh2, l2, + size -= 1 + except StopIteration: + pass + + def _make_split_filename(filename, data_set): + bits = filename.split(".") + last = bits.pop() + lang_code = bits.pop() + + bits.append(last) + bits.append(data_set) + bits.append(lang_code) + + new_filename = ".".join(bits) + return new_filename + + def _splitter_main(value, config): + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = value['src_filename'] + input_trg_filename = value['trg_filename'] + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + inp = iter(zip(ifh1, ifh2)) + + result = {} + for (data_set, size) in [ + ('devel', config['development_size']), + ('eval', config['evaluate_size']), + ('train', -1) + ]: + output_src_filename = _make_split_filename(input_src_filename, data_set) + output_trg_filename = _make_split_filename(input_trg_filename, data_set) + ofh1 = open(output_src_filename, "w") + ofh2 = open(output_trg_filename, "w") + + _copy(size, inp, ofh1, ofh2) + result[data_set + '_src_filename'] = output_src_filename + result[data_set + '_trg_filename'] = output_trg_filename + + return result + + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _splitter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = { + 'evaluation_data_size': 7, + 'development_data_size': 13, + } + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'devel_src_expected': src_filename[1] + ".devel.expected", + 'devel_trg_expected': trg_filename[1] + ".devel.expected", + 'eval_src_expected': src_filename[1] + ".eval.expected", + 'eval_trg_expected': trg_filename[1] + ".eval.expected", + 'train_src_expected': src_filename[1] + ".train.expected", + 'train_trg_expected': trg_filename[1] + ".train.expected", + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + for data_set in ['devel', 'eval', 'train']: + for lang in ['src', 'trg']: + filename = output[data_set + '_' + lang + '_filename'] + filename_expected = box_eval[data_set + '_' + lang + '_expected'] + thelp.diff(filename_expected, filename) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line(range(50))) + thelp.cat(box_eval['trg_filename'], _line(range(50))) + #expected output: + thelp.cat(box_eval['devel_src_expected'], _line(range(0,13))) + thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13))) + thelp.cat(box_eval['eval_src_expected'], _line(range(13,20))) + thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20))) + thelp.cat(box_eval['train_src_expected'], _line(range(20,50))) + thelp.cat(box_eval['train_trg_expected'], _line(range(20,50))) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py new file mode 100644 index 000000000..f65d61973 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py @@ -0,0 +1,106 @@ +import os +import shutil +import subprocess +import tempfile + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + config = dict() + config['irstlm_install_directory'] = args['irstlm_installation_dir'] + config['smoothing_method'] = args['irstlm_smoothing_method'] + config['lm_directory'] = args['language_model_directory'] + return config + +def initialise(config): + def process(a, s): + # Create the LM directory if we need to + if os.path.exists(s['lm_directory']) is False: + os.makedirs(s['lm_directory']) + + # The filename of the file to chew through + start_end_input_filename = a['input_filename'] + if os.path.exists(start_end_input_filename) is False: + raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename) + + # Derive the output file name for the add start-end marker processor + filename_bits = os.path.basename(start_end_input_filename).split(".") + filename_bits[2] = "sb"; + start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # Derive the output file name of the LM build + filename_bits[2] = "lm" + lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # Derive the compiled LM file name + filename_bits[2] = "arpa" + compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # First thing to do is add start and end markers + start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")] + infile = open(start_end_input_filename, 'r') + outfile = open(start_end_output_filename, 'w') + print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline) + return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile) + if return_code: + raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \ + start_end_input_filename, start_end_output_filename, return_code) + + # Next build the language model + tmp_dir = tempfile.mkdtemp(dir = "/tmp") + try: + build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"), + "-i", start_end_output_filename, + "-t", tmp_dir, + "-p", + "-s", s['smoothing_method'], + "-o", lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline) + return_code = subprocess.check_call(build_lm_cmdline) + if return_code: + raise Exception("IRST language model failed to build: return code = [%d]" % return_code) + finally: + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + # Compile the LM + lm_filename = lm_filename + ".gz" + compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"), + "--text", "yes", + lm_filename, + compiled_lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline) + return_code = subprocess.check_call(compile_lm_cmdline) + if return_code: + raise Exception("IRST language model compilation failed: return code = [%d]" % return_code) + + output = {'add_start_end_filename': start_end_output_filename, + 'lm_filename': lm_filename, + 'compiled_lm_filename': compiled_lm_filename} + + print "IRSTLM Build: Output = %s" % output + + return output + + return process + + +if __name__ == '__main__': + from pypeline.helpers.helpers import eval_pipeline + + lm_dir = os.environ["PWD"] + configuration = {'irstlm_root': os.environ["IRSTLM"], + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': lm_dir} + component_config = configure(configuration) + component = initialise(component_config) + + value = eval_pipeline(component, + {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'}, + component_config) + target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'), + 'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'), + 'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')} + print "Target: %s" % target + if value != target: + raise Exception("Massive fail!") diff --git a/contrib/arrow-pipelines/python/training/components/mert/__init__.py b/contrib/arrow-pipelines/python/training/components/mert/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/mert/mert.py b/contrib/arrow-pipelines/python/training/components/mert/mert.py new file mode 100755 index 000000000..2b60b1720 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +import os, shutil, subprocess + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['trg_lang'] = args['trg_lang'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['mert_working_dir'] = args['mert_working_directory'] + return result + +def initialise(config): + + def process(a, s): + infilename = os.path.abspath(a['development_data_filename']) + lm_file = os.path.abspath(a['trg_language_model_filename']) + lm_order = int(a['trg_language_model_order']) + lm_type = int(a['trg_language_model_type']) + orig_moses_ini = os.path.abspath(a['moses_ini_file']) + + if not os.path.exists(orig_moses_ini): + raise Exception, "Error: Input moses.ini does not exist" + + workdir = os.path.abspath(config['mert_working_dir']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + moses_install_dir = os.path.abspath(config['moses_installation_dir']) + mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl') + bin_dir = os.path.join(moses_install_dir, 'bin') + moses_bin = os.path.join(moses_install_dir, 'bin', 'moses') + src_file = infilename + '.' + config['src_lang'] + ref_file = infilename + '.' + config['trg_lang'] + logfile = os.path.join(workdir, 'log') + #change lm configuration in moses ini + moses_ini = os.path.join(workdir, 'trained-moses.ini') + cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s" + cmd = cmd % locals() + os.system(cmd) + + #the command + cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s' + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + new_mosesini = os.path.join(workdir, 'moses.ini') + if not os.path.exists(new_mosesini): + raise Exception, 'Failed MERT' + + return {'moses_ini_file':new_mosesini} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.path.abspath('../../../../'), + 'mert_working_dir':'../../../../../tuning'} + values = {'development_data_filename':'../../../../../corpus/tune', + 'moses_ini_file':'../../../../../model/model/moses.ini', + 'trg_language_model_filename':'../../../../../corpus/train.lt.lm', + 'trg_language_model_type':9, + 'trg_language_model_order':4} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/model_training/__init__.py b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py new file mode 100755 index 000000000..e990307d2 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import os, shutil, subprocess + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['trg_lang'] = args['trg_lang'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['external_bin_dir'] = args['giza_installation_dir'] + result['model_directory'] = args['translation_model_directory'] + return result + +def initialise(config): + + def process(a, s): + infilename = os.path.abspath(a['training_data_filename']) + workdir = os.path.abspath(config['model_directory']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl' + src_lang = config['src_lang'].lower() + trg_lang = config['trg_lang'].lower() + external_bin = os.path.abspath(config['external_bin_dir']) + #create a dummy lm file + dummy_lmfile = workdir + os.sep + 'dummy.lm' + f = open(dummy_lmfile, 'w') + print >> f, "dummy lm file" + f.close() + logfile = workdir + os.sep + 'log' + + #the command + cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s' + + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini' + if not os.path.exists(mosesini): + raise Exception, 'Failed training model' + + return {'moses_ini_file':mosesini} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.environ['MOSES_HOME'], + 'giza_installation_dir':os.environ['GIZA_HOME'], + 'translation_model_directory':'model-dir'} + values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py new file mode 100755 index 000000000..57f8771df --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import os + +from tokenizer import Tokenizer + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['src_tokenisation_dir'] = args['src_tokenisation_dir'] + result['moses_installation_dir'] = args['moses_installation_dir'] + return result + +def initialise(config): + + def process(a, s): + infilename = a['src_filename'] + outfilename = Tokenizer.batch_tokenise( + config['src_lang'], + config['moses_installation_dir'], + infilename, + config['src_tokenisation_dir']) + return {'tokenised_src_filename':outfilename} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'de', + 'src_tokenisation_dir':'tmptok', + 'moses_installation_dir':os.path.abspath('../../../../')} + values = {'src_filename':'tmp.de'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de new file mode 100644 index 000000000..c6b41edbe --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de @@ -0,0 +1,3 @@ +asdfweoih +awfwoeijf awefo +what's this diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py new file mode 100644 index 000000000..354ec1abc --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import sys, os, subprocess + +class Tokenizer: + + @staticmethod + def batch_tokenise(lang, mosesdir, infilename, workdir): + print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir) + if not os.path.exists(workdir): + os.makedirs(workdir) + tok = Tokenizer(lang, mosesdir) + basefilename = os.path.basename(infilename) + outfilename = workdir + os.sep + basefilename + '.tok' + tok.file_tokenise(infilename, outfilename) + return outfilename + + def __init__(self, lang, mosesdir): + self.arrows = None + self.lang = lang + #check the perl tokenizer is here + #path = os.path.dirname(os.path.abspath(__file__)) + path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer' + self.perltok = path + os.sep + 'tokenizer.perl' + if not os.path.exists(path): + raise Exception, "Perl tokenizer does not exists" + + def file_tokenise(self, infilename, outfilename): + cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename) + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + +if __name__ == '__main__': + #do some test + pass + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py new file mode 100755 index 000000000..3852e296f --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import os + +from tokenizer import Tokenizer + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['trg_lang'] = args['trg_lang'] + result['trg_tokenisation_dir'] = args['trg_tokenisation_dir'] + result['moses_installation_dir'] = args['moses_installation_dir'] + return result + +def initialise(config): + + def process(a, s): + infilename = a['trg_filename'] + outfilename = Tokenizer.batch_tokenise( + config['trg_lang'], + config['moses_installation_dir'], + infilename, + config['trg_tokenisation_dir']) + return {'tokenised_trg_filename':outfilename} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'trg_lang':'de', + 'trg_tokenisation_dir':'tmptoktrg', + 'moses_installation_dir':os.path.abspath('../../../../')} + values = {'trg_filename':'tmp.de'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + -- cgit v1.2.3 From 3e73ac29b2c54ac82f7553b4a17fd42ef73e84f1 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Wed, 6 Mar 2013 14:53:45 +0000 Subject: Added the RPM installer builder to contrib. --- contrib/rpm/README | 42 ++++++++++++++++++++++ contrib/rpm/build_source.sh | 63 +++++++++++++++++++++++++++++++++ contrib/rpm/rpmbuild/SPECS/moses.spec | 65 +++++++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 contrib/rpm/README create mode 100755 contrib/rpm/build_source.sh create mode 100644 contrib/rpm/rpmbuild/SPECS/moses.spec diff --git a/contrib/rpm/README b/contrib/rpm/README new file mode 100644 index 000000000..8ba7ef4da --- /dev/null +++ b/contrib/rpm/README @@ -0,0 +1,42 @@ +Building Moses RPM +================== + +*** WARNING *** +Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer. +*** WARNING *** + + +Building the RPM SPEC file +-------------------------- + +The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information: + + - The Git repository from which an installer will be built, + - The branch in the Git repository to build, and + - The version of the installed Moses distribution. + +For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git): + +$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 + +This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS. + + +Building the RPM +---------------- + +Change directory to $HOME/rpmbuild, and build the binary RPM with: + +$ rpmbuild -bb SPECS/moses.spec + +This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS//moses--1..rpm. + +For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm. + + +Building a Debian package +------------------------- + +The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page: + +https://help.ubuntu.com/community/RPM/AlienHowto diff --git a/contrib/rpm/build_source.sh b/contrib/rpm/build_source.sh new file mode 100755 index 000000000..d0fac6a33 --- /dev/null +++ b/contrib/rpm/build_source.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +BRANCH="master" +declare -i NO_RPM_BUILD=0 +declare -r RPM_VERSION_TAG="___RPM_VERSION__" + +function usage() { + echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]" + exit 1 +} + +if [ $# -lt 4 ]; then + usage +fi + +while getopts r:b:v:nh OPTION +do + case "$OPTION" in + r) REPO="${OPTARG}";; + b) BRANCH="${OPTARG}";; + v) VERSION="${OPTARG}";; + n) NO_RPM_BUILD=1;; + [h\?]) usage;; + esac +done + +if [ ! -d ./rpmbuild ]; then + echo "RPM build directory not in current working direcotry" + exit 1 +fi + +declare -r MOSES_DIR="moses-${VERSION}" +git clone ${REPO} ${MOSES_DIR} +if [ $? -ne 0 ]; then + echo "Failed to clone Git repository ${REPO}" + exit 3 +fi + +cd ${MOSES_DIR} + +git checkout ${BRANCH} +if [ $? -ne 0 ]; then + echo "Failed to checkout branch ${BRANCH}" + exit 3 +fi + +cd .. + +tar -cf moses-${VERSION}.tar ${MOSES_DIR} +gzip -f9 moses-${VERSION}.tar + +if [ ${NO_RPM_BUILD} -eq 0 ]; then + if [ ! -d ${HOME}/rpmbuild/SPECS ]; then + mkdir -p ${HOME}/rpmbuild/SPECS + fi + eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec + if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then + mkdir -p ${HOME}/rpmbuild/SOURCES + fi + mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES +fi + +rm -Rf ${MOSES_DIR} diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec new file mode 100644 index 000000000..1ae8082ef --- /dev/null +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -0,0 +1,65 @@ +Name: moses +Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. +Version: ___RPM_VERSION__ +Release: 1 +URL: http://www.statmt.org/moses/ +Source0: %{name}-%{version}.tar.gz +License: LGPL +Group: Development/Tools +Vendor: Capita Translation and Interpreting +Packager: Ian Johnson +Requires: boost >= 1.48, python >= 2.6, perl >= 5 +BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release} +%description +Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices. +%prep +%setup -q + +mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v2 + +wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz +wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz + +cd $RPM_BUILD_DIR + +tar -zxf irstlm-5.70.04.tgz +tar -zxf giza-pp-v1.0.7.tgz + +cd irstlm-5.70.04 +bash regenerate-makefiles.sh --force +./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 +make +make install + +cd ../giza-pp +make +cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v2 +%build +./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v2 -j2 +%install +mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts +cp -R bin $RPM_BUILD_ROOT/opt/moses +cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts +%clean +%files +%defattr(-,root,root) +/opt/moses/bin/* +/opt/moses/scripts/analysis/* +/opt/moses/scripts/ems/* +/opt/moses/scripts/generic/* +/opt/moses/scripts/other/* +/opt/moses/scripts/recaser/* +/opt/moses/scripts/regression-testing/* +/opt/moses/scripts/share/* +/opt/moses/scripts/tokenizer/* +/opt/moses/scripts/training/* +/opt/moses/irstlm-5.70.04/* +/opt/moses/giza++-v2/* -- cgit v1.2.3 From fe5e737589b94c5a82fa3a075a5fc1372f9bab3f Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Thu, 7 Mar 2013 12:35:29 -0500 Subject: Subtract prior scores when outputting phrase-based hypergraph arcs. --- moses/Manager.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/moses/Manager.cpp b/moses/Manager.cpp index f8f2402a6..4c4fa267e 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -801,8 +801,11 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const { - const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); - + ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); + const Hypothesis *prevHypo = hypo->GetPrevHypo(); + if (prevHypo) { + scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() ); + } vector featureValues = scoreCollection.GetScoresForProducer(ff); size_t numScoreComps = featureValues.size(); -- cgit v1.2.3 From b968eae6291af3c6e2f4893a50dd643ebd297768 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Thu, 7 Mar 2013 15:13:08 -0500 Subject: Take hypothesis recombination into account when outputting phrase-based lattice as hypergraph --- moses/Manager.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 4c4fa267e..011187cda 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -848,11 +848,14 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou } } - // Record that this arc ends at this node - hypergraphIDToArcs.insert(pair(hypergraphHypothesisID,arcNumber)); - // Get an id number for this hypothesis - int mosesHypothesisID = searchGraph[arcNumber].hypo->GetId(); + int mosesHypothesisID; + if (searchGraph[arcNumber].recombinationHypo) { + mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId(); + } else { + mosesHypothesisID = searchGraph[arcNumber].hypo->GetId(); + } + if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) { mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID; @@ -866,6 +869,10 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou hypergraphHypothesisID += 1; } + + // Record that this arc ends at this node + hypergraphIDToArcs.insert(pair(mosesIDToHypergraphID[mosesHypothesisID],arcNumber)); + } // Unique end node @@ -892,7 +899,12 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou for (multimap::iterator it=range.first; it!=range.second; ++it) { int lineNumber = (*it).second; const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; - int mosesHypothesisID = thisHypo->GetId(); + int mosesHypothesisID;// = thisHypo->GetId(); + if (searchGraph[lineNumber].recombinationHypo) { + mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId(); + } else { + mosesHypothesisID = searchGraph[lineNumber].hypo->GetId(); + } // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID]; UTIL_THROW_IF( (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]), -- cgit v1.2.3 From 5f1be3217b5ec3b69fb10b098e212e940c0b855c Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 7 Mar 2013 21:40:43 +0000 Subject: bugifx format of extract file for instance weighting --- phrase-extract/extract-main.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 92c8a470e..cab91e92d 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) { if (m_options.isOrientationFlag()) outextractstrOrientation << orientationInfo; + if (m_options.isIncludeSentenceIdFlag()) { + outextractstr << " ||| " << sentence.sentenceID; + } + if (m_options.getInstanceWeightsFile().length()) { if (m_options.isTranslationFlag()) { outextractstr << " ||| " << sentence.weightString; @@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) { } } - if (m_options.isIncludeSentenceIdFlag()) { - outextractstr << " ||| " << sentence.sentenceID; - } if (m_options.isTranslationFlag()) outextractstr << "\n"; if (m_options.isTranslationFlag()) outextractstrInv << "\n"; -- cgit v1.2.3 From 13b6973ceeea4a1327d162c28093dd6cd4e9497c Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Mon, 11 Mar 2013 13:04:10 +0000 Subject: Fixed GIZA++ installation directory name. --- contrib/rpm/rpmbuild/SPECS/moses.spec | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec index 1ae8082ef..0f4a6c6ec 100644 --- a/contrib/rpm/rpmbuild/SPECS/moses.spec +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -15,7 +15,7 @@ Moses is a statistical machine translation system that allows you to automatical %prep %setup -q -mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v2 +mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz @@ -33,9 +33,9 @@ make install cd ../giza-pp make -cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v2 +cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 %build -./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v2 -j2 +./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2 %install mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts cp -R bin $RPM_BUILD_ROOT/opt/moses @@ -62,4 +62,4 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts /opt/moses/scripts/tokenizer/* /opt/moses/scripts/training/* /opt/moses/irstlm-5.70.04/* -/opt/moses/giza++-v2/* +/opt/moses/giza++-v1.0.7/* -- cgit v1.2.3 From 7b1c062cdcd04b48ce3fface6a08511d05558f41 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 12 Mar 2013 02:03:44 -0400 Subject: Update Boost install link --- BUILD-INSTRUCTIONS.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt index 318956ccd..3dac64f60 100644 --- a/BUILD-INSTRUCTIONS.txt +++ b/BUILD-INSTRUCTIONS.txt @@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES Generally, for trouble installing external libraries, you should get support directly from the library maker: -Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html +Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user -- cgit v1.2.3 From 21c51194fab91f8e79409545db98642cbac9b505 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 13 Mar 2013 12:12:33 +0000 Subject: add -print-alignment-info --- moses-cmd/IOWrapper.cpp | 13 +++++++++++++ moses-cmd/IOWrapper.h | 2 +- moses-cmd/Main.cpp | 9 +++++++-- moses/Parameter.cpp | 1 + moses/StaticData.cpp | 14 ++++++++++---- moses/StaticData.h | 4 ++++ 6 files changed, 36 insertions(+), 7 deletions(-) diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp index f7fed9998..2da30f380 100644 --- a/moses-cmd/IOWrapper.cpp +++ b/moses-cmd/IOWrapper.cpp @@ -271,6 +271,19 @@ void OutputAlignment(ostream &out, const vector &edges) out << std::endl; } +void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo) +{ + std::vector edges; + const Hypothesis *currentHypo = hypo; + while (currentHypo) { + edges.push_back(currentHypo); + currentHypo = currentHypo->GetPrevHypo(); + } + + OutputAlignment(out, edges); + +} + void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector &edges) { ostringstream out; diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h index 5decaa122..267a3a0bc 100644 --- a/moses-cmd/IOWrapper.h +++ b/moses-cmd/IOWrapper.h @@ -139,7 +139,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo); void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo); void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path); - +void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo); } diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 5a33c214c..624c31994 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -197,7 +197,7 @@ public: // MAP decoding: best hypothesis const Hypothesis* bestHypo = NULL; if (!staticData.UseMBR()) - { + { bestHypo = manager.GetBestHypothesis(); if (bestHypo) { if (staticData.IsPathRecoveryEnabled()) { @@ -214,13 +214,18 @@ public: staticData.GetOutputFactorOrder(), staticData.GetReportSegmentation(), staticData.GetReportAllFactors()); + if (staticData.PrintAlignmentInfo()) { + out << "||| "; + OutputAlignment(out, bestHypo); + } + OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo); IFVERBOSE(1) { debug << "BEST TRANSLATION: " << *bestHypo << endl; } } out << endl; - } + } // MBR decoding (n-best MBR, lattice MBR, consensus) else diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 359174280..356cf219b 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -179,6 +179,7 @@ Parameter::Parameter() AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory"); AddParam("minphr-memory", "Load phrase table in minphr format into memory"); + AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false"); AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false"); AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false"); AddParam("alignment-output-file", "print output word alignments into given file"); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index cf797582b..9c27d9634 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter) } } - if(m_parameter->GetParam("sort-word-alignment").size()) { - m_wordAlignmentSort = (WordAlignmentSort) Scan(m_parameter->GetParam("sort-word-alignment")[0]); - } - // factor delimiter if (m_parameter->GetParam("factor-delimiter").size() > 0) { m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0]; @@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter) SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false ); //word-to-word alignment + // alignments + SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false ); + if (m_PrintAlignmentInfo) { + m_needAlignmentInfo = true; + } + + if(m_parameter->GetParam("sort-word-alignment").size()) { + m_wordAlignmentSort = (WordAlignmentSort) Scan(m_parameter->GetParam("sort-word-alignment")[0]); + } + SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false ); if (m_PrintAlignmentInfoNbest) { m_needAlignmentInfo = true; diff --git a/moses/StaticData.h b/moses/StaticData.h index ce93a5629..20d36e4b8 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -171,6 +171,7 @@ protected: bool m_reportAllFactorsNBest; std::string m_detailedTranslationReportingFilePath; bool m_onlyDistinctNBest; + bool m_PrintAlignmentInfo; bool m_needAlignmentInfo; bool m_PrintAlignmentInfoNbest; @@ -730,6 +731,9 @@ public: const std::string &GetAlignmentOutputFile() const { return m_alignmentOutputFile; } + bool PrintAlignmentInfo() const { + return m_PrintAlignmentInfo; + } bool PrintAlignmentInfoInNbest() const { return m_PrintAlignmentInfoNbest; } -- cgit v1.2.3 From 5ba153806b253c8ab4768882d34ac61c32f25b62 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Wed, 13 Mar 2013 17:52:24 +0000 Subject: =?UTF-8?q?fixed=20kneserNey=20phrase=20probability=20smoothing=20?= =?UTF-8?q?bug=20reported=20by=20=09=20=C4=8Ceslav=20Przywara=20?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- phrase-extract/consolidate-main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index 70de9678b..fd33907de 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; - if (countEF < 3) D = kneserNey_D2; + else if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct -- cgit v1.2.3 From 3a7f4f776a34049e2dd57622451694068c604bad Mon Sep 17 00:00:00 2001 From: phikoehn Date: Wed, 13 Mar 2013 17:54:29 +0000 Subject: minor --- scripts/generic/compound-splitter.perl | 43 ++++++++++++++++++---------------- scripts/generic/mteval-v13a.pl | 2 +- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index 8f82ab8d9..beca70eb0 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -16,15 +16,15 @@ $HELP = 1 unless &GetOptions('corpus=s' => \$CORPUS, 'model=s' => \$MODEL, 'filler=s' => \$FILLER, - 'factored' => \$FACTORED, + 'factored' => \$FACTORED, 'min-size=i' => \$MIN_SIZE, 'min-count=i' => \$MIN_COUNT, 'max-count=i' => \$MAX_COUNT, 'help' => \$HELP, 'verbose' => \$VERBOSE, - 'syntax' => \$SYNTAX, - 'binarize' => \$BINARIZE, - 'mark-split' => \$MARK_SPLIT, + 'syntax' => \$SYNTAX, + 'binarize' => \$BINARIZE, + 'mark-split' => \$MARK_SPLIT, 'train' => \$TRAIN); if ($HELP || @@ -155,34 +155,37 @@ sub apply { next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; $COUNT{$lc} = $count; $TRUECASE{$lc} = $factored_word; - $LABEL{$lc} = $label if $SYNTAX; + $LABEL{$lc} = $label if $SYNTAX; } close(MODEL); while() { my $first = 1; chop; s/\s+/ /g; s/^ //; s/ $//; - my @BUFFER; # for xml tags + my @BUFFER; # for xml tags foreach my $factored_word (split) { print " " unless $first; $first = 0; - # syntax: don't split xml - if ($SYNTAX && ($factored_word =~ /^$/)) { - push @BUFFER,$factored_word; - $first = 1; - next; - } - - # get case class - my $word = $factored_word; - $word =~ s/\|.+//g; # just first factor - my $lc = lc($word); - + # syntax: don't split xml + if ($SYNTAX && ($factored_word =~ /^$/)) { + push @BUFFER,$factored_word; + $first = 1; + next; + } + + # get case class + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + + print STDERR "considering $word ($lc)...\n" if $VERBOSE; # don't split frequent words - if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) { - print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) || + $lc !~ /[a-zA-Z]/) {; # has to have at least one letter + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer print $factored_word; + print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE; next; } diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl index 879212e6e..f1f8f9ef6 100755 --- a/scripts/generic/mteval-v13a.pl +++ b/scripts/generic/mteval-v13a.pl @@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span sub extract_sgml_tag_attribute { my ($name, $data) = @_; - ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); + ($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : (); } ################################# -- cgit v1.2.3 From 946fbc45e1b0a4e7fda9377e8c3a847d5cce99a0 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 14 Mar 2013 09:58:33 +0000 Subject: testing commit emails function --- NOTICE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NOTICE b/NOTICE index 7d631cd88..23d8b2ad1 100644 --- a/NOTICE +++ b/NOTICE @@ -1,3 +1,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations). This code includes data from czech wiktionary (also czech abbreviations). + + -- cgit v1.2.3 From 08330adfe6a69d107bf67c92742034fcaf700fc9 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 14 Mar 2013 10:44:00 +0000 Subject: testing commit emails function. 2 --- NOTICE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NOTICE b/NOTICE index 23d8b2ad1..cea4ab1da 100644 --- a/NOTICE +++ b/NOTICE @@ -3,3 +3,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations) This code includes data from czech wiktionary (also czech abbreviations). + + -- cgit v1.2.3 From 763f6b84a74f3d56f418edb810a2f847a776bbcc Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 14 Mar 2013 17:48:43 +0000 Subject: testing commit emails function. 3 --- NOTICE | 2 -- 1 file changed, 2 deletions(-) diff --git a/NOTICE b/NOTICE index cea4ab1da..23d8b2ad1 100644 --- a/NOTICE +++ b/NOTICE @@ -3,5 +3,3 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations) This code includes data from czech wiktionary (also czech abbreviations). - - -- cgit v1.2.3 From a3d50bab1a2f65eb07b021246d2773418c5c9a07 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 14 Mar 2013 17:50:58 +0000 Subject: testing commit emails function. 4 --- NOTICE | 1 + 1 file changed, 1 insertion(+) diff --git a/NOTICE b/NOTICE index 23d8b2ad1..c2ff560fc 100644 --- a/NOTICE +++ b/NOTICE @@ -3,3 +3,4 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations) This code includes data from czech wiktionary (also czech abbreviations). + -- cgit v1.2.3 From df3e379d2bc5af4dce66fd2b894b3ad23cda2895 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 14 Mar 2013 17:58:57 +0000 Subject: testing commit emails function. 5 --- NOTICE | 1 + 1 file changed, 1 insertion(+) diff --git a/NOTICE b/NOTICE index c2ff560fc..cea4ab1da 100644 --- a/NOTICE +++ b/NOTICE @@ -4,3 +4,4 @@ This code includes data from czech wiktionary (also czech abbreviations). + -- cgit v1.2.3 From b639f0b2101021b86ec8d6f8dd01921f30b073fe Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 14 Mar 2013 18:02:38 +0000 Subject: testing commit emails function. 6 --- NOTICE | 3 --- 1 file changed, 3 deletions(-) diff --git a/NOTICE b/NOTICE index cea4ab1da..b0dcba070 100644 --- a/NOTICE +++ b/NOTICE @@ -2,6 +2,3 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations) This code includes data from czech wiktionary (also czech abbreviations). - - - -- cgit v1.2.3 From d931006dbc1e7e86c39f225af2e795adc1b3d342 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 14 Mar 2013 18:04:11 +0000 Subject: testing commit emails function. 7 --- NOTICE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NOTICE b/NOTICE index b0dcba070..c2ff560fc 100644 --- a/NOTICE +++ b/NOTICE @@ -2,3 +2,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations) This code includes data from czech wiktionary (also czech abbreviations). + + -- cgit v1.2.3 From d7806b9351762342ea3e6d550c52a8f802995b62 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 14 Mar 2013 18:08:15 +0000 Subject: testing commit emails function. 8 --- NOTICE | 2 -- 1 file changed, 2 deletions(-) diff --git a/NOTICE b/NOTICE index c2ff560fc..b0dcba070 100644 --- a/NOTICE +++ b/NOTICE @@ -2,5 +2,3 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations) This code includes data from czech wiktionary (also czech abbreviations). - - -- cgit v1.2.3 From e60c4f74ef0e9d1a16af4a0df398962694750d5b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 15 Mar 2013 10:11:36 +0000 Subject: test commit. One last time on sourceforge --- NOTICE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NOTICE b/NOTICE index b0dcba070..c2ff560fc 100644 --- a/NOTICE +++ b/NOTICE @@ -2,3 +2,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations) This code includes data from czech wiktionary (also czech abbreviations). + + -- cgit v1.2.3 From 802f5ab3b78cfb12cff05753a3677a0f998c24c0 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 15 Mar 2013 11:19:15 +0000 Subject: testing commit emails function. 9 --- NOTICE | 1 - 1 file changed, 1 deletion(-) diff --git a/NOTICE b/NOTICE index c2ff560fc..23d8b2ad1 100644 --- a/NOTICE +++ b/NOTICE @@ -3,4 +3,3 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations) This code includes data from czech wiktionary (also czech abbreviations). - -- cgit v1.2.3 From 3da09b921c8278f5a808f6e951e329d04ab186f7 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 15 Mar 2013 12:30:39 +0000 Subject: memory leak --- moses-cmd/Main.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 624c31994..9f610204f 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -495,20 +495,20 @@ int main(int argc, char** argv) // load all the settings into the Parameter class // (stores them as strings, or array of strings) - Parameter* params = new Parameter(); - if (!params->LoadParam(argc,argv)) { + Parameter params; + if (!params.LoadParam(argc,argv)) { exit(1); } // initialize all "global" variables, which are stored in StaticData // note: this also loads models such as the language model, etc. - if (!StaticData::LoadDataStatic(params, argv[0])) { + if (!StaticData::LoadDataStatic(¶ms, argv[0])) { exit(1); } // setting "-show-weights" -> just dump out weights and exit - if (params->isParamSpecified("show-weights")) { + if (params.isParamSpecified("show-weights")) { ShowWeights(); exit(0); } -- cgit v1.2.3 From 974bdd979b4b79b4eb252665fd253a0e335e371f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 15 Mar 2013 14:19:36 +0000 Subject: memory leak --- moses/LM/SingleFactor.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp index 3418aefe2..c061d0fed 100644 --- a/moses/LM/SingleFactor.cpp +++ b/moses/LM/SingleFactor.cpp @@ -36,8 +36,9 @@ using namespace std; namespace Moses { -LanguageModelSingleFactor::~LanguageModelSingleFactor() {} - +LanguageModelSingleFactor::~LanguageModelSingleFactor() +{ +} struct PointerState : public FFState { const void* lmstate; @@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState() m_beginSentenceState = new PointerState(NULL); } -LanguageModelPointerState::~LanguageModelPointerState() {} +LanguageModelPointerState::~LanguageModelPointerState() +{ + delete m_nullContextState; + delete m_beginSentenceState; +} const FFState *LanguageModelPointerState::GetNullContextState() const { -- cgit v1.2.3 From 2d252d2dd0c97e1d1def55b9d421db4122d762f2 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 15 Mar 2013 16:11:15 +0000 Subject: memory leak --- moses-cmd/Main.cpp | 2 ++ moses/AlignmentInfoCollection.cpp | 3 +++ moses/AlignmentInfoCollection.h | 1 + 3 files changed, 6 insertions(+) diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 9f610204f..117cac3f9 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -665,6 +665,8 @@ int main(int argc, char** argv) pool.Stop(true); //flush remaining jobs #endif + delete ioWrapper; + } catch (const std::exception &e) { std::cerr << "Exception: " << e.what() << std::endl; return EXIT_FAILURE; diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp index 5daba9ba1..53b83d8cd 100644 --- a/moses/AlignmentInfoCollection.cpp +++ b/moses/AlignmentInfoCollection.cpp @@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection() m_emptyAlignmentInfo = Add(pairs); } +AlignmentInfoCollection::~AlignmentInfoCollection() +{} + const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const { return *m_emptyAlignmentInfo; diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h index 9c7f75e13..de0949f8f 100644 --- a/moses/AlignmentInfoCollection.h +++ b/moses/AlignmentInfoCollection.h @@ -55,6 +55,7 @@ class AlignmentInfoCollection //! Only a single static variable should be created. AlignmentInfoCollection(); + ~AlignmentInfoCollection(); static AlignmentInfoCollection s_instance; -- cgit v1.2.3 From de2519fb889e2de8037e9317f1c676f1efbca475 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 15 Mar 2013 19:48:11 +0000 Subject: eclipse --- contrib/other-builds/OnDiskPt/.cproject | 11 ++++++++--- contrib/other-builds/extractor/.cproject | 12 ++++++++++-- contrib/other-builds/lm/.cproject | 11 +++++++++-- contrib/other-builds/mert_lib/.cproject | 5 +---- contrib/other-builds/moses-chart-cmd/.cproject | 2 +- contrib/other-builds/moses-cmd/.cproject | 11 ++++++++--- contrib/other-builds/moses/.cproject | 18 +++++++++++------- contrib/other-builds/search/.cproject | 11 +++++++++-- contrib/other-builds/search/.project | 5 ----- contrib/other-builds/util/.cproject | 11 ++++++++--- 10 files changed, 65 insertions(+), 32 deletions(-) diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject index e135b8886..f551380fd 100644 --- a/contrib/other-builds/OnDiskPt/.cproject +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -24,7 +24,7 @@ - + @@ -133,8 +133,13 @@ - - + + + + + + + diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject index 7529a7799..1ccfe2578 100644 --- a/contrib/other-builds/extractor/.cproject +++ b/contrib/other-builds/extractor/.cproject @@ -18,7 +18,7 @@ - + - + + + + + + + + + diff --git a/contrib/other-builds/lm/.cproject b/contrib/other-builds/lm/.cproject index 2036e6b18..e3e47fd7e 100644 --- a/contrib/other-builds/lm/.cproject +++ b/contrib/other-builds/lm/.cproject @@ -24,7 +24,7 @@ - + @@ -131,7 +131,14 @@ - + + + + + + + + diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject index 79dffb294..e1c19b822 100644 --- a/contrib/other-builds/mert_lib/.cproject +++ b/contrib/other-builds/mert_lib/.cproject @@ -23,7 +23,7 @@ - + - - - diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index 7120f0b71..90a730cf7 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -19,7 +19,7 @@ - + - - + + + + + + + diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index e54a1385b..787024533 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -1,7 +1,5 @@ - - - + @@ -9,7 +7,7 @@ - + @@ -26,7 +24,7 @@ - + - - + + + + + + + + diff --git a/contrib/other-builds/search/.cproject b/contrib/other-builds/search/.cproject index 9ccb8f8e9..2de36fecd 100644 --- a/contrib/other-builds/search/.cproject +++ b/contrib/other-builds/search/.cproject @@ -24,7 +24,7 @@ - + - + + + + + + + + diff --git a/contrib/other-builds/search/.project b/contrib/other-builds/search/.project index efad842ea..95f074aae 100644 --- a/contrib/other-builds/search/.project +++ b/contrib/other-builds/search/.project @@ -156,11 +156,6 @@ 1 PARENT-3-PROJECT_LOC/search/vertex.hh - - vertex_generator.cc - 1 - PARENT-3-PROJECT_LOC/search/vertex_generator.cc - vertex_generator.hh 1 diff --git a/contrib/other-builds/util/.cproject b/contrib/other-builds/util/.cproject index ab37362a4..2fd4d2dfb 100644 --- a/contrib/other-builds/util/.cproject +++ b/contrib/other-builds/util/.cproject @@ -24,7 +24,7 @@ - + @@ -136,8 +136,13 @@ - - + + + + + + + -- cgit v1.2.3 From df5f0934be559418177ffa9a68c2e561918a310f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 15 Mar 2013 19:48:51 +0000 Subject: eclipse --- contrib/other-builds/extractor/.cproject | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject index 1ccfe2578..fc08b4c3d 100644 --- a/contrib/other-builds/extractor/.cproject +++ b/contrib/other-builds/extractor/.cproject @@ -23,6 +23,9 @@ -- cgit v1.2.3 From 8523a27768a44acbd35a04573fee90c17a753056 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 15 Mar 2013 20:38:26 +0000 Subject: fix single-threaded --- moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp index c680d7245..065368ca7 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -552,7 +552,9 @@ namespace tmmt bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const { +#ifdef WITH_THREADS boost::shared_lock read_lock(m_accessLock); +#endif map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key ); if (lookup != m_lsed.end()) { value = lookup->second; @@ -564,7 +566,9 @@ namespace tmmt void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value) { +#ifdef WITH_THREADS boost::unique_lock lock(m_accessLock); +#endif m_lsed[ key ] = value; } -- cgit v1.2.3 From 9f4824b2be25e13bf450cddbe683b790c257cf9e Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 16 Mar 2013 13:25:31 +0000 Subject: single threaded compile error --- moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp index c680d7245..065368ca7 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -552,7 +552,9 @@ namespace tmmt bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const { +#ifdef WITH_THREADS boost::shared_lock read_lock(m_accessLock); +#endif map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key ); if (lookup != m_lsed.end()) { value = lookup->second; @@ -564,7 +566,9 @@ namespace tmmt void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value) { +#ifdef WITH_THREADS boost::unique_lock lock(m_accessLock); +#endif m_lsed[ key ] = value; } -- cgit v1.2.3 From 1b83b85f443be34686792f0c9b7a2997adca1f4f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 18 Mar 2013 16:48:40 +0000 Subject: debug info from sort command --- scripts/generic/extract-parallel.perl | 6 +++--- scripts/generic/score-parallel.perl | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 7533b39e0..192169c86 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -153,9 +153,9 @@ if (defined($baselineExtract)) { $catOCmd .= "$baselineExtract.o$sorted.gz "; } -$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.sorted.gz \n"; -$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.inv.sorted.gz \n"; -$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n"; +$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.sorted.gz 2>> /dev/stderr \n"; +$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.inv.sorted.gz 2>> /dev/stderr \n"; +$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.o.sorted.gz 2>> /dev/stderr \n"; @children = (); diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index 520fbddbe..3f763e5d9 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -163,7 +163,7 @@ else $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR "; } - $cmd .= " | gzip -c > $ptHalf"; + $cmd .= " | gzip -c > $ptHalf 2>> /dev/stderr "; } print STDERR $cmd; systemCheck($cmd); -- cgit v1.2.3 From 038871fdb36359e1ec2027de1ef0227162f5473a Mon Sep 17 00:00:00 2001 From: Achim Date: Mon, 18 Mar 2013 17:17:35 -0400 Subject: Hungarian and Latvian non-breaking prefix files --- .../nonbreaking_prefixes/nonbreaking_prefix.hu | 103 +++++++++++++++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.lv | 100 ++++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu create mode 100644 scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu new file mode 100644 index 000000000..c6b9af8ca --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu @@ -0,0 +1,103 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Á +É +Í +Ó +Ö +Ő +Ú +Ü +Ű + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Dr +dr +kb +Kb +vö +Vö +pl +Pl +ca +Ca +min +Min +max +Max +ún +Ún +prof +Prof +de +De +du +Du +Szt +St + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix + +# Month name abbreviations +jan #NUMERIC_ONLY# +Jan #NUMERIC_ONLY# +Feb #NUMERIC_ONLY# +feb #NUMERIC_ONLY# +márc #NUMERIC_ONLY# +Márc #NUMERIC_ONLY# +ápr #NUMERIC_ONLY# +Ápr #NUMERIC_ONLY# +máj #NUMERIC_ONLY# +Máj #NUMERIC_ONLY# +jún #NUMERIC_ONLY# +Jún #NUMERIC_ONLY# +Júl #NUMERIC_ONLY# +júl #NUMERIC_ONLY# +aug #NUMERIC_ONLY# +Aug #NUMERIC_ONLY# +Szept #NUMERIC_ONLY# +szept #NUMERIC_ONLY# +okt #NUMERIC_ONLY# +Okt #NUMERIC_ONLY# +nov #NUMERIC_ONLY# +Nov #NUMERIC_ONLY# +dec #NUMERIC_ONLY# +Dec #NUMERIC_ONLY# + +# Other abbreviations +tel #NUMERIC_ONLY# +Tel #NUMERIC_ONLY# +Fax #NUMERIC_ONLY# +fax #NUMERIC_ONLY# diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv new file mode 100644 index 000000000..81754a17a --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv @@ -0,0 +1,100 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +Ā +B +C +Č +D +E +Ē +F +G +Ģ +H +I +Ī +J +K +Ķ +L +Ļ +M +N +Ņ +O +P +Q +R +S +Š +T +U +Ū +V +W +X +Y +Z +Ž + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +dr +Dr +med +prof +Prof +inž +Inž +ist.loc +Ist.loc +kor.loc +Kor.loc +v.i +vietn +Vietn + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +a.l +t.p +pārb +Pārb +vec +Vec +inv +Inv +sk +Sk +spec +Spec +vienk +Vienk +virz +Virz +māksl +Māksl +mūz +Mūz +akad +Akad +soc +Soc +galv +Galv +vad +Vad +sertif +Sertif +folkl +Folkl +hum +Hum + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +Nr #NUMERIC_ONLY# -- cgit v1.2.3 From 8efeb5922816ab3df527c29742961a19b522f5ad Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Mon, 18 Mar 2013 21:29:17 +0000 Subject: don't lowercase reference if there's a recaser --- scripts/ems/experiment.meta | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index d74677a69..7c84839f5 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -1000,6 +1000,7 @@ lowercase-reference out: reference default-name: evaluation/reference pass-unless: output-lowercaser + pass-if: recaser multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-lowercaser < IN > OUT nist-bleu -- cgit v1.2.3 From 7dc4faa97ed2021981c69551d2af9a68787e867d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 19 Mar 2013 11:17:17 +0000 Subject: Fix cd error when running bjam from non-top --- bjam | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bjam b/bjam index d0d94dedb..0ebf105c3 100755 --- a/bjam +++ b/bjam @@ -1,17 +1,17 @@ #!/bin/bash set -e +top="$(dirname "$0")" if bjam="$(which bjam 2>/dev/null)" && #exists [ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true ! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" /dev/null && #bjam in path isn't this script "${bjam}" --sanity-test 2>/dev/null |grep Sane >/dev/null && #The test in jam-files/sanity.jam passes - (cd jam-files/fail && ! "${bjam}") >/dev/null #Returns non-zero on failure + (cd "${top}/jam-files/fail" && ! "${bjam}") >/dev/null #Returns non-zero on failure then #Delegate to system bjam exec "${bjam}" "$@" fi -top="$(dirname "$0")" if [ ! -x "$top"/jam-files/bjam ] || "$top"/jam-files/bjam -v |grep 2011.4 >/dev/null; then pushd "$top/jam-files/engine" ./build.sh -- cgit v1.2.3 From 55f02f2fecc9bfdb6720f84beb1b94733166ec64 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 19 Mar 2013 14:46:52 +0000 Subject: Accept concatenated bzip2 files --- util/read_compressed.cc | 100 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 27 deletions(-) diff --git a/util/read_compressed.cc b/util/read_compressed.cc index b81549e42..b62a6e833 100644 --- a/util/read_compressed.cc +++ b/util/read_compressed.cc @@ -180,12 +180,73 @@ class GZip : public ReadBase { }; #endif // HAVE_ZLIB +const uint8_t kBZMagic[3] = {'B', 'Z', 'h'}; + #ifdef HAVE_BZLIB class BZip : public ReadBase { public: - explicit BZip(int fd, void *already_data, std::size_t already_size) { + BZip(int fd, void *already_data, std::size_t already_size) { scoped_fd hold(fd); closer_.reset(FDOpenReadOrThrow(hold)); + file_ = NULL; + Open(already_data, already_size); + } + + BZip(FILE *file, void *already_data, std::size_t already_size) { + closer_.reset(file); + file_ = NULL; + Open(already_data, already_size); + } + + ~BZip() { + Close(file_); + } + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + assert(file_); + int bzerror = BZ_OK; + int ret = BZ2_bzRead(&bzerror, file_, to, std::min(static_cast(INT_MAX), amount)); + long pos = ftell(closer_.get()); + if (pos != -1) ReadCount(thunk) = pos; + switch (bzerror) { + case BZ_STREAM_END: + /* bzip2 files can be concatenated by e.g. pbzip2. Annoyingly, the + * library doesn't handle this internally. This gets the trailing + * data, grows it up to magic as needed, validates the magic, and + * reopens. + */ + { + bzerror = BZ_OK; + void *trailing_data; + int trailing_size; + BZ2_bzReadGetUnused(&bzerror, file_, &trailing_data, &trailing_size); + UTIL_THROW_IF(bzerror != BZ_OK, BZException, "bzip2 error in BZ2_bzReadGetUnused " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); + std::string trailing(static_cast(trailing_data), trailing_size); + Close(file_); + + if (trailing_size < (int)sizeof(kBZMagic)) { + trailing.resize(sizeof(kBZMagic)); + if (1 != fread(&trailing[trailing_size], sizeof(kBZMagic) - trailing_size, 1, closer_.get())) { + UTIL_THROW_IF(trailing_size, BZException, "File has trailing cruft"); + // Legitimate end of file. + ReplaceThis(new Complete(), thunk); + return ret; + } + } + UTIL_THROW_IF(memcmp(trailing.data(), kBZMagic, sizeof(kBZMagic)), BZException, "Trailing cruft is not another bzip2 stream"); + Open(&trailing[0], trailing.size()); + } + return ret; + case BZ_OK: + return ret; + default: + UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); + } + } + + private: + void Open(void *already_data, std::size_t already_size) { + assert(!file_); int bzerror = BZ_OK; file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size); switch (bzerror) { @@ -199,38 +260,23 @@ class BZip : public ReadBase { UTIL_THROW(BZException, "IO error reading file"); case BZ_MEM_ERROR: throw std::bad_alloc(); + default: + UTIL_THROW(BZException, "Unknown bzip2 error code " << bzerror); } + assert(file_); } - ~BZip() { + static void Close(BZFILE *&file) { + if (file == NULL) return; int bzerror = BZ_OK; - BZ2_bzReadClose(&bzerror, file_); + BZ2_bzReadClose(&bzerror, file); if (bzerror != BZ_OK) { - std::cerr << "bz2 readclose error" << std::endl; + std::cerr << "bz2 readclose error number " << bzerror << std::endl; abort(); } + file = NULL; } - std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { - int bzerror = BZ_OK; - int ret = BZ2_bzRead(&bzerror, file_, to, std::min(static_cast(INT_MAX), amount)); - long pos; - switch (bzerror) { - case BZ_STREAM_END: - pos = ftell(closer_.get()); - if (pos != -1) ReadCount(thunk) = pos; - ReplaceThis(new Complete(), thunk); - return ret; - case BZ_OK: - pos = ftell(closer_.get()); - if (pos != -1) ReadCount(thunk) = pos; - return ret; - default: - UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); - } - } - - private: scoped_FILE closer_; BZFILE *file_; }; @@ -346,11 +392,11 @@ MagicResult DetectMagic(const void *from_void) { if (header[0] == 0x1f && header[1] == 0x8b) { return GZIP; } - if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') { + if (!memcmp(header, kBZMagic, sizeof(kBZMagic))) { return BZIP; } - const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; - if (!memcmp(header, xzmagic, 6)) { + const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; + if (!memcmp(header, kXZMagic, sizeof(kXZMagic))) { return XZIP; } return UNKNOWN; -- cgit v1.2.3 From 6efa1681fc9f385bcdec1243ef059203812018c0 Mon Sep 17 00:00:00 2001 From: Christian Buck Date: Tue, 19 Mar 2013 18:21:35 +0000 Subject: added operator< to SearchGraphNode - compares Ids --- moses/Manager.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/moses/Manager.h b/moses/Manager.h index e2f8ed8e5..11762ec37 100644 --- a/moses/Manager.h +++ b/moses/Manager.h @@ -56,6 +56,10 @@ struct SearchGraphNode { hypo(theHypo), recombinationHypo(theRecombinationHypo), forward(theForward), fscore(theFscore) {} + bool operator<(const SearchGraphNode& sgn) const { + return this->hypo->GetId() < sgn.hypo->GetId(); + } + }; /** The Manager class implements a stack decoding algorithm for phrase-based decoding @@ -104,7 +108,7 @@ private: // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const; size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; - + protected: // data -- cgit v1.2.3 From e7f54efa72c346cc9880d564def6785449dac4af Mon Sep 17 00:00:00 2001 From: Christian Buck Date: Tue, 19 Mar 2013 18:22:21 +0000 Subject: mosesserver gives search graph ordered by hyp-id --- contrib/server/mosesserver.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 98024c891..5d9c40a9b 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -1,6 +1,8 @@ #include "util/check.hh" #include #include +#include +#include #include "moses/ChartManager.h" @@ -54,7 +56,7 @@ public: PhraseDictionaryDynSuffixArray* pdsa = (PhraseDictionaryDynSuffixArray*) pdf->GetDictionary(); cerr << "Inserting into address " << pdsa << endl; pdsa->insertSnt(source_, target_, alignment_); - if(add2ORLM_) { + if(add2ORLM_) { updateORLM(); } cerr << "Done inserting\n"; @@ -83,8 +85,8 @@ public: const std::string sBOS = orlm->GetSentenceStart()->GetString(); const std::string sEOS = orlm->GetSentenceEnd()->GetString(); Utils::splitToStr(target_, vl, " "); - // insert BOS and EOS - vl.insert(vl.begin(), sBOS); + // insert BOS and EOS + vl.insert(vl.begin(), sBOS); vl.insert(vl.end(), sEOS); for(int j=0; j < vl.size(); ++j) { int i = (j retData; if (staticData.IsChart()) { - TreeInput tinput; + TreeInput tinput; const vector &inputFactorOrder = staticData.GetInputFactorOrder(); stringstream in(source + "\n"); @@ -260,10 +262,16 @@ public: } + + bool compareSearchGraphNode(const SearchGraphNode& a, const SearchGraphNode b) { + return a.hypo->GetId() < b.hypo->GetId(); + } + void insertGraphInfo(Manager& manager, map& retData) { vector searchGraphXml; vector searchGraph; manager.GetSearchGraph(searchGraph); + std::sort(searchGraph.begin(), searchGraph.end()); for (vector::const_iterator i = searchGraph.begin(); i != searchGraph.end(); ++i) { map searchGraphXmlNode; searchGraphXmlNode["forward"] = xmlrpc_c::value_double(i->forward); -- cgit v1.2.3 From 34c8975aa7c4505486b3f8494f48abd3d3b645d1 Mon Sep 17 00:00:00 2001 From: Christian Buck Date: Tue, 19 Mar 2013 18:22:43 +0000 Subject: ported perl xmlrpc example to python --- contrib/server/client.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100755 contrib/server/client.py diff --git a/contrib/server/client.py b/contrib/server/client.py new file mode 100755 index 000000000..43e77555a --- /dev/null +++ b/contrib/server/client.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# python port of client.perl + +import xmlrpclib +import datetime + +url = "http://localhost:8080/RPC2" +proxy = xmlrpclib.ServerProxy(url) + +text = u"il a souhaité que la présidence trace à nice le chemin pour l' avenir ." +params = {"text":text, "align":"true", "report-all-factors":"true"} + +result = proxy.translate(params) +print result['text'] +if 'align' in result: + print "Phrase alignments:" + aligns = result['align'] + for align in aligns: + print "%s,%s,%s" %(align['tgt-start'], align['src-start'], align['src-end']) -- cgit v1.2.3 From 7b9c5c1194528ddeea7b0c33fa7d1c43f4a3b373 Mon Sep 17 00:00:00 2001 From: Matous Machacek Date: Tue, 19 Mar 2013 23:08:28 +0100 Subject: fixed bug in InterpolatedScorer --- mert/InterpolatedScorer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp index e610cbdd0..af3f26bf2 100644 --- a/mert/InterpolatedScorer.cpp +++ b/mert/InterpolatedScorer.cpp @@ -164,7 +164,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats { stringstream buff; string align = text; - string sentence = ""; + string sentence = text; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { -- cgit v1.2.3 From 0cc07c5c9ae232710e95889da4249e483d877823 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 20 Mar 2013 17:17:16 +0000 Subject: --max-factors=1 and Sparse features segfault -> different check suggested by Philipp --- moses/SourceWordDeletionFeature.cpp | 7 +------ moses/TargetWordInsertionFeature.cpp | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/moses/SourceWordDeletionFeature.cpp b/moses/SourceWordDeletionFeature.cpp index c5a61111f..c312a3b03 100644 --- a/moses/SourceWordDeletionFeature.cpp +++ b/moses/SourceWordDeletionFeature.cpp @@ -55,12 +55,7 @@ void SourceWordDeletionFeature::ComputeFeatures(const TargetPhrase& targetPhrase // handle special case: unknown words (they have no word alignment) size_t targetLength = targetPhrase.GetSize(); size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize(); - if (targetLength == 1 && sourceLength == 1) { - const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1); - if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) { - return; - } - } + if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return; // flag aligned words bool aligned[16]; diff --git a/moses/TargetWordInsertionFeature.cpp b/moses/TargetWordInsertionFeature.cpp index 537c5c9cb..3b9bf36ba 100644 --- a/moses/TargetWordInsertionFeature.cpp +++ b/moses/TargetWordInsertionFeature.cpp @@ -56,12 +56,7 @@ void TargetWordInsertionFeature::ComputeFeatures(const TargetPhrase& targetPhras // handle special case: unknown words (they have no word alignment) size_t targetLength = targetPhrase.GetSize(); size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize(); - if (targetLength == 1 && sourceLength == 1) { - const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1); - if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) { - return; - } - } + if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return; // flag aligned words bool aligned[16]; -- cgit v1.2.3 From 22c77f73310e1afda4f232b06a3a4b212c40593c Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Thu, 21 Mar 2013 12:17:16 -0400 Subject: Work on decreasing memory requirement for outputting hypergraph --- moses-cmd/Main.cpp | 9 +++++---- moses/Manager.cpp | 31 +++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 117cac3f9..f4cdb4388 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -171,10 +171,11 @@ public: std::ofstream *file = new std::ofstream; file->open(fileName.str().c_str()); if (file->is_open() && file->good()) { - ostringstream out; - fix(out,PRECISION); - manager.OutputSearchGraphAsHypergraph(m_lineNumber, out); - *file << out.str(); + // ostringstream out; + // fix(out,PRECISION); + fix(*file,PRECISION); + manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file); + // *file << out.str(); file -> flush(); } else { TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 011187cda..2ca689bb0 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -800,7 +800,6 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const { - ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); const Hypothesis *prevHypo = hypo->GetPrevHypo(); if (prevHypo) { @@ -823,14 +822,20 @@ size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* /**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const { + + VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << translationId << std::endl) + vector searchGraph; GetSearchGraph(searchGraph); + map mosesIDToHypergraphID; // map hypergraphIDToMosesID; set terminalNodes; multimap hypergraphIDToArcs; + VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << translationId << std::endl) + long numNodes = 0; long endNode = 0; { @@ -888,11 +893,21 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou // Print number of nodes and arcs outputSearchGraphStream << numNodes << " " << numArcs << endl; + VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId + << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl) + + VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl) + + for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) { + if (hypergraphHypothesisID % 100000 == 0) { + VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << translationId << std::endl); + } // int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID]; size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID); + // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has " << count << " incoming arcs" << std::endl) if (count > 0) { - outputSearchGraphStream << count << endl; + outputSearchGraphStream << count << "\n"; pair::iterator, multimap::iterator> range = hypergraphIDToArcs.equal_range(hypergraphHypothesisID); @@ -917,10 +932,11 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); if (prevHypo==NULL) { - outputSearchGraphStream << " ||| " << endl; + // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl) + outputSearchGraphStream << " ||| \n"; } else { int startNode = mosesIDToHypergraphID[prevHypo->GetId()]; - + // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl) UTIL_THROW_IF( (startNode >= hypergraphHypothesisID), util::Exception, @@ -937,17 +953,16 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou } outputSearchGraphStream << " ||| "; OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream); - outputSearchGraphStream << endl; + outputSearchGraphStream << "\n"; } - } } } // Print node and arc(s) for end of sentence - outputSearchGraphStream << terminalNodes.size() << endl; + outputSearchGraphStream << terminalNodes.size() << "\n"; for (set::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) { - outputSearchGraphStream << "[" << (*it) << "] ||| " << endl; + outputSearchGraphStream << "[" << (*it) << "] ||| \n"; } } -- cgit v1.2.3 From 28c980d58e4d3b53c257d2979de99a746bb7b0e3 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Thu, 21 Mar 2013 15:19:31 -0400 Subject: Allow hypergraph output to be in plain text, gzip, or bzip2. The output-search-graph-hypergraph flag now takes two params: * The first param must be "none", "gzip", or "bzip2" * The second param is the hypergraph directory, which must already exist --- moses-cmd/Jamfile | 2 +- moses-cmd/Main.cpp | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile index 04f395a81..42d54568f 100644 --- a/moses-cmd/Jamfile +++ b/moses-cmd/Jamfile @@ -1,4 +1,4 @@ -alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ../moses//moses ; +alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ../moses//moses ; exe moses : Main.cpp deps ; exe lmbrgrid : LatticeMBRGrid.cpp deps ; diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index f4cdb4388..90c924a08 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -23,6 +23,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Moses main, for single-threaded and multi-threaded. **/ +#include +#include +#include +#include + #include #include #include @@ -167,10 +172,18 @@ public: // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder if (m_outputSearchGraphHypergraph) { stringstream fileName; - fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << m_lineNumber; - std::ofstream *file = new std::ofstream; - file->open(fileName.str().c_str()); - if (file->is_open() && file->good()) { + fileName << staticData.GetParam("output-search-graph-hypergraph")[1] << "/" << m_lineNumber; + boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream; + // file->open(fileName.str().c_str()); + string compression = staticData.GetParam("output-search-graph-hypergraph")[0]; + if ( compression == "gzip" || compression == "gz" ) { + file->push( boost::iostreams::gzip_compressor() ); + } else if ( compression == "bzip2" || compression == "bz2" ) { + file->push( boost::iostreams::bzip2_compressor() ); + } + file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) ); + // if (file->is_open() && file->good()) { + if (file->is_complete() && file->good()) { // ostringstream out; // fix(out,PRECISION); fix(*file,PRECISION); @@ -180,7 +193,7 @@ public: } else { TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); } - file -> close(); + file -> pop(); delete file; } -- cgit v1.2.3 From b2bba0bae344cdace0ec3fe9dc4292bcae0a2a43 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Thu, 21 Mar 2013 16:48:47 -0400 Subject: Work on compression and defaults for outputting hypergraphs. --- moses-cmd/IOWrapper.cpp | 9 ----- moses-cmd/IOWrapper.h | 2 -- moses-cmd/Jamfile | 2 +- moses-cmd/Main.cpp | 95 +++++++++++++++++++++++++++++++++++++------------ 4 files changed, 73 insertions(+), 35 deletions(-) diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp index 2da30f380..335a570a6 100644 --- a/moses-cmd/IOWrapper.cpp +++ b/moses-cmd/IOWrapper.cpp @@ -189,15 +189,6 @@ InputType*IOWrapper::GetInput(InputType* inputType) } } - ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() { - const StaticData &staticData = StaticData::Instance(); - stringstream fileName; - fileName << staticData.GetParam("output-search-graph-hypergraph")[1]; - std::ofstream *file = new std::ofstream; - file->open(fileName.str().c_str()); - return file; - } - /*** * print surface factor only for the given phrase */ diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h index 267a3a0bc..8dbdeda9c 100644 --- a/moses-cmd/IOWrapper.h +++ b/moses-cmd/IOWrapper.h @@ -117,8 +117,6 @@ public: return *m_outputSearchGraphStream; } - std::ofstream *GetOutputSearchGraphHypergraphWeightsStream(); - std::ostream &GetDetailedTranslationReportingStream() { assert (m_detailedTranslationReportingStream); return *m_detailedTranslationReportingStream; diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile index 42d54568f..bddc10911 100644 --- a/moses-cmd/Jamfile +++ b/moses-cmd/Jamfile @@ -1,4 +1,4 @@ -alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ../moses//moses ; +alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ; exe moses : Main.cpp deps ; exe lmbrgrid : LatticeMBRGrid.cpp deps ; diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 90c924a08..8f6a6c069 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -23,6 +23,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Moses main, for single-threaded and multi-threaded. **/ +#include #include #include #include @@ -171,30 +172,76 @@ public: // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder if (m_outputSearchGraphHypergraph) { - stringstream fileName; - fileName << staticData.GetParam("output-search-graph-hypergraph")[1] << "/" << m_lineNumber; - boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream; - // file->open(fileName.str().c_str()); - string compression = staticData.GetParam("output-search-graph-hypergraph")[0]; - if ( compression == "gzip" || compression == "gz" ) { - file->push( boost::iostreams::gzip_compressor() ); - } else if ( compression == "bzip2" || compression == "bz2" ) { - file->push( boost::iostreams::bzip2_compressor() ); + + vector hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph"); + + bool appendSuffix; + if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") { + appendSuffix = true; + } else { + appendSuffix = false; + } + + string compression; + if (hypergraphParameters.size() > 1) { + compression = hypergraphParameters[1]; + } else { + compression = "txt"; + } + + string hypergraphDir; + if ( hypergraphParameters.size() > 2 ) { + hypergraphDir = hypergraphParameters[2]; + } else { + string nbestFile = staticData.GetNBestFilePath(); + if ( ! nbestFile.empty()) { + boost::filesystem::path nbestPath(nbestFile); + hypergraphDir = nbestPath.parent_path().filename(); + } else { + stringstream hypergraphDirName; + hypergraphDirName << boost::filesystem::current_path() << "/hypergraph"; + hypergraphDir = hypergraphDirName.str(); + } + } + + if ( ! boost::filesystem::exists(hypergraphDir) ) { + boost::filesystem::create_directory(hypergraphDir); } - file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) ); - // if (file->is_open() && file->good()) { - if (file->is_complete() && file->good()) { - // ostringstream out; - // fix(out,PRECISION); - fix(*file,PRECISION); - manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file); - // *file << out.str(); - file -> flush(); + + if ( ! boost::filesystem::exists(hypergraphDir) ) { + TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl); + } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) { + TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl); } else { - TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); + + stringstream fileName; + fileName << hypergraphDir << "/" << m_lineNumber; + if ( appendSuffix ) { + fileName << "." << compression; + } + boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream; + + if ( compression == "gz" ) { + file->push( boost::iostreams::gzip_compressor() ); + } else if ( compression == "bz2" ) { + file->push( boost::iostreams::bzip2_compressor() ); + } else if ( compression != "txt" ) { + TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl); + compression = "txt"; + } + + file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) ); + + if (file->is_complete() && file->good()) { + fix(*file,PRECISION); + manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file); + file -> flush(); + } else { + TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl); + } + file -> pop(); + delete file; } - file -> pop(); - delete file; } // apply decision rule and output best translation(s) @@ -548,8 +595,10 @@ int main(int argc, char** argv) TRACE_ERR(weights); TRACE_ERR("\n"); } - if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 1) { - ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream(); + if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 3) { + ofstream* weightsOut = new std::ofstream; + string weightsFilename = staticData.GetParam("output-search-graph-hypergraph")[3]; + weightsOut->open(weightsFilename.c_str()); OutputFeatureWeightsForHypergraph(*weightsOut); weightsOut->flush(); weightsOut->close(); -- cgit v1.2.3 From 3a4e63c558a5d5beaefbb2e45497b66e4d9ae72f Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 22 Mar 2013 12:14:28 -0400 Subject: Ensure directory exists for outputting hypergraphs --- moses-cmd/Main.cpp | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 8f6a6c069..8fd46ba38 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -23,6 +23,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Moses main, for single-threaded and multi-threaded. **/ +#include #include #include #include @@ -194,7 +195,7 @@ public: hypergraphDir = hypergraphParameters[2]; } else { string nbestFile = staticData.GetNBestFilePath(); - if ( ! nbestFile.empty()) { + if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { boost::filesystem::path nbestPath(nbestFile); hypergraphDir = nbestPath.parent_path().filename(); } else { @@ -595,10 +596,26 @@ int main(int argc, char** argv) TRACE_ERR(weights); TRACE_ERR("\n"); } - if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 3) { + if (staticData.GetOutputSearchGraphHypergraph()) { ofstream* weightsOut = new std::ofstream; - string weightsFilename = staticData.GetParam("output-search-graph-hypergraph")[3]; - weightsOut->open(weightsFilename.c_str()); + stringstream weightsFilename; + if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) { + weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3]; + } else { + string nbestFile = staticData.GetNBestFilePath(); + if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { + boost::filesystem::path nbestPath(nbestFile); + weightsFilename << nbestPath.parent_path().filename() << "/weights"; + } else { + weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights"; + } + } + boost::filesystem::path weightsFilePath(weightsFilename.str()); + if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) { + boost::filesystem::create_directory(weightsFilePath.parent_path()); + } + TRACE_ERR("The weights file is " << weightsFilename.str() << "\n"); + weightsOut->open(weightsFilename.str().c_str()); OutputFeatureWeightsForHypergraph(*weightsOut); weightsOut->flush(); weightsOut->close(); -- cgit v1.2.3 From db005f6503a5179e0427b460d8039e8920c8d65b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 23 Mar 2013 15:58:07 +0000 Subject: compile error caused by different versions of boost --- contrib/other-builds/moses-chart-cmd/.cproject | 5 +++-- contrib/other-builds/moses-cmd/.cproject | 5 +++-- moses-cmd/Main.cpp | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index 90a730cf7..4ca560326 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -1,5 +1,7 @@ - + + + @@ -74,7 +76,6 @@ - diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject index 573fe715f..3d6d32a72 100644 --- a/contrib/other-builds/moses-cmd/.cproject +++ b/contrib/other-builds/moses-cmd/.cproject @@ -1,5 +1,7 @@ - + + + @@ -73,7 +75,6 @@ - diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 8fd46ba38..ef85fd66b 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -192,12 +192,12 @@ public: string hypergraphDir; if ( hypergraphParameters.size() > 2 ) { - hypergraphDir = hypergraphParameters[2]; + hypergraphDir = hypergraphParameters[2]; } else { string nbestFile = staticData.GetNBestFilePath(); if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { boost::filesystem::path nbestPath(nbestFile); - hypergraphDir = nbestPath.parent_path().filename(); + //hypergraphDir = nbestPath.parent_path().filename(); } else { stringstream hypergraphDirName; hypergraphDirName << boost::filesystem::current_path() << "/hypergraph"; -- cgit v1.2.3 From 4a954f2be7cc4cef32c08f6748d39f7430c1e4a5 Mon Sep 17 00:00:00 2001 From: Joan Puigcerver Date: Sun, 24 Mar 2013 20:01:59 +0100 Subject: Fixes compilation error using libboost_1_50 Problem spotted in https://github.com/moses-smt/mosesdecoder/issues/32 fixed. According to the Boost doc, nbestPath.parent_path().filename() returns a path object, to get the correspondent std::string representation, one must call one of the methods listed in: http://www.boost.org/doc/libs/1_53_0/libs/filesystem/doc/reference.html#path-native-format-observers native() is supposed to return the path in the specific OS path format (using backslashes for Windows). Anyway, since we are considering only the filename here, the result is the same. --- moses-cmd/Main.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index ef85fd66b..68d8049c4 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -197,7 +197,7 @@ public: string nbestFile = staticData.GetNBestFilePath(); if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { boost::filesystem::path nbestPath(nbestFile); - //hypergraphDir = nbestPath.parent_path().filename(); + hypergraphDir = nbestPath.parent_path().filename().native(); } else { stringstream hypergraphDirName; hypergraphDirName << boost::filesystem::current_path() << "/hypergraph"; @@ -214,7 +214,6 @@ public: } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) { TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl); } else { - stringstream fileName; fileName << hypergraphDir << "/" << m_lineNumber; if ( appendSuffix ) { -- cgit v1.2.3 From f2acca0943a7594740b0dbaa8ec70b6b8bbcb67d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 26 Mar 2013 11:50:10 +0000 Subject: eclipse --- contrib/other-builds/moses-chart-cmd/.cproject | 9 +++++---- contrib/other-builds/moses-cmd/.cproject | 8 +++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index 4ca560326..aae6822b6 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -1,7 +1,5 @@ - - - + @@ -62,6 +60,7 @@ diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject index 3d6d32a72..3a098bb9f 100644 --- a/contrib/other-builds/moses-cmd/.cproject +++ b/contrib/other-builds/moses-cmd/.cproject @@ -1,7 +1,5 @@ - - - + @@ -62,6 +60,7 @@ -- cgit v1.2.3 From 51a59b881ec7c37761c80809b44e46d34bee39a5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 26 Mar 2013 12:45:27 +0000 Subject: move RedVoc() as method in PDTimp class. Ready for getting rid of static variable --- moses/TranslationModel/PhraseDictionaryTree.cpp | 35 ++++++++++++++----------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp index 515d2f649..024294c61 100644 --- a/moses/TranslationModel/PhraseDictionaryTree.cpp +++ b/moses/TranslationModel/PhraseDictionaryTree.cpp @@ -156,22 +156,6 @@ PhraseDictionaryTree::PrefixPtr::operator bool() const typedef LVoc WordVoc; -static WordVoc* ReadVoc(const std::string& filename) -{ - static std::map vocs; -#ifdef WITH_THREADS - boost::mutex mutex; - boost::mutex::scoped_lock lock(mutex); -#endif - std::map::iterator vi = vocs.find(filename); - if (vi == vocs.end()) { - WordVoc* voc = new WordVoc(); - voc->Read(filename); - vocs[filename] = voc; - } - return vocs[filename]; -} - class PDTimp { public: @@ -190,6 +174,8 @@ public: ObjectPool pPool; // a comparison with the Boost MemPools might be useful + std::map vocs; + bool needwordalign, haswordAlign; bool printwordalign; @@ -304,6 +290,8 @@ public: return PPtr(); } + + WordVoc* ReadVoc(const std::string& filename); }; @@ -376,6 +364,21 @@ void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const } } +WordVoc* PDTimp::ReadVoc(const std::string& filename) +{ + #ifdef WITH_THREADS + boost::mutex mutex; + boost::mutex::scoped_lock lock(mutex); + #endif + std::map::iterator vi = vocs.find(filename); + if (vi == vocs.end()) { + WordVoc* voc = new WordVoc(); + voc->Read(filename); + vocs[filename] = voc; + } + return vocs[filename]; +} + //////////////////////////////////////////////////////////// // // member functions of PhraseDictionaryTree -- cgit v1.2.3 From e2b18c5337ca0b2e7a2cd94dfdae5fd7708f263b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 26 Mar 2013 13:29:59 +0000 Subject: no leak message due to static variable in binary phrase table. Doesn't actually solve the mem leak though --- moses/TranslationModel/PhraseDictionaryTree.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp index 024294c61..05b7afc4d 100644 --- a/moses/TranslationModel/PhraseDictionaryTree.cpp +++ b/moses/TranslationModel/PhraseDictionaryTree.cpp @@ -186,6 +186,12 @@ public: if(os) fClose(os); if(ot) fClose(ot); FreeMemory(); + + std::map::iterator iter; + for (iter = vocs.begin(); iter != vocs.end(); ++iter) { + WordVoc *voc = iter->second; + delete voc; + } } inline void NeedAlignmentInfo(bool a) { -- cgit v1.2.3 From f8afc7356965701fe4cb5fef51aa7e30a07e360e Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 26 Mar 2013 14:05:50 +0000 Subject: get rid of locking altogether. PDTimp already has separated source & target vocab variable. Use those instead --- moses/TranslationModel/PhraseDictionaryTree.cpp | 59 +++++++------------------ 1 file changed, 16 insertions(+), 43 deletions(-) diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp index 05b7afc4d..675656112 100644 --- a/moses/TranslationModel/PhraseDictionaryTree.cpp +++ b/moses/TranslationModel/PhraseDictionaryTree.cpp @@ -168,14 +168,12 @@ public: std::vector srcOffsets; FILE *os,*ot; - WordVoc* sv; - WordVoc* tv; + WordVoc sv; + WordVoc tv; ObjectPool pPool; // a comparison with the Boost MemPools might be useful - std::map vocs; - bool needwordalign, haswordAlign; bool printwordalign; @@ -186,12 +184,6 @@ public: if(os) fClose(os); if(ot) fClose(ot); FreeMemory(); - - std::map::iterator iter; - for (iter = vocs.begin(); iter != vocs.end(); ++iter) { - WordVoc *voc = iter->second; - delete voc; - } } inline void NeedAlignmentInfo(bool a) { @@ -261,12 +253,12 @@ public: rv.back().tokens.reserve(iphrase.size()); for(size_t j=0; jsymbol(iphrase[j])); + rv.back().tokens.push_back(&tv.symbol(iphrase[j])); } rv.back().scores = i->GetScores(); const IPhrase& fnames = i->GetFeatureNames(); for (size_t j = 0; j < fnames.size(); ++j) { - rv.back().fnames.push_back(&tv->symbol(fnames[j])); + rv.back().fnames.push_back(&tv.symbol(fnames[j])); } rv.back().fvalues = i->GetFeatureValues(); if (wa) wa->push_back(i->GetAlignment()); @@ -281,7 +273,7 @@ public: CHECK(p); if(w.empty() || w==EPSILON) return p; - LabelId wi=sv->index(w); + LabelId wi=sv.index(w); if(wi==InvalidLabelId) return PPtr(); // unknown word else if(p.imp->isRoot()) { @@ -344,10 +336,8 @@ int PDTimp::Read(const std::string& fn) for(size_t i=0; isymbol(iphr[j])<<" "; + for(size_t j=0; j::iterator vi = vocs.find(filename); - if (vi == vocs.end()) { - WordVoc* voc = new WordVoc(); - voc->Read(filename); - vocs[filename] = voc; - } - return vocs[filename]; -} - //////////////////////////////////////////////////////////// // // member functions of PhraseDictionaryTree @@ -432,7 +407,7 @@ GetTargetCandidates(const std::vector& src, { IPhrase f(src.size()); for(size_t i=0; isv->index(src[i]); + f[i]=imp->sv.index(src[i]); if(f[i]==InvalidLabelId) return; } @@ -448,7 +423,7 @@ GetTargetCandidates(const std::vector& src, { IPhrase f(src.size()); for(size_t i=0; isv->index(src[i]); + f[i]=imp->sv.index(src[i]); if(f[i]==InvalidLabelId) return; } @@ -464,7 +439,7 @@ PrintTargetCandidates(const std::vector& src, { IPhrase f(src.size()); for(size_t i=0; isv->index(src[i]); + f[i]=imp->sv.index(src[i]); if(f[i]==InvalidLabelId) { TRACE_ERR("the source phrase '"< vo; size_t lnc=0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info - imp->sv = new WordVoc(); - imp->tv = new WordVoc(); size_t missingAlignmentCount = 0; while(getline(inFile, line)) { @@ -541,11 +514,11 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) std::vector wordVec = Tokenize(sourcePhraseString); for (size_t i = 0 ; i < wordVec.size() ; ++i) - f.push_back(imp->sv->add(wordVec[i])); + f.push_back(imp->sv.add(wordVec[i])); wordVec = Tokenize(targetPhraseString); for (size_t i = 0 ; i < wordVec.size() ; ++i) - e.push_back(imp->tv->add(wordVec[i])); + e.push_back(imp->tv.add(wordVec[i])); // while(is>>w && w!="|||") sc.push_back(atof(w.c_str())); // Mauro: to handle 0 probs in phrase tables @@ -585,7 +558,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) abort(); } for (size_t i = 0; i < sparseTokens.size(); i+=2) { - fnames.push_back(imp->tv->add(sparseTokens[i])); + fnames.push_back(imp->tv.add(sparseTokens[i])); fvalues.push_back(Scan(sparseTokens[i+1])); } } @@ -672,8 +645,8 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) fWriteVector(oi,vo); fClose(oi); - imp->sv->Write(ofsv); - imp->tv->Write(oftv); + imp->sv.Write(ofsv); + imp->tv.Write(oftv); return 1; } -- cgit v1.2.3 From eeeda717a41b41db784564c88020f15a2f5bef57 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 26 Mar 2013 15:47:30 +0000 Subject: eclipse --- contrib/other-builds/moses-cmd/.cproject | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject index 3a098bb9f..42d2100d8 100644 --- a/contrib/other-builds/moses-cmd/.cproject +++ b/contrib/other-builds/moses-cmd/.cproject @@ -60,7 +60,6 @@ -- cgit v1.2.3 From ae53bc91d1a0186d791a98570c544b9999f39bd8 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 28 Mar 2013 10:27:11 +0000 Subject: Nicer error message for too many factor delimiters --- moses/Word.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/moses/Word.cpp b/moses/Word.cpp index c23e8de8c..2c1ac09ea 100644 --- a/moses/Word.cpp +++ b/moses/Word.cpp @@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "Word.h" #include "TypeDef.h" #include "StaticData.h" // needed to determine the FactorDelimiter +#include "util/exception.hh" #include "util/tokenize_piece.hh" using namespace std; @@ -95,6 +96,8 @@ std::string Word::GetString(FactorType factorType) const return NULL; } +class StrayFactorException : public util::Exception {}; + void Word::CreateFromString(FactorDirection direction , const std::vector &factorOrder , const StringPiece &str @@ -106,7 +109,7 @@ void Word::CreateFromString(FactorDirection direction for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) { m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit); } - CHECK(!fit); + UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times."); // assume term/non-term same for all factors m_isNonTerminal = isNonTerminal; -- cgit v1.2.3 From 627f3f908cbc4a1dc4c5970a13b45bd5fcc66f82 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 31 Mar 2013 15:58:34 +0100 Subject: OS X returns EINVAL for write > 2^31 --- util/file.cc | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/util/file.cc b/util/file.cc index 86d9b12de..c7d8e23b2 100644 --- a/util/file.cc +++ b/util/file.cc @@ -111,15 +111,26 @@ void ResizeOrThrow(int fd, uint64_t to) { UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes"); } +namespace { +std::size_t GuardLarge(std::size_t size) { + // The following operating systems have broken read/write/pread/pwrite that + // only supports up to 2^31. +#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) + return std::min(static_cast(INT_MAX), size); +#else + return size; +#endif +} +} + std::size_t PartialRead(int fd, void *to, std::size_t amount) { #if defined(_WIN32) || defined(_WIN64) - amount = min(static_cast(INT_MAX), amount); - int ret = _read(fd, to, amount); + int ret = _read(fd, to, GuardLarge(amount)); #else errno = 0; ssize_t ret; do { - ret = read(fd, to, amount); + ret = read(fd, to, GuardLarge(amount)); } while (ret == -1 && errno == EINTR); #endif UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes"); @@ -169,11 +180,13 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { ssize_t ret; errno = 0; do { + ret = #ifdef OS_ANDROID - ret = pread64(fd, to, size, off); + pread64 #else - ret = pread(fd, to, size, off); + pread #endif + (fd, to, GuardLarge(size), off); } while (ret == -1 && errno == EINTR); if (ret <= 0) { UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); @@ -190,14 +203,20 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) { const uint8_t *data = static_cast(data_void); while (size) { #if defined(_WIN32) || defined(_WIN64) - int ret = write(fd, data, min(static_cast(INT_MAX), size)); + int ret; #else - errno = 0; ssize_t ret; +#endif + errno = 0; do { - ret = write(fd, data, size); - } while (ret == -1 && errno == EINTR); + ret = +#if defined(_WIN32) || defined(_WIN64) + _write +#else + write #endif + (fd, data, GuardLarge(size)); + } while (ret == -1 && errno == EINTR); UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes"); data += ret; size -= ret; -- cgit v1.2.3 From b144e48fcc5c84b24595899ec53eae5dc4bf2428 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 1 Apr 2013 11:05:55 +0100 Subject: Make failure to parse a boolean argument fatal instead of log + interpret as false. --- moses/Util.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/moses/Util.cpp b/moses/Util.cpp index 98de1241e..495e05124 100644 --- a/moses/Util.cpp +++ b/moses/Util.cpp @@ -35,6 +35,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "TypeDef.h" #include "Util.h" #include "Timer.h" +#include "util/exception.hh" #include "util/file.hh" using namespace std; @@ -65,6 +66,8 @@ const std::string ToLower(const std::string& str) return lc; } +class BoolValueException : public util::Exception {}; + template<> bool Scan(const std::string &input) { @@ -73,8 +76,7 @@ bool Scan(const std::string &input) return true; if (lc == "no" || lc == "n" || lc =="false" || lc == "0") return false; - TRACE_ERR( "Scan: didn't understand '" << lc << "', returning false" << std::endl); - return false; + UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0."); } bool FileExists(const std::string& filePath) -- cgit v1.2.3 From 0a978e9f01c566292da6586ca4aa82ae5e77b53c Mon Sep 17 00:00:00 2001 From: phikoehn Date: Mon, 1 Apr 2013 14:31:32 +0100 Subject: bug fixes --- OnDiskPt/Main.cpp | 1 + scripts/ems/support/analysis.perl | 17 +++++++++++++---- scripts/training/train-model.perl | 3 ++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp index 5f6da5a33..5d4e0be8d 100644 --- a/OnDiskPt/Main.cpp +++ b/OnDiskPt/Main.cpp @@ -174,6 +174,7 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr break; } default: + cerr << "ERROR in line " << line << endl; assert(false); break; } diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index 29962ca71..a2f9580a9 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -745,7 +745,8 @@ sub hierarchical_segmentation { open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!"; open(NODE,">$dir/node") or die "Cannot open: $!"; while() { - /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_"); + /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || + /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_"); my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7); if ($last_sentence >= 0 && $sentence != $last_sentence) { &hs_process($last_sentence,\@DERIVATION,\%STATS); @@ -1137,9 +1138,17 @@ sub process_search_graph { `mkdir -p $dir/search-graph`; my $last_sentence = -1; while() { - /^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) { + ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12); + $heuristic_rule_score = $rule_score; # hmmmm.... + } + else { + die("ERROR: buggy search graph line: $_"); + } chop($alignment) if $alignment; chop($children) if $children; $recomb = 0 unless $recomb; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 5b0553581..e4292007e 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -38,8 +38,9 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_ $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS, @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE, - $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_CORPUS, $_BASELINE_ALIGNMENT, + $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT, $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $IGNORE); +my $_BASELINE_CORPUS = ""; my $_CORES = 1; my $debug = 0; # debug this script, do not delete any files in debug mode -- cgit v1.2.3 From 354d1a9474eef9aaa6ae569b4b82f2d16b2d6395 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 2 Apr 2013 14:35:20 +0100 Subject: add back -early-distortion-cost accidently deleted --- moses/Parameter.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 356cf219b..6a9745ade 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -107,6 +107,7 @@ Parameter::Parameter() AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation"); AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables"); AddParam("distortion", "configurations for each factorized/lexicalized reordering model."); + AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no"); AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'"); AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" ); AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation"); -- cgit v1.2.3 From 93433cf0157ecadd19162929a077aa6179dc0d8b Mon Sep 17 00:00:00 2001 From: Ondrej Bojar Date: Wed, 3 Apr 2013 18:07:42 +0200 Subject: support --translation-details OUTFILE in moses-parallel --- scripts/generic/moses-parallel.pl | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index d1840fc55..b8d393e71 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -64,6 +64,7 @@ my $wordgraphfile=undef; my $wordgraphflag=0; my $robust=5; # resubmit crashed jobs robust-times my $alifile=undef; +my $detailsfile=undef; my $logfile=""; my $logflag=""; my $searchgraphlist=""; @@ -93,6 +94,7 @@ sub init(){ 'output-search-graph|osg=s'=> \$searchgraphlist, 'output-word-graph|owg=s'=> \$wordgraphlist, 'alignment-output-file=s'=> \$alifile, + 'translation-details|T=s'=> \$detailsfile, 'qsub-prefix=s'=> \$qsubname, 'queue-parameters=s'=> \$queueparameters, 'inputtype=i'=> \$inputtype, @@ -539,6 +541,7 @@ while ($robust && scalar @idx_todo) { concatenate_1best(); concatenate_logs() if $logflag; concatenate_ali() if defined $alifile; +concatenate_details() if defined $detailsfile; concatenate_nbest() if $nbestflag; safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-'; @@ -580,6 +583,11 @@ sub preparing_script(){ $tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx"; } + my $tmpdetailsoutfile = ""; + if (defined $detailsfile){ + $tmpdetailsoutfile="-translation-details $tmpdir/$detailsfile.$splitpfx$idx"; + } + my $tmpsearchgraphlist=""; if ($searchgraphflag){ $tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx"; @@ -592,13 +600,17 @@ sub preparing_script(){ my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId"; - print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; + print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; print OUT "echo exit status \$\?\n\n"; if (defined $alifile){ print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } + if (defined $detailsfile){ + print OUT "\\mv -f $tmpdir/${detailsfile}.$splitpfx$idx .\n\n"; + print OUT "echo exit status \$\?\n\n"; + } if ($nbestflag){ print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; @@ -827,6 +839,18 @@ sub concatenate_ali(){ close(OUT); } +sub concatenate_details(){ + open (OUT, "> ${detailsfile}"); + foreach my $idx (@idxlist){ + my @in=(); + open (IN, "$detailsfile.$splitpfx$idx"); + @in=; + print OUT "@in"; + close(IN); + } + close(OUT); +} + sub check_exit_status(){ print STDERR "check_exit_status\n"; @@ -925,6 +949,7 @@ sub remove_temporary_files(){ unlink("${inputfile}.${splitpfx}${idx}.trans"); unlink("${inputfile}.${splitpfx}${idx}"); if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); } + if (defined $detailsfile){ unlink("${detailsfile}.${splitpfx}${idx}"); } if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); } if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); } if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); } -- cgit v1.2.3 From ac82be3120ac0b143039e2e36dff8b8538bdcb68 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Wed, 3 Apr 2013 21:59:03 +0100 Subject: Hal moved. We follow. --- scripts/training/mert-moses.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 0ac3b414f..f73b58120 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -360,7 +360,7 @@ my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt"); # or set t if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) { print "Could not find $pro_optimizer, installing it in $mertdir\n"; - my $megam_url = "http://www.umiacs.umd.edu/~hal/megam/"; + my $megam_url = "http://hal3.name/megam"; if (&is_mac_osx()) { die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details."; } -- cgit v1.2.3 From c016b6e04b9a0c6eddc423b0b839021f00599bbe Mon Sep 17 00:00:00 2001 From: phikoehn Date: Fri, 5 Apr 2013 11:26:00 +0100 Subject: extended display options for biconcor --- biconcor/PhrasePair.cpp | 37 ++++++++++++++++- biconcor/PhrasePair.h | 3 +- biconcor/PhrasePairCollection.cpp | 87 ++++++++++++++++++++++----------------- biconcor/PhrasePairCollection.h | 10 ++--- biconcor/biconcor.cpp | 65 +++++++++++++++++++++++++---- 5 files changed, 150 insertions(+), 52 deletions(-) diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp index 9c16be77c..038fa3a31 100644 --- a/biconcor/PhrasePair.cpp +++ b/biconcor/PhrasePair.cpp @@ -8,7 +8,42 @@ using namespace std; -void PhrasePair::Print( ostream* out, int width ) const +void PhrasePair::Print( ostream* out ) const +{ + // source + int sentence_start = m_source_position - m_source_start; + char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) ); + + for( char i=0; i0) *out << " "; + *out << m_suffixArray->GetWord( sentence_start + i ); + } + + // target + *out << " |||"; + for( char i=0; iGetWord( m_sentence_id, i); + } + + // source span + *out << " ||| " << (int)m_source_start << " " << (int)m_source_end; + + // target span + *out << " ||| " << (int)m_target_start << " " << (int)m_target_end; + + // word alignment + *out << " |||"; + + INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id ); + for( INDEX i=0; iGetSourceWord( m_sentence_id, i ) + << "-" << m_alignment->GetTargetWord( m_sentence_id, i ); + } + + *out << endl; +} + +void PhrasePair::PrintPretty( ostream* out, int width ) const { vector< WORD_ID >::const_iterator t; diff --git a/biconcor/PhrasePair.h b/biconcor/PhrasePair.h index f8a7881a0..f1dadb637 100644 --- a/biconcor/PhrasePair.h +++ b/biconcor/PhrasePair.h @@ -43,7 +43,8 @@ public: ~PhrasePair () {} void PrintTarget( std::ostream* out ) const; - void Print( std::ostream* out, int width ) const; + void Print( std::ostream* out ) const; + void PrintPretty( std::ostream* out, int width ) const; void PrintHTML( std::ostream* out ) const; void PrintClippedHTML( std::ostream* out, int width ) const; }; diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp index 17c95d24a..7497b2af8 100644 --- a/biconcor/PhrasePairCollection.cpp +++ b/biconcor/PhrasePairCollection.cpp @@ -13,31 +13,32 @@ using namespace std; -PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a ) +PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a, int max_translation, int max_example ) :m_suffixArray(sa) ,m_targetCorpus(tc) ,m_alignment(a) ,m_size(0) - ,m_max_lookup(10000) - ,m_max_pp_target(50) - ,m_max_pp(50) + ,m_max_lookup(10000) // maximum number of source occurrences sampled + ,m_max_translation(max_translation) // max number of different distinct translations returned + ,m_max_example(max_example) // max number of examples returned for each distinct translation {} PhrasePairCollection::~PhrasePairCollection() {} -bool PhrasePairCollection::GetCollection( const vector< string >& sourceString ) +int PhrasePairCollection::GetCollection( const vector< string >& sourceString ) { INDEX first_match, last_match; if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) { - return false; + return 0; } - cerr << "\tfirst match " << first_match << endl; - cerr << "\tlast match " << last_match << endl; + //cerr << "\tfirst match " << first_match << endl; + //cerr << "\tlast match " << last_match << endl; INDEX found = last_match - first_match +1; map< vector< WORD_ID >, INDEX > index; + int real_count = 0; for( INDEX i=first_match; i<=last_match; i++ ) { int position = m_suffixArray->GetPosition( i ); int source_start = m_suffixArray->GetWordInSentence( position ); @@ -45,23 +46,23 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString ) INDEX sentence_id = m_suffixArray->GetSentence( position ); int sentence_length = m_suffixArray->GetSentenceLength( sentence_id ); int target_length = m_targetCorpus->GetSentenceLength( sentence_id ); - cerr << "match " << (i-first_match) - << " in sentence " << sentence_id - << ", starting at word " << source_start - << " of " << sentence_length - << ". target sentence has " << target_length << " words."; + //cerr << "match " << (i-first_match) + //<< " in sentence " << sentence_id + //<< ", starting at word " << source_start + //<< " of " << sentence_length + //<< ". target sentence has " << target_length << " words."; int target_start, target_end, pre_null, post_null; if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) { - cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; - cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; + //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; + //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; bool null_boundary_words = false; for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) { for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) { vector< WORD_ID > targetString; - cerr << "; "; + //cerr << "; "; for (int target = target_start - pre; target <= target_end + post; target++) { targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) ); - cerr << m_targetCorpus->GetWord( sentence_id, target) << " "; + //cerr << m_targetCorpus->GetWord( sentence_id, target) << " "; } PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post); // matchCollection.Add( sentence_id, ) @@ -76,37 +77,47 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString ) } } else { - cerr << "mismatch " << (i-first_match) - << " in sentence " << sentence_id - << ", starting at word " << source_start - << " of " << sentence_length - << ". target sentence has " << target_length << " words."; + //cerr << "mismatch " << (i-first_match) + // << " in sentence " << sentence_id + // << ", starting at word " << source_start + // << " of " << sentence_length + // << ". target sentence has " << target_length << " words."; Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); if (mismatch->Unaligned()) m_unaligned.push_back( mismatch ); else m_mismatch.push_back( mismatch ); } - cerr << endl; + //cerr << endl; if (found > (INDEX)m_max_lookup) { i += found/m_max_lookup-1; } + real_count++; } sort(m_collection.begin(), m_collection.end(), CompareBySize()); - return true; + return real_count; } -void PhrasePairCollection::Print() const +void PhrasePairCollection::Print(bool pretty) const { vector< vector >::const_iterator ppWithSameTarget; - for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) { + int i=0; + for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && ibegin()))->PrintTarget( &cout ); int count = ppWithSameTarget->size(); cout << "(" << count << ")" << endl; - vector< PhrasePair* >::const_iterator p; - for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) { - (*p)->Print( &cout, 100 ); + vector< PhrasePair* >::const_iterator p = ppWithSameTarget->begin(); + for(int j=0; jsize() && jPrintPretty( &cout, 100 ); + } + else { + (*p)->Print( &cout ); + } + if (ppWithSameTarget->size() > m_max_example) { + p += ppWithSameTarget->size()/m_max_example-1; + } } } } @@ -117,7 +128,7 @@ void PhrasePairCollection::PrintHTML() const bool singleton = false; // loop over all translations vector< vector >::const_iterator ppWithSameTarget; - for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_targetsize(); if (!singleton) { @@ -143,9 +154,9 @@ void PhrasePairCollection::PrintHTML() const int i=0; for(p = ppWithSameTarget->begin(); i<10 && ppend(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); - if (count > m_max_pp) { - p += count/m_max_pp-1; - pp += count/m_max_pp-1; + if (count > m_max_example) { + p += count/m_max_example-1; + pp += count/m_max_example-1; } } if (i == 10 && pp < count) { @@ -153,11 +164,11 @@ void PhrasePairCollection::PrintHTML() const cout << "(more)"; cout << "
"; cout << ""; - for(i=0, pp=0, p = ppWithSameTarget->begin(); iend(); p++, pp++, i++ ) { + for(i=0, pp=0, p = ppWithSameTarget->begin(); iend(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); - if (count > m_max_pp) { - p += count/m_max_pp-1; - pp += count/m_max_pp-1; + if (count > m_max_example) { + p += count/m_max_example-1; + pp += count/m_max_example-1; } } } @@ -172,7 +183,7 @@ void PhrasePairCollection::PrintHTML() const if (singleton) cout << "
\n"; else if (pp_target > 9) cout << ""; - size_t max_mismatch = m_max_pp/3; + size_t max_mismatch = m_max_example/3; // unaligned phrases if (m_unaligned.size() > 0) { cout << "

unaligned" diff --git a/biconcor/PhrasePairCollection.h b/biconcor/PhrasePairCollection.h index f88bfc10f..e076eba9b 100644 --- a/biconcor/PhrasePairCollection.h +++ b/biconcor/PhrasePairCollection.h @@ -22,19 +22,19 @@ private: std::vector< Mismatch* > m_mismatch, m_unaligned; int m_size; int m_max_lookup; - int m_max_pp_target; - int m_max_pp; + int m_max_translation; + int m_max_example; // No copying allowed. PhrasePairCollection(const PhrasePairCollection&); void operator=(const PhrasePairCollection&); public: - PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment * ); + PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment *, int, int ); ~PhrasePairCollection (); - bool GetCollection( const std::vector& sourceString ); - void Print() const; + int GetCollection( const std::vector& sourceString ); + void Print(bool pretty) const; void PrintHTML() const; }; diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp index a25e63cb7..f4e7c03fb 100644 --- a/biconcor/biconcor.cpp +++ b/biconcor/biconcor.cpp @@ -19,8 +19,12 @@ int main(int argc, char* argv[]) int saveFlag = false; int createFlag = false; int queryFlag = false; - int htmlFlag = false; - string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n"; + int htmlFlag = false; // output as HTML + int prettyFlag = false; // output readable on screen + int stdioFlag = false; // receive requests from STDIN, respond to STDOUT + int max_translation = 20; + int max_example = 50; + string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n\t[--translations count]\n\t[--examples count]\n\t[--html]\n\t[--stdio]\n"; while(1) { static struct option long_options[] = { {"load", required_argument, 0, 'l'}, @@ -29,11 +33,15 @@ int main(int argc, char* argv[]) {"query", required_argument, 0, 'q'}, {"target", required_argument, 0, 't'}, {"alignment", required_argument, 0, 'a'}, - {"html", no_argument, &htmlFlag, 0}, + {"html", no_argument, 0, 'h'}, + {"pretty", no_argument, 0, 'p'}, + {"stdio", no_argument, 0, 'i'}, + {"translations", required_argument, 0, 'o'}, + {"examples", required_argument, 0, 'e'}, {0, 0, 0, 0} }; int option_index = 0; - int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:h", long_options, &option_index); + int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:hpio:e:", long_options, &option_index); if (c == -1) break; switch (c) { case 'l': @@ -62,11 +70,29 @@ int main(int argc, char* argv[]) query = string(optarg); queryFlag = true; break; + case 'o': + max_translation = atoi(optarg); + break; + case 'e': + max_example = atoi(optarg); + break; + case 'p': + prettyFlag = true; + break; + case 'h': + htmlFlag = true; + break; + case 'i': + stdioFlag = true; + break; default: cerr << info; exit(1); } } + if (stdioFlag) { + queryFlag = true; + } // check if parameter settings are legal if (saveFlag && !createFlag) { @@ -111,12 +137,37 @@ int main(int argc, char* argv[]) targetCorpus.Load( fileNameSuffix ); alignment.Load( fileNameSuffix ); } - if (queryFlag) { + if (stdioFlag) { + cout << "-|||- BICONCOR START -|||-" << endl << flush; + while(true) { + string query; + if (getline(cin, query, '\n').eof()) { + return 0; + } + vector< string > queryString = alignment.Tokenize( query.c_str() ); + PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); + int total = ppCollection.GetCollection( queryString ); + cout << "TOTAL: " << total << endl; + if (htmlFlag) { + ppCollection.PrintHTML(); + } + else { + ppCollection.Print(prettyFlag); + } + cout << "-|||- BICONCOR END -|||-" << endl << flush; + } + } + else if (queryFlag) { cerr << "query is " << query << endl; vector< string > queryString = alignment.Tokenize( query.c_str() ); - PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment ); + PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); ppCollection.GetCollection( queryString ); - ppCollection.PrintHTML(); + if (htmlFlag) { + ppCollection.PrintHTML(); + } + else { + ppCollection.Print(prettyFlag); + } } return 0; -- cgit v1.2.3 From 38aa0c74b0f6ac994ccc4affd69435935f450ae6 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Tue, 9 Apr 2013 11:13:11 +0200 Subject: sigtest-filter: hierarchical mode now works with syntactic models (labels other than X) --- contrib/sigtest-filter/filter-pt.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp index f06d2b430..6ab1a5657 100644 --- a/contrib/sigtest-filter/filter-pt.cpp +++ b/contrib/sigtest-filter/filter-pt.cpp @@ -287,24 +287,24 @@ SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicati if (hierarchical) { // std::cerr << "splitting up phrase: " << phrase << "\n"; int pos = 0; - int endPos = 0; + int NTStartPos, NTEndPos; vector phrases; - - while (rule.find("[X][X] ", pos) < rule.size()) { - endPos = rule.find("[X][X] ",pos) - 1; // -1 to cut space before NT - if (endPos < pos) { // no space: NT at start of rule (or two consecutive NTs) - pos += 7; + while (rule.find("] ", pos) < rule.size()) { + NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT + NTEndPos = rule.find("] ",pos); + if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs) + pos = NTEndPos + 2; continue; } - phrases.push_back(rule.substr(pos,endPos-pos)); - pos = endPos + 8; + phrases.push_back(rule.substr(pos,NTStartPos-pos)); + pos = NTEndPos + 2; } - // cut LHS of rule - endPos = rule.size()-4; - if (endPos > pos) { - phrases.push_back(rule.substr(pos,endPos-pos)); + NTStartPos = rule.find("[",pos) - 1; // LHS of rule + if (NTStartPos > pos) { + phrases.push_back(rule.substr(pos,NTStartPos-pos)); } + sa_set = lookup_multiple_phrases(phrases, my_sa, rule, cache); } else { -- cgit v1.2.3 From 5dce1463e715295fa9442d9b5eac079a76f890b6 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Tue, 9 Apr 2013 11:15:28 +0200 Subject: documentation: -phrase-word-alignment is on by default. --- contrib/tmcombine/README.md | 2 +- contrib/tmcombine/tmcombine.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/tmcombine/README.md b/contrib/tmcombine/README.md index 2cbc83299..7b8ebd45e 100644 --- a/contrib/tmcombine/README.md +++ b/contrib/tmcombine/README.md @@ -58,7 +58,7 @@ Regression tests (check if the output files (`test/phrase-table_testN`) differ f FURTHER NOTES ------------- - - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models. + - Different combination algorithms require different statistics. To be on the safe side, use the option and `-write-lexical-counts` when training models. - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). Sort the tables with `LC_ALL=C`. Phrase tables produced by Moses are sorted correctly. diff --git a/contrib/tmcombine/tmcombine.py b/contrib/tmcombine/tmcombine.py index 0bbcf7c78..5b65cc590 100755 --- a/contrib/tmcombine/tmcombine.py +++ b/contrib/tmcombine/tmcombine.py @@ -15,7 +15,7 @@ # Some general things to note: -# - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models. +# - Different combination algorithms require different statistics. To be on the safe side, use the option `-write-lexical-counts` when training models. # - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). sort with LC_ALL=C. # - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007). # - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files. @@ -306,7 +306,7 @@ class Moses(): # assuming that alignment is empty elif len(line) == 4: if self.require_alignment: - sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment\n') + sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n') exit() self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')] -- cgit v1.2.3 From 44a0e52e3052371a850f7ee463279f4cc0522ea5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 9 Apr 2013 14:44:32 +0100 Subject: fixed ShowWeights() for confusion networks. This is a reason why we should get rid of ShortNames and move to refactored moses pdq --- contrib/other-builds/moses-chart-cmd/.cproject | 2 +- moses-cmd/Main.cpp | 2 +- scripts/training/mert-moses.pl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index aae6822b6..71462b5df 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -60,7 +60,6 @@ diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index 68d8049c4..b08ba532a 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -431,7 +431,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff) vector values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); for (size_t i = 0; i < numScoreComps; ++i) cout << ff->GetScoreProducerDescription() << " " - << ff->GetScoreProducerWeightShortName() << " " + << ff->GetScoreProducerWeightShortName(i) << " " << values[i] << endl; } else { diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index f73b58120..175fa12fb 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl -w # $Id$ # Usage: # mert-moses.pl -- cgit v1.2.3 From 73035543d6086d98185b38d313a670176adde1f2 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 10 Apr 2013 18:27:25 +0100 Subject: Binary phrase table does string ops, at least make them fast --- moses/PDTAimp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h index 25131b98a..5680b8ecb 100644 --- a/moses/PDTAimp.h +++ b/moses/PDTAimp.h @@ -11,6 +11,7 @@ #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h" #include "SparsePhraseDictionaryFeature.h" #include "Util.h" +#include "util/tokenize_piece.hh" namespace Moses { @@ -284,11 +285,10 @@ protected: FactorCollection &factorCollection = FactorCollection::Instance(); for(size_t k=0; k factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter()); - CHECK(factors.size()==m_output.size()); + util::TokenIter word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter()); Word& w=targetPhrase.AddWord(); - for(size_t l=0; l Date: Wed, 10 Apr 2013 18:40:25 +0100 Subject: add score breakdown to target phrase debugging output --- moses/TargetPhrase.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index b1d99ab50..6f14657a3 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -326,8 +326,10 @@ TO_STRING_BODY(TargetPhrase); std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp) { - os << static_cast(tp) << ":" << tp.GetAlignNonTerm(); - os << ": c=" << tp.m_fullScore; + os << static_cast(tp) << ":" << flush; + os << tp.GetAlignNonTerm() << flush; + os << ": c=" << tp.m_fullScore << flush; + os << " " << tp.m_scoreBreakdown << flush; return os; } -- cgit v1.2.3