diff options
-rw-r--r-- | moses/FF/FeatureFunction.cpp | 4 | ||||
-rw-r--r-- | moses/FF/FeatureFunction.h | 1 | ||||
-rw-r--r-- | moses/FF/PhraseOrientationFeature.cpp | 616 | ||||
-rw-r--r-- | moses/FF/PhraseOrientationFeature.h | 159 | ||||
-rw-r--r-- | moses/PP/OrientationPhraseProperty.cpp | 6 | ||||
-rw-r--r-- | moses/PP/OrientationPhraseProperty.h | 16 | ||||
-rw-r--r-- | moses/StaticData.cpp | 2 | ||||
-rw-r--r-- | moses/Util.h | 5 | ||||
-rw-r--r-- | phrase-extract/ExtractionPhrasePair.cpp | 10 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/ExtractGHKM.cpp | 10 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/PhraseOrientation.cpp | 210 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/PhraseOrientation.h | 41 |
12 files changed, 846 insertions, 234 deletions
diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp index 5d4e0f91e..4b5faa91e 100644 --- a/moses/FF/FeatureFunction.cpp +++ b/moses/FF/FeatureFunction.cpp @@ -45,6 +45,7 @@ void FeatureFunction::CallChangeSource(InputType *&input) FeatureFunction:: FeatureFunction(const std::string& line) : m_tuneable(true) + , m_verbosity(1) , m_numScoreComponents(1) { Initialize(line); @@ -54,6 +55,7 @@ FeatureFunction:: FeatureFunction(size_t numScoreComponents, const std::string& line) : m_tuneable(true) + , m_verbosity(0) , m_numScoreComponents(numScoreComponents) { Initialize(line); @@ -115,6 +117,8 @@ void FeatureFunction::SetParameter(const std::string& key, const std::string& va { if (key == "tuneable") { m_tuneable = Scan<bool>(value); + } else if (key == "verbosity") { + m_verbosity = Scan<size_t>(value); } else if (key == "filterable") { //ignore } else { UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value); diff --git a/moses/FF/FeatureFunction.h b/moses/FF/FeatureFunction.h index b30815e05..115797228 100644 --- a/moses/FF/FeatureFunction.h +++ b/moses/FF/FeatureFunction.h @@ -34,6 +34,7 @@ protected: std::string m_description, m_argLine; std::vector<std::vector<std::string> > m_args; bool m_tuneable; + size_t m_verbosity; size_t m_numScoreComponents; //In case there's multiple producers with the same description static std::multiset<std::string> description_counts; diff --git a/moses/FF/PhraseOrientationFeature.cpp b/moses/FF/PhraseOrientationFeature.cpp index 0f6d8bcb1..4e2a8c637 100644 --- a/moses/FF/PhraseOrientationFeature.cpp +++ b/moses/FF/PhraseOrientationFeature.cpp @@ -1,4 +1,13 @@ -#include <vector> +// +// REFERENCE +// --------- +// When using this feature, please cite: +// +// Matthias Huck, Joern Wuebker, Felix Rietig, and Hermann Ney. +// A Phrase Orientation Model for Hierarchical Machine Translation. +// In ACL 2013 Eighth Workshop on Statistical Machine Translation (WMT 2013), pages 452-463, Sofia, Bulgaria, August 2013. +// + #include "PhraseOrientationFeature.h" #include "moses/InputFileStream.h" #include "moses/ScoreComponentCollection.h" @@ -10,82 +19,83 @@ #include "moses/PP/OrientationPhraseProperty.h" #include "phrase-extract/extract-ghkm/Alignment.h" -using namespace std; namespace Moses { PhraseOrientationFeature::PhraseOrientationFeature(const std::string &line) - : StatelessFeatureFunction(8, line) + : StatefulFeatureFunction(6, line) + , m_glueTargetLHSStr("Q") + , m_glueTargetLHS(true) + , m_offsetR2LScores(0) { VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ..."); ReadParameters(); - VERBOSE(1, " Done."); + FactorCollection &fc = FactorCollection::Instance(); + const Factor *factor = fc.AddFactor(m_glueTargetLHSStr, true); + m_glueTargetLHS.SetFactor(0, factor); + m_offsetR2LScores = m_numScoreComponents / 2; + VERBOSE(1, " Done." << std::endl); } void PhraseOrientationFeature::SetParameter(const std::string& key, const std::string& value) { - if (key == "tuneable") { + if (key == "tuneable") + { m_tuneable = Scan<bool>(value); - } else { - StatelessFeatureFunction::SetParameter(key, value); + } + else if (key == "glueTargetLHS") + { + m_glueTargetLHSStr = value; + } + else + { + StatefulFeatureFunction::SetParameter(key, value); } } -void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const -{ - targetPhrase.SetRuleSource(source); -} - -void PhraseOrientationFeature::EvaluateWhenApplied( +FFState* PhraseOrientationFeature::EvaluateWhenApplied( const ChartHypothesis& hypo, + int featureID, // used to index the state in the previous hypotheses ScoreComponentCollection* accumulator) const { // Dense scores - std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 8 + std::vector<float> newScores(m_numScoreComponents,0); + + // State: ignored wrt. recombination; used to propagate orientation probabilities in case of boundary non-terminals + PhraseOrientationFeatureState *state = new PhraseOrientationFeatureState(); // Read Orientation property const TargetPhrase &currTarPhr = hypo.GetCurrTargetPhrase(); + const Word &currTarPhrLHS = currTarPhr.GetTargetLHS(); const Phrase *currSrcPhr = currTarPhr.GetRuleSource(); // const Factor* targetLHS = currTarPhr.GetTargetLHS()[0]; // bool isGlueGrammarRule = false; - std::map<size_t,size_t> alignMap; - alignMap.insert( - currTarPhr.GetAlignTerm().begin(), - currTarPhr.GetAlignTerm().end()); - alignMap.insert( - currTarPhr.GetAlignNonTerm().begin(), - currTarPhr.GetAlignNonTerm().end()); + FEATUREVERBOSE(2, *currSrcPhr << std::endl); + FEATUREVERBOSE(2, currTarPhr << std::endl); Moses::GHKM::Alignment alignment; std::vector<int> alignmentNTs(currTarPhr.GetSize(),-1); // TODO: can be smaller (number of right-hand side non-terminals) for (AlignmentInfo::const_iterator it=currTarPhr.GetAlignTerm().begin(); - it!=currTarPhr.GetAlignTerm().end(); ++it) { + it!=currTarPhr.GetAlignTerm().end(); ++it) + { alignment.push_back(std::make_pair(it->first, it->second)); -// std::cerr << "alignTerm " << it->first << " " << it->second << std::endl; + FEATUREVERBOSE(2, "alignTerm " << it->first << " " << it->second << std::endl); } for (AlignmentInfo::const_iterator it=currTarPhr.GetAlignNonTerm().begin(); - it!=currTarPhr.GetAlignNonTerm().end(); ++it) { + it!=currTarPhr.GetAlignNonTerm().end(); ++it) + { alignment.push_back(std::make_pair(it->first, it->second)); alignmentNTs[it->second] = it->first; -// std::cerr << "alignNonTerm " << it->first << " " << it->second << std::endl; + FEATUREVERBOSE(2, "alignNonTerm " << it->first << " " << it->second << std::endl); } // Initialize phrase orientation scoring object - Moses::GHKM::PhraseOrientation phraseOrientation(currSrcPhr->GetSize(), currTarPhr.GetSize(), alignment); - // TODO: Efficiency! This should be precomputed. - -// std::cerr << *currSrcPhr << std::endl; -// std::cerr << currTarPhr << std::endl; -// std::cerr << currSrcPhr->GetSize() << std::endl; -// std::cerr << currTarPhr.GetSize() << std::endl; + Moses::GHKM::PhraseOrientation phraseOrientation(currSrcPhr->GetSize(), currTarPhr.GetSize(), alignment); // TODO: Efficiency! This should be precomputed. // Get index map for underlying hypotheses const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = @@ -95,94 +105,376 @@ void PhraseOrientationFeature::EvaluateWhenApplied( size_t nonTerminalNumber = 0; - for (size_t phrasePos=0; phrasePos<currTarPhr.GetSize(); ++phrasePos) { + for (size_t targetIndex=0; targetIndex<currTarPhr.GetSize(); ++targetIndex) + { // consult rule for either word or non-terminal - const Word &word = currTarPhr.GetWord(phrasePos); - if ( word.IsNonTerminal() ) { + const Word &word = currTarPhr.GetWord(targetIndex); + if ( word.IsNonTerminal() ) + { + + int sourceIndex = alignmentNTs[targetIndex]; + FEATUREVERBOSE(2, "Scoring nonTerminalNumber= " << nonTerminalNumber << " targetIndex= " << targetIndex << " sourceIndex= " << sourceIndex << std::endl); + // non-terminal: consult subderivation - size_t nonTermIndex = nonTermIndexMap[phrasePos]; + size_t nonTermIndex = nonTermIndexMap[targetIndex]; const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex); const TargetPhrase &prevTarPhr = prevHypo->GetCurrTargetPhrase(); - if (const PhraseProperty *property = prevTarPhr.GetProperty("Orientation")) { + if (const PhraseProperty *property = prevTarPhr.GetProperty("Orientation")) + { const OrientationPhraseProperty *orientationPhraseProperty = static_cast<const OrientationPhraseProperty*>(property); -// std::cerr << "L2R_Mono " << orientationPhraseProperty->GetLeftToRightProbabilityMono(); -// std::cerr << " L2R_Swap " << orientationPhraseProperty->GetLeftToRightProbabilitySwap(); -// std::cerr << " L2R_Dright " << orientationPhraseProperty->GetLeftToRightProbabilityDright(); -// std::cerr << " L2R_Dleft " << orientationPhraseProperty->GetLeftToRightProbabilityDleft(); -// std::cerr << " R2L_Mono " << orientationPhraseProperty->GetRightToLeftProbabilityMono(); -// std::cerr << " R2L_Swap " << orientationPhraseProperty->GetRightToLeftProbabilitySwap(); -// std::cerr << " R2L_Dright " << orientationPhraseProperty->GetRightToLeftProbabilityDright(); -// std::cerr << " R2L_Dleft " << orientationPhraseProperty->GetRightToLeftProbabilityDleft(); -// std::cerr << std::endl; - - Moses::GHKM::REO_POS l2rOrientation=Moses::GHKM::UNKNOWN, r2lOrientation=Moses::GHKM::UNKNOWN; - int sourceIndex = alignmentNTs[phrasePos]; -// std::cerr << "targetIndex " << phrasePos << " sourceIndex " << sourceIndex << std::endl; - l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::L2R); - r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::R2L); - -// std::cerr << "l2rOrientation "; - switch(l2rOrientation) { - case Moses::GHKM::LEFT: + FEATUREVERBOSE(5, "orientationPhraseProperty: " + << "L2R_Mono " << orientationPhraseProperty->GetLeftToRightProbabilityMono() + << " L2R_Swap " << orientationPhraseProperty->GetLeftToRightProbabilitySwap() + << " L2R_Dright " << orientationPhraseProperty->GetLeftToRightProbabilityDright() + << " L2R_Dleft " << orientationPhraseProperty->GetLeftToRightProbabilityDleft() + << " R2L_Mono " << orientationPhraseProperty->GetRightToLeftProbabilityMono() + << " R2L_Swap " << orientationPhraseProperty->GetRightToLeftProbabilitySwap() + << " R2L_Dright " << orientationPhraseProperty->GetRightToLeftProbabilityDright() + << " R2L_Dleft " << orientationPhraseProperty->GetRightToLeftProbabilityDleft() + << std::endl); + + const PhraseOrientationFeatureState* prevState = + static_cast<const PhraseOrientationFeatureState*>(prevHypo->GetFFState(featureID)); + + + // LEFT-TO-RIGHT DIRECTION + + Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::PhraseOrientation::REO_DIR_L2R); + + IFFEATUREVERBOSE(2) + { + FEATUREVERBOSE(2, "l2rOrientation "); + switch (l2rOrientation) + { + case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT: + FEATUREVERBOSE2(2, "mono" << std::endl); + break; + case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT: + FEATUREVERBOSE2(2, "swap" << std::endl); + break; + case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT: + FEATUREVERBOSE2(2, "dleft" << std::endl); + break; + case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: + FEATUREVERBOSE2(2, "dright" << std::endl); + break; + case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: + // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR + FEATUREVERBOSE2(2, "unknown->dleft" << std::endl); + break; + default: + UTIL_THROW2(GetScoreProducerDescription() + << ": Unsupported orientation type."); + break; + } + } + + bool delayedScoringL2R = false; + + if ( ((targetIndex == 0) || !phraseOrientation.TargetSpanIsAligned(0,targetIndex)) // boundary non-terminal in rule-initial position (left boundary) + && (currTarPhrLHS != m_glueTargetLHS) ) // and not glue rule + { + // delay left-to-right scoring + + FEATUREVERBOSE(3, "Left boundary"); + if (targetIndex != 0) { + FEATUREVERBOSE2(3, " (with targetIndex!=0)"); + } + FEATUREVERBOSE2(3, std::endl); + + bool previousSourceSpanIsAligned = ( (sourceIndex > 0) && phraseOrientation.SourceSpanIsAligned(0,sourceIndex-1) ); + bool followingSourceSpanIsAligned = ( (sourceIndex < ((int)currSrcPhr->GetSize())-1) && phraseOrientation.SourceSpanIsAligned(sourceIndex,currSrcPhr->GetSize()-1) ); + + FEATUREVERBOSE(4, "previousSourceSpanIsAligned = " << previousSourceSpanIsAligned << std::endl); + FEATUREVERBOSE(4, "followingSourceSpanIsAligned = " << followingSourceSpanIsAligned << std::endl;); + + if (previousSourceSpanIsAligned && followingSourceSpanIsAligned) + { + // discontinuous + l2rOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT; + } + else + { + FEATUREVERBOSE(3, "Delaying left-to-right scoring" << std::endl); + + delayedScoringL2R = true; + std::bitset<3> possibleFutureOrientationsL2R(0x7); + possibleFutureOrientationsL2R[0] = !previousSourceSpanIsAligned; + possibleFutureOrientationsL2R[1] = !followingSourceSpanIsAligned; + + // add heuristic scores + + std::vector<float> weightsVector = StaticData::Instance().GetAllWeights().GetScoresForProducer(this); + std::vector<float> scoresL2R; + scoresL2R.push_back( std::log(orientationPhraseProperty->GetLeftToRightProbabilityMono()) ); + scoresL2R.push_back( std::log(orientationPhraseProperty->GetLeftToRightProbabilitySwap()) ); + scoresL2R.push_back( std::log(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous()) ); + std::vector<float> weightedScoresL2R; + for ( size_t i=0; i<3;++i ) + { + weightedScoresL2R.push_back( weightsVector[i] * scoresL2R[i] ); + } + + size_t heuristicScoreIndex = 0; + for (size_t i=1; i<3; ++i) + { + if (possibleFutureOrientationsL2R[i]) + { + if (weightedScoresL2R[i] > weightedScoresL2R[heuristicScoreIndex]) + { + heuristicScoreIndex = i; + } + } + } + + IFFEATUREVERBOSE(5) + { + FEATUREVERBOSE(5, "Heuristic score computation (L2R): " + << "heuristicScoreIndex= " << heuristicScoreIndex); + for (size_t i=0; i<3; ++i) + FEATUREVERBOSE2(5, " weightsVector[" << i << "]= " << weightsVector[i]); + for (size_t i=0; i<3; ++i) + FEATUREVERBOSE2(5, " scoresL2R[" << i << "]= " << scoresL2R[i]); + for (size_t i=0; i<3; ++i) + FEATUREVERBOSE2(5, " weightedScoresL2R[" << i << "]= " << weightedScoresL2R[i]); + for (size_t i=0; i<3; ++i) + FEATUREVERBOSE2(5, " possibleFutureOrientationsL2R[" << i << "]= " << possibleFutureOrientationsL2R[i]); + if ( possibleFutureOrientationsL2R == 0x7 ) + { + FEATUREVERBOSE2(5, " (all orientations possible)"); + } + FEATUREVERBOSE2(5, std::endl); + } + + newScores[heuristicScoreIndex] += scoresL2R[heuristicScoreIndex]; + state->SetLeftBoundaryL2R(scoresL2R, heuristicScoreIndex, possibleFutureOrientationsL2R, nonTermIndex); + + if ( (possibleFutureOrientationsL2R & prevState->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations) == 0x4 ) + { + // recursive: discontinuous orientation + FEATUREVERBOSE(5, "previous state: L2R discontinuous orientation " + << possibleFutureOrientationsL2R << " & " << prevState->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations + << " = " << (possibleFutureOrientationsL2R & prevState->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations) + << std::endl); + LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores); + state->m_leftBoundaryRecursionGuard = true; // prevent subderivation from being scored recursively multiple times + } + } + } + + if (!delayedScoringL2R) + { + switch (l2rOrientation) + { + case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT: newScores[0] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityMono()); -// std::cerr << "mono" << std::endl; + // if sub-derivation has left-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x1, newScores); break; - case Moses::GHKM::RIGHT: + case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT: newScores[1] += std::log(orientationPhraseProperty->GetLeftToRightProbabilitySwap()); -// std::cerr << "swap" << std::endl; + // if sub-derivation has left-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x2, newScores); break; - case Moses::GHKM::DRIGHT: - newScores[2] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDright()); -// std::cerr << "dright" << std::endl; + case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT: + newScores[2] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous()); + // if sub-derivation has left-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores); break; - case Moses::GHKM::DLEFT: - newScores[3] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDleft()); -// std::cerr << "dleft" << std::endl; + case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: + newScores[2] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous()); + // if sub-derivation has left-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores); break; - case Moses::GHKM::UNKNOWN: - // modelType == Moses::GHKM::REO_MSLR - newScores[2] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDright()); -// std::cerr << "unknown->dright" << std::endl; + case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: + // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR + newScores[2] += std::log(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous()); + // if sub-derivation has left-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores); break; default: UTIL_THROW2(GetScoreProducerDescription() << ": Unsupported orientation type."); break; + } + } + + + // RIGHT-TO-LEFT DIRECTION + + Moses::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,Moses::GHKM::PhraseOrientation::REO_DIR_R2L); + + IFFEATUREVERBOSE(2) + { + FEATUREVERBOSE(2, "r2lOrientation "); + switch (r2lOrientation) + { + case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT: + FEATUREVERBOSE2(2, "mono" << std::endl); + break; + case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT: + FEATUREVERBOSE2(2, "swap" << std::endl); + break; + case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT: + FEATUREVERBOSE2(2, "dleft" << std::endl); + break; + case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: + FEATUREVERBOSE2(2, "dright" << std::endl); + break; + case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: + // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR + FEATUREVERBOSE2(2, "unknown->dleft" << std::endl); + break; + default: + UTIL_THROW2(GetScoreProducerDescription() + << ": Unsupported orientation type."); + break; + } } -// std::cerr << "r2lOrientation "; - switch(r2lOrientation) { - case Moses::GHKM::LEFT: - newScores[4] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityMono()); -// std::cerr << "mono" << std::endl; + bool delayedScoringR2L = false; + + if ( ((targetIndex == currTarPhr.GetSize()-1) || !phraseOrientation.TargetSpanIsAligned(targetIndex,currTarPhr.GetSize()-1)) // boundary non-terminal in rule-final position (right boundary) + && (currTarPhrLHS != m_glueTargetLHS) ) // and not glue rule + { + // delay right-to-left scoring + + FEATUREVERBOSE(3, "Right boundary"); + if (targetIndex != currTarPhr.GetSize()-1) { + FEATUREVERBOSE2(3, " (with targetIndex!=currTarPhr.GetSize()-1)"); + } + FEATUREVERBOSE2(3, std::endl); + + bool previousSourceSpanIsAligned = ( (sourceIndex > 0) && phraseOrientation.SourceSpanIsAligned(0,sourceIndex-1) ); + bool followingSourceSpanIsAligned = ( (sourceIndex < ((int)currSrcPhr->GetSize())-1) && phraseOrientation.SourceSpanIsAligned(sourceIndex,currSrcPhr->GetSize()-1) ); + + FEATUREVERBOSE(4, "previousSourceSpanIsAligned = " << previousSourceSpanIsAligned << std::endl); + FEATUREVERBOSE(4, "followingSourceSpanIsAligned = " << followingSourceSpanIsAligned << std::endl;); + + if (previousSourceSpanIsAligned && followingSourceSpanIsAligned) + { + // discontinuous + r2lOrientation = Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT; + } + else + { + FEATUREVERBOSE(3, "Delaying right-to-left scoring" << std::endl); + + delayedScoringR2L = true; + std::bitset<3> possibleFutureOrientationsR2L(0x7); + possibleFutureOrientationsR2L[0] = !followingSourceSpanIsAligned; + possibleFutureOrientationsR2L[1] = !previousSourceSpanIsAligned; + + // add heuristic scores + + std::vector<float> weightsVector = StaticData::Instance().GetAllWeights().GetScoresForProducer(this); + std::vector<float> scoresR2L; + scoresR2L.push_back( std::log(orientationPhraseProperty->GetRightToLeftProbabilityMono()) ); + scoresR2L.push_back( std::log(orientationPhraseProperty->GetRightToLeftProbabilitySwap()) ); + scoresR2L.push_back( std::log(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous()) ); + std::vector<float> weightedScoresR2L; + for ( size_t i=0; i<3;++i ) + { + weightedScoresR2L.push_back( weightsVector[m_offsetR2LScores+i] * scoresR2L[i] ); + } + + size_t heuristicScoreIndex = 0; + for (size_t i=1; i<3; ++i) + { + if (possibleFutureOrientationsR2L[i]) + { + if (weightedScoresR2L[i] > weightedScoresR2L[heuristicScoreIndex]) + { + heuristicScoreIndex = i; + } + } + } + + IFFEATUREVERBOSE(5) + { + FEATUREVERBOSE(5, "Heuristic score computation (R2L): " + << "heuristicScoreIndex= " << heuristicScoreIndex); + for (size_t i=0; i<3; ++i) + FEATUREVERBOSE2(5, " weightsVector[" << m_offsetR2LScores+i << "]= " << weightsVector[m_offsetR2LScores+i]); + for (size_t i=0; i<3; ++i) + FEATUREVERBOSE2(5, " scoresR2L[" << i << "]= " << scoresR2L[i]); + for (size_t i=0; i<3; ++i) + FEATUREVERBOSE2(5, " weightedScoresR2L[" << i << "]= " << weightedScoresR2L[i]); + for (size_t i=0; i<3; ++i) + FEATUREVERBOSE2(5, " possibleFutureOrientationsR2L[" << i << "]= " << possibleFutureOrientationsR2L[i]); + if ( possibleFutureOrientationsR2L == 0x7 ) + { + FEATUREVERBOSE2(5, " (all orientations possible)"); + } + FEATUREVERBOSE2(5, std::endl); + } + + newScores[m_offsetR2LScores+heuristicScoreIndex] += scoresR2L[heuristicScoreIndex]; + state->SetRightBoundaryR2L(scoresR2L, heuristicScoreIndex, possibleFutureOrientationsR2L, nonTermIndex); + + if ( (possibleFutureOrientationsR2L & prevState->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations) == 0x4 ) + { + // recursive: discontinuous orientation + FEATUREVERBOSE(5, "previous state: R2L discontinuous orientation " + << possibleFutureOrientationsR2L << " & " << prevState->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations + << " = " << (possibleFutureOrientationsR2L & prevState->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations) + << std::endl); + RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores); + state->m_rightBoundaryRecursionGuard = true; // prevent subderivation from being scored recursively multiple times + } + } + } + + if (!delayedScoringR2L) + { + switch (r2lOrientation) + { + case Moses::GHKM::PhraseOrientation::REO_CLASS_LEFT: + newScores[m_offsetR2LScores+0] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityMono()); + // if sub-derivation has right-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x1, newScores); break; - case Moses::GHKM::RIGHT: - newScores[5] += std::log(orientationPhraseProperty->GetRightToLeftProbabilitySwap()); -// std::cerr << "swap" << std::endl; + case Moses::GHKM::PhraseOrientation::REO_CLASS_RIGHT: + newScores[m_offsetR2LScores+1] += std::log(orientationPhraseProperty->GetRightToLeftProbabilitySwap()); + // if sub-derivation has right-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x2, newScores); break; - case Moses::GHKM::DRIGHT: - newScores[6] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDright()); -// std::cerr << "dright" << std::endl; + case Moses::GHKM::PhraseOrientation::REO_CLASS_DLEFT: + newScores[m_offsetR2LScores+2] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous()); + // if sub-derivation has right-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores); break; - case Moses::GHKM::DLEFT: - newScores[7] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDleft()); -// std::cerr << "dleft" << std::endl; + case Moses::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: + newScores[m_offsetR2LScores+2] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous()); + // if sub-derivation has right-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores); break; - case Moses::GHKM::UNKNOWN: - // modelType == Moses::GHKM::REO_MSLR - newScores[6] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDright()); -// std::cerr << "unknown->dright" << std::endl; + case Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: + // modelType == Moses::GHKM::PhraseOrientation::REO_MSLR + newScores[m_offsetR2LScores+2] += std::log(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous()); + // if sub-derivation has right-boundary non-terminal: + // add recursive actual score of boundary non-terminal from subderivation + RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, 0x4, newScores); break; default: UTIL_THROW2(GetScoreProducerDescription() << ": Unsupported orientation type."); break; + } } - - // TODO: Handle degenerate cases (boundary non-terminals) - - } else { + } + else + { // abort with error message if the phrase does not translate an unknown word UTIL_THROW_IF2(!prevTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription() << ": Missing Orientation property. " @@ -194,6 +486,142 @@ void PhraseOrientationFeature::EvaluateWhenApplied( } accumulator->PlusEquals(this, newScores); + + return state; +} + +void PhraseOrientationFeature::LeftBoundaryL2RScoreRecursive(int featureID, + const ChartHypothesis *hypo, + const PhraseOrientationFeatureState *state, + const std::bitset<3> orientation, + std::vector<float>& newScores) const +{ + if (state->m_leftBoundaryIsSet) + { + // subtract heuristic score from subderivation + newScores[state->m_leftBoundaryNonTerminalL2RHeuristicScoreIndex] -= state->m_leftBoundaryNonTerminalL2RScores[state->m_leftBoundaryNonTerminalL2RHeuristicScoreIndex]; + + // add actual score + std::bitset<3> recursiveOrientation = orientation; + if ( (orientation == 0x4) || (orientation == 0x0) ) + { + // discontinuous + newScores[2] += state->GetLeftBoundaryL2RScoreDiscontinuous(); + } + else + { + recursiveOrientation &= state->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations; + if ( recursiveOrientation == 0x1 ) + { + // monotone + newScores[0] += state->GetLeftBoundaryL2RScoreMono(); + } + else if ( recursiveOrientation == 0x2 ) + { + // swap + newScores[1] += state->GetLeftBoundaryL2RScoreSwap(); + } + else if ( recursiveOrientation == 0x4 ) + { + // discontinuous + newScores[2] += state->GetLeftBoundaryL2RScoreDiscontinuous(); + } + else if ( recursiveOrientation == 0x0 ) + { + // discontinuous + newScores[2] += state->GetLeftBoundaryL2RScoreDiscontinuous(); + } + else + { + UTIL_THROW2(GetScoreProducerDescription() + << ": Error in recursive scoring."); + } + } + + FEATUREVERBOSE(6, "Left boundary recursion: " << orientation << " & " << state->m_leftBoundaryNonTerminalL2RPossibleFutureOrientations << " = " << recursiveOrientation + << " --- Subtracted heuristic score: " << state->m_leftBoundaryNonTerminalL2RScores[state->m_leftBoundaryNonTerminalL2RHeuristicScoreIndex] << std::endl); + + if (!state->m_leftBoundaryRecursionGuard) + { + // recursive call + const ChartHypothesis *prevHypo = hypo->GetPrevHypo(state->m_leftBoundaryNonTerminalIndex); + const PhraseOrientationFeatureState* prevState = + static_cast<const PhraseOrientationFeatureState*>(prevHypo->GetFFState(featureID)); + + LeftBoundaryL2RScoreRecursive(featureID, prevHypo, prevState, recursiveOrientation, newScores); + } + else + { + FEATUREVERBOSE(6, "m_leftBoundaryRecursionGuard" << std::endl); + } + } +} + +void PhraseOrientationFeature::RightBoundaryR2LScoreRecursive(int featureID, + const ChartHypothesis *hypo, + const PhraseOrientationFeatureState *state, + const std::bitset<3> orientation, + std::vector<float>& newScores) const +{ + if (state->m_rightBoundaryIsSet) + { + // subtract heuristic score from subderivation + newScores[m_offsetR2LScores+state->m_rightBoundaryNonTerminalR2LHeuristicScoreIndex] -= state->m_rightBoundaryNonTerminalR2LScores[state->m_rightBoundaryNonTerminalR2LHeuristicScoreIndex]; + + // add actual score + std::bitset<3> recursiveOrientation = orientation; + if ( (orientation == 0x4) || (orientation == 0x0) ) + { + // discontinuous + newScores[m_offsetR2LScores+2] += state->GetRightBoundaryR2LScoreDiscontinuous(); + } + else + { + recursiveOrientation &= state->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations; + if ( recursiveOrientation == 0x1 ) + { + // monotone + newScores[m_offsetR2LScores+0] += state->GetRightBoundaryR2LScoreMono(); + } + else if ( recursiveOrientation == 0x2 ) + { + // swap + newScores[m_offsetR2LScores+1] += state->GetRightBoundaryR2LScoreSwap(); + } + else if ( recursiveOrientation == 0x4 ) + { + // discontinuous + newScores[m_offsetR2LScores+2] += state->GetRightBoundaryR2LScoreDiscontinuous(); + } + else if ( recursiveOrientation == 0x0 ) + { + // discontinuous + newScores[m_offsetR2LScores+2] += state->GetRightBoundaryR2LScoreDiscontinuous(); + } + else + { + UTIL_THROW2(GetScoreProducerDescription() + << ": Error in recursive scoring."); + } + } + + FEATUREVERBOSE(6, "Right boundary recursion: " << orientation << " & " << state->m_rightBoundaryNonTerminalR2LPossibleFutureOrientations << " = " << recursiveOrientation + << " --- Subtracted heuristic score: " << state->m_rightBoundaryNonTerminalR2LScores[state->m_rightBoundaryNonTerminalR2LHeuristicScoreIndex] << std::endl); + + if (!state->m_rightBoundaryRecursionGuard) + { + // recursive call + const ChartHypothesis *prevHypo = hypo->GetPrevHypo(state->m_rightBoundaryNonTerminalIndex); + const PhraseOrientationFeatureState* prevState = + static_cast<const PhraseOrientationFeatureState*>(prevHypo->GetFFState(featureID)); + + RightBoundaryR2LScoreRecursive(featureID, prevHypo, prevState, recursiveOrientation, newScores); + } + else + { + FEATUREVERBOSE(6, "m_rightBoundaryRecursionGuard" << std::endl); + } + } } diff --git a/moses/FF/PhraseOrientationFeature.h b/moses/FF/PhraseOrientationFeature.h index a367bc58d..e56e394a2 100644 --- a/moses/FF/PhraseOrientationFeature.h +++ b/moses/FF/PhraseOrientationFeature.h @@ -1,18 +1,134 @@ +// +// REFERENCE +// --------- +// When using this feature, please cite: +// +// Matthias Huck, Joern Wuebker, Felix Rietig, and Hermann Ney. +// A Phrase Orientation Model for Hierarchical Machine Translation. +// In ACL 2013 Eighth Workshop on Statistical Machine Translation (WMT 2013), pages 452-463, Sofia, Bulgaria, August 2013. +// + #pragma once +#include <bitset> #include <string> -#include "StatelessFeatureFunction.h" +#include <vector> +#include "StatefulFeatureFunction.h" #include "FFState.h" #include "moses/Factor.h" #include "phrase-extract/extract-ghkm/PhraseOrientation.h" + namespace Moses { +class PhraseOrientationFeatureState : public FFState +{ +public: + + friend class PhraseOrientationFeature; + + PhraseOrientationFeatureState() + : m_leftBoundaryNonTerminalL2RScores(3,0) + , m_rightBoundaryNonTerminalR2LScores(3,0) + , m_leftBoundaryNonTerminalL2RPossibleFutureOrientations(0x7) + , m_rightBoundaryNonTerminalR2LPossibleFutureOrientations(0x7) + , m_leftBoundaryRecursionGuard(false) + , m_rightBoundaryRecursionGuard(false) + , m_leftBoundaryIsSet(false) + , m_rightBoundaryIsSet(false) + {} + + void SetLeftBoundaryL2R(const std::vector<float> &scores, + size_t heuristicScoreIndex, + std::bitset<3> &possibleFutureOrientations, + size_t nonTerminalIndex) + { + for (size_t i=0; i<3; ++i) + { + m_leftBoundaryNonTerminalL2RScores[i] = scores[i]; + m_leftBoundaryNonTerminalL2RPossibleFutureOrientations[i] = possibleFutureOrientations[i]; + } + m_leftBoundaryNonTerminalL2RHeuristicScoreIndex = heuristicScoreIndex; + m_leftBoundaryNonTerminalIndex = nonTerminalIndex; + m_leftBoundaryIsSet = true; + } + + void SetRightBoundaryR2L(const std::vector<float> &scores, + size_t heuristicScoreIndex, + std::bitset<3> &possibleFutureOrientations, + size_t nonTerminalIndex) + { + for (size_t i=0; i<3; ++i) + { + m_rightBoundaryNonTerminalR2LScores[i] = scores[i]; + m_rightBoundaryNonTerminalR2LPossibleFutureOrientations[i] = possibleFutureOrientations[i]; + } + m_rightBoundaryNonTerminalR2LHeuristicScoreIndex = heuristicScoreIndex; + m_rightBoundaryNonTerminalIndex = nonTerminalIndex; + m_rightBoundaryIsSet = true; + } + + + float GetLeftBoundaryL2RScoreMono() const + { + return m_leftBoundaryNonTerminalL2RScores[0]; + } + + float GetLeftBoundaryL2RScoreSwap() const + { + return m_leftBoundaryNonTerminalL2RScores[1]; + } + + float GetLeftBoundaryL2RScoreDiscontinuous() const + { + return m_leftBoundaryNonTerminalL2RScores[2]; + } + + + float GetRightBoundaryR2LScoreMono() const + { + return m_rightBoundaryNonTerminalR2LScores[0]; + } + + float GetRightBoundaryR2LScoreSwap() const + { + return m_rightBoundaryNonTerminalR2LScores[1]; + } + + float GetRightBoundaryR2LScoreDiscontinuous() const + { + return m_rightBoundaryNonTerminalR2LScores[2]; + } + + + int Compare(const FFState& other) const { return 0; }; + +private: + + std::vector<float> m_leftBoundaryNonTerminalL2RScores; + std::vector<float> m_rightBoundaryNonTerminalR2LScores; + + size_t m_leftBoundaryNonTerminalL2RHeuristicScoreIndex; + size_t m_rightBoundaryNonTerminalR2LHeuristicScoreIndex; + + std::bitset<3> m_leftBoundaryNonTerminalL2RPossibleFutureOrientations; + std::bitset<3> m_rightBoundaryNonTerminalR2LPossibleFutureOrientations; + + size_t m_leftBoundaryNonTerminalIndex; + size_t m_rightBoundaryNonTerminalIndex; + bool m_leftBoundaryRecursionGuard; + bool m_rightBoundaryRecursionGuard; + bool m_leftBoundaryIsSet; + bool m_rightBoundaryIsSet; +}; -class PhraseOrientationFeature : public StatelessFeatureFunction + + +class PhraseOrientationFeature : public StatefulFeatureFunction { public: + PhraseOrientationFeature(const std::string &line); ~PhraseOrientationFeature() { @@ -22,12 +138,19 @@ public: return true; } + virtual const FFState* EmptyHypothesisState(const InputType &input) const { + return new PhraseOrientationFeatureState(); + } + void SetParameter(const std::string& key, const std::string& value); void EvaluateInIsolation(const Phrase &source , const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; + , ScoreComponentCollection &estimatedFutureScore) const + { + targetPhrase.SetRuleSource(source); + }; void EvaluateWithSourceContext(const InputType &input , const InputPath &inputPath @@ -37,15 +160,37 @@ public: , ScoreComponentCollection *estimatedFutureScore = NULL) const {}; - void EvaluateWhenApplied( + FFState* EvaluateWhenApplied( const Hypothesis& cur_hypo, - ScoreComponentCollection* accumulator) const - {}; + const FFState* prev_state, + ScoreComponentCollection* accumulator) const + { + return new PhraseOrientationFeatureState(); + }; - void EvaluateWhenApplied( + FFState* EvaluateWhenApplied( const ChartHypothesis& cur_hypo, + int featureID, // used to index the state in the previous hypotheses ScoreComponentCollection* accumulator) const; +protected: + + void LeftBoundaryL2RScoreRecursive(int featureID, + const ChartHypothesis *hypo, + const PhraseOrientationFeatureState *state, + const std::bitset<3> orientation, + std::vector<float>& newScores) const; + + void RightBoundaryR2LScoreRecursive(int featureID, + const ChartHypothesis *hypo, + const PhraseOrientationFeatureState *state, + const std::bitset<3> orientation, + std::vector<float>& newScores) const; + + std::string m_glueTargetLHSStr; + Word m_glueTargetLHS; + size_t m_offsetR2LScores; + }; diff --git a/moses/PP/OrientationPhraseProperty.cpp b/moses/PP/OrientationPhraseProperty.cpp index 653a1bf3b..1722a5383 100644 --- a/moses/PP/OrientationPhraseProperty.cpp +++ b/moses/PP/OrientationPhraseProperty.cpp @@ -8,13 +8,13 @@ namespace Moses void OrientationPhraseProperty::ProcessValue(const std::string &value) { // bidirectional MSLR phrase orientation with 2x4 orientation classes: - // mono swap dright dleft + // mono swap dleft dright std::istringstream tokenizer(value); try { - if (! (tokenizer >> m_l2rMonoProbability >> m_l2rSwapProbability >> m_l2rDrightProbability >> m_l2rDleftProbability - >> m_r2lMonoProbability >> m_r2lSwapProbability >> m_r2lDrightProbability >> m_r2lDleftProbability)) { + if (! (tokenizer >> m_l2rMonoProbability >> m_l2rSwapProbability >> m_l2rDleftProbability >> m_l2rDrightProbability + >> m_r2lMonoProbability >> m_r2lSwapProbability >> m_r2lDleftProbability >> m_r2lDrightProbability)) { UTIL_THROW2("OrientationPhraseProperty: Not able to read value. Flawed property?"); } } catch (const std::exception &e) { diff --git a/moses/PP/OrientationPhraseProperty.h b/moses/PP/OrientationPhraseProperty.h index 32c6ff208..f6344062c 100644 --- a/moses/PP/OrientationPhraseProperty.h +++ b/moses/PP/OrientationPhraseProperty.h @@ -24,12 +24,16 @@ public: return m_l2rSwapProbability; }; + double GetLeftToRightProbabilityDleft() const { + return m_l2rDleftProbability; + }; + double GetLeftToRightProbabilityDright() const { return m_l2rDrightProbability; }; - double GetLeftToRightProbabilityDleft() const { - return m_l2rDleftProbability; + double GetLeftToRightProbabilityDiscontinuous() const { + return m_l2rDleftProbability + m_l2rDrightProbability; }; @@ -41,12 +45,16 @@ public: return m_r2lSwapProbability; }; + double GetRightToLeftProbabilityDleft() const { + return m_r2lDleftProbability; + }; + double GetRightToLeftProbabilityDright() const { return m_r2lDrightProbability; }; - double GetRightToLeftProbabilityDleft() const { - return m_r2lDleftProbability; + double GetRightToLeftProbabilityDiscontinuous() const { + return m_r2lDleftProbability + m_r2lDrightProbability; }; diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 0b5adaba8..49ec0ef99 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -63,8 +63,8 @@ StaticData::StaticData() ,m_lmEnableOOVFeature(false) ,m_isAlwaysCreateDirectTranslationOption(false) ,m_currentWeightSetting("default") - ,m_treeStructure(NULL) ,m_useS2TDecoder(false) + ,m_treeStructure(NULL) { m_xmlBrackets.first="<"; m_xmlBrackets.second=">"; diff --git a/moses/Util.h b/moses/Util.h index 4d2ccea10..ca34fcfb0 100644 --- a/moses/Util.h +++ b/moses/Util.h @@ -59,8 +59,11 @@ namespace Moses #define VERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR(str); } } #define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level) -#define XVERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR("[" << __FILE__ << ":" << __LINE__ << "] ");TRACE_ERR(str); } } +#define XVERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR("[" << __FILE__ << ":" << __LINE__ << "] "); TRACE_ERR(str); } } #define HERE __FILE__ << ":" << __LINE__ +#define FEATUREVERBOSE(level,str) { if (m_verbosity >= level) { TRACE_ERR("[" << GetScoreProducerDescription() << "] "); FEATUREVERBOSE2(level,str); } } +#define FEATUREVERBOSE2(level,str) { if (m_verbosity >= level) { TRACE_ERR(str); } } +#define IFFEATUREVERBOSE(level) if (m_verbosity >= level) #if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2) diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp index ccf0fc275..b281a05b0 100644 --- a/phrase-extract/ExtractionPhrasePair.cpp +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -469,7 +469,7 @@ void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key, double smoothingFactor, std::ostream &out) const { - assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dright dleft + assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright const PROPERTY_VALUES *allPropertyValues = GetProperty( key ); @@ -507,10 +507,10 @@ void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key, if (!l2rOrientationClass.compare("swap")) { l2rOrientationClassId = 1; } - if (!l2rOrientationClass.compare("dright")) { + if (!l2rOrientationClass.compare("dleft")) { l2rOrientationClassId = 2; } - if (!l2rOrientationClass.compare("dleft")) { + if (!l2rOrientationClass.compare("dright")) { l2rOrientationClassId = 3; } if (l2rOrientationClassId == -1) { @@ -525,10 +525,10 @@ void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key, if (!r2lOrientationClass.compare("swap")) { r2lOrientationClassId = 1; } - if (!r2lOrientationClass.compare("dright")) { + if (!r2lOrientationClass.compare("dleft")) { r2lOrientationClassId = 2; } - if (!r2lOrientationClass.compare("dleft")) { + if (!r2lOrientationClass.compare("dright")) { r2lOrientationClassId = 3; } if (r2lOrientationClassId == -1) { diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 70d08e41a..7c210541d 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -264,12 +264,12 @@ int ExtractGHKM::Main(int argc, char *argv[]) const std::vector<const Subgraph *> &rules = (*p)->GetRules(); - REO_POS l2rOrientation=UNKNOWN, r2lOrientation=UNKNOWN; + Moses::GHKM::PhraseOrientation::REO_CLASS l2rOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=Moses::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN; if (options.phraseOrientation && !rules.empty()) { int sourceSpanBegin = *((*p)->GetSpan().begin()); int sourceSpanEnd = *((*p)->GetSpan().rbegin()); - l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,L2R); - r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,R2L); + l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_L2R); + r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,Moses::GHKM::PhraseOrientation::REO_DIR_R2L); // std::cerr << "span " << sourceSpanBegin << " " << sourceSpanEnd << std::endl; // std::cerr << "phraseOrientation " << phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd) << std::endl; } @@ -304,8 +304,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) fwdExtractStream << " "; phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation); fwdExtractStream << "}}"; - phraseOrientation.IncrementPriorCount(L2R,l2rOrientation,1); - phraseOrientation.IncrementPriorCount(R2L,r2lOrientation,1); + phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_L2R,l2rOrientation,1); + phraseOrientation.IncrementPriorCount(Moses::GHKM::PhraseOrientation::REO_DIR_R2L,r2lOrientation,1); } fwdExtractStream << std::endl; invExtractStream << std::endl; diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp index aa843c3c1..5a8452f42 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp +++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp @@ -22,6 +22,7 @@ #include <iostream> #include <sstream> #include <limits> +#include <cassert> #include <boost/assign/list_of.hpp> @@ -100,13 +101,15 @@ PhraseOrientation::PhraseOrientation(int sourceSize, } } + m_minAndMaxAlignedToTargetSpan[ std::pair<int,int>(startE,endE) ] = std::pair<int,int>(minF,maxF); + if (maxF >= 0) { // aligned to any source words at all - // check if source words are aligned to out of bound target words + // check if source words are aligned to out of bounds target words bool out_of_bounds = false; for (int fi=minF; fi<=maxF && !out_of_bounds; ++fi) if (usedF[fi]>0) { - // cout << "ouf of bounds: " << fi << "\n"; + // cout << "out of bounds: " << fi << "\n"; out_of_bounds = true; } @@ -175,7 +178,7 @@ const std::string PhraseOrientation::GetOrientationInfoString(int startF, int en // << std::endl; return GetOrientationInfoString(startF, startE, endF, endE, direction); } else { - std::cerr << "Error: not able to determine phrase orientation" << std::endl; + std::cerr << "PhraseOrientation::GetOrientationInfoString(): Error: not able to determine phrase orientation" << std::endl; std::exit(1); } } @@ -183,46 +186,33 @@ const std::string PhraseOrientation::GetOrientationInfoString(int startF, int en const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const { - REO_POS hierPrevOrient=UNKNOWN, hierNextOrient=UNKNOWN; - - bool connectedLeftTopP = IsAligned( startF-1, startE-1 ); - bool connectedRightTopP = IsAligned( endF+1, startE-1 ); - bool connectedLeftTopN = IsAligned( endF+1, endE+1 ); - bool connectedRightTopN = IsAligned( startF-1, endE+1 ); - - if ( direction == L2R || direction == BIDIR ) - hierPrevOrient = GetOrientHierModel(REO_MSLR, - connectedLeftTopP, connectedRightTopP, - startF, endF, startE, endE, m_countF-1, 0, 1, - &ge, <, - m_bottomRight, m_bottomLeft); - - if ( direction == R2L || direction == BIDIR ) - hierNextOrient = GetOrientHierModel(REO_MSLR, - connectedLeftTopN, connectedRightTopN, - endF, startF, endE, startE, 0, m_countF-1, -1, - <, &ge, - m_bottomLeft, m_bottomRight); + REO_CLASS hierPrevOrient=REO_CLASS_UNKNOWN, hierNextOrient=REO_CLASS_UNKNOWN; + + if ( direction == REO_DIR_L2R || direction == REO_DIR_BIDIR ) + hierPrevOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_L2R); + + if ( direction == REO_DIR_R2L || direction == REO_DIR_BIDIR ) + hierNextOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_R2L); switch (direction) { - case L2R: - return GetOrientationString(hierPrevOrient, REO_MSLR); + case REO_DIR_L2R: + return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR); break; - case R2L: - return GetOrientationString(hierNextOrient, REO_MSLR); + case REO_DIR_R2L: + return GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR); break; - case BIDIR: - return GetOrientationString(hierPrevOrient, REO_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MSLR); + case REO_DIR_BIDIR: + return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR); break; default: - return GetOrientationString(hierPrevOrient, REO_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MSLR); + return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR); break; } return "PhraseOrientationERROR"; } -REO_POS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR direction) const +PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR direction) const { boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) ); @@ -238,86 +228,114 @@ REO_POS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR dire // << std::endl; return GetOrientationInfo(startF, startE, endF, endE, direction); } else { - std::cerr << "Error: not able to determine phrase orientation" << std::endl; + std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: not able to determine phrase orientation" << std::endl; std::exit(1); } } -REO_POS PhraseOrientation::GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const +PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const { - if ( direction != L2R && direction != R2L ) { - std::cerr << "PhraseOrientation::GetOrientationInfo(): direction should be either L2R or R2L" << std::endl; + if ( direction != REO_DIR_L2R && direction != REO_DIR_R2L ) { + std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: direction should be either L2R or R2L" << std::endl; std::exit(1); } - bool connectedLeftTopP = IsAligned( startF-1, startE-1 ); - bool connectedRightTopP = IsAligned( endF+1, startE-1 ); - bool connectedLeftTopN = IsAligned( endF+1, endE+1 ); - bool connectedRightTopN = IsAligned( startF-1, endE+1 ); - - if ( direction == L2R ) - return GetOrientHierModel(REO_MSLR, - connectedLeftTopP, connectedRightTopP, - startF, endF, startE, endE, m_countF-1, 0, 1, - &ge, <, + if ( direction == REO_DIR_L2R ) + return GetOrientHierModel(REO_MODEL_TYPE_MSLR, + startF, endF, startE, endE, m_countF-1, 0, 0, 1, + &ge, &le, m_bottomRight, m_bottomLeft); - if ( direction == R2L ) - return GetOrientHierModel(REO_MSLR, - connectedLeftTopN, connectedRightTopN, - endF, startF, endE, startE, 0, m_countF-1, -1, - <, &ge, - m_bottomLeft, m_bottomRight); + if ( direction == REO_DIR_R2L ) + return GetOrientHierModel(REO_MODEL_TYPE_MSLR, + endF, startF, endE, startE, 0, m_countF-1, m_countE-1, -1, + &le, &ge, + m_topLeft, m_topRight); - return UNKNOWN; + return REO_CLASS_UNKNOWN; } // to be called with countF-1 instead of countF -REO_POS PhraseOrientation::GetOrientHierModel(REO_MODEL_TYPE modelType, - bool connectedLeftTop, bool connectedRightTop, - int startF, int endF, int startE, int endE, int countF, int zero, int unit, - bool (*ge)(int, int), bool (*lt)(int, int), +PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientHierModel(REO_MODEL_TYPE modelType, + int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit, + bool (*ge)(int, int), bool (*le)(int, int), const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const { + bool leftSourceSpanIsAligned = ( (startF != zeroF) && SourceSpanIsAligned(zeroF,startF-unit) ); + bool topTargetSpanIsAligned = ( (startE != zeroE) && TargetSpanIsAligned(zeroE,startE-unit) ); + + if (!topTargetSpanIsAligned && !leftSourceSpanIsAligned) + return REO_CLASS_LEFT; + HSentenceVertices::const_iterator it; - if ((connectedLeftTop && !connectedRightTop) || + if (//(connectedLeftTop && !connectedRightTop) || ((it = bottomRight.find(startE - unit)) != bottomRight.end() && it->second.find(startF-unit) != it->second.end())) - return LEFT; + return REO_CLASS_LEFT; - if (modelType == REO_MONO) - return UNKNOWN; + if (modelType == REO_MODEL_TYPE_MONO) + return REO_CLASS_UNKNOWN; - if ((!connectedLeftTop && connectedRightTop) || + if (//(!connectedLeftTop && connectedRightTop) || ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() && it->second.find(endF + unit) != it->second.end())) - return RIGHT; + return REO_CLASS_RIGHT; - if (modelType == REO_MSD) - return UNKNOWN; + if (modelType == REO_MODEL_TYPE_MSD) + return REO_CLASS_UNKNOWN; - connectedLeftTop = false; - for (int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) { - if ((connectedLeftTop = ((it = bottomRight.find(startE - unit)) != bottomRight.end() && - it->second.find(indexF) != it->second.end()))) - return DRIGHT; + for (int indexF=startF-2*unit; (*ge)(indexF, zeroF); indexF=indexF-unit) + { + if ((it = bottomRight.find(startE - unit)) != bottomRight.end() && + it->second.find(indexF) != it->second.end()) + return REO_CLASS_DLEFT; } - connectedRightTop = false; - for (int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) { - if ((connectedRightTop = ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() && - it->second.find(indexF) != it->second.end()))) - return DLEFT; + for (int indexF=endF+2*unit; (*le)(indexF, countF); indexF=indexF+unit) + { + if ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() && + it->second.find(indexF) != it->second.end()) + return REO_CLASS_DRIGHT; } - return UNKNOWN; + return REO_CLASS_UNKNOWN; +} + +bool PhraseOrientation::SourceSpanIsAligned(int index1, int index2) const +{ + return SpanIsAligned(index1, index2, m_minAndMaxAlignedToSourceSpan); } +bool PhraseOrientation::TargetSpanIsAligned(int index1, int index2) const +{ + return SpanIsAligned(index1, index2, m_minAndMaxAlignedToTargetSpan); +} -const std::string PhraseOrientation::GetOrientationString(const REO_POS orient, const REO_MODEL_TYPE modelType) +bool PhraseOrientation::SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const +{ + boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator itMinAndMaxAligned = + minAndMaxAligned.find(std::pair<int,int>(std::min(index1,index2),std::max(index1,index2))); + + if (itMinAndMaxAligned == minAndMaxAligned.end()) + { + std::cerr << "PhraseOrientation::SourceSpanIsAligned(): Error" << std::endl; + std::exit(1); + } + else + { + if (itMinAndMaxAligned->second.first == std::numeric_limits<int>::max()) + { + return false; + } + } + return true; +} + + +const std::string PhraseOrientation::GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType) { std::ostringstream oss; WriteOrientation(oss, orient, modelType); @@ -325,31 +343,31 @@ const std::string PhraseOrientation::GetOrientationString(const REO_POS orient, } -void PhraseOrientation::WriteOrientation(std::ostream& out, const REO_POS orient, const REO_MODEL_TYPE modelType) +void PhraseOrientation::WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType) { switch(orient) { - case LEFT: + case REO_CLASS_LEFT: out << "mono"; break; - case RIGHT: + case REO_CLASS_RIGHT: out << "swap"; break; - case DRIGHT: - out << "dright"; - break; - case DLEFT: + case REO_CLASS_DLEFT: out << "dleft"; break; - case UNKNOWN: + case REO_CLASS_DRIGHT: + out << "dright"; + break; + case REO_CLASS_UNKNOWN: switch(modelType) { - case REO_MONO: + case REO_MODEL_TYPE_MONO: out << "nomono"; break; - case REO_MSD: + case REO_MODEL_TYPE_MSD: out << "other"; break; - case REO_MSLR: - out << "dright"; + case REO_MODEL_TYPE_MSLR: + out << "dleft"; break; } break; @@ -379,12 +397,12 @@ bool PhraseOrientation::IsAligned(int fi, int ei) const } -void PhraseOrientation::IncrementPriorCount(REO_DIR direction, REO_POS orient, float increment) +void PhraseOrientation::IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment) { - assert(direction==L2R || direction==R2L); - if (direction == L2R) { + assert(direction==REO_DIR_L2R || direction==REO_DIR_R2L); + if (direction == REO_DIR_L2R) { m_l2rOrientationPriorCounts[orient] += increment; - } else if (direction == R2L) { + } else if (direction == REO_DIR_R2L) { m_r2lOrientationPriorCounts[orient] += increment; } } @@ -394,11 +412,11 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE { std::map<std::string,float> l2rOrientationPriorCountsMap; std::map<std::string,float> r2lOrientationPriorCountsMap; - for (int orient=0; orient<=UNKNOWN; ++orient) { - l2rOrientationPriorCountsMap[GetOrientationString((REO_POS)orient, modelType)] += m_l2rOrientationPriorCounts[orient]; + for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) { + l2rOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_l2rOrientationPriorCounts[orient]; } - for (int orient=0; orient<=UNKNOWN; ++orient) { - r2lOrientationPriorCountsMap[GetOrientationString((REO_POS)orient, modelType)] += m_r2lOrientationPriorCounts[orient]; + for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) { + r2lOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_r2lOrientationPriorCounts[orient]; } for (std::map<std::string,float>::const_iterator l2rOrientationPriorCountsMapIt = l2rOrientationPriorCountsMap.begin(); l2rOrientationPriorCountsMapIt != l2rOrientationPriorCountsMap.end(); ++l2rOrientationPriorCountsMapIt) { diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h index 8ef05987f..313c1f3df 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.h +++ b/phrase-extract/extract-ghkm/PhraseOrientation.h @@ -33,10 +33,6 @@ namespace Moses namespace GHKM { -enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO}; -enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN}; -enum REO_DIR {L2R, R2L, BIDIR}; - // The key of the map is the English index and the value is a set of the source ones typedef std::map <int, std::set<int> > HSentenceVertices; @@ -45,18 +41,25 @@ class PhraseOrientation { public: - PhraseOrientation(int sourceSize, - int targetSize, - const Alignment &alignment); + enum REO_MODEL_TYPE {REO_MODEL_TYPE_MSD, REO_MODEL_TYPE_MSLR, REO_MODEL_TYPE_MONO}; + enum REO_CLASS {REO_CLASS_LEFT, REO_CLASS_RIGHT, REO_CLASS_DLEFT, REO_CLASS_DRIGHT, REO_CLASS_UNKNOWN}; + enum REO_DIR {REO_DIR_L2R, REO_DIR_R2L, REO_DIR_BIDIR}; + - REO_POS GetOrientationInfo(int startF, int endF, REO_DIR direction) const; - REO_POS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const; - const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=BIDIR) const; - const std::string GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction=BIDIR) const; - static const std::string GetOrientationString(const REO_POS orient, const REO_MODEL_TYPE modelType=REO_MSLR); - static void WriteOrientation(std::ostream& out, const REO_POS orient, const REO_MODEL_TYPE modelType=REO_MSLR); - void IncrementPriorCount(REO_DIR direction, REO_POS orient, float increment); - static void WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType=REO_MSLR); + PhraseOrientation(int sourceSize, + int targetSize, + const Alignment &alignment); + + REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const; + REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const; + const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=REO_DIR_BIDIR) const; + const std::string GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction=REO_DIR_BIDIR) const; + static const std::string GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR); + static void WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR); + void IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment); + static void WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR); + bool SourceSpanIsAligned(int index1, int index2) const; + bool TargetSpanIsAligned(int index1, int index2) const; private: @@ -68,12 +71,13 @@ private: HSentenceVertices & bottomRight, int startF, int startE, int endF, int endE); - REO_POS GetOrientHierModel(REO_MODEL_TYPE modelType, - bool connectedLeftTop, bool connectedRightTop, - int startF, int endF, int startE, int endE, int countF, int zero, int unit, + REO_CLASS GetOrientHierModel(REO_MODEL_TYPE modelType, + int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit, bool (*ge)(int, int), bool (*lt)(int, int), const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const; + bool SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const; + bool IsAligned(int fi, int ei) const; static bool ge(int first, int second) { return first >= second; }; @@ -91,6 +95,7 @@ private: HSentenceVertices m_bottomRight; boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToSourceSpan; + boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToTargetSpan; static std::vector<float> m_l2rOrientationPriorCounts; static std::vector<float> m_r2lOrientationPriorCounts; |