From 1659d6b4c8c0b2a261678c012b9eab32f8c7b296 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Fri, 12 Feb 2016 17:46:57 +0000 Subject: Option for target constituent constrained phrase extraction. TargetConstituentAdjacencyFeature. --- phrase-extract/ExtractionPhrasePair.cpp | 10 +- phrase-extract/PhraseExtractionOptions.h | 16 +++ phrase-extract/SyntaxNodeCollection.cpp | 32 +++++ phrase-extract/SyntaxNodeCollection.h | 8 ++ phrase-extract/extract-main.cpp | 232 ++++++++++++++++++++++++++----- phrase-extract/score-main.cpp | 16 +++ 6 files changed, 275 insertions(+), 39 deletions(-) (limited to 'phrase-extract') diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp index 57821fe44..9a2884858 100644 --- a/phrase-extract/ExtractionPhrasePair.cpp +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -311,12 +311,14 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke std::ostringstream oss; for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); iter!=allPropertyValues->end(); ++iter) { - if (iter!=allPropertyValues->begin()) { + if (!(iter->first).empty()) { + if (iter!=allPropertyValues->begin()) { + oss << " "; + } + oss << iter->first; oss << " "; + oss << iter->second; } - oss << iter->first; - oss << " "; - oss << iter->second; } std::string allPropertyValuesString(oss.str()); diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index 38e570b79..859ab92d7 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -50,6 +50,8 @@ private: bool onlyOutputSpanInfo; bool gzOutput; std::string instanceWeightsFile; //weights for each sentence + bool targetConstituentConstrainedFlag; + bool targetConstituentBoundariesFlag; bool flexScoreFlag; bool singleWordHeuristicFlag; @@ -73,6 +75,8 @@ public: includeSentenceIdFlag(false), onlyOutputSpanInfo(false), gzOutput(false), + targetConstituentConstrainedFlag(false), + targetConstituentBoundariesFlag(false), flexScoreFlag(false), singleWordHeuristicFlag(false), debug(false) { @@ -118,6 +122,12 @@ public: void initInstanceWeightsFile(const char* initInstanceWeightsFile) { instanceWeightsFile = std::string(initInstanceWeightsFile); } + void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) { + targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag; + } + void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) { + targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag; + } void initFlexScoreFlag(const bool initflexScoreFlag) { flexScoreFlag=initflexScoreFlag; } @@ -165,6 +175,12 @@ public: std::string getInstanceWeightsFile() const { return instanceWeightsFile; } + bool isTargetConstituentConstrainedFlag() const { + return targetConstituentConstrainedFlag; + } + bool isTargetConstituentBoundariesFlag() const { + return targetConstituentBoundariesFlag; + } bool isFlexScoreFlag() const { return flexScoreFlag; } diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 70f52317e..2a321f1e2 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -47,6 +47,8 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); + m_endPositionsIndex[ endPos ].push_back( newNode ); + m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)? m_numWords = std::max(endPos+1, m_numWords); return newNode; } @@ -70,6 +72,36 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( return endIndex->second; } +bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const +{ + return GetNodesByStartPosition(startPos).size() > 0; +} + +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition( + int startPos ) const +{ + InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos ); + if (startIndex == m_startPositionsIndex.end() ) + return m_emptyNode; + + return startIndex->second; +} + +bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const +{ + return GetNodesByEndPosition(endPos).size() > 0; +} + +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition( + int endPos ) const +{ + InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos ); + if (endIndex == m_endPositionsIndex.end() ) + return m_emptyNode; + + return endIndex->second; +} + std::auto_ptr SyntaxNodeCollection::ExtractTree() { std::map nodeToTree; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index ef0989cd0..83aa66bb4 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -50,6 +50,11 @@ public: //! Lookup the SyntaxNodes for a given span. const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; + bool HasNodeStartingAtPosition( int startPos ) const; + const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const; + bool HasNodeEndingAtPosition( int endPos ) const; + const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const; + //! Get a vector of pointers to all SyntaxNodes (unordered). const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; @@ -78,6 +83,9 @@ private: NodeIndex m_index; int m_numWords; std::vector< SyntaxNode* > m_emptyNode; + + InnerNodeIndex m_endPositionsIndex; + InnerNodeIndex m_startPositionsIndex; }; } // namespace MosesTraining diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 0e77368f4..e4d074e15 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -1,11 +1,3 @@ -/* - * extract.cpp - * Modified by: Rohit Gupta CDAC, Mumbai, India - * on July 15, 2012 to implement parallel processing - * Modified by: Nadi Tomeh - LIMSI/CNRS - * Machine Translation Marathon 2010, Dublin - */ - #include #include #include @@ -20,11 +12,12 @@ #include #include -#include "SentenceAlignment.h" #include "tables-core.h" #include "InputFileStream.h" #include "OutputFileStream.h" #include "PhraseExtractionOptions.h" +#include "SentenceAlignmentWithSyntax.h" +#include "SyntaxNode.h" using namespace std; using namespace MosesTraining; @@ -46,14 +39,14 @@ typedef vector < HPhrase > HPhraseVector; // The key of the map is the English index and the value is a set of the source ones typedef map > HSentenceVertices; -REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientWordModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int)); -REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientPhraseModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int), const HSentenceVertices &, const HSentenceVertices &); -REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientHierModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int), const HSentenceVertices &, const HSentenceVertices &, @@ -69,7 +62,7 @@ bool ge(int, int); bool le(int, int); bool lt(int, int); -bool isAligned (SentenceAlignment &, int, int); +bool isAligned (SentenceAlignmentWithSyntax &, int, int); int sentenceOffset = 0; @@ -87,7 +80,7 @@ class ExtractTask { public: ExtractTask( - size_t id, SentenceAlignment &sentence, + size_t id, SentenceAlignmentWithSyntax &sentence, PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv, @@ -109,14 +102,17 @@ private: vector< string > m_extractedPhrasesSid; vector< string > m_extractedPhrasesContext; vector< string > m_extractedPhrasesContextInv; - void extractBase(SentenceAlignment &); - void extract(SentenceAlignment &); - void addPhrase(SentenceAlignment &, int, int, int, int, string &); + void extractBase(SentenceAlignmentWithSyntax &); + void extract(SentenceAlignmentWithSyntax &); + void addPhrase(const SentenceAlignmentWithSyntax &, int, int, int, int, const std::string &, const std::string &); void writePhrasesToFile(); - bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF); + bool checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF); bool isPlaceholder(const string &word); + bool checkTargetConstituentBoundaries(const SentenceAlignmentWithSyntax &sentence, + int startE, int endE, int startF, int endF, + std::string &phrasePropertiesString); - SentenceAlignment &m_sentence; + SentenceAlignmentWithSyntax &m_sentence; const PhraseExtractionOptions &m_options; Moses::OutputFileStream &m_extractFile; Moses::OutputFileStream &m_extractFileInv; @@ -133,7 +129,8 @@ int main(int argc, char* argv[]) if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; - cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n"; + cerr << "| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename "; + cerr << "| --TargetConstituentConstrained | --TargetConstituentBoundaries ]" << std::endl; exit(1); } @@ -153,6 +150,10 @@ int main(int argc, char* argv[]) options.initOnlyOutputSpanInfo(true); } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { options.initOrientationFlag(true); + } else if (strcmp(argv[i],"--TargetConstituentConstrained") == 0) { + options.initTargetConstituentConstrainedFlag(true); + } else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) { + options.initTargetConstituentBoundariesFlag(true); } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.initFlexScoreFlag(true); } else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) { @@ -280,6 +281,11 @@ int main(int argc, char* argv[]) extractFileContextInv.Open(fileNameExtractContextInv.c_str()); } + // stats on labels for glue grammar and unknown word label probabilities + set< string > targetLabelCollection, sourceLabelCollection; + map< string, int > targetTopLabelCollection, sourceTopLabelCollection; + const bool targetSyntax = true; + int i = sentenceOffset; string englishString, foreignString, alignmentString, weightString; @@ -295,7 +301,10 @@ int main(int argc, char* argv[]) getline(*iwFileP, weightString); } - SentenceAlignment sentence; + SentenceAlignmentWithSyntax sentence + (targetLabelCollection, sourceLabelCollection, + targetTopLabelCollection, sourceTopLabelCollection, + targetSyntax, false); // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line if (options.isOnlyOutputSpanInfo()) { @@ -360,7 +369,7 @@ void ExtractTask::Run() } -void ExtractTask::extract(SentenceAlignment &sentence) +void ExtractTask::extract(SentenceAlignmentWithSyntax &sentence) { int countE = sentence.target.size(); int countF = sentence.source.size(); @@ -454,7 +463,15 @@ void ExtractTask::extract(SentenceAlignment &sentence) // if(m_options.isAllModelsOutputFlag()) // " | | "; } - addPhrase(sentence, startE, endE, startF, endF, orientationInfo); + std::string phrasePropertiesString; + bool doAdd = !m_options.isTargetConstituentBoundariesFlag(); + if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) { + bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString); + doAdd = doAdd || isTargetConstituentCovered; + } + if (doAdd) { + addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString); + } } } } @@ -510,12 +527,20 @@ void ExtractTask::extract(SentenceAlignment &sentence) ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " + ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : ""); - addPhrase(sentence, startE, endE, startF, endF, orientationInfo); + std::string phrasePropertiesString; + bool doAdd = !m_options.isTargetConstituentBoundariesFlag(); + if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) { + bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString); + doAdd = doAdd || isTargetConstituentCovered; + } + if (doAdd) { + addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString); + } } } } -REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType, +REO_POS getOrientWordModel(SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int) ) @@ -541,7 +566,7 @@ REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelTyp } // to be called with countF-1 instead of countF -REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, +REO_POS getOrientPhraseModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int), @@ -577,7 +602,7 @@ REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE model } // to be called with countF-1 instead of countF -REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, +REO_POS getOrientHierModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int), @@ -629,7 +654,7 @@ REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelTy return UNKNOWN; } -bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) +bool isAligned ( SentenceAlignmentWithSyntax &sentence, int fi, int ei ) { if (ei == -1 && fi == -1) return true; @@ -715,8 +740,138 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType) } return ""; } + -void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) +bool ExtractTask::checkTargetConstituentBoundaries( const SentenceAlignmentWithSyntax &sentence, + int startE, int endE, int startF, int endF, + std::string &phrasePropertiesString) +{ + ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesLeft; + + if (m_options.isTargetConstituentBoundariesFlag()) { + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "{{TargetConstituentBoundariesLeft "; + } + + bool validTargetConstituentBoundaries = false; + bool outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true; + + if (m_options.isTargetConstituentBoundariesFlag()) { + if (startE==0) { + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false; + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "BOS_"; + } + } + + if (!sentence.targetTree.HasNodeStartingAtPosition(startE)) { + + validTargetConstituentBoundaries = false; + + } else { + + const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(startE); + for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); iter != startingNodes.rend(); ++iter ) { + if ( (*iter)->end == endE ) { + validTargetConstituentBoundaries = true; + if (!m_options.isTargetConstituentBoundariesFlag()) { + break; + } + } + if (m_options.isTargetConstituentBoundariesFlag()) { + if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) { + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false; + } else { + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<"; + } + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << (*iter)->label; + } + } + } + + if (m_options.isTargetConstituentBoundariesFlag()) { + if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) { + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<"; + } + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "}}"; + } + + + if (m_options.isTargetConstituentConstrainedFlag() && !validTargetConstituentBoundaries) { + // skip over all boundary punctuation and check again + bool relaxedValidTargetConstituentBoundaries = false; + int relaxedStartE = startE; + int relaxedEndE = endE; + const std::string punctuation = ",;.:!?"; + while ( (relaxedStartE < endE) && + (sentence.target[relaxedStartE].size() == 1) && + (punctuation.find(sentence.target[relaxedStartE].at(0)) != std::string::npos) ) { + ++relaxedStartE; + } + while ( (relaxedEndE > relaxedStartE) && + (sentence.target[relaxedEndE].size() == 1) && + (punctuation.find(sentence.target[relaxedEndE].at(0)) != std::string::npos) ) { + --relaxedEndE; + } + + if ( (relaxedStartE != startE) || (relaxedEndE !=endE) ) { + const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(relaxedStartE); + for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); + (iter != startingNodes.rend() && !relaxedValidTargetConstituentBoundaries); + ++iter ) { + if ( (*iter)->end == relaxedEndE ) { + relaxedValidTargetConstituentBoundaries = true; + } + } + } + + if (!relaxedValidTargetConstituentBoundaries) { + return false; + } + } + + + if (m_options.isTargetConstituentBoundariesFlag()) { + + ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent; + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "{{TargetConstituentBoundariesRightAdjacent "; + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true; + + if (endE==sentence.target.size()-1) { + + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "EOS_"; + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false; + + } else { + + const std::vector< SyntaxNode* >& adjacentNodes = sentence.targetTree.GetNodesByStartPosition(endE+1); + for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = adjacentNodes.rbegin(); iter != adjacentNodes.rend(); ++iter ) { + if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) { + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false; + } else { + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<"; + } + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << (*iter)->label; + } + } + + if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) { + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<"; + } + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "}}"; + + phrasePropertiesString += " "; + phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesLeft.str(); + phrasePropertiesString += " "; + phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent.str(); + } + + return true; +} + + +void ExtractTask::addPhrase( const SentenceAlignmentWithSyntax &sentence, + int startE, int endE, int startF, int endF, + const std::string &orientationInfo, + const std::string &phrasePropertiesString) { // source // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; @@ -746,11 +901,18 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, if (m_options.isTranslationFlag()) outextractstr << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + // target for(int ei=startE; ei<=endE; ei++) { - if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " "; - if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " "; - if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; + + if (m_options.isTranslationFlag()) { + outextractstr << sentence.target[ei] << " "; + outextractstrInv << sentence.target[ei] << " "; + } + + if (m_options.isOrientationFlag()) { + outextractstrOrientation << sentence.target[ei] << " "; + } } if (m_options.isTranslationFlag()) outextractstr << "|||"; if (m_options.isTranslationFlag()) outextractstrInv << "||| "; @@ -792,7 +954,7 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, } } - + outextractstr << phrasePropertiesString; // generate two lines for every extracted phrase: // once with left, once with right context @@ -901,7 +1063,7 @@ void ExtractTask::writePhrasesToFile() // if proper conditioning, we need the number of times a source phrase occured -void ExtractTask::extractBase( SentenceAlignment &sentence ) +void ExtractTask::extractBase( SentenceAlignmentWithSyntax &sentence ) { ostringstream outextractFile; ostringstream outextractFileInv; @@ -935,7 +1097,7 @@ void ExtractTask::extractBase( SentenceAlignment &sentence ) } -bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF) +bool ExtractTask::checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF) { for (size_t pos = startF; pos <= endF; ++pos) { const string &sourceWord = sentence.source[pos]; diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 9095df01b..081ee8ef1 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -68,6 +68,7 @@ bool spanLength = false; bool ruleLength = false; bool nonTermContext = false; bool nonTermContextTarget = false; +bool targetConstituentBoundariesFlag = false; int countOfCounts[COC_MAX+1]; int totalDistinct = 0; @@ -286,6 +287,9 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--NonTermContextTarget") == 0) { nonTermContextTarget = true; std::cerr << "non-term context (target)" << std::endl; + } else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) { + targetConstituentBoundariesFlag = true; + std::cerr << "including target constituent boundaries information" << std::endl; } else { featureArgs.push_back(argv[i]); ++i; @@ -957,6 +961,18 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, } } + // target constituent boundaries + if (targetConstituentBoundariesFlag && !inverseFlag) { + const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft"); + if (!targetConstituentBoundariesLeftValues.empty()) { + phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}"; + } + const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent"); + if (!targetConstituentBoundariesRightAdjacentValues.empty()) { + phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}"; + } + } + phraseTableFile << std::endl; } -- cgit v1.2.3