diff options
author | Matthias Huck <mhuck@inf.ed.ac.uk> | 2016-01-10 02:02:31 +0300 |
---|---|---|
committer | Matthias Huck <mhuck@inf.ed.ac.uk> | 2016-01-10 02:02:31 +0300 |
commit | 1d3feba8d04645fb0111fb23e4561dc0ea13d2a0 (patch) | |
tree | 92b097548a6ad531000a459bde9ed9525d0f9a4a /phrase-extract | |
parent | 8750c71ef4149f43509c1affa558e1d363edc647 (diff) |
preparing extraction of Hiero soft syntactic preferences (target syntax)
Diffstat (limited to 'phrase-extract')
-rw-r--r-- | phrase-extract/ExtractedRule.h | 2 | ||||
-rw-r--r-- | phrase-extract/PropertiesConsolidator.cpp | 100 | ||||
-rw-r--r-- | phrase-extract/PropertiesConsolidator.h | 10 | ||||
-rw-r--r-- | phrase-extract/RuleExtractionOptions.h | 2 | ||||
-rw-r--r-- | phrase-extract/consolidate-main.cpp | 17 | ||||
-rw-r--r-- | phrase-extract/extract-rules-main.cpp | 94 | ||||
-rw-r--r-- | phrase-extract/score-main.cpp | 76 |
7 files changed, 249 insertions, 52 deletions
diff --git a/phrase-extract/ExtractedRule.h b/phrase-extract/ExtractedRule.h index adbde43e8..d8c632f90 100644 --- a/phrase-extract/ExtractedRule.h +++ b/phrase-extract/ExtractedRule.h @@ -45,6 +45,7 @@ public: std::string targetContextRight; std::string sourceHoleString; std::string targetHoleString; + std::string targetSyntacticPreference; int startT; int endT; int startS; @@ -65,6 +66,7 @@ public: , targetContextRight() , sourceHoleString() , targetHoleString() + , targetSyntacticPreference() , startT(sT) , endT(eT) , startS(sS) diff --git a/phrase-extract/PropertiesConsolidator.cpp b/phrase-extract/PropertiesConsolidator.cpp index 94b6ea13a..71899e15d 100644 --- a/phrase-extract/PropertiesConsolidator.cpp +++ b/phrase-extract/PropertiesConsolidator.cpp @@ -83,6 +83,32 @@ void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string & } +void PropertiesConsolidator::ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile) +{ + Moses::InputFileStream inFile(targetSyntacticPreferencesLabelSetFile); + + // read target syntactic preferences label set + m_targetSyntacticPreferencesLabels.clear(); + std::string line; + while (getline(inFile, line)) { + std::istringstream tokenizer(line); + std::string label; + size_t index; + try { + tokenizer >> label >> index; + } catch (const std::exception &e) { + UTIL_THROW2("Error reading target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " ."); + } + std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_targetSyntacticPreferencesLabels.insert( std::pair<std::string,size_t>(label,index) ); + UTIL_THROW_IF2(!inserted.second,"Target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " should contain each syntactic label only once."); + } + + inFile.Close(); + + m_targetSyntacticPreferencesFlag = true; +} + + void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const { if ( propertiesString.empty() ) { @@ -129,6 +155,19 @@ void PropertiesConsolidator::ProcessPropertiesString(const std::string &properti } */ + } else if ( !keyValue[0].compare("TargetPreferences") ) { + + if ( m_targetSyntacticPreferencesFlag ) { + + // TargetPreferences property: replace strings with vocabulary indices + out << " {{" << keyValue[0]; + ProcessTargetSyntacticPreferencesPropertyValue(keyValue[1], out); + out << "}}"; + + } else { // don't process TargetPreferences property + out << " {{" << keyValue[0] << " " << keyValue[1] << "}}"; + } + } else { // output other property @@ -246,5 +285,66 @@ bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std:: } +void PropertiesConsolidator::ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const +{ + // TargetPreferences property: replace strings with vocabulary indices + std::istringstream tokenizer(value); + + size_t nNTs; + double totalCount; + + if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side) + UTIL_THROW2("Not able to read number of non-terminals from TargetPreferences property. " + << "Flawed TargetPreferences property?"); + } + assert( nNTs > 0 ); + out << " " << nNTs; + + if (! (tokenizer >> totalCount)) { // second token: overall rule count + UTIL_THROW2("Not able to read overall rule count from TargetPreferences property. " + << "Flawed TargetPreferences property?"); + } + assert( totalCount > 0.0 ); + out << " " << totalCount; + + while (tokenizer.peek() != EOF) { + try { + + size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max(); + + std::string token; + + if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule + for (size_t i=0; i<nNTs-1; ++i) { // RHS target preference non-terminal labels + tokenizer >> token; // RHS target preference non-terminal label + std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token); + UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set."); + out << " " << found->second; + } + + tokenizer >> token; // targetPreferenceRHSCount + out << " " << token; + + tokenizer >> numberOfLHSsGivenRHS; + out << " " << numberOfLHSsGivenRHS; + } + + for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS target preference non-terminal labels seen with this RHS + tokenizer >> token; // LHS target preference non-terminal label + std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token); + UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set."); + out << " " << found->second; + + tokenizer >> token; // ruleTargetPreferenceLabelledCount + out << " " << token; + } + + } catch (const std::exception &e) { + UTIL_THROW2("Flawed item in TargetPreferences property?"); + } + } +} + + } // namespace MosesTraining diff --git a/phrase-extract/PropertiesConsolidator.h b/phrase-extract/PropertiesConsolidator.h index dbb64e0dd..69a046d8f 100644 --- a/phrase-extract/PropertiesConsolidator.h +++ b/phrase-extract/PropertiesConsolidator.h @@ -34,10 +34,15 @@ class PropertiesConsolidator { public: - PropertiesConsolidator() : m_sourceLabelsFlag(false) {}; + PropertiesConsolidator() + : m_sourceLabelsFlag(false) + , m_partsOfSpeechFlag(false) + , m_targetSyntacticPreferencesFlag(false) + {}; void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile); void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile); + void ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile); bool GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const; @@ -47,11 +52,14 @@ protected: void ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const; void ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const; + void ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const; bool m_sourceLabelsFlag; std::map<std::string,size_t> m_sourceLabels; bool m_partsOfSpeechFlag; std::map<std::string,size_t> m_partsOfSpeechVocabulary; + bool m_targetSyntacticPreferencesFlag; + std::map<std::string,size_t> m_targetSyntacticPreferencesLabels; }; diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h index aab059cf9..43f7b205b 100644 --- a/phrase-extract/RuleExtractionOptions.h +++ b/phrase-extract/RuleExtractionOptions.h @@ -46,6 +46,7 @@ public: bool requireAlignedWord; bool sourceSyntax; bool targetSyntax; + bool targetSyntacticPreferences; bool duplicateRules; bool fractionalCounting; bool pcfgScore; @@ -80,6 +81,7 @@ public: , requireAlignedWord(true) , sourceSyntax(false) , targetSyntax(false) + , targetSyntacticPreferences(false) , duplicateRules(true) , fractionalCounting(true) , pcfgScore(false) diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index 0f276144b..2c1198d4c 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -38,6 +38,7 @@ bool onlyDirectFlag = false; bool partsOfSpeechFlag = false; bool phraseCountFlag = false; bool sourceLabelsFlag = false; +bool targetSyntacticPreferencesFlag = false; bool sparseCountBinFeatureFlag = false; std::vector< int > countBin; @@ -49,7 +50,7 @@ std::vector< float > goodTuringDiscount; float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1; -void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& ); +void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& ); void loadCountOfCounts( const std::string& ); void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse ); bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item ); @@ -93,6 +94,7 @@ int main(int argc, char* argv[]) std::string fileNameCountOfCounts; std::string fileNameSourceLabelSet; std::string fileNamePartsOfSpeechVocabulary; + std::string fileNameTargetSyntacticPreferencesLabelSet; for(int i=4; i<argc; i++) { if (strcmp(argv[i],"--Hierarchical") == 0) { @@ -150,6 +152,11 @@ int main(int argc, char* argv[]) UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!"); fileNamePartsOfSpeechVocabulary = argv[++i]; std::cerr << "processing parts-of-speech property" << std::endl; + } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) { + targetSyntacticPreferencesFlag = true; + UTIL_THROW_IF2(i+1==argc, "specify target syntactic preferences label set file!"); + fileNameTargetSyntacticPreferencesLabelSet = argv[++i]; + std::cerr << "processing target syntactic preferences property" << std::endl; } else if (strcmp(argv[i],"--MinScore") == 0) { std::string setting = argv[++i]; bool done = false; @@ -182,7 +189,7 @@ int main(int argc, char* argv[]) } } - processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary ); + processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary, fileNameTargetSyntacticPreferencesLabelSet ); } @@ -231,7 +238,8 @@ void processFiles( const std::string& fileNameDirect, const std::string& fileNameConsolidated, const std::string& fileNameCountOfCounts, const std::string& fileNameSourceLabelSet, - const std::string& fileNamePartsOfSpeechVocabulary ) + const std::string& fileNamePartsOfSpeechVocabulary, + const std::string& fileNameTargetSyntacticPreferencesLabelSet ) { if (goodTuringFlag || kneserNeyFlag) loadCountOfCounts( fileNameCountOfCounts ); @@ -256,6 +264,9 @@ void processFiles( const std::string& fileNameDirect, if (partsOfSpeechFlag) { propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary); } + if (targetSyntacticPreferencesFlag) { + propertiesConsolidator.ActivateTargetSyntacticPreferencesProcessing(fileNameTargetSyntacticPreferencesLabelSet); + } // loop through all extracted phrase translations int i=0; diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index f77e0b5eb..5319bcc6e 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -79,14 +79,15 @@ private: , RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS); void saveHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, LabelIndex &labelIndex, int countS); - string saveTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS); + string saveTargetHieroPhrase( int startT, int endT, int startS, int endS + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS); string saveSourceHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, const LabelIndex &labelIndex); void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex); void saveHieroAlignment( int startT, int endT, int startS, int endS , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule); + void saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule); void saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS); inline string IntToString( int i ) { @@ -225,6 +226,8 @@ int main(int argc, char* argv[]) // allow consecutive non-terminals (X Y | X Y) else if (strcmp(argv[i],"--TargetSyntax") == 0) { options.targetSyntax = true; + } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) { + options.targetSyntacticPreferences = true; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { options.sourceSyntax = true; } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) { @@ -422,7 +425,8 @@ void ExtractTask::extractRules() int endT = startT + lengthT - 1; // if there is target side syntax, there has to be a node - if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT)) + if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT)) +// if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT)) continue; // find find aligned source words @@ -566,7 +570,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int int labelI = labelIndex[ 2+holeCount ]; string targetLabel; - if (m_options.targetSyntax) { + if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) { targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; @@ -628,7 +632,7 @@ string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int if (m_options.unpairedExtractFormat) { out += "[" + sourceLabel + "] "; } else { - out += "[" + sourceLabel + "][" + targetLabel + "] "; + out += "[" + sourceLabel + "][" + (m_options.targetSyntacticPreferences ? "X" : targetLabel) + "] "; } currPos = hole.GetEnd(0); @@ -682,6 +686,33 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS } } +void ExtractTask::saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule) +{ + rule.targetSyntacticPreference = ""; + int holeCount = 0; + for (HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin(); + iterHoleList != holeColl.GetHoles().end(); + ++iterHoleList) { + + const Hole &hole = *iterHoleList; + + int labelI = labelIndex[ 2+holeCount ]; + string targetLabel = "X"; + int startT = hole.GetStart(1); + int endT = hole.GetEnd(1); + if (m_sentence.targetTree.HasNode(startT,endT)) { + rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelI]->label; + rule.targetSyntacticPreference += " "; + } else { + rule.targetSyntacticPreference += "X "; + } + ++holeCount; + } + + rule.targetSyntacticPreference.erase(rule.targetSyntacticPreference.size()-1); +} + + void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, LabelIndex &labelIndex, int countS) { @@ -691,7 +722,8 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS // phrase labels string targetLabel; - if (m_options.targetSyntax) { +// if (m_options.targetSyntax && m_sentence.targetTree.HasNode(startT,endT)) { + if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) { targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; @@ -776,6 +808,17 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS // std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L) << std::endl; } + // target syntactic preferences + if (m_options.targetSyntacticPreferences) { + saveTargetSyntacticPreference(holeColl, labelIndex, rule); + if (m_sentence.targetTree.HasNode(startT,endT)) { + rule.targetSyntacticPreference += " "; + rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label; + } else { + rule.targetSyntacticPreference += " X"; + } + } + addRuleToCollection( rule ); } @@ -785,6 +828,9 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end // number of target head labels int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1; + if (m_options.targetSyntacticPreferences && !numLabels) { + numLabels++; + } labelCount.push_back(numLabels); labelIndex.push_back(0); @@ -796,7 +842,10 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end // number of target hole labels for( HoleList::const_iterator hole = holeColl.GetHoles().begin(); hole != holeColl.GetHoles().end(); hole++ ) { - int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ; + int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ; + if (m_options.targetSyntacticPreferences && !numLabels) { + numLabels++; + } labelCount.push_back(numLabels); labelIndex.push_back(0); } @@ -973,12 +1022,19 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count // phrase labels string targetLabel,sourceLabel; if (m_options.targetSyntax && m_options.conditionOnTargetLhs) { - sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label; + if (m_sentence.targetTree.HasNode(startT,endT) && !m_options.targetSyntacticPreferences) { + sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label; + } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { + sourceLabel = "S"; + } else { + sourceLabel = "X"; + } } else { sourceLabel = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X"; - if (m_options.targetSyntax) { + if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) { +// if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT)) targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; @@ -1037,6 +1093,15 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count // std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L) << std::endl; } + // target syntactic preferences + if (m_options.targetSyntacticPreferences) { + if (m_sentence.targetTree.HasNode(startT,endT)) { + rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[0]->label; + } else { + rule.targetSyntacticPreference += "X"; + } + } + addRuleToCollection( rule ); } @@ -1114,6 +1179,11 @@ void ExtractTask::writeRulesToFile() m_phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,rule->r2lOrientation,1); out << "}}"; } + if (m_options.targetSyntacticPreferences) { + out << " {{TargetPreferences "; + out << rule->targetSyntacticPreference; + out << "}}"; + } out << "\n"; if (!m_options.onlyDirectFlag) { @@ -1167,12 +1237,12 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, if (options.phraseOrientation) { glueRulesPhraseProperty.append(" ||| ||| {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}"); } - if (!options.targetSyntax) { + if (!options.targetSyntax || options.targetSyntacticPreferences) { grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0" << glueRulesPhraseProperty << endl << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl; } else { - // chose a top label that is not already a label + // choose a top label that is not already a label string topLabel = "QQQQQQ"; for( unsigned int i=1; i<=topLabel.length(); i++) { if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) { @@ -1202,7 +1272,7 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, // collect counts for labels for each word // ( labels of singleton words are used to estimate -// distribution oflabels for unknown words ) +// distribution of labels for unknown words ) map<string,int> wordCount; map<string,string> wordLabel; diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 0ad5844bf..1feb6b7f2 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -51,7 +51,7 @@ bool treeFragmentsFlag = false; bool partsOfSpeechFlag = false; bool sourceSyntaxLabelsFlag = false; bool sourceSyntaxLabelCountsLHSFlag = false; -bool targetPreferenceLabelsFlag = false; +bool targetSyntacticPreferencesFlag = false; bool unpairedExtractFormatFlag = false; bool conditionOnTargetLhsFlag = false; bool wordAlignmentFlag = true; @@ -83,11 +83,11 @@ std::vector<std::string> sourceLabelsByIndex; std::set<std::string> partsOfSpeechSet; -boost::unordered_map<std::string,float> targetPreferenceLHSCounts; -boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts; -std::set<std::string> targetPreferenceLabelSet; -std::map<std::string,size_t> targetPreferenceLabels; -std::vector<std::string> targetPreferenceLabelsByIndex; +boost::unordered_map<std::string,float> targetSyntacticPreferencesLHSCounts; +boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts; +std::set<std::string> targetSyntacticPreferencesLabelSet; +std::map<std::string,size_t> targetSyntacticPreferencesLabels; +std::vector<std::string> targetSyntacticPreferencesLabelsByIndex; std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dleft dright std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dleft dright @@ -150,7 +150,7 @@ int main(int argc, char* argv[]) "[--TreeFragments] " "[--SourceLabels] " "[--SourceLabelCountsLHS] " - "[--TargetPreferenceLabels] " + "[--TargetSyntacticPreferences] " "[--UnpairedExtractFormat] " "[--ConditionOnTargetLHS] " "[--CrossedNonTerm]" @@ -167,9 +167,9 @@ int main(int argc, char* argv[]) std::string fileNameFunctionWords; std::string fileNameLeftHandSideSourceLabelCounts; std::string fileNameLeftHandSideTargetSourceLabelCounts; - std::string fileNameTargetPreferenceLabelSet; - std::string fileNameLeftHandSideTargetPreferenceLabelCounts; - std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts; + std::string fileNameTargetSyntacticPreferencesLabelSet; + std::string fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts; + std::string fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts; std::string fileNamePhraseOrientationPriors; // All unknown args are passed to feature manager. std::vector<std::string> featureArgs; @@ -205,14 +205,18 @@ int main(int argc, char* argv[]) fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs"; fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs"; std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl; - } else if (strcmp(argv[i],"--TargetPreferenceLabels") == 0) { - targetPreferenceLabelsFlag = true; - std::cerr << "including target preference label information" << std::endl; - fileNameTargetPreferenceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref"; - std::cerr << "writing target preference label set to file " << fileNameTargetPreferenceLabelSet << std::endl; - fileNameLeftHandSideTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs"; - fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs"; - std::cerr << "counting left-hand side target preference labels and writing them to files " << fileNameLeftHandSideTargetPreferenceLabelCounts << " and " << fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts << std::endl; + } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) { + targetSyntacticPreferencesFlag = true; + std::cerr << "including target syntactic preferences information" << std::endl; + fileNameTargetSyntacticPreferencesLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref"; + std::cerr << "writing target syntactic preferences label set to file " << fileNameTargetSyntacticPreferencesLabelSet << std::endl; + fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs"; + fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs"; + std::cerr << "counting left-hand side target syntactic preferences labels and writing them to files " + << fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts + << " and " + << fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts + << std::endl; } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { unpairedExtractFormatFlag = true; std::cerr << "processing unpaired extract format" << std::endl; @@ -508,13 +512,13 @@ int main(int argc, char* argv[]) writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet ); } - // target preference labels - if (targetPreferenceLabelsFlag && !inverseFlag) { - writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet ); - writeLeftHandSideLabelCounts( targetPreferenceLHSCounts, - ruleTargetLHSAndTargetPreferenceLHSJointCounts, - fileNameLeftHandSideTargetPreferenceLabelCounts, - fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts ); + // target syntactic preferences labels + if (targetSyntacticPreferencesFlag && !inverseFlag) { + writeLabelSet( targetSyntacticPreferencesLabelSet, fileNameTargetSyntacticPreferencesLabelSet ); + writeLeftHandSideLabelCounts( targetSyntacticPreferencesLHSCounts, + ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts, + fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts, + fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts ); } } @@ -874,7 +878,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, } // syntax labels - if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) { + if ((sourceSyntaxLabelsFlag || targetSyntacticPreferencesFlag) && !inverseFlag) { unsigned nNTs = 1; for(size_t j=0; j<phraseSource->size()-1; ++j) { if (isNonTerminal(vcbS.getWord( phraseSource->at(j) ))) @@ -897,20 +901,20 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, << "}}"; } } - // target preference labels - if (targetPreferenceLabelsFlag) { - std::string targetPreferenceLabelCounts; - targetPreferenceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences", - targetPreferenceLabelSet, - targetPreferenceLHSCounts, - ruleTargetLHSAndTargetPreferenceLHSJointCounts, - vcbT); - if ( !targetPreferenceLabelCounts.empty() ) { + // target syntactic preferences labels + if (targetSyntacticPreferencesFlag) { + std::string targetSyntacticPreferencesLabelCounts; + targetSyntacticPreferencesLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences", + targetSyntacticPreferencesLabelSet, + targetSyntacticPreferencesLHSCounts, + ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts, + vcbT); + if (!targetSyntacticPreferencesLabelCounts.empty()) { phraseTableFile << " {{TargetPreferences " << nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT) << " " << count // rule count - << targetPreferenceLabelCounts + << targetSyntacticPreferencesLabelCounts << "}}"; } } |