Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <mhuck@inf.ed.ac.uk>2016-01-10 02:02:31 +0300
committerMatthias Huck <mhuck@inf.ed.ac.uk>2016-01-10 02:02:31 +0300
commit1d3feba8d04645fb0111fb23e4561dc0ea13d2a0 (patch)
tree92b097548a6ad531000a459bde9ed9525d0f9a4a /phrase-extract
parent8750c71ef4149f43509c1affa558e1d363edc647 (diff)
preparing extraction of Hiero soft syntactic preferences (target syntax)
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/ExtractedRule.h2
-rw-r--r--phrase-extract/PropertiesConsolidator.cpp100
-rw-r--r--phrase-extract/PropertiesConsolidator.h10
-rw-r--r--phrase-extract/RuleExtractionOptions.h2
-rw-r--r--phrase-extract/consolidate-main.cpp17
-rw-r--r--phrase-extract/extract-rules-main.cpp94
-rw-r--r--phrase-extract/score-main.cpp76
7 files changed, 249 insertions, 52 deletions
diff --git a/phrase-extract/ExtractedRule.h b/phrase-extract/ExtractedRule.h
index adbde43e8..d8c632f90 100644
--- a/phrase-extract/ExtractedRule.h
+++ b/phrase-extract/ExtractedRule.h
@@ -45,6 +45,7 @@ public:
std::string targetContextRight;
std::string sourceHoleString;
std::string targetHoleString;
+ std::string targetSyntacticPreference;
int startT;
int endT;
int startS;
@@ -65,6 +66,7 @@ public:
, targetContextRight()
, sourceHoleString()
, targetHoleString()
+ , targetSyntacticPreference()
, startT(sT)
, endT(eT)
, startS(sS)
diff --git a/phrase-extract/PropertiesConsolidator.cpp b/phrase-extract/PropertiesConsolidator.cpp
index 94b6ea13a..71899e15d 100644
--- a/phrase-extract/PropertiesConsolidator.cpp
+++ b/phrase-extract/PropertiesConsolidator.cpp
@@ -83,6 +83,32 @@ void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &
}
+void PropertiesConsolidator::ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile)
+{
+ Moses::InputFileStream inFile(targetSyntacticPreferencesLabelSetFile);
+
+ // read target syntactic preferences label set
+ m_targetSyntacticPreferencesLabels.clear();
+ std::string line;
+ while (getline(inFile, line)) {
+ std::istringstream tokenizer(line);
+ std::string label;
+ size_t index;
+ try {
+ tokenizer >> label >> index;
+ } catch (const std::exception &e) {
+ UTIL_THROW2("Error reading target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " .");
+ }
+ std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_targetSyntacticPreferencesLabels.insert( std::pair<std::string,size_t>(label,index) );
+ UTIL_THROW_IF2(!inserted.second,"Target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " should contain each syntactic label only once.");
+ }
+
+ inFile.Close();
+
+ m_targetSyntacticPreferencesFlag = true;
+}
+
+
void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const
{
if ( propertiesString.empty() ) {
@@ -129,6 +155,19 @@ void PropertiesConsolidator::ProcessPropertiesString(const std::string &properti
}
*/
+ } else if ( !keyValue[0].compare("TargetPreferences") ) {
+
+ if ( m_targetSyntacticPreferencesFlag ) {
+
+ // TargetPreferences property: replace strings with vocabulary indices
+ out << " {{" << keyValue[0];
+ ProcessTargetSyntacticPreferencesPropertyValue(keyValue[1], out);
+ out << "}}";
+
+ } else { // don't process TargetPreferences property
+ out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+ }
+
} else {
// output other property
@@ -246,5 +285,66 @@ bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std::
}
+void PropertiesConsolidator::ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
+{
+ // TargetPreferences property: replace strings with vocabulary indices
+ std::istringstream tokenizer(value);
+
+ size_t nNTs;
+ double totalCount;
+
+ if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
+ UTIL_THROW2("Not able to read number of non-terminals from TargetPreferences property. "
+ << "Flawed TargetPreferences property?");
+ }
+ assert( nNTs > 0 );
+ out << " " << nNTs;
+
+ if (! (tokenizer >> totalCount)) { // second token: overall rule count
+ UTIL_THROW2("Not able to read overall rule count from TargetPreferences property. "
+ << "Flawed TargetPreferences property?");
+ }
+ assert( totalCount > 0.0 );
+ out << " " << totalCount;
+
+ while (tokenizer.peek() != EOF) {
+ try {
+
+ size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
+
+ std::string token;
+
+ if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
+ for (size_t i=0; i<nNTs-1; ++i) { // RHS target preference non-terminal labels
+ tokenizer >> token; // RHS target preference non-terminal label
+ std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
+ UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
+ out << " " << found->second;
+ }
+
+ tokenizer >> token; // targetPreferenceRHSCount
+ out << " " << token;
+
+ tokenizer >> numberOfLHSsGivenRHS;
+ out << " " << numberOfLHSsGivenRHS;
+ }
+
+ for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS target preference non-terminal labels seen with this RHS
+ tokenizer >> token; // LHS target preference non-terminal label
+ std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
+ UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
+ out << " " << found->second;
+
+ tokenizer >> token; // ruleTargetPreferenceLabelledCount
+ out << " " << token;
+ }
+
+ } catch (const std::exception &e) {
+ UTIL_THROW2("Flawed item in TargetPreferences property?");
+ }
+ }
+}
+
+
} // namespace MosesTraining
diff --git a/phrase-extract/PropertiesConsolidator.h b/phrase-extract/PropertiesConsolidator.h
index dbb64e0dd..69a046d8f 100644
--- a/phrase-extract/PropertiesConsolidator.h
+++ b/phrase-extract/PropertiesConsolidator.h
@@ -34,10 +34,15 @@ class PropertiesConsolidator
{
public:
- PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
+ PropertiesConsolidator()
+ : m_sourceLabelsFlag(false)
+ , m_partsOfSpeechFlag(false)
+ , m_targetSyntacticPreferencesFlag(false)
+ {};
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);
+ void ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile);
bool GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const;
@@ -47,11 +52,14 @@ protected:
void ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
void ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
+ void ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
bool m_sourceLabelsFlag;
std::map<std::string,size_t> m_sourceLabels;
bool m_partsOfSpeechFlag;
std::map<std::string,size_t> m_partsOfSpeechVocabulary;
+ bool m_targetSyntacticPreferencesFlag;
+ std::map<std::string,size_t> m_targetSyntacticPreferencesLabels;
};
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index aab059cf9..43f7b205b 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -46,6 +46,7 @@ public:
bool requireAlignedWord;
bool sourceSyntax;
bool targetSyntax;
+ bool targetSyntacticPreferences;
bool duplicateRules;
bool fractionalCounting;
bool pcfgScore;
@@ -80,6 +81,7 @@ public:
, requireAlignedWord(true)
, sourceSyntax(false)
, targetSyntax(false)
+ , targetSyntacticPreferences(false)
, duplicateRules(true)
, fractionalCounting(true)
, pcfgScore(false)
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 0f276144b..2c1198d4c 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -38,6 +38,7 @@ bool onlyDirectFlag = false;
bool partsOfSpeechFlag = false;
bool phraseCountFlag = false;
bool sourceLabelsFlag = false;
+bool targetSyntacticPreferencesFlag = false;
bool sparseCountBinFeatureFlag = false;
std::vector< int > countBin;
@@ -49,7 +50,7 @@ std::vector< float > goodTuringDiscount;
float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
-void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
+void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
void loadCountOfCounts( const std::string& );
void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse );
bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item );
@@ -93,6 +94,7 @@ int main(int argc, char* argv[])
std::string fileNameCountOfCounts;
std::string fileNameSourceLabelSet;
std::string fileNamePartsOfSpeechVocabulary;
+ std::string fileNameTargetSyntacticPreferencesLabelSet;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) {
@@ -150,6 +152,11 @@ int main(int argc, char* argv[])
UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!");
fileNamePartsOfSpeechVocabulary = argv[++i];
std::cerr << "processing parts-of-speech property" << std::endl;
+ } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
+ targetSyntacticPreferencesFlag = true;
+ UTIL_THROW_IF2(i+1==argc, "specify target syntactic preferences label set file!");
+ fileNameTargetSyntacticPreferencesLabelSet = argv[++i];
+ std::cerr << "processing target syntactic preferences property" << std::endl;
} else if (strcmp(argv[i],"--MinScore") == 0) {
std::string setting = argv[++i];
bool done = false;
@@ -182,7 +189,7 @@ int main(int argc, char* argv[])
}
}
- processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
+ processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary, fileNameTargetSyntacticPreferencesLabelSet );
}
@@ -231,7 +238,8 @@ void processFiles( const std::string& fileNameDirect,
const std::string& fileNameConsolidated,
const std::string& fileNameCountOfCounts,
const std::string& fileNameSourceLabelSet,
- const std::string& fileNamePartsOfSpeechVocabulary )
+ const std::string& fileNamePartsOfSpeechVocabulary,
+ const std::string& fileNameTargetSyntacticPreferencesLabelSet )
{
if (goodTuringFlag || kneserNeyFlag)
loadCountOfCounts( fileNameCountOfCounts );
@@ -256,6 +264,9 @@ void processFiles( const std::string& fileNameDirect,
if (partsOfSpeechFlag) {
propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
}
+ if (targetSyntacticPreferencesFlag) {
+ propertiesConsolidator.ActivateTargetSyntacticPreferencesProcessing(fileNameTargetSyntacticPreferencesLabelSet);
+ }
// loop through all extracted phrase translations
int i=0;
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index f77e0b5eb..5319bcc6e 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -79,14 +79,15 @@ private:
, RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
void saveHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
- string saveTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
+ string saveTargetHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
string saveSourceHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, const LabelIndex &labelIndex);
void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
, WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
void saveHieroAlignment( int startT, int endT, int startS, int endS
, const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
+ void saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule);
void saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
inline string IntToString( int i ) {
@@ -225,6 +226,8 @@ int main(int argc, char* argv[])
// allow consecutive non-terminals (X Y | X Y)
else if (strcmp(argv[i],"--TargetSyntax") == 0) {
options.targetSyntax = true;
+ } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
+ options.targetSyntacticPreferences = true;
} else if (strcmp(argv[i],"--SourceSyntax") == 0) {
options.sourceSyntax = true;
} else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
@@ -422,7 +425,8 @@ void ExtractTask::extractRules()
int endT = startT + lengthT - 1;
// if there is target side syntax, there has to be a node
- if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
+ if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT))
+// if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
continue;
// find find aligned source words
@@ -566,7 +570,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
int labelI = labelIndex[ 2+holeCount ];
string targetLabel;
- if (m_options.targetSyntax) {
+ if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
@@ -628,7 +632,7 @@ string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int
if (m_options.unpairedExtractFormat) {
out += "[" + sourceLabel + "] ";
} else {
- out += "[" + sourceLabel + "][" + targetLabel + "] ";
+ out += "[" + sourceLabel + "][" + (m_options.targetSyntacticPreferences ? "X" : targetLabel) + "] ";
}
currPos = hole.GetEnd(0);
@@ -682,6 +686,33 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
}
}
+void ExtractTask::saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule)
+{
+ rule.targetSyntacticPreference = "";
+ int holeCount = 0;
+ for (HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
+ iterHoleList != holeColl.GetHoles().end();
+ ++iterHoleList) {
+
+ const Hole &hole = *iterHoleList;
+
+ int labelI = labelIndex[ 2+holeCount ];
+ string targetLabel = "X";
+ int startT = hole.GetStart(1);
+ int endT = hole.GetEnd(1);
+ if (m_sentence.targetTree.HasNode(startT,endT)) {
+ rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelI]->label;
+ rule.targetSyntacticPreference += " ";
+ } else {
+ rule.targetSyntacticPreference += "X ";
+ }
+ ++holeCount;
+ }
+
+ rule.targetSyntacticPreference.erase(rule.targetSyntacticPreference.size()-1);
+}
+
+
void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
{
@@ -691,7 +722,8 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
// phrase labels
string targetLabel;
- if (m_options.targetSyntax) {
+// if (m_options.targetSyntax && m_sentence.targetTree.HasNode(startT,endT)) {
+ if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
@@ -776,6 +808,17 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
// std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L) << std::endl;
}
+ // target syntactic preferences
+ if (m_options.targetSyntacticPreferences) {
+ saveTargetSyntacticPreference(holeColl, labelIndex, rule);
+ if (m_sentence.targetTree.HasNode(startT,endT)) {
+ rule.targetSyntacticPreference += " ";
+ rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
+ } else {
+ rule.targetSyntacticPreference += " X";
+ }
+ }
+
addRuleToCollection( rule );
}
@@ -785,6 +828,9 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end
// number of target head labels
int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
+ if (m_options.targetSyntacticPreferences && !numLabels) {
+ numLabels++;
+ }
labelCount.push_back(numLabels);
labelIndex.push_back(0);
@@ -796,7 +842,10 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end
// number of target hole labels
for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
hole != holeColl.GetHoles().end(); hole++ ) {
- int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+ int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+ if (m_options.targetSyntacticPreferences && !numLabels) {
+ numLabels++;
+ }
labelCount.push_back(numLabels);
labelIndex.push_back(0);
}
@@ -973,12 +1022,19 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
// phrase labels
string targetLabel,sourceLabel;
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
- sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
+ if (m_sentence.targetTree.HasNode(startT,endT) && !m_options.targetSyntacticPreferences) {
+ sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ sourceLabel = "S";
+ } else {
+ sourceLabel = "X";
+ }
} else {
sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
- if (m_options.targetSyntax) {
+ if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
+// if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT))
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
@@ -1037,6 +1093,15 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
// std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L) << std::endl;
}
+ // target syntactic preferences
+ if (m_options.targetSyntacticPreferences) {
+ if (m_sentence.targetTree.HasNode(startT,endT)) {
+ rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
+ } else {
+ rule.targetSyntacticPreference += "X";
+ }
+ }
+
addRuleToCollection( rule );
}
@@ -1114,6 +1179,11 @@ void ExtractTask::writeRulesToFile()
m_phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,rule->r2lOrientation,1);
out << "}}";
}
+ if (m_options.targetSyntacticPreferences) {
+ out << " {{TargetPreferences ";
+ out << rule->targetSyntacticPreference;
+ out << "}}";
+ }
out << "\n";
if (!m_options.onlyDirectFlag) {
@@ -1167,12 +1237,12 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
if (options.phraseOrientation) {
glueRulesPhraseProperty.append(" ||| ||| {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}");
}
- if (!options.targetSyntax) {
+ if (!options.targetSyntax || options.targetSyntacticPreferences) {
grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0" << glueRulesPhraseProperty << endl
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl;
} else {
- // chose a top label that is not already a label
+ // choose a top label that is not already a label
string topLabel = "QQQQQQ";
for( unsigned int i=1; i<=topLabel.length(); i++) {
if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
@@ -1202,7 +1272,7 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
// collect counts for labels for each word
// ( labels of singleton words are used to estimate
-// distribution oflabels for unknown words )
+// distribution of labels for unknown words )
map<string,int> wordCount;
map<string,string> wordLabel;
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 0ad5844bf..1feb6b7f2 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -51,7 +51,7 @@ bool treeFragmentsFlag = false;
bool partsOfSpeechFlag = false;
bool sourceSyntaxLabelsFlag = false;
bool sourceSyntaxLabelCountsLHSFlag = false;
-bool targetPreferenceLabelsFlag = false;
+bool targetSyntacticPreferencesFlag = false;
bool unpairedExtractFormatFlag = false;
bool conditionOnTargetLhsFlag = false;
bool wordAlignmentFlag = true;
@@ -83,11 +83,11 @@ std::vector<std::string> sourceLabelsByIndex;
std::set<std::string> partsOfSpeechSet;
-boost::unordered_map<std::string,float> targetPreferenceLHSCounts;
-boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts;
-std::set<std::string> targetPreferenceLabelSet;
-std::map<std::string,size_t> targetPreferenceLabels;
-std::vector<std::string> targetPreferenceLabelsByIndex;
+boost::unordered_map<std::string,float> targetSyntacticPreferencesLHSCounts;
+boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts;
+std::set<std::string> targetSyntacticPreferencesLabelSet;
+std::map<std::string,size_t> targetSyntacticPreferencesLabels;
+std::vector<std::string> targetSyntacticPreferencesLabelsByIndex;
std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dleft dright
std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dleft dright
@@ -150,7 +150,7 @@ int main(int argc, char* argv[])
"[--TreeFragments] "
"[--SourceLabels] "
"[--SourceLabelCountsLHS] "
- "[--TargetPreferenceLabels] "
+ "[--TargetSyntacticPreferences] "
"[--UnpairedExtractFormat] "
"[--ConditionOnTargetLHS] "
"[--CrossedNonTerm]"
@@ -167,9 +167,9 @@ int main(int argc, char* argv[])
std::string fileNameFunctionWords;
std::string fileNameLeftHandSideSourceLabelCounts;
std::string fileNameLeftHandSideTargetSourceLabelCounts;
- std::string fileNameTargetPreferenceLabelSet;
- std::string fileNameLeftHandSideTargetPreferenceLabelCounts;
- std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts;
+ std::string fileNameTargetSyntacticPreferencesLabelSet;
+ std::string fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts;
+ std::string fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts;
std::string fileNamePhraseOrientationPriors;
// All unknown args are passed to feature manager.
std::vector<std::string> featureArgs;
@@ -205,14 +205,18 @@ int main(int argc, char* argv[])
fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs";
fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs";
std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
- } else if (strcmp(argv[i],"--TargetPreferenceLabels") == 0) {
- targetPreferenceLabelsFlag = true;
- std::cerr << "including target preference label information" << std::endl;
- fileNameTargetPreferenceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
- std::cerr << "writing target preference label set to file " << fileNameTargetPreferenceLabelSet << std::endl;
- fileNameLeftHandSideTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
- fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
- std::cerr << "counting left-hand side target preference labels and writing them to files " << fileNameLeftHandSideTargetPreferenceLabelCounts << " and " << fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts << std::endl;
+ } else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
+ targetSyntacticPreferencesFlag = true;
+ std::cerr << "including target syntactic preferences information" << std::endl;
+ fileNameTargetSyntacticPreferencesLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
+ std::cerr << "writing target syntactic preferences label set to file " << fileNameTargetSyntacticPreferencesLabelSet << std::endl;
+ fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
+ fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
+ std::cerr << "counting left-hand side target syntactic preferences labels and writing them to files "
+ << fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts
+ << " and "
+ << fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts
+ << std::endl;
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
unpairedExtractFormatFlag = true;
std::cerr << "processing unpaired extract format" << std::endl;
@@ -508,13 +512,13 @@ int main(int argc, char* argv[])
writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet );
}
- // target preference labels
- if (targetPreferenceLabelsFlag && !inverseFlag) {
- writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet );
- writeLeftHandSideLabelCounts( targetPreferenceLHSCounts,
- ruleTargetLHSAndTargetPreferenceLHSJointCounts,
- fileNameLeftHandSideTargetPreferenceLabelCounts,
- fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts );
+ // target syntactic preferences labels
+ if (targetSyntacticPreferencesFlag && !inverseFlag) {
+ writeLabelSet( targetSyntacticPreferencesLabelSet, fileNameTargetSyntacticPreferencesLabelSet );
+ writeLeftHandSideLabelCounts( targetSyntacticPreferencesLHSCounts,
+ ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
+ fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts,
+ fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts );
}
}
@@ -874,7 +878,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
// syntax labels
- if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) {
+ if ((sourceSyntaxLabelsFlag || targetSyntacticPreferencesFlag) && !inverseFlag) {
unsigned nNTs = 1;
for(size_t j=0; j<phraseSource->size()-1; ++j) {
if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
@@ -897,20 +901,20 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
<< "}}";
}
}
- // target preference labels
- if (targetPreferenceLabelsFlag) {
- std::string targetPreferenceLabelCounts;
- targetPreferenceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
- targetPreferenceLabelSet,
- targetPreferenceLHSCounts,
- ruleTargetLHSAndTargetPreferenceLHSJointCounts,
- vcbT);
- if ( !targetPreferenceLabelCounts.empty() ) {
+ // target syntactic preferences labels
+ if (targetSyntacticPreferencesFlag) {
+ std::string targetSyntacticPreferencesLabelCounts;
+ targetSyntacticPreferencesLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
+ targetSyntacticPreferencesLabelSet,
+ targetSyntacticPreferencesLHSCounts,
+ ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
+ vcbT);
+ if (!targetSyntacticPreferencesLabelCounts.empty()) {
phraseTableFile << " {{TargetPreferences "
<< nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
<< " "
<< count // rule count
- << targetPreferenceLabelCounts
+ << targetSyntacticPreferencesLabelCounts
<< "}}";
}
}