diff options
Diffstat (limited to 'scripts/training/phrase-extract/extract.cpp')
-rw-r--r-- | scripts/training/phrase-extract/extract.cpp | 1116 |
1 files changed, 1116 insertions, 0 deletions
diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp new file mode 100644 index 000000000..374b521de --- /dev/null +++ b/scripts/training/phrase-extract/extract.cpp @@ -0,0 +1,1116 @@ +// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include <cstdio> +#include <stdlib.h> +#include <assert.h> +#include <time.h> +#include <cstring> +#include <sstream> +#include "extract.h" +#include "InputFileStream.h" + +#ifdef WIN32 +// Include Visual Leak Detector +#include <vld.h> +#endif + +using namespace std; + +int main(int argc, char* argv[]) +{ + cerr << "Extract v2.0, written by Philipp Koehn\n" + << "rule extraction from an aligned parallel corpus\n"; + time_t starttime = time(NULL); + + Global *global = new Global(); + g_global = global; + + if (argc < 5) { + cerr << "syntax: extract corpus.target corpus.source corpus.align extract " + << " [ --Hierarchical | --Orientation" + << " | --GlueGrammar FILE | --UnknownWordLabel FILE" + << " | --OnlyDirect" + << " | --MaxSpan[" << global->maxSpan << "]" + << " | --MinHoleTarget[" << global->minHoleTarget << "]" + << " | --MinHoleSource[" << global->minHoleSource << "]" + << " | --MinWords[" << global->minWords << "]" + << " | --MaxSymbolsTarget[" << global->maxSymbolsTarget << "]" + << " | --MaxSymbolsSource[" << global->maxSymbolsSource << "]" + << " | --MaxNonTerm[" << global->maxNonTerm << "]" + << " | --SourceSyntax | --TargetSyntax" + << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting ]\n"; + exit(1); + } + char* &fileNameT = argv[1]; + char* &fileNameS = argv[2]; + char* &fileNameA = argv[3]; + string fileNameGlueGrammar; + string fileNameUnknownWordLabel; + string fileNameExtract = string(argv[4]); + + int optionInd = 5; + + for(int i=optionInd;i<argc;i++) { + // maximum span length + if (strcmp(argv[i],"--MaxSpan") == 0) { + global->maxSpan = atoi(argv[++i]); + if (global->maxSpan < 1) { + cerr << "extract error: --maxSpan should be at least 1" << endl; + exit(1); + } + } + else if (strcmp(argv[i],"--MinHoleTarget") == 0) { + global->minHoleTarget = atoi(argv[++i]); + if (global->minHoleTarget < 1) { + cerr << "extract error: --minHoleTarget should be at least 1" << endl; + exit(1); + } + } + else if (strcmp(argv[i],"--MinHoleSource") == 0) { + global->minHoleSource = atoi(argv[++i]); + if (global->minHoleSource < 1) { + cerr << "extract error: --minHoleSource should be at least 1" << endl; + exit(1); + } + } + // maximum number of words in hierarchical phrase + else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) { + global->maxSymbolsTarget = atoi(argv[++i]); + if (global->maxSymbolsTarget < 1) { + cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl; + exit(1); + } + } + // maximum number of words in hierarchical phrase + else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) { + global->maxSymbolsSource = atoi(argv[++i]); + if (global->maxSymbolsSource < 1) { + cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl; + exit(1); + } + } + // minimum number of words in hierarchical phrase + else if (strcmp(argv[i],"--MinWords") == 0) { + global->minWords = atoi(argv[++i]); + if (global->minWords < 0) { + cerr << "extract error: --MinWords should be at least 0" << endl; + exit(1); + } + } + // maximum number of non-terminals + else if (strcmp(argv[i],"--MaxNonTerm") == 0) { + global->maxNonTerm = atoi(argv[++i]); + if (global->maxNonTerm < 1) { + cerr << "extract error: --MaxNonTerm should be at least 1" << endl; + exit(1); + } + } + // allow consecutive non-terminals (X Y | X Y) + else if (strcmp(argv[i],"--TargetSyntax") == 0) { + global->targetSyntax = true; + } + else if (strcmp(argv[i],"--SourceSyntax") == 0) { + global->sourceSyntax = true; + } + else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) { + global->requireAlignedWord = false; + } + else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) { + global->nonTermConsecTarget = false; + } + else if (strcmp(argv[i],"--NonTermConsecSource") == 0) { + global->nonTermConsecSource = true; + } + else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) { + global->nonTermFirstWord = false; + } + else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { + global->onlyOutputSpanInfo = true; + } + // do not create many part00xx files! + else if (strcmp(argv[i],"--NoFileLimit") == 0) { + // now default + } + else if (strcmp(argv[i],"--OnlyDirect") == 0) { + global->onlyDirectFlag = true; + } + else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { + global->orientationFlag = true; + } + else if (strcmp(argv[i],"--GlueGrammar") == 0) { + global->glueGrammarFlag = true; + if (++i >= argc) + { + cerr << "ERROR: Option --GlueGrammar requires a file name" << endl; + exit(0); + } + fileNameGlueGrammar = string(argv[i]); + cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl; + } + else if (strcmp(argv[i],"--UnknownWordLabel") == 0) { + global->unknownWordLabelFlag = true; + if (++i >= argc) + { + cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl; + exit(0); + } + fileNameUnknownWordLabel = string(argv[i]); + cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl; + } + else if (strcmp(argv[i],"--Hierarchical") == 0) { + cerr << "extracting hierarchical rules" << endl; + global->hierarchicalFlag = true; + } + // TODO: this should be a useful option + //else if (strcmp(argv[i],"--ZipFiles") == 0) { + // zipFiles = true; + //} + // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 + else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { + global->fractionalCounting = false; + } + else if (strcmp(argv[i],"--Mixed") == 0) { + global->mixed = true; + } + else { + cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; + exit(1); + } + } + + // open input files + Moses::InputFileStream tFile(fileNameT); + Moses::InputFileStream sFile(fileNameS); + Moses::InputFileStream aFile(fileNameA); + + istream *tFileP = &tFile; + istream *sFileP = &sFile; + istream *aFileP = &aFile; + + // open output files + string fileNameExtractInv = fileNameExtract + ".inv"; + string fileNameExtractOrientation = fileNameExtract + ".o"; + extractFile.open(fileNameExtract.c_str()); + if (!global->onlyDirectFlag) + extractFileInv.open(fileNameExtractInv.c_str()); + if (global->orientationFlag) + extractFileOrientation.open(fileNameExtractOrientation.c_str()); + + // loop through all sentence pairs + int i=0; + while(true) { + i++; + //if (i%1000 == 0) cerr << "." << flush; + //if (i%10000 == 0) cerr << ":" << flush; + //if (i%100000 == 0) cerr << "!" << flush; + cerr << i << " "; + + char targetString[LINE_MAX_LENGTH]; + char sourceString[LINE_MAX_LENGTH]; + char alignmentString[LINE_MAX_LENGTH]; + SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n'); + if (tFileP->eof()) break; + SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n'); + SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n'); + SentenceAlignment sentence; + //cerr << "read in: " << targetString << " & " << sourceString << " & " << alignmentString << endl; + //az: output src, tgt, and alingment line + if (global->onlyOutputSpanInfo) { + cout << "LOG: SRC: " << sourceString << endl; + cout << "LOG: TGT: " << targetString << endl; + cout << "LOG: ALT: " << alignmentString << endl; + cout << "LOG: PHRASES_BEGIN:" << endl; + } + + if (sentence.create( targetString, sourceString, alignmentString, i, *global )) + { + if (global->unknownWordLabelFlag) + { + collectWordLabelCounts(sentence); + } + extractRules(sentence); + consolidateRules(); + writeRulesToFile(); + extractedRules.clear(); + } + if (global->onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases + } + + tFile.Close(); + sFile.Close(); + aFile.Close(); + // only close if we actually opened it + if (!global->onlyOutputSpanInfo) { + extractFile.close(); + if (!global->onlyDirectFlag) extractFileInv.close(); + if (global->orientationFlag) extractFileOrientation.close(); + } + + if (global->glueGrammarFlag) + writeGlueGrammar(fileNameGlueGrammar); + + if (global->unknownWordLabelFlag) + writeUnknownWordLabel(fileNameUnknownWordLabel); +} + +void extractRules( SentenceAlignment &sentence ) { + int countT = sentence.target.size(); + int countS = sentence.source.size(); + + // phrase repository for creating hiero phrases + RuleExist ruleExist(countT); + + // check alignments for target phrase startT...endT + for(int lengthT=1; + lengthT <= g_global->maxSpan && lengthT <= countT; + lengthT++) { + for(int startT=0; startT < countT-(lengthT-1); startT++) { + + // that's nice to have + int endT = startT + lengthT - 1; + + // if there is target side syntax, there has to be a node + if (g_global->targetSyntax && !sentence.targetTree.HasNode(startT,endT)) + continue; + + // find find aligned source words + // first: find minimum and maximum source word + int minS = 9999; + int maxS = -1; + vector< int > usedS = sentence.alignedCountS; + for(int ti=startT;ti<=endT;ti++) { + for(int i=0;i<sentence.alignedToT[ti].size();i++) { + int si = sentence.alignedToT[ti][i]; + // cerr << "point (" << si << ", " << ti << ")\n"; + if (si<minS) { minS = si; } + if (si>maxS) { maxS = si; } + usedS[ si ]--; + } + } + + // unaligned phrases are not allowed + if( maxS == -1 ) + continue; + + // source phrase has to be within limits + if( maxS-minS >= g_global->maxSpan ) + continue; + + // check if source words are aligned to out of bound target words + bool out_of_bounds = false; + for(int si=minS;si<=maxS && !out_of_bounds;si++) + if (usedS[si]>0) { + out_of_bounds = true; + } + + // if out of bound, you gotta go + if (out_of_bounds) + continue; + + // done with all the checks, lets go over all consistent phrase pairs + // start point of source phrase may retreat over unaligned + for(int startS=minS; + (startS>=0 && + startS>maxS - g_global->maxSpan && // within length limit + (startS==minS || sentence.alignedCountS[startS]==0)); // unaligned + startS--) + { + // end point of source phrase may advance over unaligned + for(int endS=maxS; + (endS<countS && endS<startS + g_global->maxSpan && // within length limit + (endS==maxS || sentence.alignedCountS[endS]==0)); // unaligned + endS++) + { + // if there is source side syntax, there has to be a node + if (g_global->sourceSyntax && !sentence.sourceTree.HasNode(startS,endS)) + continue; + + // TODO: loop over all source and target syntax labels + + // if within length limits, add as fully-lexical phrase pair + if (endT-startT < g_global->maxSymbolsTarget && endS-startS < g_global->maxSymbolsSource) + { + addRule(sentence,startT,endT,startS,endS, ruleExist); + } + + // take note that this is a valid phrase alignment + ruleExist.Add(startT, endT, startS, endS); + + // extract hierarchical rules + if (g_global->hierarchicalFlag) + { + // are rules not allowed to start non-terminals? + int initStartT = g_global->nonTermFirstWord ? startT : startT + 1; + + HoleCollection holeColl; // empty hole collection + addHieroRule(sentence, startT, endT, startS, endS, + ruleExist, holeColl, 0, initStartT, + endT-startT+1, endS-startS+1); + } + } + } + } + } +} + +void preprocessSourceHieroPhrase( SentenceAlignment &sentence + , int startT, int endT, int startS, int endS + , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex) +{ + vector<Hole*>::const_iterator iterHoleList = holeColl.GetSortedSourceHoles().begin(); + assert(iterHoleList != holeColl.GetSortedSourceHoles().end()); + + int outPos = 0; + int holeCount = 0; + int holeTotal = holeColl.GetHoles().size(); + for(int currPos = startS; currPos <= endS; currPos++) + { + bool isHole = false; + if (iterHoleList != holeColl.GetSortedSourceHoles().end()) + { + const Hole &hole = **iterHoleList; + isHole = hole.GetStart(0) == currPos; + } + + if (isHole) + { + Hole &hole = **iterHoleList; + + int labelI = labelIndex[ 2+holeCount+holeTotal ]; + string label = g_global->sourceSyntax ? + sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X"; + hole.SetLabel(label, 0); + + currPos = hole.GetEnd(0); + hole.SetPos(outPos, 0); + ++iterHoleList; + ++holeCount; + } + else + { + indexS[currPos] = outPos; + } + + outPos++; + } + + assert(iterHoleList == holeColl.GetSortedSourceHoles().end()); +} + +bool postprocessSourceHieroPhrase( const SentenceAlignment &sentence + , int startT, int endT, int startS, int endS + , const WordIndex &indexS, const HoleCollection &holeColl, const LabelIndex &labelIndex) +{ // make sure that X aren't by the edges, unless you're doing hiero extraction + if (!g_global->mixed) + return true; + + vector<Hole*>::const_iterator iterHoleList = holeColl.GetSortedSourceHoles().begin(); + assert(iterHoleList != holeColl.GetSortedSourceHoles().end()); + + int outPos = 0; + int holeCount = 0; + const Hole *prevHole = NULL; + for(int currPos = startS; currPos <= endS; currPos++) + { + bool isHole = false; + if (iterHoleList != holeColl.GetSortedSourceHoles().end()) + { + const Hole &hole = **iterHoleList; + isHole = hole.GetStart(0) == currPos; + } + + if (isHole) + { // check that X not on the perimeters, and X not adjacent + const Hole &hole = **iterHoleList; + + if (!hole.IsSyntax()) + { + if (prevHole && !prevHole->IsSyntax()) + { // adjacent [X] [X] + return false; + } + /* + if (hole.GetStart(0) == startS || hole.GetEnd(0) == endS) + { // on the perimeters + return false; + } + */ + } + + currPos = hole.GetEnd(0); + + ++iterHoleList; + ++holeCount; + prevHole = &hole; + } + else + { // not a hole, do nothing + prevHole = NULL; + } + + outPos++; + } + + assert(iterHoleList == holeColl.GetSortedSourceHoles().end()); + + return true; +} + +string createTargetHieroPhrase(SentenceAlignment &sentence + , int startT, int endT, int startS, int endS + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex) +{ + HoleList::iterator iterHoleList = holeColl.GetHoles().begin(); + assert(iterHoleList != holeColl.GetHoles().end()); + + string out = ""; + int outPos = 0; + int holeCount = 0; + for(int currPos = startT; currPos <= endT; currPos++) + { + bool isHole = false; + if (iterHoleList != holeColl.GetHoles().end()) + { + const Hole &hole = *iterHoleList; + isHole = hole.GetStart(1) == currPos; + } + + if (isHole) + { + Hole &hole = *iterHoleList; + + const string &sourceLabel = hole.GetLabel(0); + assert(sourceLabel != ""); + + int labelI = labelIndex[ 2+holeCount ]; + string targetLabel = g_global->targetSyntax ? + sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X"; + hole.SetLabel(targetLabel, 1); + + out += " [" + sourceLabel + "][" + targetLabel + "]"; + + currPos = hole.GetEnd(1); + hole.SetPos(outPos, 1); + ++iterHoleList; + holeCount++; + } + else + { + indexT[currPos] = outPos; + out += " " + sentence.target[currPos]; + } + + outPos++; + } + + assert(iterHoleList == holeColl.GetHoles().end()); + return out.substr(1); +} + + +string createSourceHieroPhrase( SentenceAlignment &sentence + , int startT, int endT, int startS, int endS + , HoleCollection &holeColl, const LabelIndex &labelIndex) +{ + vector<Hole*>::const_iterator iterHoleList = holeColl.GetSortedSourceHoles().begin(); + assert(iterHoleList != holeColl.GetSortedSourceHoles().end()); + + string out = ""; + int outPos = 0; + int holeCount = 0; + for(int currPos = startS; currPos <= endS; currPos++) + { + bool isHole = false; + if (iterHoleList != holeColl.GetSortedSourceHoles().end()) + { + const Hole &hole = **iterHoleList; + isHole = hole.GetStart(0) == currPos; + } + + if (isHole) + { + Hole &hole = **iterHoleList; + + const string &targetLabel = hole.GetLabel(1); + assert(targetLabel != ""); + + const string &sourceLabel = hole.GetLabel(0); + out += " [" + sourceLabel + "][" + targetLabel + "]"; + + currPos = hole.GetEnd(0); + hole.SetPos(outPos, 0); + ++iterHoleList; + ++holeCount; + } + else + { + out += " " + sentence.source[currPos]; + } + + outPos++; + } + + assert(iterHoleList == holeColl.GetSortedSourceHoles().end()); + return out.substr(1); +} + +void createHieroAlignment(SentenceAlignment &sentence + , int startT, int endT, int startS, int endS + , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule) +{ + // print alignment of words + for(int ti=startT;ti<=endT;ti++) { + if (indexT.find(ti) != indexT.end()) { // does word still exist? + for(int i=0;i<sentence.alignedToT[ti].size();i++) { + int si = sentence.alignedToT[ti][i]; + rule.alignment += " " + IntToString(indexS.find(si)->second) + "-" + IntToString(indexT.find(ti)->second); + if (! g_global->onlyDirectFlag) + rule.alignmentInv += " " + IntToString(indexT.find(ti)->second) + "-" + IntToString(indexS.find(si)->second); + } + } + } + + // print alignment of non terminals + HoleList::const_iterator iterHole; + for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) + { + rule.alignment += " " + IntToString(iterHole->GetPos(0)) + "-" + IntToString(iterHole->GetPos(1)); + if (!g_global->onlyDirectFlag) + rule.alignmentInv += " " + IntToString(iterHole->GetPos(1)) + "-" + IntToString(iterHole->GetPos(0)); + } + + rule.alignment = rule.alignment.substr(1); + if (!g_global->onlyDirectFlag) + rule.alignmentInv = rule.alignmentInv.substr(1); +} + +void createHieroPhrase( SentenceAlignment &sentence, int startT, int endT, int startS, int endS + , HoleCollection &holeColl, LabelIndex &labelIndex) +{ + phraseCount++; + WordIndex indexS, indexT; // to keep track of word positions in rule + + ExtractedRule rule( startT, endT, startS, endS ); + + // phrase labels + string targetLabel = g_global->targetSyntax ? + sentence.targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X"; + string sourceLabel = g_global->sourceSyntax ? + sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X"; + //string sourceLabel = "X"; + + // create non-terms on the source side + preprocessSourceHieroPhrase(sentence, startT, endT, startS, endS, indexS, holeColl, labelIndex); + + // target + rule.target = createTargetHieroPhrase(sentence, startT, endT, startS, endS, indexT, holeColl, labelIndex) + + " [" + targetLabel + "]"; + + // source + // holeColl.SortSourceHoles(); + rule.source = createSourceHieroPhrase(sentence, startT, endT, startS, endS, holeColl, labelIndex) + + " [" + sourceLabel + "]"; + + bool allowable = postprocessSourceHieroPhrase(sentence, startT, endT, startS, endS, indexS, holeColl, labelIndex); + rule.allowable = allowable; + + // alignment + createHieroAlignment(sentence, startT, endT, startS, endS, indexS, indexT, holeColl, rule); + +// cerr << "\tLABEL RULE " << rule.source << " ||| " << rule.target << endl; + + addRuleToCollection( rule ); +} + +void createAllHieroPhrases( SentenceAlignment &sentence + , int startT, int endT, int startS, int endS + , HoleCollection &holeColl) +{ + LabelIndex labelIndex,labelCount; + + // number of target head labels + int numLabels = g_global->targetSyntax ? sentence.targetTree.GetNodes(startT,endT).size() : 1; + labelCount.push_back(numLabels); + labelIndex.push_back(0); + + // number of source head labels + numLabels = g_global->sourceSyntax ? sentence.sourceTree.GetNodes(startS,endS).size() : 1; + labelCount.push_back(numLabels); + labelIndex.push_back(0); + + // number of target hole labels + for( HoleList::const_iterator hole = holeColl.GetHoles().begin(); + hole != holeColl.GetHoles().end(); hole++ ) + { + int numLabels = g_global->targetSyntax ? sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ; + labelCount.push_back(numLabels); + labelIndex.push_back(0); + } + + // number of source hole labels + holeColl.SortSourceHoles(); + for( vector<Hole*>::const_iterator i = holeColl.GetSortedSourceHoles().begin(); + i != holeColl.GetSortedSourceHoles().end(); i++ ) + { + const Hole &hole = **i; + int numLabels = g_global->sourceSyntax ? sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ; + labelCount.push_back(numLabels); + labelIndex.push_back(0); + } + + // cerr << "LABEL COUNT: "; +// for(int i=0;i<labelCount.size();i++) +// { +// cerr << " " << labelCount[i]; +// } +// cerr << endl; + + // loop through the holes + bool done = false; + while(!done) + { +// cerr << "\tLABEL INDEX: "; +// for(int i=0;i<labelIndex.size();i++) +// { +// cerr << " " << labelIndex[i]; +// } +// cerr << endl; + createHieroPhrase( sentence, startT, endT, startS, endS, holeColl, labelIndex ); + for(int i=0; i<labelIndex.size(); i++) + { + labelIndex[i]++; + if(labelIndex[i] == labelCount[i]) + { + labelIndex[i] = 0; + if (i == labelIndex.size()-1) + done = true; + } + else + { + break; + } + } + } +} + + +// this function is called recursively +// it pokes a new hole into the phrase pair, and then calls itself for more holes +void addHieroRule( SentenceAlignment &sentence + , int startT, int endT, int startS, int endS + , RuleExist &ruleExist, const HoleCollection &holeColl + , int numHoles, int initStartT, int wordCountT, int wordCountS) +{ + // cerr << "phrase " << startT << "-" << endT << "=>" << startS << "-" << endS << " (" << numHoles << ")" << endl; + // done, if already the maximum number of non-terminals in phrase pair + if (numHoles >= g_global->maxNonTerm) + return; + + // find a hole... + for (int startHoleT = initStartT; startHoleT <= endT; ++startHoleT) + { + //cerr << "phrase " << startT << "-" << endT << "=>" << startS << "-" << endS << " (" << numHoles << ")" << endl; + for (int endHoleT = startHoleT+(g_global->minHoleTarget-1); endHoleT <= endT; ++endHoleT) + { + //cerr << "considering from hole " << startHoleT << "-" << endHoleT << " from phrase " << startT << "-" << endT << "=>" << startS << "-" << endS << " (" << numHoles << ")" << endl; + // if last non-terminal, enforce word count limit + if (numHoles == g_global->maxNonTerm-1 && wordCountT - (endHoleT-startT+1) + (numHoles+1) > g_global->maxSymbolsTarget) + continue; + + // always enforce min word count limit + if (wordCountT - (endHoleT-startHoleT+1) < g_global->minWords) + continue; + + // except the whole span + if (startHoleT == startT && endHoleT == endT) + continue; + + // does a phrase cover this target span? + // if it does, then there should be a list of mapped source phrases + // (multiple possible due to unaligned words) + const HoleList &sourceHoles = ruleExist.GetSourceHoles(startHoleT, endHoleT); + + // loop over sub phrase pairs + HoleList::const_iterator iterSourceHoles; + for (iterSourceHoles = sourceHoles.begin(); iterSourceHoles != sourceHoles.end(); ++iterSourceHoles) + { + const Hole &sourceHole = *iterSourceHoles; + + // cerr << "source" << sourceHole.GetStart() << "-" << sourceHole.GetEnd() << endl; + + // enforce minimum hole size + if (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1 < g_global->minHoleSource) + continue; + + // if last non-terminal, enforce word count limit + if (numHoles == g_global->maxNonTerm-1 && + wordCountS - (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1) + (numHoles+1) > g_global->maxSymbolsSource) + continue; + + // enforce min word count limit + if (wordCountS - (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1) < g_global->minWords) + continue; + + // hole must be subphrase of the source phrase + // (may be violated if subphrase contains additional unaligned source word) + if (startS > sourceHole.GetStart(0) || endS < sourceHole.GetEnd(0)) + continue; + + // make sure target side does not overlap with another hole + if (holeColl.OverlapSource(sourceHole)) + continue; + + // if consecutive non-terminals are not allowed, also check for source + if (!g_global->nonTermConsecSource && holeColl.ConsecSource(sourceHole) ) + continue; + + // require that at least one aligned word is left + if (g_global->requireAlignedWord) + { + HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin(); + bool foundAlignedWord = false; + // loop through all word positions + for(int pos = startT; pos <= endT && !foundAlignedWord; pos++) + { + // new hole? moving on... + if (pos == startHoleT) + { + pos = endHoleT; + } + // covered by hole? moving on... + else if (iterHoleList != holeColl.GetHoles().end() && iterHoleList->GetStart(1) == pos) + { + pos = iterHoleList->GetEnd(1); + ++iterHoleList; + } + // covered by word? check if it is aligned + else + { + if (sentence.alignedToT[pos].size() > 0) + foundAlignedWord = true; + } + } + if (!foundAlignedWord) + continue; + } + + // update list of holes in this phrase pair + HoleCollection copyHoleColl(holeColl); + copyHoleColl.Add(startHoleT, endHoleT, sourceHole.GetStart(0), sourceHole.GetEnd(0)); + + // now some checks that disallow this phrase pair, but not further recursion + bool allowablePhrase = true; + + // maximum words count violation? + if (wordCountS - (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1) + (numHoles+1) > g_global->maxSymbolsSource) + allowablePhrase = false; + + if (wordCountT - (endHoleT-startHoleT+1) + (numHoles+1) > g_global->maxSymbolsTarget) + allowablePhrase = false; + + // passed all checks... + // cerr << "adding hole " << startHoleT << "," << endHoleT << "," << sourceHole.GetStart() << "," << sourceHole.GetEnd() << endl; + if (allowablePhrase) + createAllHieroPhrases(sentence, startT, endT, startS, endS, copyHoleColl); + + // recursively search for next hole + int nextInitStartT = g_global->nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2; + addHieroRule(sentence, startT, endT, startS, endS + , ruleExist, copyHoleColl, numHoles + 1, nextInitStartT + , wordCountT - (endHoleT-startHoleT+1) + , wordCountS - (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1)); + } + } + } +} + +void addRule( SentenceAlignment &sentence, int startT, int endT, int startS, int endS + , RuleExist &ruleExist) +{ + // source + //cerr << "adding ( " << startS << "-" << endS << ", " << startT << "-" << endT << ")\n"; + + if (g_global->onlyOutputSpanInfo) { + cout << startS << " " << endS << " " << startT << " " << endT << endl; + return; + } + phraseCount++; + + ExtractedRule rule(startT, endT, startS, endS); + + // phrase labels + string targetLabel,sourceLabel; + if (g_global->hierarchicalFlag) { + sourceLabel = g_global->sourceSyntax ? + sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X"; + // sourceLabel = "X"; + targetLabel = g_global->targetSyntax ? + sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X"; + } + + // source + rule.source = ""; + for(int si=startS;si<=endS;si++) + rule.source += " " + sentence.source[si]; + if (g_global->hierarchicalFlag) + rule.source += " [" + sourceLabel + "]"; + rule.source = rule.source.substr(1); + + // target + rule.target = ""; + for(int ti=startT;ti<=endT;ti++) + rule.target += " " + sentence.target[ti]; + if (g_global->hierarchicalFlag) + rule.target += " [" + targetLabel + "]"; + rule.target = rule.target.substr(1); + + // alignment + for(int ti=startT;ti<=endT;ti++) + { + for(int i=0;i<sentence.alignedToT[ti].size();i++) + { + int si = sentence.alignedToT[ti][i]; + rule.alignment += " " + IntToString(si-startS) + "-" + IntToString(ti-startT); + if (!g_global->onlyDirectFlag) + rule.alignmentInv += " " + IntToString(ti-startT) + "-" + IntToString(si-startS); + } + } + rule.alignment = rule.alignment.substr(1); + if (!g_global->onlyDirectFlag) + rule.alignmentInv = rule.alignmentInv.substr(1); + + // orientation + if (g_global->orientationFlag) { + // orientation to previous E + bool connectedLeftTop = isAligned( sentence, startS-1, startT-1 ); + bool connectedRightTop = isAligned( sentence, endS+1, startT-1 ); + if ( connectedLeftTop && !connectedRightTop) + rule.orientation = "mono"; + else if (!connectedLeftTop && connectedRightTop) + rule.orientation = "swap"; + else + rule.orientation = "other"; + + // orientation to following E + bool connectedLeftBottom = isAligned( sentence, startS-1, endT+1 ); + bool connectedRightBottom = isAligned( sentence, endS+1, endT+1 ); + if ( connectedLeftBottom && !connectedRightBottom) + rule.orientationForward = "swap"; + else if (!connectedLeftBottom && connectedRightBottom) + rule.orientationForward = "mono"; + else + rule.orientationForward = "other"; + } + addRuleToCollection( rule ); +} + +bool isAligned ( SentenceAlignment &sentence, int si, int ti ) { + if (ti == -1 && si == -1) return true; + if (ti <= -1 || si <= -1) return false; + if (ti == sentence.target.size() && si == sentence.source.size()) return true; + if (ti >= sentence.target.size() || si >= sentence.source.size()) return false; + for(int i=0;i<sentence.alignedToT[ti].size();i++) + if (sentence.alignedToT[ti][i] == si) return true; + return false; +} + +void addRuleToCollection( ExtractedRule &newRule ) { + //cerr << "add ( " << newRule.source << " , " << newRule.target << " )\n"; + + // no double-counting of identical rules from overlapping spans + if (!g_global->duplicateRules) + { + vector<ExtractedRule>::const_iterator rule; + for(rule = extractedRules.begin(); rule != extractedRules.end(); rule++ ) + { + if (rule->source.compare( newRule.source ) == 0 && + rule->target.compare( newRule.target ) == 0 && + !(rule->endT < newRule.startT || rule->startT > newRule.endT)) // overlapping + { + return; + } + } + } + extractedRules.push_back( newRule ); +} + +void consolidateRules() +{ + typedef vector<ExtractedRule>::iterator R; + map<int, map<int, map<int, map<int,int> > > > spanCount; + + // compute number of rules per span + if (g_global->fractionalCounting) + { + for(R rule = extractedRules.begin(); rule != extractedRules.end(); rule++ ) + { + spanCount[ rule->startT ][ rule->endT ][ rule->startS ][ rule->endS ]++; + } + } + + // compute fractional counts + for(R rule = extractedRules.begin(); rule != extractedRules.end(); rule++ ) + { + rule->count = 1.0/(float) (g_global->fractionalCounting ? spanCount[ rule->startT ][ rule->endT ][ rule->startS ][ rule->endS ] : 1.0 ); + } + + // consolidate counts + for(R rule = extractedRules.begin(); rule != extractedRules.end(); rule++ ) + { + if (rule->count == 0) + continue; + for(R r2 = rule+1; r2 != extractedRules.end(); r2++ ) + { + if (rule->source.compare( r2->source ) == 0 && + rule->target.compare( r2->target ) == 0 && + rule->alignment.compare( r2->alignment ) == 0) + { + rule->count += r2->count; + r2->count = 0; + } + } + } +} + +void writeRulesToFile() { + vector<ExtractedRule>::const_iterator ruleIter; + for(ruleIter = extractedRules.begin(); ruleIter != extractedRules.end(); ruleIter++ ) + { + const ExtractedRule rule = *ruleIter; + //cerr << rule.source << endl; + + if (rule.count == 0) + continue; + if (!rule.allowable) + continue; + + extractFile << rule.source << " ||| " + << rule.target << " ||| " + << rule.alignment << " ||| " + << rule.count << "\n"; + + if (!g_global->onlyDirectFlag) + extractFileInv << rule.target << " ||| " + << rule.source << " ||| " + << rule.alignmentInv << " ||| " + << rule.count << "\n"; + + if (g_global->orientationFlag) + extractFileOrientation << rule.source << " ||| " + << rule.target << " ||| " + << rule.orientation << " " << rule.orientationForward << "\n"; + } +} + +void writeGlueGrammar( string fileName ) +{ + ofstream grammarFile; + grammarFile.open(fileName.c_str()); + if (!g_global->targetSyntax) + { + grammarFile << "[X] [S] ||| <s> ||| <s> ||| ||| 1" << endl + << "[X] [S] ||| [X] </s> ||| [S] </s> ||| 0-0 ||| 1" << endl + << "[X] [S] ||| [X] [X] ||| [S] [X] ||| 0-0 1-1 ||| 2.718" << endl; + } + else + { + // chose a top label that is not already a label + string topLabel = "QQQQQQ"; + for( int i=1;i<=topLabel.length(); i++) + { + if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) + { + topLabel = topLabel.substr(0,i); + break; + } + } + // basic rules + grammarFile << "[X] [" << topLabel << "] ||| <s> ||| <s> ||| ||| 1" << endl + << "[X] [" << topLabel << "] ||| [X] </s> ||| [" << topLabel << "] </s> ||| 0-0 ||| 1" << endl; + + // top rules + for( map<string,int>::const_iterator i = targetTopLabelCollection.begin(); + i != targetTopLabelCollection.end(); i++ ) + { + grammarFile << "[X] [" << topLabel << "] ||| <s> [X] </s> ||| <s> [" << i->first << "] </s> ||| 1-1 ||| 1" << endl; + } + + // glue rules + for( set<string>::const_iterator i = targetLabelCollection.begin(); + i != targetLabelCollection.end(); i++ ) + { + grammarFile << "[X] [" << topLabel << "] ||| [X] [X] ||| [" << topLabel << "] [" << *i << "] ||| 0-0 1-1 ||| 2.718" << endl; + } + grammarFile << "[X] [" << topLabel << "] ||| [X] [X] ||| [" << topLabel << "] [X] ||| 0-0 1-1 ||| 2.718" << endl; // glue rule for unknown word... + } + grammarFile.close(); +} + +// collect counts for labels for each word +// ( labels of singleton words are used to estimate +// distribution oflabels for unknown words ) + +map<string,int> wordCount; +map<string,string> wordLabel; +void collectWordLabelCounts( SentenceAlignment &sentence ) +{ + int countT = sentence.target.size(); + for(int ti=0; ti < countT; ti++) + { + string &word = sentence.target[ ti ]; + const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti); + if (labels.size() > 0) + { + wordCount[ word ]++; + wordLabel[ word ] = labels[0]->GetLabel(); + } + } +} + +void writeUnknownWordLabel( string fileName ) +{ + ofstream outFile; + outFile.open(fileName.c_str()); + typedef map<string,int>::const_iterator I; + + map<string,int> count; + int total = 0; + for(I word = wordCount.begin(); word != wordCount.end(); word++) + { + // only consider singletons + if (word->second == 1) + { + count[ wordLabel[ word->first ] ]++; + total++; + } + } + + for(I pos = count.begin(); pos != count.end(); pos++) + { + double ratio = ((double) pos->second / (double) total); + if (ratio > 0.03) + outFile << pos->first << " " << ratio << endl; + } + + outFile.close(); +} |