Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/training/phrase-extract/extract.cpp')
-rw-r--r--scripts/training/phrase-extract/extract.cpp1116
1 files changed, 1116 insertions, 0 deletions
diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp
new file mode 100644
index 000000000..374b521de
--- /dev/null
+++ b/scripts/training/phrase-extract/extract.cpp
@@ -0,0 +1,1116 @@
+// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <cstdio>
+#include <stdlib.h>
+#include <assert.h>
+#include <time.h>
+#include <cstring>
+#include <sstream>
+#include "extract.h"
+#include "InputFileStream.h"
+
+#ifdef WIN32
+// Include Visual Leak Detector
+#include <vld.h>
+#endif
+
+using namespace std;
+
+int main(int argc, char* argv[])
+{
+ cerr << "Extract v2.0, written by Philipp Koehn\n"
+ << "rule extraction from an aligned parallel corpus\n";
+ time_t starttime = time(NULL);
+
+ Global *global = new Global();
+ g_global = global;
+
+ if (argc < 5) {
+ cerr << "syntax: extract corpus.target corpus.source corpus.align extract "
+ << " [ --Hierarchical | --Orientation"
+ << " | --GlueGrammar FILE | --UnknownWordLabel FILE"
+ << " | --OnlyDirect"
+ << " | --MaxSpan[" << global->maxSpan << "]"
+ << " | --MinHoleTarget[" << global->minHoleTarget << "]"
+ << " | --MinHoleSource[" << global->minHoleSource << "]"
+ << " | --MinWords[" << global->minWords << "]"
+ << " | --MaxSymbolsTarget[" << global->maxSymbolsTarget << "]"
+ << " | --MaxSymbolsSource[" << global->maxSymbolsSource << "]"
+ << " | --MaxNonTerm[" << global->maxNonTerm << "]"
+ << " | --SourceSyntax | --TargetSyntax"
+ << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting ]\n";
+ exit(1);
+ }
+ char* &fileNameT = argv[1];
+ char* &fileNameS = argv[2];
+ char* &fileNameA = argv[3];
+ string fileNameGlueGrammar;
+ string fileNameUnknownWordLabel;
+ string fileNameExtract = string(argv[4]);
+
+ int optionInd = 5;
+
+ for(int i=optionInd;i<argc;i++) {
+ // maximum span length
+ if (strcmp(argv[i],"--MaxSpan") == 0) {
+ global->maxSpan = atoi(argv[++i]);
+ if (global->maxSpan < 1) {
+ cerr << "extract error: --maxSpan should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
+ global->minHoleTarget = atoi(argv[++i]);
+ if (global->minHoleTarget < 1) {
+ cerr << "extract error: --minHoleTarget should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ else if (strcmp(argv[i],"--MinHoleSource") == 0) {
+ global->minHoleSource = atoi(argv[++i]);
+ if (global->minHoleSource < 1) {
+ cerr << "extract error: --minHoleSource should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ // maximum number of words in hierarchical phrase
+ else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
+ global->maxSymbolsTarget = atoi(argv[++i]);
+ if (global->maxSymbolsTarget < 1) {
+ cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ // maximum number of words in hierarchical phrase
+ else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
+ global->maxSymbolsSource = atoi(argv[++i]);
+ if (global->maxSymbolsSource < 1) {
+ cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ // minimum number of words in hierarchical phrase
+ else if (strcmp(argv[i],"--MinWords") == 0) {
+ global->minWords = atoi(argv[++i]);
+ if (global->minWords < 0) {
+ cerr << "extract error: --MinWords should be at least 0" << endl;
+ exit(1);
+ }
+ }
+ // maximum number of non-terminals
+ else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
+ global->maxNonTerm = atoi(argv[++i]);
+ if (global->maxNonTerm < 1) {
+ cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ // allow consecutive non-terminals (X Y | X Y)
+ else if (strcmp(argv[i],"--TargetSyntax") == 0) {
+ global->targetSyntax = true;
+ }
+ else if (strcmp(argv[i],"--SourceSyntax") == 0) {
+ global->sourceSyntax = true;
+ }
+ else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
+ global->requireAlignedWord = false;
+ }
+ else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
+ global->nonTermConsecTarget = false;
+ }
+ else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
+ global->nonTermConsecSource = true;
+ }
+ else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
+ global->nonTermFirstWord = false;
+ }
+ else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
+ global->onlyOutputSpanInfo = true;
+ }
+ // do not create many part00xx files!
+ else if (strcmp(argv[i],"--NoFileLimit") == 0) {
+ // now default
+ }
+ else if (strcmp(argv[i],"--OnlyDirect") == 0) {
+ global->onlyDirectFlag = true;
+ }
+ else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
+ global->orientationFlag = true;
+ }
+ else if (strcmp(argv[i],"--GlueGrammar") == 0) {
+ global->glueGrammarFlag = true;
+ if (++i >= argc)
+ {
+ cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
+ exit(0);
+ }
+ fileNameGlueGrammar = string(argv[i]);
+ cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
+ }
+ else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
+ global->unknownWordLabelFlag = true;
+ if (++i >= argc)
+ {
+ cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
+ exit(0);
+ }
+ fileNameUnknownWordLabel = string(argv[i]);
+ cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
+ }
+ else if (strcmp(argv[i],"--Hierarchical") == 0) {
+ cerr << "extracting hierarchical rules" << endl;
+ global->hierarchicalFlag = true;
+ }
+ // TODO: this should be a useful option
+ //else if (strcmp(argv[i],"--ZipFiles") == 0) {
+ // zipFiles = true;
+ //}
+ // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
+ else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
+ global->fractionalCounting = false;
+ }
+ else if (strcmp(argv[i],"--Mixed") == 0) {
+ global->mixed = true;
+ }
+ else {
+ cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
+ exit(1);
+ }
+ }
+
+ // open input files
+ Moses::InputFileStream tFile(fileNameT);
+ Moses::InputFileStream sFile(fileNameS);
+ Moses::InputFileStream aFile(fileNameA);
+
+ istream *tFileP = &tFile;
+ istream *sFileP = &sFile;
+ istream *aFileP = &aFile;
+
+ // open output files
+ string fileNameExtractInv = fileNameExtract + ".inv";
+ string fileNameExtractOrientation = fileNameExtract + ".o";
+ extractFile.open(fileNameExtract.c_str());
+ if (!global->onlyDirectFlag)
+ extractFileInv.open(fileNameExtractInv.c_str());
+ if (global->orientationFlag)
+ extractFileOrientation.open(fileNameExtractOrientation.c_str());
+
+ // loop through all sentence pairs
+ int i=0;
+ while(true) {
+ i++;
+ //if (i%1000 == 0) cerr << "." << flush;
+ //if (i%10000 == 0) cerr << ":" << flush;
+ //if (i%100000 == 0) cerr << "!" << flush;
+ cerr << i << " ";
+
+ char targetString[LINE_MAX_LENGTH];
+ char sourceString[LINE_MAX_LENGTH];
+ char alignmentString[LINE_MAX_LENGTH];
+ SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n');
+ if (tFileP->eof()) break;
+ SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n');
+ SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
+ SentenceAlignment sentence;
+ //cerr << "read in: " << targetString << " & " << sourceString << " & " << alignmentString << endl;
+ //az: output src, tgt, and alingment line
+ if (global->onlyOutputSpanInfo) {
+ cout << "LOG: SRC: " << sourceString << endl;
+ cout << "LOG: TGT: " << targetString << endl;
+ cout << "LOG: ALT: " << alignmentString << endl;
+ cout << "LOG: PHRASES_BEGIN:" << endl;
+ }
+
+ if (sentence.create( targetString, sourceString, alignmentString, i, *global ))
+ {
+ if (global->unknownWordLabelFlag)
+ {
+ collectWordLabelCounts(sentence);
+ }
+ extractRules(sentence);
+ consolidateRules();
+ writeRulesToFile();
+ extractedRules.clear();
+ }
+ if (global->onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+ }
+
+ tFile.Close();
+ sFile.Close();
+ aFile.Close();
+ // only close if we actually opened it
+ if (!global->onlyOutputSpanInfo) {
+ extractFile.close();
+ if (!global->onlyDirectFlag) extractFileInv.close();
+ if (global->orientationFlag) extractFileOrientation.close();
+ }
+
+ if (global->glueGrammarFlag)
+ writeGlueGrammar(fileNameGlueGrammar);
+
+ if (global->unknownWordLabelFlag)
+ writeUnknownWordLabel(fileNameUnknownWordLabel);
+}
+
+void extractRules( SentenceAlignment &sentence ) {
+ int countT = sentence.target.size();
+ int countS = sentence.source.size();
+
+ // phrase repository for creating hiero phrases
+ RuleExist ruleExist(countT);
+
+ // check alignments for target phrase startT...endT
+ for(int lengthT=1;
+ lengthT <= g_global->maxSpan && lengthT <= countT;
+ lengthT++) {
+ for(int startT=0; startT < countT-(lengthT-1); startT++) {
+
+ // that's nice to have
+ int endT = startT + lengthT - 1;
+
+ // if there is target side syntax, there has to be a node
+ if (g_global->targetSyntax && !sentence.targetTree.HasNode(startT,endT))
+ continue;
+
+ // find find aligned source words
+ // first: find minimum and maximum source word
+ int minS = 9999;
+ int maxS = -1;
+ vector< int > usedS = sentence.alignedCountS;
+ for(int ti=startT;ti<=endT;ti++) {
+ for(int i=0;i<sentence.alignedToT[ti].size();i++) {
+ int si = sentence.alignedToT[ti][i];
+ // cerr << "point (" << si << ", " << ti << ")\n";
+ if (si<minS) { minS = si; }
+ if (si>maxS) { maxS = si; }
+ usedS[ si ]--;
+ }
+ }
+
+ // unaligned phrases are not allowed
+ if( maxS == -1 )
+ continue;
+
+ // source phrase has to be within limits
+ if( maxS-minS >= g_global->maxSpan )
+ continue;
+
+ // check if source words are aligned to out of bound target words
+ bool out_of_bounds = false;
+ for(int si=minS;si<=maxS && !out_of_bounds;si++)
+ if (usedS[si]>0) {
+ out_of_bounds = true;
+ }
+
+ // if out of bound, you gotta go
+ if (out_of_bounds)
+ continue;
+
+ // done with all the checks, lets go over all consistent phrase pairs
+ // start point of source phrase may retreat over unaligned
+ for(int startS=minS;
+ (startS>=0 &&
+ startS>maxS - g_global->maxSpan && // within length limit
+ (startS==minS || sentence.alignedCountS[startS]==0)); // unaligned
+ startS--)
+ {
+ // end point of source phrase may advance over unaligned
+ for(int endS=maxS;
+ (endS<countS && endS<startS + g_global->maxSpan && // within length limit
+ (endS==maxS || sentence.alignedCountS[endS]==0)); // unaligned
+ endS++)
+ {
+ // if there is source side syntax, there has to be a node
+ if (g_global->sourceSyntax && !sentence.sourceTree.HasNode(startS,endS))
+ continue;
+
+ // TODO: loop over all source and target syntax labels
+
+ // if within length limits, add as fully-lexical phrase pair
+ if (endT-startT < g_global->maxSymbolsTarget && endS-startS < g_global->maxSymbolsSource)
+ {
+ addRule(sentence,startT,endT,startS,endS, ruleExist);
+ }
+
+ // take note that this is a valid phrase alignment
+ ruleExist.Add(startT, endT, startS, endS);
+
+ // extract hierarchical rules
+ if (g_global->hierarchicalFlag)
+ {
+ // are rules not allowed to start non-terminals?
+ int initStartT = g_global->nonTermFirstWord ? startT : startT + 1;
+
+ HoleCollection holeColl; // empty hole collection
+ addHieroRule(sentence, startT, endT, startS, endS,
+ ruleExist, holeColl, 0, initStartT,
+ endT-startT+1, endS-startS+1);
+ }
+ }
+ }
+ }
+ }
+}
+
+void preprocessSourceHieroPhrase( SentenceAlignment &sentence
+ , int startT, int endT, int startS, int endS
+ , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex)
+{
+ vector<Hole*>::const_iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
+ assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
+
+ int outPos = 0;
+ int holeCount = 0;
+ int holeTotal = holeColl.GetHoles().size();
+ for(int currPos = startS; currPos <= endS; currPos++)
+ {
+ bool isHole = false;
+ if (iterHoleList != holeColl.GetSortedSourceHoles().end())
+ {
+ const Hole &hole = **iterHoleList;
+ isHole = hole.GetStart(0) == currPos;
+ }
+
+ if (isHole)
+ {
+ Hole &hole = **iterHoleList;
+
+ int labelI = labelIndex[ 2+holeCount+holeTotal ];
+ string label = g_global->sourceSyntax ?
+ sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+ hole.SetLabel(label, 0);
+
+ currPos = hole.GetEnd(0);
+ hole.SetPos(outPos, 0);
+ ++iterHoleList;
+ ++holeCount;
+ }
+ else
+ {
+ indexS[currPos] = outPos;
+ }
+
+ outPos++;
+ }
+
+ assert(iterHoleList == holeColl.GetSortedSourceHoles().end());
+}
+
+bool postprocessSourceHieroPhrase( const SentenceAlignment &sentence
+ , int startT, int endT, int startS, int endS
+ , const WordIndex &indexS, const HoleCollection &holeColl, const LabelIndex &labelIndex)
+{ // make sure that X aren't by the edges, unless you're doing hiero extraction
+ if (!g_global->mixed)
+ return true;
+
+ vector<Hole*>::const_iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
+ assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
+
+ int outPos = 0;
+ int holeCount = 0;
+ const Hole *prevHole = NULL;
+ for(int currPos = startS; currPos <= endS; currPos++)
+ {
+ bool isHole = false;
+ if (iterHoleList != holeColl.GetSortedSourceHoles().end())
+ {
+ const Hole &hole = **iterHoleList;
+ isHole = hole.GetStart(0) == currPos;
+ }
+
+ if (isHole)
+ { // check that X not on the perimeters, and X not adjacent
+ const Hole &hole = **iterHoleList;
+
+ if (!hole.IsSyntax())
+ {
+ if (prevHole && !prevHole->IsSyntax())
+ { // adjacent [X] [X]
+ return false;
+ }
+ /*
+ if (hole.GetStart(0) == startS || hole.GetEnd(0) == endS)
+ { // on the perimeters
+ return false;
+ }
+ */
+ }
+
+ currPos = hole.GetEnd(0);
+
+ ++iterHoleList;
+ ++holeCount;
+ prevHole = &hole;
+ }
+ else
+ { // not a hole, do nothing
+ prevHole = NULL;
+ }
+
+ outPos++;
+ }
+
+ assert(iterHoleList == holeColl.GetSortedSourceHoles().end());
+
+ return true;
+}
+
+string createTargetHieroPhrase(SentenceAlignment &sentence
+ , int startT, int endT, int startS, int endS
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex)
+{
+ HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
+ assert(iterHoleList != holeColl.GetHoles().end());
+
+ string out = "";
+ int outPos = 0;
+ int holeCount = 0;
+ for(int currPos = startT; currPos <= endT; currPos++)
+ {
+ bool isHole = false;
+ if (iterHoleList != holeColl.GetHoles().end())
+ {
+ const Hole &hole = *iterHoleList;
+ isHole = hole.GetStart(1) == currPos;
+ }
+
+ if (isHole)
+ {
+ Hole &hole = *iterHoleList;
+
+ const string &sourceLabel = hole.GetLabel(0);
+ assert(sourceLabel != "");
+
+ int labelI = labelIndex[ 2+holeCount ];
+ string targetLabel = g_global->targetSyntax ?
+ sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+ hole.SetLabel(targetLabel, 1);
+
+ out += " [" + sourceLabel + "][" + targetLabel + "]";
+
+ currPos = hole.GetEnd(1);
+ hole.SetPos(outPos, 1);
+ ++iterHoleList;
+ holeCount++;
+ }
+ else
+ {
+ indexT[currPos] = outPos;
+ out += " " + sentence.target[currPos];
+ }
+
+ outPos++;
+ }
+
+ assert(iterHoleList == holeColl.GetHoles().end());
+ return out.substr(1);
+}
+
+
+string createSourceHieroPhrase( SentenceAlignment &sentence
+ , int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, const LabelIndex &labelIndex)
+{
+ vector<Hole*>::const_iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
+ assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
+
+ string out = "";
+ int outPos = 0;
+ int holeCount = 0;
+ for(int currPos = startS; currPos <= endS; currPos++)
+ {
+ bool isHole = false;
+ if (iterHoleList != holeColl.GetSortedSourceHoles().end())
+ {
+ const Hole &hole = **iterHoleList;
+ isHole = hole.GetStart(0) == currPos;
+ }
+
+ if (isHole)
+ {
+ Hole &hole = **iterHoleList;
+
+ const string &targetLabel = hole.GetLabel(1);
+ assert(targetLabel != "");
+
+ const string &sourceLabel = hole.GetLabel(0);
+ out += " [" + sourceLabel + "][" + targetLabel + "]";
+
+ currPos = hole.GetEnd(0);
+ hole.SetPos(outPos, 0);
+ ++iterHoleList;
+ ++holeCount;
+ }
+ else
+ {
+ out += " " + sentence.source[currPos];
+ }
+
+ outPos++;
+ }
+
+ assert(iterHoleList == holeColl.GetSortedSourceHoles().end());
+ return out.substr(1);
+}
+
+void createHieroAlignment(SentenceAlignment &sentence
+ , int startT, int endT, int startS, int endS
+ , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule)
+{
+ // print alignment of words
+ for(int ti=startT;ti<=endT;ti++) {
+ if (indexT.find(ti) != indexT.end()) { // does word still exist?
+ for(int i=0;i<sentence.alignedToT[ti].size();i++) {
+ int si = sentence.alignedToT[ti][i];
+ rule.alignment += " " + IntToString(indexS.find(si)->second) + "-" + IntToString(indexT.find(ti)->second);
+ if (! g_global->onlyDirectFlag)
+ rule.alignmentInv += " " + IntToString(indexT.find(ti)->second) + "-" + IntToString(indexS.find(si)->second);
+ }
+ }
+ }
+
+ // print alignment of non terminals
+ HoleList::const_iterator iterHole;
+ for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole)
+ {
+ rule.alignment += " " + IntToString(iterHole->GetPos(0)) + "-" + IntToString(iterHole->GetPos(1));
+ if (!g_global->onlyDirectFlag)
+ rule.alignmentInv += " " + IntToString(iterHole->GetPos(1)) + "-" + IntToString(iterHole->GetPos(0));
+ }
+
+ rule.alignment = rule.alignment.substr(1);
+ if (!g_global->onlyDirectFlag)
+ rule.alignmentInv = rule.alignmentInv.substr(1);
+}
+
+void createHieroPhrase( SentenceAlignment &sentence, int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, LabelIndex &labelIndex)
+{
+ phraseCount++;
+ WordIndex indexS, indexT; // to keep track of word positions in rule
+
+ ExtractedRule rule( startT, endT, startS, endS );
+
+ // phrase labels
+ string targetLabel = g_global->targetSyntax ?
+ sentence.targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+ string sourceLabel = g_global->sourceSyntax ?
+ sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+ //string sourceLabel = "X";
+
+ // create non-terms on the source side
+ preprocessSourceHieroPhrase(sentence, startT, endT, startS, endS, indexS, holeColl, labelIndex);
+
+ // target
+ rule.target = createTargetHieroPhrase(sentence, startT, endT, startS, endS, indexT, holeColl, labelIndex)
+ + " [" + targetLabel + "]";
+
+ // source
+ // holeColl.SortSourceHoles();
+ rule.source = createSourceHieroPhrase(sentence, startT, endT, startS, endS, holeColl, labelIndex)
+ + " [" + sourceLabel + "]";
+
+ bool allowable = postprocessSourceHieroPhrase(sentence, startT, endT, startS, endS, indexS, holeColl, labelIndex);
+ rule.allowable = allowable;
+
+ // alignment
+ createHieroAlignment(sentence, startT, endT, startS, endS, indexS, indexT, holeColl, rule);
+
+// cerr << "\tLABEL RULE " << rule.source << " ||| " << rule.target << endl;
+
+ addRuleToCollection( rule );
+}
+
+void createAllHieroPhrases( SentenceAlignment &sentence
+ , int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl)
+{
+ LabelIndex labelIndex,labelCount;
+
+ // number of target head labels
+ int numLabels = g_global->targetSyntax ? sentence.targetTree.GetNodes(startT,endT).size() : 1;
+ labelCount.push_back(numLabels);
+ labelIndex.push_back(0);
+
+ // number of source head labels
+ numLabels = g_global->sourceSyntax ? sentence.sourceTree.GetNodes(startS,endS).size() : 1;
+ labelCount.push_back(numLabels);
+ labelIndex.push_back(0);
+
+ // number of target hole labels
+ for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
+ hole != holeColl.GetHoles().end(); hole++ )
+ {
+ int numLabels = g_global->targetSyntax ? sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+ labelCount.push_back(numLabels);
+ labelIndex.push_back(0);
+ }
+
+ // number of source hole labels
+ holeColl.SortSourceHoles();
+ for( vector<Hole*>::const_iterator i = holeColl.GetSortedSourceHoles().begin();
+ i != holeColl.GetSortedSourceHoles().end(); i++ )
+ {
+ const Hole &hole = **i;
+ int numLabels = g_global->sourceSyntax ? sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
+ labelCount.push_back(numLabels);
+ labelIndex.push_back(0);
+ }
+
+ // cerr << "LABEL COUNT: ";
+// for(int i=0;i<labelCount.size();i++)
+// {
+// cerr << " " << labelCount[i];
+// }
+// cerr << endl;
+
+ // loop through the holes
+ bool done = false;
+ while(!done)
+ {
+// cerr << "\tLABEL INDEX: ";
+// for(int i=0;i<labelIndex.size();i++)
+// {
+// cerr << " " << labelIndex[i];
+// }
+// cerr << endl;
+ createHieroPhrase( sentence, startT, endT, startS, endS, holeColl, labelIndex );
+ for(int i=0; i<labelIndex.size(); i++)
+ {
+ labelIndex[i]++;
+ if(labelIndex[i] == labelCount[i])
+ {
+ labelIndex[i] = 0;
+ if (i == labelIndex.size()-1)
+ done = true;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+}
+
+
+// this function is called recursively
+// it pokes a new hole into the phrase pair, and then calls itself for more holes
+void addHieroRule( SentenceAlignment &sentence
+ , int startT, int endT, int startS, int endS
+ , RuleExist &ruleExist, const HoleCollection &holeColl
+ , int numHoles, int initStartT, int wordCountT, int wordCountS)
+{
+ // cerr << "phrase " << startT << "-" << endT << "=>" << startS << "-" << endS << " (" << numHoles << ")" << endl;
+ // done, if already the maximum number of non-terminals in phrase pair
+ if (numHoles >= g_global->maxNonTerm)
+ return;
+
+ // find a hole...
+ for (int startHoleT = initStartT; startHoleT <= endT; ++startHoleT)
+ {
+ //cerr << "phrase " << startT << "-" << endT << "=>" << startS << "-" << endS << " (" << numHoles << ")" << endl;
+ for (int endHoleT = startHoleT+(g_global->minHoleTarget-1); endHoleT <= endT; ++endHoleT)
+ {
+ //cerr << "considering from hole " << startHoleT << "-" << endHoleT << " from phrase " << startT << "-" << endT << "=>" << startS << "-" << endS << " (" << numHoles << ")" << endl;
+ // if last non-terminal, enforce word count limit
+ if (numHoles == g_global->maxNonTerm-1 && wordCountT - (endHoleT-startT+1) + (numHoles+1) > g_global->maxSymbolsTarget)
+ continue;
+
+ // always enforce min word count limit
+ if (wordCountT - (endHoleT-startHoleT+1) < g_global->minWords)
+ continue;
+
+ // except the whole span
+ if (startHoleT == startT && endHoleT == endT)
+ continue;
+
+ // does a phrase cover this target span?
+ // if it does, then there should be a list of mapped source phrases
+ // (multiple possible due to unaligned words)
+ const HoleList &sourceHoles = ruleExist.GetSourceHoles(startHoleT, endHoleT);
+
+ // loop over sub phrase pairs
+ HoleList::const_iterator iterSourceHoles;
+ for (iterSourceHoles = sourceHoles.begin(); iterSourceHoles != sourceHoles.end(); ++iterSourceHoles)
+ {
+ const Hole &sourceHole = *iterSourceHoles;
+
+ // cerr << "source" << sourceHole.GetStart() << "-" << sourceHole.GetEnd() << endl;
+
+ // enforce minimum hole size
+ if (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1 < g_global->minHoleSource)
+ continue;
+
+ // if last non-terminal, enforce word count limit
+ if (numHoles == g_global->maxNonTerm-1 &&
+ wordCountS - (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1) + (numHoles+1) > g_global->maxSymbolsSource)
+ continue;
+
+ // enforce min word count limit
+ if (wordCountS - (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1) < g_global->minWords)
+ continue;
+
+ // hole must be subphrase of the source phrase
+ // (may be violated if subphrase contains additional unaligned source word)
+ if (startS > sourceHole.GetStart(0) || endS < sourceHole.GetEnd(0))
+ continue;
+
+ // make sure target side does not overlap with another hole
+ if (holeColl.OverlapSource(sourceHole))
+ continue;
+
+ // if consecutive non-terminals are not allowed, also check for source
+ if (!g_global->nonTermConsecSource && holeColl.ConsecSource(sourceHole) )
+ continue;
+
+ // require that at least one aligned word is left
+ if (g_global->requireAlignedWord)
+ {
+ HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
+ bool foundAlignedWord = false;
+ // loop through all word positions
+ for(int pos = startT; pos <= endT && !foundAlignedWord; pos++)
+ {
+ // new hole? moving on...
+ if (pos == startHoleT)
+ {
+ pos = endHoleT;
+ }
+ // covered by hole? moving on...
+ else if (iterHoleList != holeColl.GetHoles().end() && iterHoleList->GetStart(1) == pos)
+ {
+ pos = iterHoleList->GetEnd(1);
+ ++iterHoleList;
+ }
+ // covered by word? check if it is aligned
+ else
+ {
+ if (sentence.alignedToT[pos].size() > 0)
+ foundAlignedWord = true;
+ }
+ }
+ if (!foundAlignedWord)
+ continue;
+ }
+
+ // update list of holes in this phrase pair
+ HoleCollection copyHoleColl(holeColl);
+ copyHoleColl.Add(startHoleT, endHoleT, sourceHole.GetStart(0), sourceHole.GetEnd(0));
+
+ // now some checks that disallow this phrase pair, but not further recursion
+ bool allowablePhrase = true;
+
+ // maximum words count violation?
+ if (wordCountS - (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1) + (numHoles+1) > g_global->maxSymbolsSource)
+ allowablePhrase = false;
+
+ if (wordCountT - (endHoleT-startHoleT+1) + (numHoles+1) > g_global->maxSymbolsTarget)
+ allowablePhrase = false;
+
+ // passed all checks...
+ // cerr << "adding hole " << startHoleT << "," << endHoleT << "," << sourceHole.GetStart() << "," << sourceHole.GetEnd() << endl;
+ if (allowablePhrase)
+ createAllHieroPhrases(sentence, startT, endT, startS, endS, copyHoleColl);
+
+ // recursively search for next hole
+ int nextInitStartT = g_global->nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
+ addHieroRule(sentence, startT, endT, startS, endS
+ , ruleExist, copyHoleColl, numHoles + 1, nextInitStartT
+ , wordCountT - (endHoleT-startHoleT+1)
+ , wordCountS - (sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1));
+ }
+ }
+ }
+}
+
+void addRule( SentenceAlignment &sentence, int startT, int endT, int startS, int endS
+ , RuleExist &ruleExist)
+{
+ // source
+ //cerr << "adding ( " << startS << "-" << endS << ", " << startT << "-" << endT << ")\n";
+
+ if (g_global->onlyOutputSpanInfo) {
+ cout << startS << " " << endS << " " << startT << " " << endT << endl;
+ return;
+ }
+ phraseCount++;
+
+ ExtractedRule rule(startT, endT, startS, endS);
+
+ // phrase labels
+ string targetLabel,sourceLabel;
+ if (g_global->hierarchicalFlag) {
+ sourceLabel = g_global->sourceSyntax ?
+ sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+ // sourceLabel = "X";
+ targetLabel = g_global->targetSyntax ?
+ sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+ }
+
+ // source
+ rule.source = "";
+ for(int si=startS;si<=endS;si++)
+ rule.source += " " + sentence.source[si];
+ if (g_global->hierarchicalFlag)
+ rule.source += " [" + sourceLabel + "]";
+ rule.source = rule.source.substr(1);
+
+ // target
+ rule.target = "";
+ for(int ti=startT;ti<=endT;ti++)
+ rule.target += " " + sentence.target[ti];
+ if (g_global->hierarchicalFlag)
+ rule.target += " [" + targetLabel + "]";
+ rule.target = rule.target.substr(1);
+
+ // alignment
+ for(int ti=startT;ti<=endT;ti++)
+ {
+ for(int i=0;i<sentence.alignedToT[ti].size();i++)
+ {
+ int si = sentence.alignedToT[ti][i];
+ rule.alignment += " " + IntToString(si-startS) + "-" + IntToString(ti-startT);
+ if (!g_global->onlyDirectFlag)
+ rule.alignmentInv += " " + IntToString(ti-startT) + "-" + IntToString(si-startS);
+ }
+ }
+ rule.alignment = rule.alignment.substr(1);
+ if (!g_global->onlyDirectFlag)
+ rule.alignmentInv = rule.alignmentInv.substr(1);
+
+ // orientation
+ if (g_global->orientationFlag) {
+ // orientation to previous E
+ bool connectedLeftTop = isAligned( sentence, startS-1, startT-1 );
+ bool connectedRightTop = isAligned( sentence, endS+1, startT-1 );
+ if ( connectedLeftTop && !connectedRightTop)
+ rule.orientation = "mono";
+ else if (!connectedLeftTop && connectedRightTop)
+ rule.orientation = "swap";
+ else
+ rule.orientation = "other";
+
+ // orientation to following E
+ bool connectedLeftBottom = isAligned( sentence, startS-1, endT+1 );
+ bool connectedRightBottom = isAligned( sentence, endS+1, endT+1 );
+ if ( connectedLeftBottom && !connectedRightBottom)
+ rule.orientationForward = "swap";
+ else if (!connectedLeftBottom && connectedRightBottom)
+ rule.orientationForward = "mono";
+ else
+ rule.orientationForward = "other";
+ }
+ addRuleToCollection( rule );
+}
+
+bool isAligned ( SentenceAlignment &sentence, int si, int ti ) {
+ if (ti == -1 && si == -1) return true;
+ if (ti <= -1 || si <= -1) return false;
+ if (ti == sentence.target.size() && si == sentence.source.size()) return true;
+ if (ti >= sentence.target.size() || si >= sentence.source.size()) return false;
+ for(int i=0;i<sentence.alignedToT[ti].size();i++)
+ if (sentence.alignedToT[ti][i] == si) return true;
+ return false;
+}
+
+void addRuleToCollection( ExtractedRule &newRule ) {
+ //cerr << "add ( " << newRule.source << " , " << newRule.target << " )\n";
+
+ // no double-counting of identical rules from overlapping spans
+ if (!g_global->duplicateRules)
+ {
+ vector<ExtractedRule>::const_iterator rule;
+ for(rule = extractedRules.begin(); rule != extractedRules.end(); rule++ )
+ {
+ if (rule->source.compare( newRule.source ) == 0 &&
+ rule->target.compare( newRule.target ) == 0 &&
+ !(rule->endT < newRule.startT || rule->startT > newRule.endT)) // overlapping
+ {
+ return;
+ }
+ }
+ }
+ extractedRules.push_back( newRule );
+}
+
+void consolidateRules()
+{
+ typedef vector<ExtractedRule>::iterator R;
+ map<int, map<int, map<int, map<int,int> > > > spanCount;
+
+ // compute number of rules per span
+ if (g_global->fractionalCounting)
+ {
+ for(R rule = extractedRules.begin(); rule != extractedRules.end(); rule++ )
+ {
+ spanCount[ rule->startT ][ rule->endT ][ rule->startS ][ rule->endS ]++;
+ }
+ }
+
+ // compute fractional counts
+ for(R rule = extractedRules.begin(); rule != extractedRules.end(); rule++ )
+ {
+ rule->count = 1.0/(float) (g_global->fractionalCounting ? spanCount[ rule->startT ][ rule->endT ][ rule->startS ][ rule->endS ] : 1.0 );
+ }
+
+ // consolidate counts
+ for(R rule = extractedRules.begin(); rule != extractedRules.end(); rule++ )
+ {
+ if (rule->count == 0)
+ continue;
+ for(R r2 = rule+1; r2 != extractedRules.end(); r2++ )
+ {
+ if (rule->source.compare( r2->source ) == 0 &&
+ rule->target.compare( r2->target ) == 0 &&
+ rule->alignment.compare( r2->alignment ) == 0)
+ {
+ rule->count += r2->count;
+ r2->count = 0;
+ }
+ }
+ }
+}
+
+void writeRulesToFile() {
+ vector<ExtractedRule>::const_iterator ruleIter;
+ for(ruleIter = extractedRules.begin(); ruleIter != extractedRules.end(); ruleIter++ )
+ {
+ const ExtractedRule rule = *ruleIter;
+ //cerr << rule.source << endl;
+
+ if (rule.count == 0)
+ continue;
+ if (!rule.allowable)
+ continue;
+
+ extractFile << rule.source << " ||| "
+ << rule.target << " ||| "
+ << rule.alignment << " ||| "
+ << rule.count << "\n";
+
+ if (!g_global->onlyDirectFlag)
+ extractFileInv << rule.target << " ||| "
+ << rule.source << " ||| "
+ << rule.alignmentInv << " ||| "
+ << rule.count << "\n";
+
+ if (g_global->orientationFlag)
+ extractFileOrientation << rule.source << " ||| "
+ << rule.target << " ||| "
+ << rule.orientation << " " << rule.orientationForward << "\n";
+ }
+}
+
+void writeGlueGrammar( string fileName )
+{
+ ofstream grammarFile;
+ grammarFile.open(fileName.c_str());
+ if (!g_global->targetSyntax)
+ {
+ grammarFile << "[X] [S] ||| <s> ||| <s> ||| ||| 1" << endl
+ << "[X] [S] ||| [X] </s> ||| [S] </s> ||| 0-0 ||| 1" << endl
+ << "[X] [S] ||| [X] [X] ||| [S] [X] ||| 0-0 1-1 ||| 2.718" << endl;
+ }
+ else
+ {
+ // chose a top label that is not already a label
+ string topLabel = "QQQQQQ";
+ for( int i=1;i<=topLabel.length(); i++)
+ {
+ if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() )
+ {
+ topLabel = topLabel.substr(0,i);
+ break;
+ }
+ }
+ // basic rules
+ grammarFile << "[X] [" << topLabel << "] ||| <s> ||| <s> ||| ||| 1" << endl
+ << "[X] [" << topLabel << "] ||| [X] </s> ||| [" << topLabel << "] </s> ||| 0-0 ||| 1" << endl;
+
+ // top rules
+ for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
+ i != targetTopLabelCollection.end(); i++ )
+ {
+ grammarFile << "[X] [" << topLabel << "] ||| <s> [X] </s> ||| <s> [" << i->first << "] </s> ||| 1-1 ||| 1" << endl;
+ }
+
+ // glue rules
+ for( set<string>::const_iterator i = targetLabelCollection.begin();
+ i != targetLabelCollection.end(); i++ )
+ {
+ grammarFile << "[X] [" << topLabel << "] ||| [X] [X] ||| [" << topLabel << "] [" << *i << "] ||| 0-0 1-1 ||| 2.718" << endl;
+ }
+ grammarFile << "[X] [" << topLabel << "] ||| [X] [X] ||| [" << topLabel << "] [X] ||| 0-0 1-1 ||| 2.718" << endl; // glue rule for unknown word...
+ }
+ grammarFile.close();
+}
+
+// collect counts for labels for each word
+// ( labels of singleton words are used to estimate
+// distribution oflabels for unknown words )
+
+map<string,int> wordCount;
+map<string,string> wordLabel;
+void collectWordLabelCounts( SentenceAlignment &sentence )
+{
+ int countT = sentence.target.size();
+ for(int ti=0; ti < countT; ti++)
+ {
+ string &word = sentence.target[ ti ];
+ const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
+ if (labels.size() > 0)
+ {
+ wordCount[ word ]++;
+ wordLabel[ word ] = labels[0]->GetLabel();
+ }
+ }
+}
+
+void writeUnknownWordLabel( string fileName )
+{
+ ofstream outFile;
+ outFile.open(fileName.c_str());
+ typedef map<string,int>::const_iterator I;
+
+ map<string,int> count;
+ int total = 0;
+ for(I word = wordCount.begin(); word != wordCount.end(); word++)
+ {
+ // only consider singletons
+ if (word->second == 1)
+ {
+ count[ wordLabel[ word->first ] ]++;
+ total++;
+ }
+ }
+
+ for(I pos = count.begin(); pos != count.end(); pos++)
+ {
+ double ratio = ((double) pos->second / (double) total);
+ if (ratio > 0.03)
+ outFile << pos->first << " " << ratio << endl;
+ }
+
+ outFile.close();
+}