1 files changed, 237 insertions, 0 deletions
diff --git a/scripts/training/phrase-extract.6/extract.cpp b/scripts/training/phrase-extract.6/extract.cpp
new file mode 100644
index 000000000..4eddce6bf
--- /dev/null
+++ b/scripts/training/phrase-extract.6/extract.cpp
@@ -0,0 +1,237 @@
+// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $
+// vim:tabstop=2
+
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <cstdio>
+#include <stdlib.h>
+#include <assert.h>
+#include <time.h>
+#include <cstring>
+#include <sstream>
+#include "extract.h"
+#include "InputFileStream.h"
+#include "Lattice.h"
+
+#ifdef WIN32
+// Include Visual Leak Detector
+#include <vld.h>
+#endif
+
+using namespace std;
+
+int main(int argc, char* argv[]) 
+{
+  cerr << "Extract v2.0, written by Philipp Koehn\n"
+       << "rule extraction from an aligned parallel corpus\n";
+  //time_t starttime = time(NULL);
+	
+	Global *global = new Global();
+	g_global = global;
+
+		
+	if (argc < 5) {
+		cerr << "syntax: extract corpus.target corpus.source corpus.align extract "
+		     << " [ --Hierarchical | --Orientation"
+				 << " | --GlueGrammar FILE | --UnknownWordLabel FILE"
+				 << " | --OnlyDirect"
+					
+					<< " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
+					<< " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
+					<< " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
+					<< " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"
+
+				<< " | --MaxSymbolsSource[" << global->maxSymbolsSource << "]"
+				 << " | --MaxNonTerm[" << global->maxNonTerm << "]"
+		     << " | --SourceSyntax | --TargetSyntax" 
+				<<	" | --UppermostOnly[" << g_global->uppermostOnly << "]"
+				<< endl;
+		exit(1);
+	}
+  char* &fileNameT = argv[1];
+  char* &fileNameS = argv[2];
+  char* &fileNameA = argv[3];
+	string fileNameGlueGrammar;
+ 	string fileNameUnknownWordLabel;
+	string fileNameExtract = string(argv[4]);
+
+	int optionInd = 5;
+
+  for(int i=optionInd;i<argc;i++) 
+	{
+		if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
+			global->minHoleSpanSourceDefault = atoi(argv[++i]);
+			if (global->minHoleSpanSourceDefault < 1) {
+				cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
+			global->maxHoleSpanSourceDefault = atoi(argv[++i]);
+			if (global->maxHoleSpanSourceDefault < 1) {
+				cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		else  if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
+			global->minHoleSpanSourceSyntax = atoi(argv[++i]);
+			if (global->minHoleSpanSourceSyntax < 1) {
+				cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		else if (strcmp(argv[i],"--UppermostOnly") == 0) {
+			global->uppermostOnly = atoi(argv[++i]);
+		}
+		else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
+			global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
+			if (global->maxHoleSpanSourceSyntax < 1) {
+				cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		
+		// maximum number of words in hierarchical phrase
+		else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
+			global->maxSymbolsSource = atoi(argv[++i]);
+			if (global->maxSymbolsSource < 1) {
+				cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
+				exit(1);
+			}
+		}
+		// maximum number of non-terminals
+		else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
+			global->maxNonTerm = atoi(argv[++i]);
+			if (global->maxNonTerm < 1) {
+				cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
+				exit(1);
+			}
+		}		
+		// allow consecutive non-terminals (X Y | X Y)
+    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
+      global->targetSyntax = true;
+    }
+    else if (strcmp(argv[i],"--SourceSyntax") == 0) {
+      global->sourceSyntax = true;
+    }
+		// do not create many part00xx files!
+    else if (strcmp(argv[i],"--NoFileLimit") == 0) {
+      // now default
+    }
+		else if (strcmp(argv[i],"--GlueGrammar") == 0) {
+			global->glueGrammarFlag = true;
+			if (++i >= argc)
+			{
+				cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
+				exit(0);
+			}
+			fileNameGlueGrammar = string(argv[i]);
+			cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
+    }
+		else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
+			global->unknownWordLabelFlag = true;
+			if (++i >= argc)
+			{
+				cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
+				exit(0);
+			}
+			fileNameUnknownWordLabel = string(argv[i]);
+			cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
+		}
+		// TODO: this should be a useful option
+    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
+    //  zipFiles = true;
+    //}
+		// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
+    else if (strcmp(argv[i],"--Mixed") == 0) {
+			global->mixed = true;
+    }
+    else {
+      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
+      exit(1);
+    }
+  }
+
+	// open input files
+	Moses::InputFileStream tFile(fileNameT);
+	Moses::InputFileStream sFile(fileNameS);
+	Moses::InputFileStream aFile(fileNameA);
+
+	// open output files
+  string fileNameExtractInv = fileNameExtract + ".inv";
+  string fileNameExtractOrientation = fileNameExtract + ".o";
+  extractFile.open(fileNameExtract.c_str());
+  
+  
+	// loop through all sentence pairs
+  int i=0;
+  while(true) {
+    i++;
+
+    //if (i%1000 == 0) cerr << "." << flush;
+    //if (i%10000 == 0) cerr << ":" << flush;
+    //if (i%100000 == 0) cerr << "!" << flush;
+    string targetString;
+    string sourceString;
+    string alignmentString;
+		
+		bool ok = getline(tFile, targetString);
+		if (!ok)
+			break;
+		getline(sFile, sourceString);
+		getline(aFile, alignmentString);
+    
+		cerr << i << " ";
+
+		//cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;
+
+		//time_t currTime = time(NULL);
+		//cerr << "A " << (currTime - starttime) << endl;
+
+    SentenceAlignment sentencePair;
+    if (sentencePair.create( targetString, sourceString, alignmentString, i, *global )) 
+		{			
+			//cerr << sentence.sourceTree << endl;
+			//cerr << sentence.targetTree << endl;
+
+			sentencePair.FindAlignedHoles(*g_global);
+			//cerr << "C " << (time(NULL) - starttime) << endl;
+			
+			sentencePair.CreateLattice(*g_global);
+			//cerr << "D " << (time(NULL) - starttime) << endl;
+
+			sentencePair.CreateRules(*g_global);
+			//cerr << "E " << (time(NULL) - starttime) << endl;
+
+			//cerr << sentence.lattice->GetRules().GetSize() << endl;
+			extractFile << sentencePair.lattice->GetRules();
+    }
+
+  }
+	
+  tFile.Close();
+  sFile.Close();
+  aFile.Close();
+	
+	delete global;
+}
+ 
+
+