Merge remote branch 'github/master' into miramerge

Compiles, but not tested. Had to disable relent filter. Strangely, it seems to contain the whole of moses-cmd. Conflicts: Jamroot OnDiskPt/TargetPhrase.cpp moses-cmd/src/Main.cpp moses/src/AlignmentInfo.cpp moses/src/AlignmentInfo.h moses/src/ChartTranslationOptionCollection.cpp moses/src/ChartTranslationOptionCollection.h moses/src/GenerationDictionary.cpp moses/src/Jamfile moses/src/Parameter.cpp moses/src/PhraseDictionary.cpp moses/src/StaticData.cpp moses/src/StaticData.h moses/src/TargetPhrase.h moses/src/TranslationSystem.cpp moses/src/TranslationSystem.h moses/src/Word.cpp phrase-extract/score.cpp regression-testing/Jamfile scripts/ems/experiment.meta scripts/ems/experiment.perl scripts/training/train-model.perl
author: Barry Haddow <barry.haddow@gmail.com> 2012-09-27 01:49:33 +0400
committer: Barry Haddow <barry.haddow@gmail.com> 2012-09-27 01:49:33 +0400
commit: 0a950ee9f4227c8afbbe58d03a854745479ffbc0 (patch)
tree: 3e4515adc6b3323f8742ff5addde2f29da2002c8 /phrase-extract
parent: 1ce788e2b83dc9b359f6132e7e82774f9d0777b1 (diff)
parent: ab60d1ad6f93a78e80e665bc6c7d32b61b7c1c52 (diff)
17 files changed, 1010 insertions, 402 deletions
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index d834674b8..e4f801089 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -2,6 +2,7 @@ obj InputFileStream.o : InputFileStream.cpp : <include>. ;
 alias InputFileStream : InputFileStream.o ..//z ;
 
 obj tables-core.o : tables-core.cpp : <include>. ;
+obj domain.o : domain.cpp : <include>. ;
 obj AlignmentPhrase.o : AlignmentPhrase.cpp : <include>. ;
 obj SentenceAlignment.o : SentenceAlignment.cpp : <include>. ;
 obj SyntaxTree.o : SyntaxTree.cpp : <include>. ;
@@ -10,13 +11,13 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
 alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
 alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
 
-exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
 
-exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
+exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
 
 exe extract-lex : extract-lex.cpp InputFileStream ;
 
-exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe score : tables-core.o domain.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
 
 exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
 
@@ -24,7 +25,7 @@ exe consolidate-direct : consolidate-direct.cpp OutputFileStream.cpp InputFileSt
 
 exe consolidate-reverse :  consolidate-reverse.cpp tables-core.o InputFileStream ;
 
-exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp ;
+exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp InputFileStream ;
 
 exe statistics : tables-core.o AlignmentPhrase.o statistics.cpp InputFileStream ;
 
diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp
index e432294b9..bdfead082 100644
--- a/phrase-extract/PhraseAlignment.cpp
+++ b/phrase-extract/PhraseAlignment.cpp
@@ -79,12 +79,11 @@ inline void Tokenize( std::vector<T> &output
 }
 
 // read in a phrase pair and store it
-void PhraseAlignment::create( char line[], int lineID )
+void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFlag )
 {
   assert(phraseS.empty());
   assert(phraseT.empty());
 
-  //cerr << "processing " << line;
   vector< string > token = tokenize( line );
   int item = 1;
   for (size_t j=0; j<token.size(); j++) {
@@ -111,12 +110,13 @@ void PhraseAlignment::create( char line[], int lineID )
         alignedToT[t].insert( s );
         alignedToS[s].insert( t );
       }
-    } else if (item == 4) { // count
+    } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
+      sscanf(token[j].c_str(), "%d", &sentenceId);
+    } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
       sscanf(token[j].c_str(), "%f", &count);
-    }
-    else if (item == 5) { // non-term lengths
+    } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // non-term lengths
       addNTLength(token[j]);
-    } else if (item == 6) { // target syntax PCFG score
+    } else if (item + (includeSentenceIdFlag?-1:0) == 6) { // target syntax PCFG score
       float pcfgScore = std::atof(token[j].c_str());
       pcfgSum = pcfgScore * count;
     }
@@ -124,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID )
 
   createAlignVec(phraseS.size(), phraseT.size());
 
-  if (item == 3) {
+  if (item + (includeSentenceIdFlag?-1:0) == 3) {
     count = 1.0;
   }
   if (item < 3 || item > 6) {
diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h
index 9763b7a52..35afb314b 100644
--- a/phrase-extract/PhraseAlignment.h
+++ b/phrase-extract/PhraseAlignment.h
@@ -30,10 +30,13 @@ protected:
 public:
   float pcfgSum;
   float count;
+  int sentenceId;
+  std::string domain;
+
   std::vector< std::set<size_t> > alignedToT;
   std::vector< std::set<size_t> > alignedToS;
 
-  void create( char*, int );
+  void create( char*, int, bool );
   void clear();
   bool equals( const PhraseAlignment& );
   bool match( const PhraseAlignment& );
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
new file mode 100644
index 000000000..eeec39750
--- /dev/null
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -0,0 +1,152 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
+
+#pragma once
+#ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+#define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+
+namespace MosesTraining
+{
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+
+
+class PhraseExtractionOptions {
+  
+ public: 
+     const int maxPhraseLength;
+ private:
+  bool allModelsOutputFlag;
+  bool wordModel;
+  REO_MODEL_TYPE wordType;
+  bool phraseModel;
+  REO_MODEL_TYPE phraseType;
+  bool hierModel;
+  REO_MODEL_TYPE hierType;
+  bool orientationFlag;
+  bool translationFlag;
+  bool sentenceIdFlag; //create extract file with sentence id
+  bool includeSentenceIdFlag; //include sentence id in extract file
+  bool onlyOutputSpanInfo;
+  bool gzOutput;
+
+public:  
+  PhraseExtractionOptions(const int initmaxPhraseLength):
+            maxPhraseLength(initmaxPhraseLength),
+            allModelsOutputFlag(false),
+            wordModel(false),
+            wordType(REO_MSD),
+            phraseModel(false),
+            phraseType(REO_MSD),
+            hierModel(false),
+            hierType(REO_MSD),
+            orientationFlag(false),
+            translationFlag(true),
+            sentenceIdFlag(false),
+            includeSentenceIdFlag(false),
+            onlyOutputSpanInfo(false),
+            gzOutput(false){}
+ 
+    //functions for initialization of options
+    void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
+        allModelsOutputFlag=initallModelsOutputFlag;
+    }
+    void initWordModel(const bool initwordModel){
+        wordModel=initwordModel;
+    }
+    void initWordType(REO_MODEL_TYPE initwordType ){
+        wordType=initwordType; 
+    } 
+    void initPhraseModel(const bool initphraseModel ){
+        phraseModel=initphraseModel;  
+    } 
+    void initPhraseType(REO_MODEL_TYPE initphraseType){
+        phraseType=initphraseType;
+    }  
+    void initHierModel(const bool inithierModel){
+        hierModel=inithierModel;
+    }
+    void initHierType(REO_MODEL_TYPE inithierType){
+        hierType=inithierType;
+    }
+    void initOrientationFlag(const bool initorientationFlag){
+        orientationFlag=initorientationFlag;
+    }
+    void initTranslationFlag(const bool inittranslationFlag){
+        translationFlag=inittranslationFlag;
+    }
+    void initSentenceIdFlag(const bool initsentenceIdFlag){
+        sentenceIdFlag=initsentenceIdFlag;
+    }
+    void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag){
+        includeSentenceIdFlag=initincludeSentenceIdFlag;
+    }
+    void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
+        onlyOutputSpanInfo= initonlyOutputSpanInfo;
+    } 
+    void initGzOutput (const bool initgzOutput){
+        gzOutput= initgzOutput;
+    } 
+    // functions for getting values
+    bool isAllModelsOutputFlag() const {
+        return allModelsOutputFlag;
+    }
+    bool isWordModel() const {
+        return wordModel;
+    }
+    REO_MODEL_TYPE isWordType() const {
+        return wordType; 
+    } 
+    bool isPhraseModel() const {
+        return phraseModel;  
+    } 
+    REO_MODEL_TYPE isPhraseType() const {
+        return phraseType;
+    }  
+    bool isHierModel() const {
+        return hierModel; 
+    }
+    REO_MODEL_TYPE isHierType() const {
+        return hierType;
+    }
+    bool isOrientationFlag() const {
+        return orientationFlag;
+    }
+    bool isTranslationFlag() const {
+        return translationFlag;
+    }
+    bool isSentenceIdFlag() const {
+        return sentenceIdFlag;
+    }
+    bool isIncludeSentenceIdFlag() const {
+        return includeSentenceIdFlag;
+    }
+    bool isOnlyOutputSpanInfo() const {
+        return onlyOutputSpanInfo;
+    } 
+    bool isGzOutput () const {
+        return gzOutput;
+   } 
+};
+
+}
+
+#endif
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index bb2d97580..431be58b0 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -53,6 +53,7 @@ public:
   bool gzOutput;
   bool unpairedExtractFormat;
   bool conditionOnTargetLhs;
+  bool boundaryRules;
   
   RuleExtractionOptions()
     : maxSpan(10)
@@ -85,6 +86,7 @@ public:
     , gzOutput(false)
     , unpairedExtractFormat(false)
     , conditionOnTargetLhs(false)
+    , boundaryRules(false)
   {}
 };
 
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index 8e44bddc4..af1cfa953 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -25,33 +25,45 @@
 
 #include "tables-core.h"
 
+using namespace std;
+
 namespace MosesTraining
 {
 
 SentenceAlignment::~SentenceAlignment() {}
 
-bool SentenceAlignment::processTargetSentence(const char * targetString, int)
+void addBoundaryWords(vector<string> &phrase)
+{
+  phrase.insert(phrase.begin(), "<s>");
+  phrase.push_back("</s>");
+}
+
+bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
 {
   target = tokenize(targetString);
+  if (boundaryRules)
+    addBoundaryWords(target);
   return true;
 }
 
-bool SentenceAlignment::processSourceSentence(const char * sourceString, int)
+bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
 {
   source = tokenize(sourceString);
+  if (boundaryRules)
+    addBoundaryWords(source);
   return true;
 }
 
-bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID)
+bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID, bool boundaryRules)
 {
   using namespace std;
   this->sentenceID = sentenceID;
 
   // process sentence strings and store in target and source members.
-  if (!processTargetSentence(targetString, sentenceID)) {
+  if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
     return false;
   }
-  if (!processSourceSentence(sourceString, sentenceID)) {
+  if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
     return false;
   }
 
@@ -81,6 +93,12 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
       cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
       return false;
     }
+    
+    if (boundaryRules) {
+      ++s;
+      ++t;
+    }
+    
     // cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
     if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
       cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
@@ -90,6 +108,16 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
     alignedToT[t].push_back( s );
     alignedCountS[s]++;
   }
+  
+  if (boundaryRules) {
+    alignedToT[0].push_back(0);
+    alignedCountS[0]++;
+    
+    alignedToT.back().push_back(alignedCountS.size() - 1);
+    alignedCountS.back()++;
+    
+  }
+  
   return true;
 }
 
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
index b1fb5933a..7c2988780 100644
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@@ -38,12 +38,13 @@ public:
 
   virtual ~SentenceAlignment();
 
-  virtual bool processTargetSentence(const char *, int);
+  virtual bool processTargetSentence(const char *, int, bool boundaryRules);
 
-  virtual bool processSourceSentence(const char *, int);
+  virtual bool processSourceSentence(const char *, int, bool boundaryRules);
 
   bool create(char targetString[], char sourceString[],
-              char alignmentString[], int sentenceID);
+              char alignmentString[], int sentenceID, bool boundaryRules);
+  
 };
 
 }
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
index 83a048757..5d866edfb 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -32,10 +32,10 @@ using namespace std;
 namespace MosesTraining
 {
 
-bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
 {
   if (!m_options.targetSyntax) {
-    return SentenceAlignment::processTargetSentence(targetString, sentenceID);
+    return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
   }
 
   string targetStringCPP(targetString);
@@ -52,10 +52,10 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
   return true;
 }
 
-bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
 {
   if (!m_options.sourceSyntax) {
-    return SentenceAlignment::processSourceSentence(sourceString, sentenceID);
+    return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
   }
 
   string sourceStringCPP(sourceString);
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index 38fa77907..28eef57b7 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -59,10 +59,10 @@ public:
   virtual ~SentenceAlignmentWithSyntax() {}
 
   bool
-  processTargetSentence(const char *, int);
+  processTargetSentence(const char *, int, bool boundaryRules);
 
   bool
-  processSourceSentence(const char *, int);
+  processSourceSentence(const char *, int, bool boundaryRules);
 };
 
 }
diff --git a/phrase-extract/consolidate.cpp b/phrase-extract/consolidate.cpp
index 60285e6e7..43b3f32a1 100644
--- a/phrase-extract/consolidate.cpp
+++ b/phrase-extract/consolidate.cpp
@@ -47,9 +47,11 @@ inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
 char line[LINE_MAX_LENGTH];
 void processFiles( char*, char*, char*, char* );
 void loadCountOfCounts( char* );
+void breakdownCoreAndSparse( string combined, string &core, string &sparse );
 bool getLine( istream &fileP, vector< string > &item );
 vector< string > splitLine();
 vector< int > countBin;
+bool sparseCountBinFeatureFlag = false;
 
 int main(int argc, char* argv[])
 {
@@ -94,8 +96,11 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
       lowCountFlag = true;
       cerr << "including the low count feature\n";
-    } else if (strcmp(argv[i],"--CountBinFeature") == 0) {
-      cerr << "include count bin feature:";
+    } else if (strcmp(argv[i],"--CountBinFeature") == 0 ||
+               strcmp(argv[i],"--SparseCountBinFeature") == 0) {
+      if (strcmp(argv[i],"--SparseCountBinFeature") == 0)
+        sparseCountBinFeatureFlag = true;
+      cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
       int prev = 0;
       while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
         int binCount = atoi(argv[++i]);
@@ -223,10 +228,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     }
 
     // output hierarchical phrase pair (with separated labels)
-    fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
+    fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||";
 
     // SCORES ...
-    fileConsolidated << " |||";
+    string directScores, directSparseScores, indirectScores, indirectSparseScores;
+    breakdownCoreAndSparse( itemDirect[2], directScores, directSparseScores );
+    breakdownCoreAndSparse( itemIndirect[2], indirectScores, indirectSparseScores );
+
     vector<string> directCounts = tokenize(itemDirect[4].c_str());
     vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
     float countF = atof(directCounts[0].c_str());
@@ -264,12 +272,12 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     // prob indirect
     if (!onlyDirectFlag) {
       fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
-      fileConsolidated << " " << itemIndirect[2];
+      fileConsolidated << " " << directScores;
     }
 
     // prob direct
     fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
-    fileConsolidated << " " << itemDirect[2];
+    fileConsolidated << " " << indirectScores;
 
     // phrase count feature
     if (phraseCountFlag) {
@@ -281,8 +289,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
       fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
     }
 
-    // count bin feature
-    if (countBin.size()>0) {
+    // count bin feature (as a core feature)
+    if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
       bool foundBin = false;
       for(size_t i=0; i < countBin.size(); i++) {
         if (!foundBin && countEF <= countBin[i]) {
@@ -307,6 +315,35 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
       fileConsolidated << " ||| " << itemDirect[5];
     }
     
+    // count bin feature (as a sparse feature)
+    if (sparseCountBinFeatureFlag || 
+        directSparseScores.compare("") != 0 || 
+        indirectSparseScores.compare("") != 0)
+    {
+      fileConsolidated << " |||";
+      if (directSparseScores.compare("") != 0)
+        fileConsolidated << " " << directSparseScores;
+      if (indirectSparseScores.compare("") != 0)
+        fileConsolidated << " " << indirectSparseScores;
+      if (sparseCountBinFeatureFlag) {
+        bool foundBin = false;
+        for(size_t i=0; i < countBin.size(); i++) {
+          if (!foundBin && countEF <= countBin[i]) {
+            fileConsolidated << " cb_";
+            if (i == 0 && countBin[i] > 1)
+              fileConsolidated << "1_";
+            else if (i > 0 && countBin[i-1]+1 < countBin[i])
+              fileConsolidated << (countBin[i-1]+1) << "_";
+            fileConsolidated << countBin[i] << " 1";
+            foundBin = true;
+          }
+        }
+        if (!foundBin) {
+          fileConsolidated << " cb_max 1";
+        }
+      }
+    }
+
     fileConsolidated << endl;
   }
   fileDirect.Close();
@@ -314,6 +351,22 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
   fileConsolidated.Close();
 }
 
+void breakdownCoreAndSparse( string combined, string &core, string &sparse ) 
+{
+  core = "";
+  sparse = "";
+  vector<string> score = tokenize( combined.c_str() );
+  for(size_t i=0; i<score.size(); i++) {
+    if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size()) 
+      core += " " + score[i];
+    else {
+      sparse += " " + score[i];
+      sparse += " " + score[++i];
+    }
+  }
+  if (core.size() > 0 ) core = core.substr(1);
+  if (sparse.size() > 0 ) sparse = sparse.substr(1);
+}
 
 bool getLine( istream &fileP, vector< string > &item )
 {
diff --git a/phrase-extract/domain.cpp b/phrase-extract/domain.cpp
new file mode 100644
index 000000000..aacb7160d
--- /dev/null
+++ b/phrase-extract/domain.cpp
@@ -0,0 +1,52 @@
+// $Id$
+//#include "beammain.h"
+#include "domain.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+#include "SafeGetline.h"
+
+#define TABLE_LINE_MAX_LENGTH 1000
+
+using namespace std;
+
+namespace MosesTraining
+{
+
+// handling of domain names: load database with sentence-id / domain name info
+void Domain::load( const std::string &domainFileName ) {
+  Moses::InputFileStream fileS( domainFileName );
+  istream *fileP = &fileS;
+  while(true) {
+    char line[TABLE_LINE_MAX_LENGTH];
+    SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
+    if (fileP->eof()) break;
+    // read
+    vector< string > domainSpecLine = tokenize( line );
+    int lineNumber;
+    if (domainSpecLine.size() != 2 ||
+        ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
+      cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+      exit(1);
+    }
+    // store
+    string &name = domainSpecLine[1];
+    spec.push_back( make_pair( lineNumber, name ));
+    if (name2id.find( name ) == name2id.end()) {
+      name2id[ name ] = list.size();
+      list.push_back( name );
+    }
+  }
+}
+
+// get domain name based on sentence number
+string Domain::getDomainOfSentence( int sentenceId ) {
+  for(size_t i=0; i<spec.size(); i++) {
+    if (sentenceId <= spec[i].first) {
+      return spec[i].second;
+    }
+  }
+  return "undefined";
+}
+
+}
+
diff --git a/phrase-extract/domain.h b/phrase-extract/domain.h
new file mode 100644
index 000000000..cf675c17e
--- /dev/null
+++ b/phrase-extract/domain.h
@@ -0,0 +1,32 @@
+// $Id$
+
+#ifndef _DOMAIN_H
+#define _DOMAIN_H
+
+#include <iostream>
+#include <fstream>
+#include <assert.h>
+#include <stdlib.h>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+
+extern std::vector<std::string> tokenize( const char*);
+
+namespace MosesTraining
+{
+
+class Domain
+{
+public:
+  std::vector< std::pair< int, std::string > > spec;
+  std::vector< std::string > list;
+  std::map< std::string, int > name2id;
+  void load( const std::string &fileName );
+  std::string getDomainOfSentence( int sentenceId );
+};
+
+}
+
+#endif
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index c333040f6..5c308fd9b 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -46,8 +46,6 @@
 #include "XmlTree.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
-#include "../moses/src/ThreadPool.h"
-#include "../moses/src/OutputCollector.h"
 
 #define LINE_MAX_LENGTH 500000
 
@@ -57,55 +55,53 @@ using namespace MosesTraining;
 typedef vector< int > LabelIndex;
 typedef map< int, int > WordIndex;
 
-class ExtractTask : public Moses::Task {
+class ExtractTask 
+{
 private:
-  size_t m_id;
-  SentenceAlignmentWithSyntax *m_sentence;
-  RuleExtractionOptions &m_options;
-  Moses::OutputCollector* m_extractCollector;
-  Moses::OutputCollector* m_extractCollectorInv;
+  SentenceAlignmentWithSyntax &m_sentence;
+  const RuleExtractionOptions &m_options;
+  Moses::OutputFileStream& m_extractFile;
+  Moses::OutputFileStream& m_extractFileInv;
+
+  vector< ExtractedRule > m_extractedRules;
+  
+  // main functions
+  void extractRules();
+  void addRuleToCollection(ExtractedRule &rule);
+  void consolidateRules();
+  void writeRulesToFile();
+  
+  // subs
+  void addRule( int, int, int, int, int, RuleExist &ruleExist);
+  void addHieroRule( int startT, int endT, int startS, int endS
+                    , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
+  void printHieroPhrase( int startT, int endT, int startS, int endS
+                        , HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
+  string printTargetHieroPhrase(  int startT, int endT, int startS, int endS
+                                , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
+  string printSourceHieroPhrase( int startT, int endT, int startS, int endS
+                                , HoleCollection &holeColl, const LabelIndex &labelIndex);
+  void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
+                                   , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
+  void printHieroAlignment(  int startT, int endT, int startS, int endS
+                           , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
+  void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
+  
+  inline string IntToString( int i )
+  {
+    stringstream out;
+    out << i;
+    return out.str();
+  }
 
 public:
-  ExtractTask(size_t id, SentenceAlignmentWithSyntax *sentence, RuleExtractionOptions &options, Moses::OutputCollector* extractCollector, Moses::OutputCollector* extractCollectorInv):
-    m_id(id),
+  ExtractTask(SentenceAlignmentWithSyntax &sentence, const RuleExtractionOptions &options, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv):
     m_sentence(sentence),
     m_options(options),
-    m_extractCollector(extractCollector),
-    m_extractCollectorInv(extractCollectorInv) {}
-  ~ExtractTask() { delete m_sentence; }
+    m_extractFile(extractFile),
+    m_extractFileInv(extractFileInv) {}
   void Run();
 
-private:
-vector< ExtractedRule > m_extractedRules;
-
-// main functions
-void extractRules();
-void addRuleToCollection(ExtractedRule &rule);
-void consolidateRules();
-void writeRulesToFile();
-
-// subs
-void addRule( int, int, int, int, RuleExist &ruleExist);
-void addHieroRule( int startT, int endT, int startS, int endS
-                   , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
-void printHieroPhrase( int startT, int endT, int startS, int endS
-                       , HoleCollection &holeColl, LabelIndex &labelIndex);
-string printTargetHieroPhrase(  int startT, int endT, int startS, int endS
-                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
-string printSourceHieroPhrase( int startT, int endT, int startS, int endS
-                               , HoleCollection &holeColl, const LabelIndex &labelIndex);
-void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
-                                  , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
-void printHieroAlignment(  int startT, int endT, int startS, int endS
-                         , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
-void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
-
-inline string IntToString( int i )
-{
-  stringstream out;
-  out << i;
-  return out.str();
-}
 };
 
 // stats for glue grammar and unknown word label probabilities
@@ -120,15 +116,14 @@ int main(int argc, char* argv[])
        << "rule extraction from an aligned parallel corpus\n";
 
   RuleExtractionOptions options;
+  int sentenceOffset = 0;
 #ifdef WITH_THREADS
   int thread_count = 1;
 #endif
   if (argc < 5) {
     cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
-#ifdef WITH_THREADS
-         << " --threads NUM |"
-#endif
-         << " --GlueGrammar FILE"
+
+    << " --GlueGrammar FILE"
          << " | --UnknownWordLabel FILE"
          << " | --OnlyDirect"
          << " | --OutputNTLengths"
@@ -143,7 +138,9 @@ int main(int argc, char* argv[])
          << " | --SourceSyntax | --TargetSyntax"
          << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource |  --NoNonTermFirstWord | --NoFractionalCounting"
          << " | --UnpairedExtractFormat"
-         << " | --ConditionOnTargetLHS ]\n";
+         << " | --ConditionOnTargetLHS ]"
+        << " | --BoundaryRules[" << options.boundaryRules << "]";
+    
     exit(1);
   }
   char* &fileNameT = argv[1];
@@ -268,12 +265,23 @@ int main(int argc, char* argv[])
       options.unpairedExtractFormat = true;
     } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
       options.conditionOnTargetLhs = true;
-#ifdef WITH_THREADS
     } else if (strcmp(argv[i],"-threads") == 0 || 
                strcmp(argv[i],"--threads") == 0 ||
                strcmp(argv[i],"--Threads") == 0) {
+#ifdef WITH_THREADS
       thread_count = atoi(argv[++i]);
+#else
+      cerr << "thread support not compiled in." << '\n';
+      exit(1);
 #endif
+    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+        exit(1);
+      }
+      sentenceOffset = atoi(argv[++i]);
+    } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
+      options.boundaryRules = true;
     } else {
       cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
       exit(1);
@@ -299,27 +307,17 @@ int main(int argc, char* argv[])
   if (!options.onlyDirectFlag)
     extractFileInv.Open(fileNameExtractInv.c_str());
 
-  // output into file
-  Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
-  Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);
 
   // stats on labels for glue grammar and unknown word label probabilities
   set< string > targetLabelCollection, sourceLabelCollection;
   map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
 
-#ifdef WITH_THREADS
-  // set up thread pool
-  Moses::ThreadPool pool(thread_count);
-  pool.SetQueueLimit(1000);
-#endif
-
   // loop through all sentence pairs
-  size_t i=0;
+  size_t i=sentenceOffset;
   while(true) {
     i++;
-    if (i%1000 == 0) cerr << "." << flush;
-    if (i%10000 == 0) cerr << ":" << flush;
-    if (i%100000 == 0) cerr << "!" << flush;
+    if (i%1000 == 0) cerr << i << " " << flush;
+
     char targetString[LINE_MAX_LENGTH];
     char sourceString[LINE_MAX_LENGTH];
     char alignmentString[LINE_MAX_LENGTH];
@@ -328,7 +326,7 @@ int main(int argc, char* argv[])
     SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
     SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
 
-    SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
+    SentenceAlignmentWithSyntax sentence
       (targetLabelCollection, sourceLabelCollection, 
        targetTopLabelCollection, sourceTopLabelCollection, options);
     //az: output src, tgt, and alingment line
@@ -339,32 +337,17 @@ int main(int argc, char* argv[])
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
 
-    if (sentence->create(targetString, sourceString, alignmentString, i)) {
+    if (sentence.create(targetString, sourceString, alignmentString, i, options.boundaryRules)) {
       if (options.unknownWordLabelFlag) {
-        collectWordLabelCounts(*sentence);
-      }
-      ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
-#ifdef WITH_THREADS
-      if (thread_count == 1) {
-        task->Run();
-        delete task;
+        collectWordLabelCounts(sentence);
       }
-      else {
-        pool.Submit(task);
-      }
-#else
+      ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv);
       task->Run();
       delete task;
-#endif
     }
     if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
   }
 
-#ifdef WITH_THREADS
-  // wait for all threads to finish
-  pool.Stop(true);
-#endif
-
   tFile.Close();
   sFile.Close();
   aFile.Close();
@@ -390,8 +373,8 @@ void ExtractTask::Run() {
 
 void ExtractTask::extractRules()
 {
-  int countT = m_sentence->target.size();
-  int countS = m_sentence->source.size();
+  int countT = m_sentence.target.size();
+  int countS = m_sentence.source.size();
 
   // phrase repository for creating hiero phrases
   RuleExist ruleExist(countT);
@@ -406,17 +389,17 @@ void ExtractTask::extractRules()
       int endT = startT + lengthT - 1;
 
       // if there is target side syntax, there has to be a node
-      if (m_options.targetSyntax && !m_sentence->targetTree.HasNode(startT,endT))
+      if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
         continue;
 
       // find find aligned source words
       // first: find minimum and maximum source word
       int minS = 9999;
       int maxS = -1;
-      vector< int > usedS = m_sentence->alignedCountS;
+      vector< int > usedS = m_sentence.alignedCountS;
       for(int ti=startT; ti<=endT; ti++) {
-        for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
-          int si = m_sentence->alignedToT[ti][i];
+        for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+          int si = m_sentence.alignedToT[ti][i];
           if (si<minS) {
             minS = si;
           }
@@ -451,22 +434,22 @@ void ExtractTask::extractRules()
       for(int startS=minS;
           (startS>=0 &&
            startS>maxS - m_options.maxSpan && // within length limit
-           (startS==minS || m_sentence->alignedCountS[startS]==0)); // unaligned
+           (startS==minS || m_sentence.alignedCountS[startS]==0)); // unaligned
           startS--) {
         // end point of source phrase may advance over unaligned
         for(int endS=maxS;
             (endS<countS && endS<startS + m_options.maxSpan && // within length limit
-             (endS==maxS || m_sentence->alignedCountS[endS]==0)); // unaligned
+             (endS==maxS || m_sentence.alignedCountS[endS]==0)); // unaligned
             endS++) {
           // if there is source side syntax, there has to be a node
-          if (m_options.sourceSyntax && !m_sentence->sourceTree.HasNode(startS,endS))
+          if (m_options.sourceSyntax && !m_sentence.sourceTree.HasNode(startS,endS))
             continue;
 
           // TODO: loop over all source and target syntax labels
 
           // if within length limits, add as fully-lexical phrase pair
           if (endT-startT < m_options.maxSymbolsTarget && endS-startS < m_options.maxSymbolsSource) {
-            addRule(startT,endT,startS,endS, ruleExist);
+            addRule(startT,endT,startS,endS, countS, ruleExist);
           }
 
           // take note that this is a valid phrase alignment
@@ -508,7 +491,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
 
       int labelI = labelIndex[ 2+holeCount+holeTotal ];
       string label = m_options.sourceSyntax ?
-                     m_sentence->sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+                     m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
       hole.SetLabel(label, 0);
 
       currPos = hole.GetEnd(0);
@@ -526,7 +509,8 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
 }
 
 string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
-                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
+                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
+                              , int countS)
 {
   HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
   assert(iterHoleList != holeColl.GetHoles().end());
@@ -548,8 +532,15 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
       assert(sourceLabel != "");
 
       int labelI = labelIndex[ 2+holeCount ];
-      string targetLabel = m_options.targetSyntax ?
-                           m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+      string targetLabel;
+      if (m_options.targetSyntax) {
+        targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
+      } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+         targetLabel = "S"; 
+      } else {
+        targetLabel = "X";
+      }
+      
       hole.SetLabel(targetLabel, 1);
 
       if (m_options.unpairedExtractFormat) {
@@ -559,7 +550,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
       }
 
       if (m_options.pcfgScore) {
-        double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+        double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
         logPCFGScore -= score;
       }
 
@@ -569,7 +560,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
       holeCount++;
     } else {
       indexT[currPos] = outPos;
-      out += m_sentence->target[currPos] + " ";
+      out += m_sentence.target[currPos] + " ";
     }
 
     outPos++;
@@ -613,7 +604,7 @@ string ExtractTask::printSourceHieroPhrase( int startT, int endT, int startS, in
       ++iterHoleList;
       ++holeCount;
     } else {
-      out += m_sentence->source[currPos] + " ";
+      out += m_sentence.source[currPos] + " ";
     }
 
     outPos++;
@@ -630,8 +621,8 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
   for(int ti=startT; ti<=endT; ti++) {
     WordIndex::const_iterator p = indexT.find(ti);
     if (p != indexT.end()) { // does word still exist?
-      for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
-        int si = m_sentence->alignedToT[ti][i];
+      for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+        int si = m_sentence.alignedToT[ti][i];
         std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
         std::string targetSymbolIndex = IntToString(p->second);
         rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -663,30 +654,37 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
 }
 
 void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
-                       , HoleCollection &holeColl, LabelIndex &labelIndex)
+                       , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
 {
   WordIndex indexS, indexT; // to keep track of word positions in rule
 
   ExtractedRule rule( startT, endT, startS, endS );
 
   // phrase labels
-  string targetLabel = m_options.targetSyntax ?
-                       m_sentence->targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+  string targetLabel;
+  if (m_options.targetSyntax) {
+    targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
+  } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+    targetLabel = "S";
+  } else {
+    targetLabel = "X";
+  }
+
   string sourceLabel = m_options.sourceSyntax ?
-                       m_sentence->sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+                       m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
 
   // create non-terms on the source side
   preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
 
   // target
   if (m_options.pcfgScore) {
-    double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
-    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
                 + " [" + targetLabel + "]";
     rule.pcfgScore = std::exp(logPCFGScore);
   } else {
     double logPCFGScore = 0.0f;
-    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
                 + " [" + targetLabel + "]";
   }
 
@@ -704,24 +702,24 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
   addRuleToCollection( rule );
 }
 
-void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl)
+void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS)
 {
   LabelIndex labelIndex,labelCount;
 
   // number of target head labels
-  int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(startT,endT).size() : 1;
+  int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
   labelCount.push_back(numLabels);
   labelIndex.push_back(0);
 
   // number of source head labels
-  numLabels =  m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(startS,endS).size() : 1;
+  numLabels =  m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1;
   labelCount.push_back(numLabels);
   labelIndex.push_back(0);
 
   // number of target hole labels
   for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
        hole != holeColl.GetHoles().end(); hole++ ) {
-    int numLabels =  m_options.targetSyntax ? m_sentence->targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+    int numLabels =  m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
     labelCount.push_back(numLabels);
     labelIndex.push_back(0);
   }
@@ -731,7 +729,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
   for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
        i != holeColl.GetSortedSourceHoles().end(); i++ ) {
     const Hole &hole = **i;
-    int numLabels =  m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
+    int numLabels =  m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
     labelCount.push_back(numLabels);
     labelIndex.push_back(0);
   }
@@ -739,7 +737,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
   // loop through the holes
   bool done = false;
   while(!done) {
-    printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex );
+    printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS );
     for(unsigned int i=0; i<labelIndex.size(); i++) {
       labelIndex[i]++;
       if(labelIndex[i] == labelCount[i]) {
@@ -843,7 +841,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
             }
             // covered by word? check if it is aligned
             else {
-              if (m_sentence->alignedToT[pos].size() > 0)
+              if (m_sentence.alignedToT[pos].size() > 0)
                 foundAlignedWord = true;
             }
           }
@@ -867,7 +865,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
 
         // passed all checks...
         if (allowablePhrase)
-          printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl);
+          printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl, wordCountS);
 
         // recursively search for next hole
         int nextInitStartT = m_options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
@@ -879,10 +877,15 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
   }
 }
 
-void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist &ruleExist)
+void ExtractTask::addRule( int startT, int endT, int startS, int endS, int countS, RuleExist &ruleExist)
 {
-  // source
-
+  // contains only <s> or </s>. Don't output
+  if (m_options.boundaryRules 
+      && (   (startS == 0         && endS == 0) 
+          || (startS == countS-1  && endS == countS-1))) {
+    return;
+  }
+  
   if (m_options.onlyOutputSpanInfo) {
     cout << startS << " " << endS << " " << startT << " " << endT << endl;
     return;
@@ -893,36 +896,42 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
   // phrase labels
   string targetLabel,sourceLabel;
   if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
-    sourceLabel = targetLabel = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel();
+    sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
   }
   else {
     sourceLabel = m_options.sourceSyntax ?
-                  m_sentence->sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
-    targetLabel = m_options.targetSyntax ?
-                  m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+                  m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+    
+    if (m_options.targetSyntax) {
+      targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+    } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+      targetLabel = "S";
+    } else {
+      targetLabel = "X";
+    }
   }
 
   // source
   rule.source = "";
   for(int si=startS; si<=endS; si++)
-    rule.source += m_sentence->source[si] + " ";
+    rule.source += m_sentence.source[si] + " ";
   rule.source += "[" + sourceLabel + "]";
 
   // target
   rule.target = "";
   for(int ti=startT; ti<=endT; ti++)
-    rule.target += m_sentence->target[ti] + " ";
+    rule.target += m_sentence.target[ti] + " ";
   rule.target += "[" + targetLabel + "]";
 
   if (m_options.pcfgScore) {
-    double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
     rule.pcfgScore = std::exp(logPCFGScore);
   }
 
   // alignment
   for(int ti=startT; ti<=endT; ti++) {
-    for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
-      int si = m_sentence->alignedToT[ti][i];
+    for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+      int si = m_sentence.alignedToT[ti][i];
       std::string sourceSymbolIndex = IntToString(si-startS);
       std::string targetSymbolIndex = IntToString(ti-startT);
       rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -1015,8 +1024,8 @@ void ExtractTask::writeRulesToFile()
              << rule->count << "\n";
     }
   }
-  m_extractCollector->Write( m_id, out.str() );
-  m_extractCollectorInv->Write( m_id, outInv.str() );;
+  m_extractFile << out.str();
+  m_extractFileInv << outInv.str();
 }
 
 void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index 6583fd077..6a1ee77ab 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -1,6 +1,7 @@
 /*
  * extract.cpp
- *
+ *	Modified by: Rohit Gupta CDAC, Mumbai, India
+ *	on July 15, 2012 to implement parallel processing
  *      Modified by: Nadi Tomeh - LIMSI/CNRS
  *      Machine Translation Marathon 2010, Dublin
  */
@@ -13,7 +14,7 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <cstring>
-
+#include <sstream>
 #include <map>
 #include <set>
 #include <vector>
@@ -23,14 +24,16 @@
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
+#include "PhraseExtractionOptions.h"
 
 using namespace std;
 using namespace MosesTraining;
 
-#define LINE_MAX_LENGTH 500000
+namespace MosesTraining {
+
+
+const long int LINE_MAX_LENGTH = 500000 ;
 
-namespace MosesTraining
-{
 
 // HPhraseVertex represents a point in the alignment matrix
 typedef pair <int, int> HPhraseVertex;
@@ -46,58 +49,64 @@ typedef vector < HPhrase > HPhraseVector;
 // The key of the map is the English index and the value is a set of the source ones
 typedef map <int, set<int> > HSentenceVertices;
 
-enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
-enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
-
-REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+  REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                            int, int, int, int, int, int, int,
                            bool (*)(int, int), bool (*)(int, int));
-REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+  REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                              int, int, int, int, int, int, int,
                              bool (*)(int, int), bool (*)(int, int),
                              const HSentenceVertices &, const HSentenceVertices &);
-REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+  REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                            int, int, int, int, int, int, int,
                            bool (*)(int, int), bool (*)(int, int),
                            const HSentenceVertices &, const HSentenceVertices &,
                            const HSentenceVertices &, const HSentenceVertices &,
                            REO_POS);
 
-void insertVertex(HSentenceVertices &, int, int);
-void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
+  void insertVertex(HSentenceVertices &, int, int);
+  void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
                           int, int, int, int);
-string getOrientString(REO_POS, REO_MODEL_TYPE);
-
-bool ge(int, int);
-bool le(int, int);
-bool lt(int, int);
-
-void extractBase(SentenceAlignment &);
-void extract(SentenceAlignment &);
-void addPhrase(SentenceAlignment &, int, int, int, int, string &);
-bool isAligned (SentenceAlignment &, int, int);
-
-bool allModelsOutputFlag = false;
-
-bool wordModel = false;
-REO_MODEL_TYPE wordType = REO_MSD;
-bool phraseModel = false;
-REO_MODEL_TYPE phraseType = REO_MSD;
-bool hierModel = false;
-REO_MODEL_TYPE hierType = REO_MSD;
-
-
-Moses::OutputFileStream extractFile;
-Moses::OutputFileStream extractFileInv;
-Moses::OutputFileStream extractFileOrientation;
-Moses::OutputFileStream extractFileSentenceId;
-int maxPhraseLength;
-bool orientationFlag = false;
-bool translationFlag = true;
-bool sentenceIdFlag = false; //create extract file with sentence id
-bool onlyOutputSpanInfo = false;
-bool gzOutput = false;
+  string getOrientString(REO_POS, REO_MODEL_TYPE);
+
+  bool ge(int, int);
+  bool le(int, int);
+  bool lt(int, int);
+
+  bool isAligned (SentenceAlignment &, int, int);
+  int sentenceOffset = 0;
+
+}
 
+namespace MosesTraining{
+
+class ExtractTask 
+{
+public:
+  ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation,Moses::OutputFileStream &extractFileSentenceId  ):
+    m_sentence(sentence),
+    m_options(initoptions),
+    m_extractFile(extractFile),
+    m_extractFileInv(extractFileInv),
+    m_extractFileOrientation(extractFileOrientation),
+    m_extractFileSentenceId(extractFileSentenceId) {}
+void Run();
+private:
+  vector< string > m_extractedPhrases;
+  vector< string > m_extractedPhrasesInv;
+  vector< string > m_extractedPhrasesOri;
+  vector< string > m_extractedPhrasesSid;
+  void extractBase(SentenceAlignment &);
+  void extract(SentenceAlignment &);
+  void addPhrase(SentenceAlignment &, int, int, int, int, string &);
+  void writePhrasesToFile();
+  
+  SentenceAlignment &m_sentence;
+  const PhraseExtractionOptions &m_options;
+  Moses::OutputFileStream &m_extractFile;
+  Moses::OutputFileStream &m_extractFileInv;
+  Moses::OutputFileStream &m_extractFileOrientation;
+  Moses::OutputFileStream &m_extractFileSentenceId;
+};
 }
 
 int main(int argc, char* argv[])
@@ -105,70 +114,84 @@ int main(int argc, char* argv[])
   cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
         << "phrase extraction from an aligned parallel corpus\n";
 
-  if (argc < 6) {
-    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
+ if (argc < 6) {
+    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
+    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput | --IncludeSentenceId | --SentenceOffset n ]\n";
     exit(1);
   }
-  char* &fileNameE = argv[1];
-  char* &fileNameF = argv[2];
-  char* &fileNameA = argv[3];
-  string fileNameExtract = string(argv[4]);
-  maxPhraseLength = atoi(argv[5]);
+
+  Moses::OutputFileStream extractFile;
+  Moses::OutputFileStream extractFileInv;
+  Moses::OutputFileStream extractFileOrientation;
+  Moses::OutputFileStream extractFileSentenceId;
+  const char* const &fileNameE = argv[1];
+  const char* const &fileNameF = argv[2];
+  const char* const &fileNameA = argv[3];
+  const string fileNameExtract = string(argv[4]);
+  PhraseExtractionOptions options(atoi(argv[5]));
 
   for(int i=6; i<argc; i++) {
     if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
-      onlyOutputSpanInfo = true;
+      options.initOnlyOutputSpanInfo(true);
     } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
-      orientationFlag = true;
+      options.initOrientationFlag(true);
     } else if (strcmp(argv[i],"--NoTTable") == 0) {
-      translationFlag = false;
+      options.initTranslationFlag(false);
     } else if (strcmp(argv[i], "--SentenceId") == 0) {
-      sentenceIdFlag = true;  
+      options.initSentenceIdFlag(true);  
+    } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
+      options.initIncludeSentenceIdFlag(true);  
+    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+        exit(1);
+      }
+      sentenceOffset = atoi(argv[++i]);
     } else if (strcmp(argv[i], "--GZOutput") == 0) {
-      gzOutput = true;  
+      options.initGzOutput(true);  
     } else if(strcmp(argv[i],"--model") == 0) {
       if (i+1 >= argc) {
         cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
         exit(1);
       }
-      char* modelParams = argv[++i];
-      char* modelName = strtok(modelParams, "-");
-      char* modelType = strtok(NULL, "-");
+      char*  modelParams = argv[++i];
+      char*  modelName = strtok(modelParams, "-");
+      char*  modelType = strtok(NULL, "-");
 
-      REO_MODEL_TYPE intModelType;
+      // REO_MODEL_TYPE intModelType;
 
       if(strcmp(modelName, "wbe") == 0) {
-        wordModel = true;
+        options.initWordModel(true);
         if(strcmp(modelType, "msd") == 0)
-          wordType = REO_MSD;
+          options.initWordType(REO_MSD);
         else if(strcmp(modelType, "mslr") == 0)
-          wordType = REO_MSLR;
+          options.initWordType(REO_MSLR);
         else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          wordType = REO_MONO;
+          options.initWordType(REO_MONO);
         else {
           cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
           exit(1);
         }
       } else if(strcmp(modelName, "phrase") == 0) {
-        phraseModel = true;
+        options.initPhraseModel(true);
         if(strcmp(modelType, "msd") == 0)
-          phraseType = REO_MSD;
+          options.initPhraseType(REO_MSD);
         else if(strcmp(modelType, "mslr") == 0)
-          phraseType = REO_MSLR;
+          options.initPhraseType(REO_MSLR);
         else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          phraseType = REO_MONO;
+          options.initPhraseType(REO_MONO);
         else {
           cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
           exit(1);
         }
       } else if(strcmp(modelName, "hier") == 0) {
-        hierModel = true;
+        options.initHierModel(true);
         if(strcmp(modelType, "msd") == 0)
-          hierType = REO_MSD;
+          options.initHierType(REO_MSD);
         else if(strcmp(modelType, "mslr") == 0)
-          hierType = REO_MSLR;
+          options.initHierType(REO_MSLR);
         else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          hierType = REO_MONO;
+          options.initHierType(REO_MONO);
         else {
           cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
           exit(1);
@@ -178,7 +201,8 @@ int main(int argc, char* argv[])
         exit(1);
       }
 
-      allModelsOutputFlag = true;
+      options.initAllModelsOutputFlag(true);
+
     } else {
       cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
       exit(1);
@@ -187,9 +211,9 @@ int main(int argc, char* argv[])
 
   // default reordering model if no model selected
   // allows for the old syntax to be used
-  if(orientationFlag && !allModelsOutputFlag) {
-    wordModel = true;
-    wordType = REO_MSD;
+  if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
+    options.initWordModel(true);
+    options.initWordType(REO_MSD);
   }
 
   // open input files
@@ -202,22 +226,22 @@ int main(int argc, char* argv[])
   istream *aFileP = &aFile;
 
   // open output files
-  if (translationFlag) {
-    string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
-    extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+  if (options.isTranslationFlag()) {
+    string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
+    extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
     extractFileInv.Open(fileNameExtractInv.c_str());
   }
-  if (orientationFlag) {
-    string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+  if (options.isOrientationFlag()) {
+    string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
     extractFileOrientation.Open(fileNameExtractOrientation.c_str());
   }
 
-  if (sentenceIdFlag) {
-    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+  if (options.isSentenceIdFlag()) {
+    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
     extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
   }
 
-  int i=0;
+  int i = sentenceOffset;
   while(true) {
     i++;
     if (i%10000 == 0) cerr << "." << flush;
@@ -229,31 +253,38 @@ int main(int argc, char* argv[])
     SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
     SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
     SentenceAlignment sentence;
-    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+	// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
     //az: output src, tgt, and alingment line
-    if (onlyOutputSpanInfo) {
+    if (options.isOnlyOutputSpanInfo()) {
       cout << "LOG: SRC: " << foreignString << endl;
       cout << "LOG: TGT: " << englishString << endl;
       cout << "LOG: ALT: " << alignmentString << endl;
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
+	if (sentence.create( englishString, foreignString, alignmentString, i, false)) {
+   	ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileSentenceId);
+      task->Run();
+      delete task;
 
-    if (sentence.create( englishString, foreignString, alignmentString, i)) {
-      extract(sentence);
     }
-    if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+    if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
   }
+
   eFile.Close();
   fFile.Close();
   aFile.Close();
+
   //az: only close if we actually opened it
-  if (!onlyOutputSpanInfo) {
-    if (translationFlag) {
+  if (!options.isOnlyOutputSpanInfo()) {
+    if (options.isTranslationFlag()) {
       extractFile.Close();
       extractFileInv.Close();
+      
     }
-    if (orientationFlag) extractFileOrientation.Close();
-    if (sentenceIdFlag) {
+    if (options.isOrientationFlag()){ 
+	extractFileOrientation.Close();
+	}
+    if (options.isSentenceIdFlag()) {
       extractFileSentenceId.Close();
     }
   }
@@ -261,8 +292,17 @@ int main(int argc, char* argv[])
 
 namespace MosesTraining
 {
+void ExtractTask::Run() {
+  extract(m_sentence);
+  writePhrasesToFile();
+  m_extractedPhrases.clear();
+  m_extractedPhrasesInv.clear();
+  m_extractedPhrasesOri.clear();
+  m_extractedPhrasesSid.clear();
+
+}
 
-void extract(SentenceAlignment &sentence)
+void ExtractTask::extract(SentenceAlignment &sentence)
 {
   int countE = sentence.target.size();
   int countF = sentence.source.size();
@@ -281,14 +321,14 @@ void extract(SentenceAlignment &sentence)
 
   HSentenceVertices::const_iterator it;
 
-  bool relaxLimit = hierModel;
-  bool buildExtraStructure = phraseModel || hierModel;
+  bool relaxLimit = m_options.isHierModel();
+  bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
 
   // check alignments for target phrase startE...endE
   // loop over extracted phrases which are compatible with the word-alignments
   for(int startE=0; startE<countE; startE++) {
     for(int endE=startE;
-        (endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
+        (endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
         endE++) {
 
       int minF = 9999;
@@ -308,7 +348,7 @@ void extract(SentenceAlignment &sentence)
       }
 
       if (maxF >= 0 && // aligned to any source words at all
-          (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
+          (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
 
         // check if source words are aligned to out of bound target words
         bool out_of_bounds = false;
@@ -323,17 +363,17 @@ void extract(SentenceAlignment &sentence)
           // start point of source phrase may retreat over unaligned
           for(int startF=minF;
               (startF>=0 &&
-               (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
+               (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
                (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
               startF--)
             // end point of source phrase may advance over unaligned
             for(int endF=maxF;
                 (endF<countF &&
-                 (relaxLimit || endF<startF+maxPhraseLength) && // within length limit
+                 (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
                  (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
                 endF++) { // at this point we have extracted a phrase
               if(buildExtraStructure) { // phrase || hier
-                if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
+                if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
                   inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
                                                    HPhraseVertex(endF,endE)));
                   insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
@@ -343,16 +383,16 @@ void extract(SentenceAlignment &sentence)
                                        startF, startE, endF, endE);
               } else {
                 string orientationInfo = "";
-                if(wordModel) {
+                if(m_options.isWordModel()) {
                   REO_POS wordPrevOrient, wordNextOrient;
                   bool connectedLeftTopP  = isAligned( sentence, startF-1, startE-1 );
                   bool connectedRightTopP = isAligned( sentence, endF+1,   startE-1 );
                   bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
                   bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );
-                  wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
-                  wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
-                  orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
-                  if(allModelsOutputFlag)
+                  wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
+                  wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
+                  orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
+                  if(m_options.isAllModelsOutputFlag())
                     " | | ";
                 }
                 addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
@@ -378,38 +418,38 @@ void extract(SentenceAlignment &sentence)
       bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
       bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );
 
-      if(wordModel) {
-        wordPrevOrient = getOrientWordModel(sentence, wordType,
+      if(m_options.isWordModel()) {
+        wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
                                             connectedLeftTopP, connectedRightTopP,
                                             startF, endF, startE, endE, countF, 0, 1,
                                             &ge, &lt);
-        wordNextOrient = getOrientWordModel(sentence, wordType,
+        wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
                                             connectedLeftTopN, connectedRightTopN,
                                             endF, startF, endE, startE, 0, countF, -1,
                                             &lt, &ge);
       }
-      if (phraseModel) {
-        phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
+      if (m_options.isPhraseModel()) {
+        phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
                                                 connectedLeftTopP, connectedRightTopP,
                                                 startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
-        phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
+        phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
                                                 connectedLeftTopN, connectedRightTopN,
                                                 endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
       } else {
         phrasePrevOrient = phraseNextOrient = UNKNOWN;
       }
-      if(hierModel) {
-        hierPrevOrient = getOrientHierModel(sentence, hierType,
+      if(m_options.isHierModel()) {
+        hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
                                             connectedLeftTopP, connectedRightTopP,
                                             startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
-        hierNextOrient = getOrientHierModel(sentence, hierType,
+        hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
                                             connectedLeftTopN, connectedRightTopN,
                                             endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
       }
 
-      orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
-                        ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
-                        ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
+      orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
+                        ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
+                        ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
 
       addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
     }
@@ -617,95 +657,141 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
   return "";
 }
 
-void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
 {
   // source
-  // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+  //   // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+  	ostringstream outextractstr;
+  	ostringstream outextractstrInv;
+  	ostringstream outextractstrOrientation;
+  	ostringstream outextractstrSentenceId;
 
-  if (onlyOutputSpanInfo) {
+  if (m_options.isOnlyOutputSpanInfo()) {
     cout << startF << " " << endF << " " << startE << " " << endE << endl;
     return;
   }
 
-  for(int fi=startF; fi<=endF; fi++) {
-    if (translationFlag) extractFile << sentence.source[fi] << " ";
-    if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
-    if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
+for(int fi=startF; fi<=endF; fi++) {
+    if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
+    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
+    if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
   }
-  if (translationFlag) extractFile << "||| ";
-  if (orientationFlag) extractFileOrientation << "||| ";
-  if (sentenceIdFlag) extractFileSentenceId << "||| ";
+  if (m_options.isTranslationFlag()) outextractstr << "||| ";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
 
   // target
   for(int ei=startE; ei<=endE; ei++) {
-    if (translationFlag) extractFile << sentence.target[ei] << " ";
-    if (translationFlag) extractFileInv << sentence.target[ei] << " ";
-    if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
-    if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
+    if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
+    if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
+    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
+    if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
   }
-  if (translationFlag) extractFile << "|||";
-  if (translationFlag) extractFileInv << "||| ";
-  if (orientationFlag) extractFileOrientation << "||| ";
-  if (sentenceIdFlag) extractFileSentenceId << "||| ";
+  if (m_options.isTranslationFlag()) outextractstr << "|||";
+  if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
 
   // source (for inverse)
-  if (translationFlag) {
+
+ if (m_options.isTranslationFlag()) {
     for(int fi=startF; fi<=endF; fi++)
-      extractFileInv << sentence.source[fi] << " ";
-    extractFileInv << "|||";
+      outextractstrInv << sentence.source[fi] << " ";
+    outextractstrInv << "|||";
   }
-
   // alignment
-  if (translationFlag) {
+ if (m_options.isTranslationFlag()) {
     for(int ei=startE; ei<=endE; ei++) {
-      for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+      for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
         int fi = sentence.alignedToT[ei][i];
-        extractFile << " " << fi-startF << "-" << ei-startE;
-        extractFileInv << " " << ei-startE << "-" << fi-startF;
+        outextractstr << " " << fi-startF << "-" << ei-startE;
+        outextractstrInv << " " << ei-startE << "-" << fi-startF;
       }
     }
   }
 
-  if (orientationFlag)
-    extractFileOrientation << orientationInfo;
+  if (m_options.isOrientationFlag())
+    outextractstrOrientation << orientationInfo;
 
-  if (sentenceIdFlag) {
-    extractFileSentenceId << sentence.sentenceID;
+  if (m_options.isSentenceIdFlag()) {
+    outextractstrSentenceId << sentence.sentenceID;
+  }
+  if (m_options.isIncludeSentenceIdFlag()) {
+    outextractstr << " ||| " << sentence.sentenceID;
   }
 
-  if (translationFlag) extractFile << "\n";
-  if (translationFlag) extractFileInv << "\n";
-  if (orientationFlag) extractFileOrientation << "\n";
-  if (sentenceIdFlag) extractFileSentenceId << "\n";
+  if (m_options.isTranslationFlag()) outextractstr << "\n";
+  if (m_options.isTranslationFlag()) outextractstrInv << "\n";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
+
+
+    m_extractedPhrases.push_back(outextractstr.str());
+    m_extractedPhrasesInv.push_back(outextractstrInv.str());
+    m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
+    m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
+}
+
+
+void ExtractTask::writePhrasesToFile(){
+
+    ostringstream outextractFile;
+    ostringstream outextractFileInv;
+    ostringstream outextractFileOrientation;
+    ostringstream outextractFileSentenceId;
+
+    for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
+        outextractFile<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
+        outextractFileInv<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
+        outextractFileOrientation<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
+        outextractFileSentenceId<<phrase->data();
+    }
+
+      m_extractFile << outextractFile.str();
+      m_extractFileInv  << outextractFileInv.str();
+      m_extractFileOrientation << outextractFileOrientation.str();
+      m_extractFileSentenceId << outextractFileSentenceId.str();
 }
 
 // if proper conditioning, we need the number of times a source phrase occured
-void extractBase( SentenceAlignment &sentence )
+
+void ExtractTask::extractBase( SentenceAlignment &sentence )
 {
+    ostringstream outextractFile;
+    ostringstream outextractFileInv;
+
   int countF = sentence.source.size();
   for(int startF=0; startF<countF; startF++) {
     for(int endF=startF;
-        (endF<countF && endF<startF+maxPhraseLength);
+        (endF<countF && endF<startF+m_options.maxPhraseLength);
         endF++) {
       for(int fi=startF; fi<=endF; fi++) {
-        extractFile << sentence.source[fi] << " ";
-      }
-      extractFile << "|||" << endl;
+         outextractFile << sentence.source[fi] << " ";
+	}
+      outextractFile << "|||" << endl;
     }
   }
 
   int countE = sentence.target.size();
   for(int startE=0; startE<countE; startE++) {
     for(int endE=startE;
-        (endE<countE && endE<startE+maxPhraseLength);
+        (endE<countE && endE<startE+m_options.maxPhraseLength);
         endE++) {
       for(int ei=startE; ei<=endE; ei++) {
-        extractFileInv << sentence.target[ei] << " ";
+        outextractFileInv << sentence.target[ei] << " ";
       }
-      extractFileInv << "|||" << endl;
+      outextractFileInv << "|||" << endl;
     }
   }
-}
+    m_extractFile << outextractFile.str();
+    m_extractFileInv << outextractFileInv.str();
 
 }
 
+}
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index e3025ac08..f764beef7 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -30,6 +30,7 @@
 
 #include "SafeGetline.h"
 #include "tables-core.h"
+#include "domain.h"
 #include "PhraseAlignment.h"
 #include "score.h"
 #include "InputFileStream.h"
@@ -54,13 +55,22 @@ bool kneserNeyFlag = false;
 #define COC_MAX 10
 bool logProbFlag = false;
 int negLogProb = 1;
+inline float maybeLogProb( float a ) { return logProbFlag ? negLogProb*log(a) : a; }
 bool lexFlag = true;
 bool unalignedFlag = false;
 bool unalignedFWFlag = false;
 bool outputNTLengths = false;
+bool singletonFeature = false;
+bool crossedNonTerm = false;
 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
 float minCountHierarchical = 0;
+bool domainFlag = false;
+bool domainRatioFlag = false;
+bool domainSubsetFlag = false;
+bool domainSparseFlag = false;
+Domain *domain;
+bool includeSentenceIdFlag = false;
 
 Vocabulary vcbT;
 Vocabulary vcbS;
@@ -70,14 +80,14 @@ Vocabulary vcbS;
 vector<string> tokenize( const char [] );
 
 void writeCountOfCounts( const string &fileNameCountOfCounts );
-void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton);
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton );
+double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
+double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
 set<string> functionWordList;
-void loadFunctionWords( const char* fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+void loadFunctionWords( const string &fileNameFunctionWords );
+double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
 void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
                       , map<size_t, map<size_t, float> > &sourceProb
                       , map<size_t, map<size_t, float> > &targetProb);
@@ -90,14 +100,15 @@ int main(int argc, char* argv[])
        << "scoring methods for extracted rules\n";
 
   if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS]\n";
+    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n";
     exit(1);
   }
-  char* fileNameExtract = argv[1];
-  char* fileNameLex = argv[2];
-  char* fileNamePhraseTable = argv[3];
+  string fileNameExtract = argv[1];
+  string fileNameLex = argv[2];
+  string fileNamePhraseTable = argv[3];
   string fileNameCountOfCounts;
-  char* fileNameFunctionWords;
+  char* fileNameFunctionWords = NULL;
+  char* fileNameDomain = NULL;
 
   for(int i=4; i<argc; i++) {
     if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
@@ -140,6 +151,22 @@ int main(int argc, char* argv[])
       }
       fileNameFunctionWords = argv[++i];
       cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
+    } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 ||
+               strcmp(argv[i],"--SparseDomainRatio") == 0 ||
+               strcmp(argv[i],"--SparseDomainSubset") == 0 ||
+               strcmp(argv[i],"--DomainIndicator") == 0 ||
+               strcmp(argv[i],"--DomainRatio") == 0 ||
+               strcmp(argv[i],"--DomainSubset") == 0) {
+      includeSentenceIdFlag = true;
+      domainFlag = true;
+      domainSparseFlag = strstr( argv[i], "Sparse" );
+      domainRatioFlag = strstr( argv[i], "Ratio" );
+      domainSubsetFlag = strstr( argv[i], "Subset" );
+      if (i+1==argc) {
+        cerr << "ERROR: specify domain info file with " << argv[i] << endl;
+        exit(1);
+      }
+      fileNameDomain = argv[++i];
     } else if (strcmp(argv[i],"--LogProb") == 0) {
       logProbFlag = true;
       cerr << "using log-probabilities\n";
@@ -153,6 +180,12 @@ int main(int argc, char* argv[])
       minCountHierarchical -= 0.00001; // account for rounding
     } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
       outputNTLengths = true;
+    } else if (strcmp(argv[i],"--Singleton") == 0) {
+      singletonFeature = true;
+      cerr << "binary singleton feature\n";
+    } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
+      crossedNonTerm = true;
+      cerr << "crossed non-term reordering feature\n";
     } else {
       cerr << "ERROR: unknown option " << argv[i] << endl;
       exit(1);
@@ -167,6 +200,18 @@ int main(int argc, char* argv[])
   if (unalignedFWFlag)
     loadFunctionWords( fileNameFunctionWords );
 
+  // load domain information
+  if (domainFlag) {
+    if (inverseFlag) {
+      domainFlag = false;
+      includeSentenceIdFlag = false;
+    }
+    else {
+      domain = new Domain;
+      domain->load( fileNameDomain );
+    }
+  }
+
   // compute count of counts for Good Turing discounting
   if (goodTuringFlag || kneserNeyFlag) {
     for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
@@ -184,7 +229,7 @@ int main(int argc, char* argv[])
   // output file: phrase translation table
 	ostream *phraseTableFile;
 
-	if (strcmp(fileNamePhraseTable, "-") == 0) {
+	if (fileNamePhraseTable == "-") {
 		phraseTableFile = &cout;
 	}
 	else {
@@ -202,6 +247,7 @@ int main(int argc, char* argv[])
   float lastCount = 0.0f;
   float lastPcfgSum = 0.0f;
   vector< PhraseAlignment > phrasePairsWithSameF;
+  bool isSingleton = true;
   int i=0;
   char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
   lastLine[0] = '\0';
@@ -222,30 +268,40 @@ int main(int argc, char* argv[])
 
     // create new phrase pair
     PhraseAlignment phrasePair;
-    phrasePair.create( line, i );
+    phrasePair.create( line, i, includeSentenceIdFlag );
     lastCount = phrasePair.count;
     lastPcfgSum = phrasePair.pcfgSum;
 
     // only differs in count? just add count
-    if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
+    if (lastPhrasePair != NULL 
+	&& lastPhrasePair->equals( phrasePair )
+	&& (!domainFlag
+	    || domain->getDomainOfSentence( lastPhrasePair->sentenceId )
+	    == domain->getDomainOfSentence( phrasePair.sentenceId ) )) {
       lastPhrasePair->count += phrasePair.count;
       lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
       continue;
     }
-
+    
     // if new source phrase, process last batch
     if (lastPhrasePair != NULL &&
         lastPhrasePair->GetSource() != phrasePair.GetSource()) {
-      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
+      
       phrasePairsWithSameF.clear();
+      isSingleton = false;
       lastPhrasePair = NULL;
     }
+    else
+    {
+      isSingleton = true;
+    }
 
     // add phrase pairs to list, it's now the last one
     phrasePairsWithSameF.push_back( phrasePair );
     lastPhrasePair = &phrasePairsWithSameF.back();
   }
-  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
 	
 	phraseTableFile->flush();
 	if (phraseTableFile != &cout) {
@@ -279,7 +335,7 @@ void writeCountOfCounts( const string &fileNameCountOfCounts )
 	countOfCountsFile.Close();
 }
 
-void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
+void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton )
 {
   if (phrasePair.size() == 0) return;
 
@@ -320,16 +376,15 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
   for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) 
   {
     const PhraseAlignmentCollection &group = **iter;
-    outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile );
-
+    outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile, isSingleton );
   }
   
 }
 
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
 {
   float bestAlignmentCount = -1;
-  PhraseAlignment* bestAlignment;
+  PhraseAlignment* bestAlignment = NULL;
   
   for(size_t i=0; i<phrasePair.size(); i++) {
     size_t alignInd;
@@ -347,7 +402,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
     }
   }    
 
-  return bestAlignment;
+  return *bestAlignment;
 }
 
 
@@ -438,11 +493,65 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,
 
 }
 
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
+bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS)
+{
+  for (int currSource = 0; currSource < alignedToS.size(); ++currSource)
+  {
+    if (currSource == sourcePos)
+    { // skip
+    }
+    else 
+    {
+      const std::set<size_t> &targetSet = alignedToS[currSource];
+      std::set<size_t>::const_iterator iter;
+      for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+      {
+        size_t currTarget = *iter;
+        
+        if ((currSource < sourcePos && currTarget > targetPos)
+            || (currSource > sourcePos && currTarget < targetPos)
+          )
+        {
+          return true;
+        }
+      }
+      
+    }
+  }
+  
+  return false;
+}
+
+int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
+{
+  const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
+  
+  for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
+  {
+    const std::set<size_t> &targetSet = alignedToS[sourcePos];
+    
+    WORD_ID wordId = phraseS[sourcePos];
+    const WORD &word = vcbS.getWord(wordId);
+    bool isNonTerm = isNonTerminal(word);
+    
+    if (isNonTerm)
+    {
+      assert(targetSet.size() == 1);
+      int targetPos = *targetSet.begin();
+      bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
+      if (ret)
+        return 1;
+    }
+  }
+  
+  return 0;
+}
+
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
 {
   if (phrasePair.size() == 0) return;
 
-  PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
+  const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
     
   // compute count
   float count = 0;
@@ -450,6 +559,18 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
     count += phrasePair[i]->count;
   }
 
+  // compute domain counts
+  map< string, float > domainCount;
+  if (domainFlag) {
+    for(size_t i=0; i<phrasePair.size(); i++) {
+      string d = domain->getDomainOfSentence( phrasePair[i]->sentenceId );
+      if (domainCount.find( d ) == domainCount.end())
+        domainCount[ d ] = phrasePair[i]->count;
+      else
+        domainCount[ d ] += phrasePair[i]->count;
+    }
+  }
+
   // collect count of count statistics
   if (goodTuringFlag || kneserNeyFlag) {
     totalDistinct++;
@@ -459,7 +580,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   }
 
   // compute PCFG score
-  float pcfgScore;
+  float pcfgScore = 0;
   if (pcfgFlag && !inverseFlag) {
     float pcfgSum = 0;
     for(size_t i=0; i<phrasePair.size(); ++i) {
@@ -482,41 +603,109 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
 
   // source phrase (unless inverse)
   if (! inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
     phraseTableFile << " ||| ";
   }
 
   // target phrase
-  printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+  printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
   phraseTableFile << " ||| ";
 
   // source phrase (if inverse)
   if (inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
     phraseTableFile << " ||| ";
   }
 
   // lexical translation probability
   if (lexFlag) {
     double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
-    phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
+    phraseTableFile << maybeLogProb( lexScore );
   }
 
   // unaligned word penalty
   if (unalignedFlag) {
     double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
-    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+    phraseTableFile << " " << maybeLogProb( penalty );
   }
 
   // unaligned function word penalty
   if (unalignedFWFlag) {
     double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
-    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+    phraseTableFile << " " << maybeLogProb( penalty );
   }
 
+  if (singletonFeature) {
+    phraseTableFile << " " << (isSingleton ? 1 : 0);
+  }
+  
+  if (crossedNonTerm && !inverseFlag) {
+    phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
+  }
+  
   // target-side PCFG score
   if (pcfgFlag && !inverseFlag) {
-    phraseTableFile << " " << pcfgScore;
+    phraseTableFile << " " << maybeLogProb( pcfgScore );
+  }
+
+  // domain count features
+  if (domainFlag) {
+    if (domainSparseFlag) {
+      // sparse, subset
+      if (domainSubsetFlag) {
+        typedef vector< string >::const_iterator I;
+        phraseTableFile << " doms";
+        for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
+          if (domainCount.find( *i ) != domainCount.end() ) {
+            phraseTableFile << "_" << *i;
+          }
+        }
+        phraseTableFile << " 1";
+      }
+      // sparse, indicator or ratio
+      else {
+        typedef map< string, float >::const_iterator I;
+        for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+          if (domainRatioFlag) {
+            phraseTableFile << " domr_" << i->first << " " << (i->second / count);
+          }
+          else {
+            phraseTableFile << " dom_" << i->first << " 1";
+          }
+        }
+      }
+    }
+    // core, subset
+    else if (domainSubsetFlag) {
+      if (domain->list.size() > 6) {
+        cerr << "ERROR: too many domains for core domain subset features\n";
+        exit(1);
+      }
+      size_t bitmap = 0;
+      for(size_t bit = 0; bit < domain->list.size(); bit++) {
+        if (domainCount.find( domain->list[ bit ] ) != domainCount.end()) {
+          bitmap += 1 << bit;
+        }
+      }
+      for(size_t i = 1; i < (1 << domain->list.size()); i++) {
+        phraseTableFile << " " << maybeLogProb( (bitmap == i) ? 2.718 : 1 );
+      }
+    }
+    // core, indicator or ratio
+    else {
+      typedef vector< string >::const_iterator I;
+      for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
+        if (domainCount.find( *i ) == domainCount.end() ) {
+          phraseTableFile << " " << maybeLogProb( 1 );
+        }
+        else if (domainRatioFlag) {
+          phraseTableFile << " " << maybeLogProb( exp( domainCount[ *i ] / count ) );
+        }
+        else {
+          phraseTableFile << " " << maybeLogProb( 2.718 );
+        }
+      }
+    }
   }
 
   phraseTableFile << " ||| ";
@@ -526,41 +715,40 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
     if (hierarchicalFlag) {
       // always output alignment if hiero style, but only for non-terms 
       // (eh: output all alignments, needed for some feature functions) 
-      assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
+      assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
       std::vector<std::string> alignment;
       for(size_t j = 0; j < phraseT.size() - 1; j++) {
         if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
-          if (bestAlignment->alignedToT[ j ].size() != 1) {
+          if (bestAlignment.alignedToT[ j ].size() != 1) {
             cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
             phraseTableFile.flush();
-            assert(bestAlignment->alignedToT[ j ].size() == 1);
+            assert(bestAlignment.alignedToT[ j ].size() == 1);
           }
-          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
+          int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
           //phraseTableFile << sourcePos << "-" << j << " ";
-	  std::stringstream point;
-	  point << sourcePos << "-" << j;
-	  alignment.push_back(point.str());
-	}
-	else {
+          std::stringstream point;
+          point << sourcePos << "-" << j;
+          alignment.push_back(point.str());
+        } else {
           set<size_t>::iterator setIter;
-          for(setIter = (bestAlignment->alignedToT[j]).begin(); setIter != (bestAlignment->alignedToT[j]).end(); setIter++) {
+          for(setIter = (bestAlignment.alignedToT[j]).begin(); setIter != (bestAlignment.alignedToT[j]).end(); setIter++) {
             int sourcePos = *setIter;
             //phraseTableFile << sourcePos << "-" << j << " ";
-	    std::stringstream point;
-	    point << sourcePos << "-" << j;
-	    alignment.push_back(point.str());
-          }
-        }
-      }
-      // now print all alignments, sorted by source index
-      sort(alignment.begin(), alignment.end());
-      for (size_t i = 0; i < alignment.size(); ++i) {
-	phraseTableFile << alignment[i] << " ";
-      }
-    } else if (wordAlignmentFlag) {
+            std::stringstream point;
+            point << sourcePos << "-" << j;
+            alignment.push_back(point.str());
+           }
+         }
+       }
+       // now print all alignments, sorted by source index
+       sort(alignment.begin(), alignment.end());
+       for (size_t i = 0; i < alignment.size(); ++i) {
+          phraseTableFile << alignment[i] << " ";
+       }
+     } else if (wordAlignmentFlag) {
       // alignment info in pb model
-      for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
-        const set< size_t > &aligned = bestAlignment->alignedToT[j];
+      for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
+        const set< size_t > &aligned = bestAlignment.alignedToT[j];
         for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
           phraseTableFile << *p << "-" << j << " ";
         }
@@ -568,6 +756,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
     }
   }
 
+
   // counts
   
   phraseTableFile << " ||| " << totalCount << " " << count;
@@ -594,13 +783,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   phraseTableFile << endl;
 }
 
-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // unaligned word counter
   double unaligned = 1.0;
   // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty()) {
       unaligned *= 2.718;
     }
@@ -608,13 +797,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph
   return unaligned;
 }
 
-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // unaligned word counter
   double unaligned = 1.0;
   // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
       unaligned *= 2.718;
     }
@@ -622,11 +811,11 @@ double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT,
   return unaligned;
 }
 
-void loadFunctionWords( const char *fileName )
+void loadFunctionWords( const string &fileName )
 {
   cerr << "Loading function word list from " << fileName;
   ifstream inFile;
-  inFile.open(fileName);
+  inFile.open(fileName.c_str());
   if (inFile.fail()) {
     cerr << " - ERROR: could not open file\n";
     exit(1);
@@ -647,14 +836,14 @@ void loadFunctionWords( const char *fileName )
   inFile.close();
 }
 
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // lexical translation probability
   double lexScore = 1.0;
   int null = vcbS.getWordID("NULL");
   // all target words have to be explained
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty()) {
       // explain unaligned word by NULL
       lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
@@ -670,11 +859,11 @@ double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT,
   return lexScore;
 }
 
-void LexicalTable::load( char *fileName )
+void LexicalTable::load( const string &fileName )
 {
   cerr << "Loading lexical translation table from " << fileName;
   ifstream inFile;
-  inFile.open(fileName);
+  inFile.open(fileName.c_str());
   if (inFile.fail()) {
     cerr << " - ERROR: could not open file\n";
     exit(1);
diff --git a/phrase-extract/score.h b/phrase-extract/score.h
index ed9adc18c..f720a32d2 100644
--- a/phrase-extract/score.h
+++ b/phrase-extract/score.h
@@ -65,7 +65,7 @@ class LexicalTable
 {
 public:
   std::map< WORD_ID, std::map< WORD_ID, double > > ltable;
-  void load( char[] );
+  void load( const std::string &filePath );
   double permissiveLookup( WORD_ID wordS, WORD_ID wordT ) {
     // cout << endl << vcbS.getWord( wordS ) << "-" << vcbT.getWord( wordT ) << ":";
     if (ltable.find( wordS ) == ltable.end()) return 1.0;
diff --git a/phrase-extract/statistics.cpp b/phrase-extract/statistics.cpp
index d39a05d3b..67373ec93 100644
--- a/phrase-extract/statistics.cpp
+++ b/phrase-extract/statistics.cpp
@@ -40,7 +40,7 @@ class LexicalTable
 {
 public:
   map< WORD_ID, map< WORD_ID, double > > ltable;
-  void load( char[] );
+  void load( const string &);
 };
 
 }
@@ -310,11 +310,11 @@ bool PhraseAlignment::equals( const PhraseAlignment& other )
   return true;
 }
 
-void LexicalTable::load( char *fileName )
+void LexicalTable::load( const string &filePath )
 {
-  cerr << "Loading lexical translation table from " << fileName;
+  cerr << "Loading lexical translation table from " << filePath;
   ifstream inFile;
-  inFile.open(fileName);
+  inFile.open(filePath.c_str());
   if (inFile.fail()) {
     cerr << " - ERROR: could not open file\n";
     exit(1);
@@ -332,7 +332,7 @@ void LexicalTable::load( char *fileName )
 
     vector<string> token = tokenize( line );
     if (token.size() != 3) {
-      cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
+      cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
            token.size() << " " << token[0] << " " << line << endl;
       continue;
     }
author	Barry Haddow <barry.haddow@gmail.com>	2012-09-27 01:49:33 +0400
committer	Barry Haddow <barry.haddow@gmail.com>	2012-09-27 01:49:33 +0400
commit	0a950ee9f4227c8afbbe58d03a854745479ffbc0 (patch)
tree	3e4515adc6b3323f8742ff5addde2f29da2002c8 /phrase-extract
parent	1ce788e2b83dc9b359f6132e7e82774f9d0777b1 (diff)
parent	ab60d1ad6f93a78e80e665bc6c7d32b61b7c1c52 (diff)