a lot of changes

author: phikoehn <pkoehn@inf.ed.ac.uk> 2012-08-19 02:48:26 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2012-08-19 02:48:26 +0400
commit: 4a1a995878ed069dd4d77e0ac6c1727dc223ebe6 (patch)
tree: 6f774d4ddc8c6a6fc3d5b5b619c356a833b03c90 /phrase-extract
parent: 366ab93f8aa53b7b065fe8366201bd59dafc51ba (diff)
parent: b317522563feb4ca7ff978a0de661ec2189934ea (diff)
7 files changed, 518 insertions, 257 deletions
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index 50f03a739..e4f801089 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -11,9 +11,9 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
 alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
 alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
 
-exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
 
-exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
+exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
 
 exe extract-lex : extract-lex.cpp InputFileStream ;
 
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
new file mode 100644
index 000000000..d541144b7
--- /dev/null
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -0,0 +1,146 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
+
+#pragma once
+#ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+#define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+
+namespace MosesTraining
+{
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+
+
+class PhraseExtractionOptions {
+  
+ public: 
+     const int maxPhraseLength;
+ private:
+  bool allModelsOutputFlag;
+  bool wordModel;
+  REO_MODEL_TYPE wordType;
+  bool phraseModel;
+  REO_MODEL_TYPE phraseType;
+  bool hierModel;
+  REO_MODEL_TYPE hierType;
+  bool orientationFlag;
+  bool translationFlag;
+  bool sentenceIdFlag; //create extract file with sentence id
+  bool onlyOutputSpanInfo;
+  bool gzOutput;
+
+public:  
+  PhraseExtractionOptions(const int initmaxPhraseLength):
+            maxPhraseLength(initmaxPhraseLength),
+            allModelsOutputFlag(false),
+            wordModel(false),
+            wordType(REO_MSD),
+            phraseModel(false),
+            phraseType(REO_MSD),
+            hierModel(false),
+            hierType(REO_MSD),
+            orientationFlag(false),
+            translationFlag(true),
+            sentenceIdFlag(false),
+            onlyOutputSpanInfo(false),
+            gzOutput(false){}
+
+
+ 
+    //functions for initialization of options
+    void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
+        allModelsOutputFlag=initallModelsOutputFlag;
+    }
+    void initWordModel(const bool initwordModel){
+        wordModel=initwordModel;
+    }
+    void initWordType(REO_MODEL_TYPE initwordType ){
+        wordType=initwordType; 
+    } 
+    void initPhraseModel(const bool initphraseModel ){
+        phraseModel=initphraseModel;  
+    } 
+    void initPhraseType(REO_MODEL_TYPE initphraseType){
+        phraseType=initphraseType;
+    }  
+    void initHierModel(const bool inithierModel){
+        hierModel=inithierModel;
+    }
+    void initHierType(REO_MODEL_TYPE inithierType){
+        hierType=inithierType;
+    }
+    void initOrientationFlag(const bool initorientationFlag){
+        orientationFlag=initorientationFlag;
+    }
+    void initTranslationFlag(const bool inittranslationFlag){
+        translationFlag=inittranslationFlag;
+    }
+    void initSentenceIdFlag(const bool initsentenceIdFlag){
+        sentenceIdFlag=initsentenceIdFlag;
+    }
+    void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
+        onlyOutputSpanInfo= initonlyOutputSpanInfo;
+    } 
+    void initGzOutput (const bool initgzOutput){
+        gzOutput= initgzOutput;
+    } 
+    // functions for getting values
+    bool isAllModelsOutputFlag() const {
+        return allModelsOutputFlag;
+    }
+    bool isWordModel() const {
+        return wordModel;
+    }
+    REO_MODEL_TYPE isWordType() const {
+        return wordType; 
+    } 
+    bool isPhraseModel() const {
+        return phraseModel;  
+    } 
+    REO_MODEL_TYPE isPhraseType() const {
+        return phraseType;
+    }  
+    bool isHierModel() const {
+        return hierModel; 
+    }
+    REO_MODEL_TYPE isHierType() const {
+        return hierType;
+    }
+    bool isOrientationFlag() const {
+        return orientationFlag;
+    }
+    bool isTranslationFlag() const {
+        return translationFlag;
+    }
+    bool isSentenceIdFlag() const {
+        return sentenceIdFlag;
+    }
+    bool isOnlyOutputSpanInfo() const {
+        return onlyOutputSpanInfo;
+    } 
+    bool isGzOutput () const {
+        return gzOutput;
+   } 
+};
+
+}
+
+#endif
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index 0abf548c3..252547557 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -46,8 +46,6 @@
 #include "XmlTree.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
-#include "../moses/src/ThreadPool.h"
-#include "../moses/src/OutputCollector.h"
 
 #define LINE_MAX_LENGTH 500000
 
@@ -57,55 +55,53 @@ using namespace MosesTraining;
 typedef vector< int > LabelIndex;
 typedef map< int, int > WordIndex;
 
-class ExtractTask : public Moses::Task {
+class ExtractTask 
+{
 private:
-  size_t m_id;
-  SentenceAlignmentWithSyntax *m_sentence;
-  RuleExtractionOptions &m_options;
-  Moses::OutputCollector* m_extractCollector;
-  Moses::OutputCollector* m_extractCollectorInv;
+  SentenceAlignmentWithSyntax &m_sentence;
+  const RuleExtractionOptions &m_options;
+  Moses::OutputFileStream& m_extractFile;
+  Moses::OutputFileStream& m_extractFileInv;
+
+  vector< ExtractedRule > m_extractedRules;
+  
+  // main functions
+  void extractRules();
+  void addRuleToCollection(ExtractedRule &rule);
+  void consolidateRules();
+  void writeRulesToFile();
+  
+  // subs
+  void addRule( int, int, int, int, RuleExist &ruleExist);
+  void addHieroRule( int startT, int endT, int startS, int endS
+                    , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
+  void printHieroPhrase( int startT, int endT, int startS, int endS
+                        , HoleCollection &holeColl, LabelIndex &labelIndex);
+  string printTargetHieroPhrase(  int startT, int endT, int startS, int endS
+                                , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
+  string printSourceHieroPhrase( int startT, int endT, int startS, int endS
+                                , HoleCollection &holeColl, const LabelIndex &labelIndex);
+  void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
+                                   , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
+  void printHieroAlignment(  int startT, int endT, int startS, int endS
+                           , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
+  void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
+  
+  inline string IntToString( int i )
+  {
+    stringstream out;
+    out << i;
+    return out.str();
+  }
 
 public:
-  ExtractTask(size_t id, SentenceAlignmentWithSyntax *sentence, RuleExtractionOptions &options, Moses::OutputCollector* extractCollector, Moses::OutputCollector* extractCollectorInv):
-    m_id(id),
+  ExtractTask(SentenceAlignmentWithSyntax &sentence, const RuleExtractionOptions &options, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv):
     m_sentence(sentence),
     m_options(options),
-    m_extractCollector(extractCollector),
-    m_extractCollectorInv(extractCollectorInv) {}
-  ~ExtractTask() { delete m_sentence; }
+    m_extractFile(extractFile),
+    m_extractFileInv(extractFileInv) {}
   void Run();
 
-private:
-vector< ExtractedRule > m_extractedRules;
-
-// main functions
-void extractRules();
-void addRuleToCollection(ExtractedRule &rule);
-void consolidateRules();
-void writeRulesToFile();
-
-// subs
-void addRule( int, int, int, int, RuleExist &ruleExist);
-void addHieroRule( int startT, int endT, int startS, int endS
-                   , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
-void printHieroPhrase( int startT, int endT, int startS, int endS
-                       , HoleCollection &holeColl, LabelIndex &labelIndex);
-string printTargetHieroPhrase(  int startT, int endT, int startS, int endS
-                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
-string printSourceHieroPhrase( int startT, int endT, int startS, int endS
-                               , HoleCollection &holeColl, const LabelIndex &labelIndex);
-void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
-                                  , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
-void printHieroAlignment(  int startT, int endT, int startS, int endS
-                         , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
-void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
-
-inline string IntToString( int i )
-{
-  stringstream out;
-  out << i;
-  return out.str();
-}
 };
 
 // stats for glue grammar and unknown word label probabilities
@@ -120,16 +116,18 @@ int main(int argc, char* argv[])
        << "rule extraction from an aligned parallel corpus\n";
 
   RuleExtractionOptions options;
+<<<<<<< HEAD
   int sentenceOffset = 0;
 #ifdef WITH_THREADS
   int thread_count = 1;
 #endif
+=======
+
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
   if (argc < 5) {
     cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
-#ifdef WITH_THREADS
-         << " --threads NUM |"
-#endif
-         << " --GlueGrammar FILE"
+
+    << " --GlueGrammar FILE"
          << " | --UnknownWordLabel FILE"
          << " | --OnlyDirect"
          << " | --OutputNTLengths"
@@ -269,6 +267,7 @@ int main(int argc, char* argv[])
       options.unpairedExtractFormat = true;
     } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
       options.conditionOnTargetLhs = true;
+<<<<<<< HEAD
 #ifdef WITH_THREADS
     } else if (strcmp(argv[i],"-threads") == 0 || 
                strcmp(argv[i],"--threads") == 0 ||
@@ -281,6 +280,8 @@ int main(int argc, char* argv[])
         exit(1);
       }
       sentenceOffset = atoi(argv[++i]);
+=======
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
     } else {
       cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
       exit(1);
@@ -306,27 +307,17 @@ int main(int argc, char* argv[])
   if (!options.onlyDirectFlag)
     extractFileInv.Open(fileNameExtractInv.c_str());
 
-  // output into file
-  Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
-  Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);
 
   // stats on labels for glue grammar and unknown word label probabilities
   set< string > targetLabelCollection, sourceLabelCollection;
   map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
 
-#ifdef WITH_THREADS
-  // set up thread pool
-  Moses::ThreadPool pool(thread_count);
-  pool.SetQueueLimit(1000);
-#endif
-
   // loop through all sentence pairs
   size_t i=sentenceOffset;
   while(true) {
     i++;
-    if (i%1000 == 0) cerr << "." << flush;
-    if (i%10000 == 0) cerr << ":" << flush;
-    if (i%100000 == 0) cerr << "!" << flush;
+    if (i%1000 == 0) cerr << i << " " << flush;
+
     char targetString[LINE_MAX_LENGTH];
     char sourceString[LINE_MAX_LENGTH];
     char alignmentString[LINE_MAX_LENGTH];
@@ -335,7 +326,7 @@ int main(int argc, char* argv[])
     SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
     SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
 
-    SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
+    SentenceAlignmentWithSyntax sentence
       (targetLabelCollection, sourceLabelCollection, 
        targetTopLabelCollection, sourceTopLabelCollection, options);
     //az: output src, tgt, and alingment line
@@ -346,32 +337,17 @@ int main(int argc, char* argv[])
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
 
-    if (sentence->create(targetString, sourceString, alignmentString, i)) {
+    if (sentence.create(targetString, sourceString, alignmentString, i)) {
       if (options.unknownWordLabelFlag) {
-        collectWordLabelCounts(*sentence);
-      }
-      ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
-#ifdef WITH_THREADS
-      if (thread_count == 1) {
-        task->Run();
-        delete task;
-      }
-      else {
-        pool.Submit(task);
+        collectWordLabelCounts(sentence);
       }
-#else
+      ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv);
       task->Run();
       delete task;
-#endif
     }
     if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
   }
 
-#ifdef WITH_THREADS
-  // wait for all threads to finish
-  pool.Stop(true);
-#endif
-
   tFile.Close();
   sFile.Close();
   aFile.Close();
@@ -397,8 +373,8 @@ void ExtractTask::Run() {
 
 void ExtractTask::extractRules()
 {
-  int countT = m_sentence->target.size();
-  int countS = m_sentence->source.size();
+  int countT = m_sentence.target.size();
+  int countS = m_sentence.source.size();
 
   // phrase repository for creating hiero phrases
   RuleExist ruleExist(countT);
@@ -413,17 +389,17 @@ void ExtractTask::extractRules()
       int endT = startT + lengthT - 1;
 
       // if there is target side syntax, there has to be a node
-      if (m_options.targetSyntax && !m_sentence->targetTree.HasNode(startT,endT))
+      if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
         continue;
 
       // find find aligned source words
       // first: find minimum and maximum source word
       int minS = 9999;
       int maxS = -1;
-      vector< int > usedS = m_sentence->alignedCountS;
+      vector< int > usedS = m_sentence.alignedCountS;
       for(int ti=startT; ti<=endT; ti++) {
-        for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
-          int si = m_sentence->alignedToT[ti][i];
+        for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+          int si = m_sentence.alignedToT[ti][i];
           if (si<minS) {
             minS = si;
           }
@@ -458,15 +434,15 @@ void ExtractTask::extractRules()
       for(int startS=minS;
           (startS>=0 &&
            startS>maxS - m_options.maxSpan && // within length limit
-           (startS==minS || m_sentence->alignedCountS[startS]==0)); // unaligned
+           (startS==minS || m_sentence.alignedCountS[startS]==0)); // unaligned
           startS--) {
         // end point of source phrase may advance over unaligned
         for(int endS=maxS;
             (endS<countS && endS<startS + m_options.maxSpan && // within length limit
-             (endS==maxS || m_sentence->alignedCountS[endS]==0)); // unaligned
+             (endS==maxS || m_sentence.alignedCountS[endS]==0)); // unaligned
             endS++) {
           // if there is source side syntax, there has to be a node
-          if (m_options.sourceSyntax && !m_sentence->sourceTree.HasNode(startS,endS))
+          if (m_options.sourceSyntax && !m_sentence.sourceTree.HasNode(startS,endS))
             continue;
 
           // TODO: loop over all source and target syntax labels
@@ -515,7 +491,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
 
       int labelI = labelIndex[ 2+holeCount+holeTotal ];
       string label = m_options.sourceSyntax ?
-                     m_sentence->sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+                     m_sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
       hole.SetLabel(label, 0);
 
       currPos = hole.GetEnd(0);
@@ -556,7 +532,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
 
       int labelI = labelIndex[ 2+holeCount ];
       string targetLabel = m_options.targetSyntax ?
-                           m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+                           m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
       hole.SetLabel(targetLabel, 1);
 
       if (m_options.unpairedExtractFormat) {
@@ -566,7 +542,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
       }
 
       if (m_options.pcfgScore) {
-        double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+        double score = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
         logPCFGScore -= score;
       }
 
@@ -576,7 +552,7 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
       holeCount++;
     } else {
       indexT[currPos] = outPos;
-      out += m_sentence->target[currPos] + " ";
+      out += m_sentence.target[currPos] + " ";
     }
 
     outPos++;
@@ -620,7 +596,7 @@ string ExtractTask::printSourceHieroPhrase( int startT, int endT, int startS, in
       ++iterHoleList;
       ++holeCount;
     } else {
-      out += m_sentence->source[currPos] + " ";
+      out += m_sentence.source[currPos] + " ";
     }
 
     outPos++;
@@ -637,8 +613,8 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
   for(int ti=startT; ti<=endT; ti++) {
     WordIndex::const_iterator p = indexT.find(ti);
     if (p != indexT.end()) { // does word still exist?
-      for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
-        int si = m_sentence->alignedToT[ti][i];
+      for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+        int si = m_sentence.alignedToT[ti][i];
         std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
         std::string targetSymbolIndex = IntToString(p->second);
         rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -678,16 +654,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
 
   // phrase labels
   string targetLabel = m_options.targetSyntax ?
-                       m_sentence->targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+                       m_sentence.targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
   string sourceLabel = m_options.sourceSyntax ?
-                       m_sentence->sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+                       m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
 
   // create non-terms on the source side
   preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
 
   // target
   if (m_options.pcfgScore) {
-    double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
     rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
                 + " [" + targetLabel + "]";
     rule.pcfgScore = std::exp(logPCFGScore);
@@ -716,19 +692,19 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
   LabelIndex labelIndex,labelCount;
 
   // number of target head labels
-  int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(startT,endT).size() : 1;
+  int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
   labelCount.push_back(numLabels);
   labelIndex.push_back(0);
 
   // number of source head labels
-  numLabels =  m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(startS,endS).size() : 1;
+  numLabels =  m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1;
   labelCount.push_back(numLabels);
   labelIndex.push_back(0);
 
   // number of target hole labels
   for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
        hole != holeColl.GetHoles().end(); hole++ ) {
-    int numLabels =  m_options.targetSyntax ? m_sentence->targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+    int numLabels =  m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
     labelCount.push_back(numLabels);
     labelIndex.push_back(0);
   }
@@ -738,7 +714,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
   for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
        i != holeColl.GetSortedSourceHoles().end(); i++ ) {
     const Hole &hole = **i;
-    int numLabels =  m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
+    int numLabels =  m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
     labelCount.push_back(numLabels);
     labelIndex.push_back(0);
   }
@@ -850,7 +826,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
             }
             // covered by word? check if it is aligned
             else {
-              if (m_sentence->alignedToT[pos].size() > 0)
+              if (m_sentence.alignedToT[pos].size() > 0)
                 foundAlignedWord = true;
             }
           }
@@ -900,36 +876,36 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
   // phrase labels
   string targetLabel,sourceLabel;
   if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
-    sourceLabel = targetLabel = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel();
+    sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
   }
   else {
     sourceLabel = m_options.sourceSyntax ?
-                  m_sentence->sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+                  m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
     targetLabel = m_options.targetSyntax ?
-                  m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+                  m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
   }
 
   // source
   rule.source = "";
   for(int si=startS; si<=endS; si++)
-    rule.source += m_sentence->source[si] + " ";
+    rule.source += m_sentence.source[si] + " ";
   rule.source += "[" + sourceLabel + "]";
 
   // target
   rule.target = "";
   for(int ti=startT; ti<=endT; ti++)
-    rule.target += m_sentence->target[ti] + " ";
+    rule.target += m_sentence.target[ti] + " ";
   rule.target += "[" + targetLabel + "]";
 
   if (m_options.pcfgScore) {
-    double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
     rule.pcfgScore = std::exp(logPCFGScore);
   }
 
   // alignment
   for(int ti=startT; ti<=endT; ti++) {
-    for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
-      int si = m_sentence->alignedToT[ti][i];
+    for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
+      int si = m_sentence.alignedToT[ti][i];
       std::string sourceSymbolIndex = IntToString(si-startS);
       std::string targetSymbolIndex = IntToString(ti-startT);
       rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
@@ -1022,8 +998,8 @@ void ExtractTask::writeRulesToFile()
              << rule->count << "\n";
     }
   }
-  m_extractCollector->Write( m_id, out.str() );
-  m_extractCollectorInv->Write( m_id, outInv.str() );;
+  m_extractFile << out.str();
+  m_extractFileInv << outInv.str();
 }
 
 void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index 89c45a2e6..b6ea97c6e 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -1,6 +1,7 @@
 /*
  * extract.cpp
- *
+ *	Modified by: Rohit Gupta CDAC, Mumbai, India
+ *	on July 15, 2012 to implement parallel processing
  *      Modified by: Nadi Tomeh - LIMSI/CNRS
  *      Machine Translation Marathon 2010, Dublin
  */
@@ -13,7 +14,7 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <cstring>
-
+#include <sstream>
 #include <map>
 #include <set>
 #include <vector>
@@ -23,14 +24,16 @@
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
+#include "PhraseExtractionOptions.h"
 
 using namespace std;
 using namespace MosesTraining;
 
-#define LINE_MAX_LENGTH 500000
+namespace MosesTraining {
+
+
+const long int LINE_MAX_LENGTH = 500000 ;
 
-namespace MosesTraining
-{
 
 // HPhraseVertex represents a point in the alignment matrix
 typedef pair <int, int> HPhraseVertex;
@@ -46,26 +49,24 @@ typedef vector < HPhrase > HPhraseVector;
 // The key of the map is the English index and the value is a set of the source ones
 typedef map <int, set<int> > HSentenceVertices;
 
-enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
-enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
-
-REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+  REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                            int, int, int, int, int, int, int,
                            bool (*)(int, int), bool (*)(int, int));
-REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+  REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                              int, int, int, int, int, int, int,
                              bool (*)(int, int), bool (*)(int, int),
                              const HSentenceVertices &, const HSentenceVertices &);
-REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+  REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                            int, int, int, int, int, int, int,
                            bool (*)(int, int), bool (*)(int, int),
                            const HSentenceVertices &, const HSentenceVertices &,
                            const HSentenceVertices &, const HSentenceVertices &,
                            REO_POS);
 
-void insertVertex(HSentenceVertices &, int, int);
-void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
+  void insertVertex(HSentenceVertices &, int, int);
+  void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
                           int, int, int, int);
+<<<<<<< HEAD
 string getOrientString(REO_POS, REO_MODEL_TYPE);
 
 bool ge(int, int);
@@ -99,7 +100,49 @@ int sentenceOffset = 0;
 bool includeSentenceIdFlag = false; //include sentence id in extract file
 bool onlyOutputSpanInfo = false;
 bool gzOutput = false;
+=======
+  string getOrientString(REO_POS, REO_MODEL_TYPE);
+
+  bool ge(int, int);
+  bool le(int, int);
+  bool lt(int, int);
+
+  bool isAligned (SentenceAlignment &, int, int);
+
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
+
+}
+
+namespace MosesTraining{
 
+class ExtractTask 
+{
+public:
+  ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation,Moses::OutputFileStream &extractFileSentenceId  ):
+    m_sentence(sentence),
+    m_options(initoptions),
+    m_extractFile(extractFile),
+    m_extractFileInv(extractFileInv),
+    m_extractFileOrientation(extractFileOrientation),
+    m_extractFileSentenceId(extractFileSentenceId) {}
+void Run();
+private:
+  vector< string > m_extractedPhrases;
+  vector< string > m_extractedPhrasesInv;
+  vector< string > m_extractedPhrasesOri;
+  vector< string > m_extractedPhrasesSid;
+  void extractBase(SentenceAlignment &);
+  void extract(SentenceAlignment &);
+  void addPhrase(SentenceAlignment &, int, int, int, int, string &);
+  void writePhrasesToFile();
+  
+  SentenceAlignment &m_sentence;
+  const PhraseExtractionOptions &m_options;
+  Moses::OutputFileStream &m_extractFile;
+  Moses::OutputFileStream &m_extractFileInv;
+  Moses::OutputFileStream &m_extractFileOrientation;
+  Moses::OutputFileStream &m_extractFileSentenceId;
+};
 }
 
 int main(int argc, char* argv[])
@@ -107,24 +150,36 @@ int main(int argc, char* argv[])
   cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
         << "phrase extraction from an aligned parallel corpus\n";
 
+<<<<<<< HEAD
   if (argc < 6) {
     cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --IncludeSentenceId | --SentenceOffset n ]\n";
+=======
+ if (argc < 6) {
+    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
+    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput ]\n";
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
     exit(1);
   }
-  char* &fileNameE = argv[1];
-  char* &fileNameF = argv[2];
-  char* &fileNameA = argv[3];
-  string fileNameExtract = string(argv[4]);
-  maxPhraseLength = atoi(argv[5]);
+
+  Moses::OutputFileStream extractFile;
+  Moses::OutputFileStream extractFileInv;
+  Moses::OutputFileStream extractFileOrientation;
+  Moses::OutputFileStream extractFileSentenceId;
+  const char* const &fileNameE = argv[1];
+  const char* const &fileNameF = argv[2];
+  const char* const &fileNameA = argv[3];
+  const string fileNameExtract = string(argv[4]);
+  PhraseExtractionOptions options(atoi(argv[5]));
 
   for(int i=6; i<argc; i++) {
     if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
-      onlyOutputSpanInfo = true;
+      options.initOnlyOutputSpanInfo(true);
     } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
-      orientationFlag = true;
+      options.initOrientationFlag(true);
     } else if (strcmp(argv[i],"--NoTTable") == 0) {
-      translationFlag = false;
+      options.initTranslationFlag(false);
     } else if (strcmp(argv[i], "--SentenceId") == 0) {
+<<<<<<< HEAD
       sentenceIdFlag = true;  
     } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
       includeSentenceIdFlag = true;  
@@ -134,51 +189,54 @@ int main(int argc, char* argv[])
         exit(1);
       }
       sentenceOffset = atoi(argv[++i]);
+=======
+      options.initSentenceIdFlag(true);  
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
     } else if (strcmp(argv[i], "--GZOutput") == 0) {
-      gzOutput = true;  
+      options.initGzOutput(true);  
     } else if(strcmp(argv[i],"--model") == 0) {
       if (i+1 >= argc) {
         cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
         exit(1);
       }
-      char* modelParams = argv[++i];
-      char* modelName = strtok(modelParams, "-");
-      char* modelType = strtok(NULL, "-");
+      char*  modelParams = argv[++i];
+      char*  modelName = strtok(modelParams, "-");
+      char*  modelType = strtok(NULL, "-");
 
       // REO_MODEL_TYPE intModelType;
 
       if(strcmp(modelName, "wbe") == 0) {
-        wordModel = true;
+        options.initWordModel(true);
         if(strcmp(modelType, "msd") == 0)
-          wordType = REO_MSD;
+          options.initWordType(REO_MSD);
         else if(strcmp(modelType, "mslr") == 0)
-          wordType = REO_MSLR;
+          options.initWordType(REO_MSLR);
         else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          wordType = REO_MONO;
+          options.initWordType(REO_MONO);
         else {
           cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
           exit(1);
         }
       } else if(strcmp(modelName, "phrase") == 0) {
-        phraseModel = true;
+        options.initPhraseModel(true);
         if(strcmp(modelType, "msd") == 0)
-          phraseType = REO_MSD;
+          options.initPhraseType(REO_MSD);
         else if(strcmp(modelType, "mslr") == 0)
-          phraseType = REO_MSLR;
+          options.initPhraseType(REO_MSLR);
         else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          phraseType = REO_MONO;
+          options.initPhraseType(REO_MONO);
         else {
           cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
           exit(1);
         }
       } else if(strcmp(modelName, "hier") == 0) {
-        hierModel = true;
+        options.initHierModel(true);
         if(strcmp(modelType, "msd") == 0)
-          hierType = REO_MSD;
+          options.initHierType(REO_MSD);
         else if(strcmp(modelType, "mslr") == 0)
-          hierType = REO_MSLR;
+          options.initHierType(REO_MSLR);
         else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          hierType = REO_MONO;
+          options.initHierType(REO_MONO);
         else {
           cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
           exit(1);
@@ -188,7 +246,8 @@ int main(int argc, char* argv[])
         exit(1);
       }
 
-      allModelsOutputFlag = true;
+      options.initAllModelsOutputFlag(true);
+
     } else {
       cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
       exit(1);
@@ -197,9 +256,9 @@ int main(int argc, char* argv[])
 
   // default reordering model if no model selected
   // allows for the old syntax to be used
-  if(orientationFlag && !allModelsOutputFlag) {
-    wordModel = true;
-    wordType = REO_MSD;
+  if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
+    options.initWordModel(true);
+    options.initWordType(REO_MSD);
   }
 
   // open input files
@@ -212,18 +271,18 @@ int main(int argc, char* argv[])
   istream *aFileP = &aFile;
 
   // open output files
-  if (translationFlag) {
-    string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
-    extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+  if (options.isTranslationFlag()) {
+    string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
+    extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
     extractFileInv.Open(fileNameExtractInv.c_str());
   }
-  if (orientationFlag) {
-    string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+  if (options.isOrientationFlag()) {
+    string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
     extractFileOrientation.Open(fileNameExtractOrientation.c_str());
   }
 
-  if (sentenceIdFlag) {
-    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+  if (options.isSentenceIdFlag()) {
+    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
     extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
   }
 
@@ -239,31 +298,38 @@ int main(int argc, char* argv[])
     SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
     SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
     SentenceAlignment sentence;
-    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+	// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
     //az: output src, tgt, and alingment line
-    if (onlyOutputSpanInfo) {
+    if (options.isOnlyOutputSpanInfo()) {
       cout << "LOG: SRC: " << foreignString << endl;
       cout << "LOG: TGT: " << englishString << endl;
       cout << "LOG: ALT: " << alignmentString << endl;
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
+	if (sentence.create( englishString, foreignString, alignmentString, i)) {
+   	ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileSentenceId);
+      task->Run();
+      delete task;
 
-    if (sentence.create( englishString, foreignString, alignmentString, i)) {
-      extract(sentence);
     }
-    if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+    if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
   }
+
   eFile.Close();
   fFile.Close();
   aFile.Close();
+
   //az: only close if we actually opened it
-  if (!onlyOutputSpanInfo) {
-    if (translationFlag) {
+  if (!options.isOnlyOutputSpanInfo()) {
+    if (options.isTranslationFlag()) {
       extractFile.Close();
       extractFileInv.Close();
+      
     }
-    if (orientationFlag) extractFileOrientation.Close();
-    if (sentenceIdFlag) {
+    if (options.isOrientationFlag()){ 
+	extractFileOrientation.Close();
+	}
+    if (options.isSentenceIdFlag()) {
       extractFileSentenceId.Close();
     }
   }
@@ -271,8 +337,17 @@ int main(int argc, char* argv[])
 
 namespace MosesTraining
 {
+void ExtractTask::Run() {
+  extract(m_sentence);
+  writePhrasesToFile();
+  m_extractedPhrases.clear();
+  m_extractedPhrasesInv.clear();
+  m_extractedPhrasesOri.clear();
+  m_extractedPhrasesSid.clear();
 
-void extract(SentenceAlignment &sentence)
+}
+
+void ExtractTask::extract(SentenceAlignment &sentence)
 {
   int countE = sentence.target.size();
   int countF = sentence.source.size();
@@ -291,14 +366,14 @@ void extract(SentenceAlignment &sentence)
 
   HSentenceVertices::const_iterator it;
 
-  bool relaxLimit = hierModel;
-  bool buildExtraStructure = phraseModel || hierModel;
+  bool relaxLimit = m_options.isHierModel();
+  bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
 
   // check alignments for target phrase startE...endE
   // loop over extracted phrases which are compatible with the word-alignments
   for(int startE=0; startE<countE; startE++) {
     for(int endE=startE;
-        (endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
+        (endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
         endE++) {
 
       int minF = 9999;
@@ -318,7 +393,7 @@ void extract(SentenceAlignment &sentence)
       }
 
       if (maxF >= 0 && // aligned to any source words at all
-          (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
+          (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
 
         // check if source words are aligned to out of bound target words
         bool out_of_bounds = false;
@@ -333,17 +408,17 @@ void extract(SentenceAlignment &sentence)
           // start point of source phrase may retreat over unaligned
           for(int startF=minF;
               (startF>=0 &&
-               (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
+               (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
                (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
               startF--)
             // end point of source phrase may advance over unaligned
             for(int endF=maxF;
                 (endF<countF &&
-                 (relaxLimit || endF<startF+maxPhraseLength) && // within length limit
+                 (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
                  (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
                 endF++) { // at this point we have extracted a phrase
               if(buildExtraStructure) { // phrase || hier
-                if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
+                if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
                   inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
                                                    HPhraseVertex(endF,endE)));
                   insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
@@ -353,16 +428,16 @@ void extract(SentenceAlignment &sentence)
                                        startF, startE, endF, endE);
               } else {
                 string orientationInfo = "";
-                if(wordModel) {
+                if(m_options.isWordModel()) {
                   REO_POS wordPrevOrient, wordNextOrient;
                   bool connectedLeftTopP  = isAligned( sentence, startF-1, startE-1 );
                   bool connectedRightTopP = isAligned( sentence, endF+1,   startE-1 );
                   bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
                   bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );
-                  wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
-                  wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
-                  orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
-                  if(allModelsOutputFlag)
+                  wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
+                  wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
+                  orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
+                  if(m_options.isAllModelsOutputFlag())
                     " | | ";
                 }
                 addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
@@ -388,38 +463,38 @@ void extract(SentenceAlignment &sentence)
       bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
       bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );
 
-      if(wordModel) {
-        wordPrevOrient = getOrientWordModel(sentence, wordType,
+      if(m_options.isWordModel()) {
+        wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
                                             connectedLeftTopP, connectedRightTopP,
                                             startF, endF, startE, endE, countF, 0, 1,
                                             &ge, &lt);
-        wordNextOrient = getOrientWordModel(sentence, wordType,
+        wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
                                             connectedLeftTopN, connectedRightTopN,
                                             endF, startF, endE, startE, 0, countF, -1,
                                             &lt, &ge);
       }
-      if (phraseModel) {
-        phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
+      if (m_options.isPhraseModel()) {
+        phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
                                                 connectedLeftTopP, connectedRightTopP,
                                                 startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
-        phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
+        phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
                                                 connectedLeftTopN, connectedRightTopN,
                                                 endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
       } else {
         phrasePrevOrient = phraseNextOrient = UNKNOWN;
       }
-      if(hierModel) {
-        hierPrevOrient = getOrientHierModel(sentence, hierType,
+      if(m_options.isHierModel()) {
+        hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
                                             connectedLeftTopP, connectedRightTopP,
                                             startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
-        hierNextOrient = getOrientHierModel(sentence, hierType,
+        hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
                                             connectedLeftTopN, connectedRightTopN,
                                             endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
       }
 
-      orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
-                        ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
-                        ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
+      orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
+                        ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
+                        ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
 
       addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
     }
@@ -627,96 +702,147 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
   return "";
 }
 
-void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
 {
   // source
-  // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+  //   // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+  	ostringstream outextractstr;
+  	ostringstream outextractstrInv;
+  	ostringstream outextractstrOrientation;
+  	ostringstream outextractstrSentenceId;
 
-  if (onlyOutputSpanInfo) {
+  if (m_options.isOnlyOutputSpanInfo()) {
     cout << startF << " " << endF << " " << startE << " " << endE << endl;
     return;
   }
 
-  for(int fi=startF; fi<=endF; fi++) {
-    if (translationFlag) extractFile << sentence.source[fi] << " ";
-    if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
-    if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
+for(int fi=startF; fi<=endF; fi++) {
+    if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
+    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
+    if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
   }
-  if (translationFlag) extractFile << "||| ";
-  if (orientationFlag) extractFileOrientation << "||| ";
-  if (sentenceIdFlag) extractFileSentenceId << "||| ";
+  if (m_options.isTranslationFlag()) outextractstr << "||| ";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
 
   // target
   for(int ei=startE; ei<=endE; ei++) {
-    if (translationFlag) extractFile << sentence.target[ei] << " ";
-    if (translationFlag) extractFileInv << sentence.target[ei] << " ";
-    if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
-    if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
+    if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
+    if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
+    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
+    if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
   }
-  if (translationFlag) extractFile << "|||";
-  if (translationFlag) extractFileInv << "||| ";
-  if (orientationFlag) extractFileOrientation << "||| ";
-  if (sentenceIdFlag) extractFileSentenceId << "||| ";
+  if (m_options.isTranslationFlag()) outextractstr << "|||";
+  if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
 
   // source (for inverse)
-  if (translationFlag) {
+
+ if (m_options.isTranslationFlag()) {
     for(int fi=startF; fi<=endF; fi++)
-      extractFileInv << sentence.source[fi] << " ";
-    extractFileInv << "|||";
+      outextractstrInv << sentence.source[fi] << " ";
+    outextractstrInv << "|||";
   }
-
   // alignment
-  if (translationFlag) {
+ if (m_options.isTranslationFlag()) {
     for(int ei=startE; ei<=endE; ei++) {
-      for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+      for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
         int fi = sentence.alignedToT[ei][i];
-        extractFile << " " << fi-startF << "-" << ei-startE;
-        extractFileInv << " " << ei-startE << "-" << fi-startF;
+        outextractstr << " " << fi-startF << "-" << ei-startE;
+        outextractstrInv << " " << ei-startE << "-" << fi-startF;
       }
     }
   }
 
-  if (orientationFlag)
-    extractFileOrientation << orientationInfo;
+  if (m_options.isOrientationFlag())
+    outextractstrOrientation << orientationInfo;
 
+<<<<<<< HEAD
   if (sentenceIdFlag)
     extractFileSentenceId << sentence.sentenceID;
 
   if (includeSentenceIdFlag)
     extractFile << " ||| " << sentence.sentenceID;
+=======
+  if (m_options.isSentenceIdFlag()) {
+    outextractstrSentenceId << sentence.sentenceID;
+  }
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
+
+
+ if (m_options.isTranslationFlag()) outextractstr << "\n";
+  if (m_options.isTranslationFlag()) outextractstrInv << "\n";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
+
 
-  if (translationFlag) extractFile << "\n";
-  if (translationFlag) extractFileInv << "\n";
-  if (orientationFlag) extractFileOrientation << "\n";
-  if (sentenceIdFlag) extractFileSentenceId << "\n";
+    m_extractedPhrases.push_back(outextractstr.str());
+    m_extractedPhrasesInv.push_back(outextractstrInv.str());
+    m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
+    m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
+}
+
+
+void ExtractTask::writePhrasesToFile(){
+
+    ostringstream outextractFile;
+    ostringstream outextractFileInv;
+    ostringstream outextractFileOrientation;
+    ostringstream outextractFileSentenceId;
+
+    for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
+        outextractFile<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
+        outextractFileInv<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
+        outextractFileOrientation<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
+        outextractFileSentenceId<<phrase->data();
+    }
+
+      m_extractFile << outextractFile.str();
+      m_extractFileInv  << outextractFileInv.str();
+      m_extractFileOrientation << outextractFileOrientation.str();
+      m_extractFileSentenceId << outextractFileSentenceId.str();
 }
 
 // if proper conditioning, we need the number of times a source phrase occured
-void extractBase( SentenceAlignment &sentence )
+
+void ExtractTask::extractBase( SentenceAlignment &sentence )
 {
+    ostringstream outextractFile;
+    ostringstream outextractFileInv;
+
   int countF = sentence.source.size();
   for(int startF=0; startF<countF; startF++) {
     for(int endF=startF;
-        (endF<countF && endF<startF+maxPhraseLength);
+        (endF<countF && endF<startF+m_options.maxPhraseLength);
         endF++) {
       for(int fi=startF; fi<=endF; fi++) {
-        extractFile << sentence.source[fi] << " ";
-      }
-      extractFile << "|||" << endl;
+         outextractFile << sentence.source[fi] << " ";
+	}
+      outextractFile << "|||" << endl;
     }
   }
 
   int countE = sentence.target.size();
   for(int startE=0; startE<countE; startE++) {
     for(int endE=startE;
-        (endE<countE && endE<startE+maxPhraseLength);
+        (endE<countE && endE<startE+m_options.maxPhraseLength);
         endE++) {
       for(int ei=startE; ei<=endE; ei++) {
-        extractFileInv << sentence.target[ei] << " ";
+        outextractFileInv << sentence.target[ei] << " ";
       }
-      extractFileInv << "|||" << endl;
+      outextractFileInv << "|||" << endl;
     }
   }
+    m_extractFile << outextractFile.str();
+    m_extractFileInv << outextractFileInv.str();
+
 }
 
 }
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index f02b6b3b0..9ec976f46 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -83,7 +83,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, o
 double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
 double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
 set<string> functionWordList;
-void loadFunctionWords( const char* fileNameFunctionWords );
+void loadFunctionWords( const string &fileNameFunctionWords );
 double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
 void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
                       , map<size_t, map<size_t, float> > &sourceProb
@@ -100,12 +100,16 @@ int main(int argc, char* argv[])
     cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]\n";
     exit(1);
   }
-  char* fileNameExtract = argv[1];
-  char* fileNameLex = argv[2];
-  char* fileNamePhraseTable = argv[3];
+  string fileNameExtract = argv[1];
+  string fileNameLex = argv[2];
+  string fileNamePhraseTable = argv[3];
   string fileNameCountOfCounts;
+<<<<<<< HEAD
   char* fileNameFunctionWords = NULL;
   char* fileNameDomain = NULL;
+=======
+  string fileNameFunctionWords;
+>>>>>>> b317522563feb4ca7ff978a0de661ec2189934ea
 
   for(int i=4; i<argc; i++) {
     if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
@@ -220,7 +224,7 @@ int main(int argc, char* argv[])
   // output file: phrase translation table
 	ostream *phraseTableFile;
 
-	if (strcmp(fileNamePhraseTable, "-") == 0) {
+	if (fileNamePhraseTable == "-") {
 		phraseTableFile = &cout;
 	}
 	else {
@@ -367,12 +371,21 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
   PhraseAlignment* bestAlignment = NULL;
   
   for(size_t i=0; i<phrasePair.size(); i++) {
-    if (phrasePair[i]->count > bestAlignmentCount) {
-      bestAlignmentCount = phrasePair[i]->count;
-      bestAlignment = phrasePair[i];
+    size_t alignInd;
+    if (inverseFlag) 
+    { // count backwards, so that alignments for ties will be the same for both normal & inverse scores
+      alignInd = phrasePair.size() - i - 1;
     }
-  }
-  
+    else {
+      alignInd = i;
+    }
+    
+    if (phrasePair[alignInd]->count > bestAlignmentCount) {
+      bestAlignmentCount = phrasePair[alignInd]->count;
+      bestAlignment = phrasePair[alignInd];
+    }
+  }    
+
   return bestAlignment;
 }
 
@@ -700,11 +713,11 @@ double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT,
   return unaligned;
 }
 
-void loadFunctionWords( const char *fileName )
+void loadFunctionWords( const string &fileName )
 {
   cerr << "Loading function word list from " << fileName;
   ifstream inFile;
-  inFile.open(fileName);
+  inFile.open(fileName.c_str());
   if (inFile.fail()) {
     cerr << " - ERROR: could not open file\n";
     exit(1);
@@ -748,11 +761,11 @@ double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT,
   return lexScore;
 }
 
-void LexicalTable::load( char *fileName )
+void LexicalTable::load( const string &fileName )
 {
   cerr << "Loading lexical translation table from " << fileName;
   ifstream inFile;
-  inFile.open(fileName);
+  inFile.open(fileName.c_str());
   if (inFile.fail()) {
     cerr << " - ERROR: could not open file\n";
     exit(1);
diff --git a/phrase-extract/score.h b/phrase-extract/score.h
index ed9adc18c..f720a32d2 100644
--- a/phrase-extract/score.h
+++ b/phrase-extract/score.h
@@ -65,7 +65,7 @@ class LexicalTable
 {
 public:
   std::map< WORD_ID, std::map< WORD_ID, double > > ltable;
-  void load( char[] );
+  void load( const std::string &filePath );
   double permissiveLookup( WORD_ID wordS, WORD_ID wordT ) {
     // cout << endl << vcbS.getWord( wordS ) << "-" << vcbT.getWord( wordT ) << ":";
     if (ltable.find( wordS ) == ltable.end()) return 1.0;
diff --git a/phrase-extract/statistics.cpp b/phrase-extract/statistics.cpp
index d39a05d3b..67373ec93 100644
--- a/phrase-extract/statistics.cpp
+++ b/phrase-extract/statistics.cpp
@@ -40,7 +40,7 @@ class LexicalTable
 {
 public:
   map< WORD_ID, map< WORD_ID, double > > ltable;
-  void load( char[] );
+  void load( const string &);
 };
 
 }
@@ -310,11 +310,11 @@ bool PhraseAlignment::equals( const PhraseAlignment& other )
   return true;
 }
 
-void LexicalTable::load( char *fileName )
+void LexicalTable::load( const string &filePath )
 {
-  cerr << "Loading lexical translation table from " << fileName;
+  cerr << "Loading lexical translation table from " << filePath;
   ifstream inFile;
-  inFile.open(fileName);
+  inFile.open(filePath.c_str());
   if (inFile.fail()) {
     cerr << " - ERROR: could not open file\n";
     exit(1);
@@ -332,7 +332,7 @@ void LexicalTable::load( char *fileName )
 
     vector<string> token = tokenize( line );
     if (token.size() != 3) {
-      cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
+      cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
            token.size() << " " << token[0] << " " << line << endl;
       continue;
     }
author	phikoehn <pkoehn@inf.ed.ac.uk>	2012-08-19 02:48:26 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2012-08-19 02:48:26 +0400
commit	4a1a995878ed069dd4d77e0ac6c1727dc223ebe6 (patch)
tree	6f774d4ddc8c6a6fc3d5b5b619c356a833b03c90 /phrase-extract
parent	366ab93f8aa53b7b065fe8366201bd59dafc51ba (diff)
parent	b317522563feb4ca7ff978a0de661ec2189934ea (diff)