Option for target constituent constrained phrase extraction. TargetConstituentAdjacencyFeature.

author: Matthias Huck <mhuck@inf.ed.ac.uk> 2016-02-12 20:46:57 +0300
committer: Matthias Huck <mhuck@inf.ed.ac.uk> 2016-02-12 20:46:57 +0300
commit: 1659d6b4c8c0b2a261678c012b9eab32f8c7b296 (patch)
tree: 6e1cc6b7bf0a17e6b34bf6bbbafd45ad19bd42d3 /phrase-extract
parent: c75f9854e489c14670d3c9ab6e381fc0878d27d9 (diff)
6 files changed, 275 insertions, 39 deletions
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
index 57821fe44..9a2884858 100644
--- a/phrase-extract/ExtractionPhrasePair.cpp
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -311,12 +311,14 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
   std::ostringstream oss;
   for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
        iter!=allPropertyValues->end(); ++iter) {
-    if (iter!=allPropertyValues->begin()) {
+    if (!(iter->first).empty()) {
+      if (iter!=allPropertyValues->begin()) {
+        oss << " ";
+      }
+      oss << iter->first;
       oss << " ";
+      oss << iter->second;
     }
-    oss << iter->first;
-    oss << " ";
-    oss << iter->second;
   }
 
   std::string allPropertyValuesString(oss.str());
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
index 38e570b79..859ab92d7 100644
--- a/phrase-extract/PhraseExtractionOptions.h
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -50,6 +50,8 @@ private:
   bool onlyOutputSpanInfo;
   bool gzOutput;
   std::string instanceWeightsFile; //weights for each sentence
+  bool targetConstituentConstrainedFlag;
+  bool targetConstituentBoundariesFlag;
   bool flexScoreFlag;
   bool singleWordHeuristicFlag;
 
@@ -73,6 +75,8 @@ public:
     includeSentenceIdFlag(false),
     onlyOutputSpanInfo(false),
     gzOutput(false),
+    targetConstituentConstrainedFlag(false),
+    targetConstituentBoundariesFlag(false),
     flexScoreFlag(false),
     singleWordHeuristicFlag(false),
     debug(false) {
@@ -118,6 +122,12 @@ public:
   void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
     instanceWeightsFile = std::string(initInstanceWeightsFile);
   }
+  void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
+    targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
+  }
+  void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
+    targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
+  }
   void initFlexScoreFlag(const bool initflexScoreFlag) {
     flexScoreFlag=initflexScoreFlag;
   }
@@ -165,6 +175,12 @@ public:
   std::string getInstanceWeightsFile() const {
     return instanceWeightsFile;
   }
+  bool isTargetConstituentConstrainedFlag() const {
+    return targetConstituentConstrainedFlag;
+  }
+  bool isTargetConstituentBoundariesFlag() const {
+    return targetConstituentBoundariesFlag;
+  }
   bool isFlexScoreFlag() const {
     return flexScoreFlag;
   }
diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp
index 70f52317e..2a321f1e2 100644
--- a/phrase-extract/SyntaxNodeCollection.cpp
+++ b/phrase-extract/SyntaxNodeCollection.cpp
@@ -47,6 +47,8 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
   SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
   m_nodes.push_back( newNode );
   m_index[ startPos ][ endPos ].push_back( newNode );
+  m_endPositionsIndex[ endPos ].push_back( newNode );
+  m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)?
   m_numWords = std::max(endPos+1, m_numWords);
   return newNode;
 }
@@ -70,6 +72,36 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
   return endIndex->second;
 }
 
+bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const
+{
+  return GetNodesByStartPosition(startPos).size() > 0;
+}
+
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition(
+  int startPos ) const
+{
+  InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos );
+  if (startIndex == m_startPositionsIndex.end() )
+    return m_emptyNode;
+
+  return startIndex->second;
+}
+
+bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const
+{
+  return GetNodesByEndPosition(endPos).size() > 0;
+}
+
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition(
+  int endPos ) const
+{
+  InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos );
+  if (endIndex == m_endPositionsIndex.end() )
+    return m_emptyNode;
+
+  return endIndex->second;
+}
+
 std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
 {
   std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h
index ef0989cd0..83aa66bb4 100644
--- a/phrase-extract/SyntaxNodeCollection.h
+++ b/phrase-extract/SyntaxNodeCollection.h
@@ -50,6 +50,11 @@ public:
   //! Lookup the SyntaxNodes for a given span.
   const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
 
+  bool HasNodeStartingAtPosition( int startPos ) const;
+  const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const;
+  bool HasNodeEndingAtPosition( int endPos ) const;
+  const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const;
+
   //! Get a vector of pointers to all SyntaxNodes (unordered).
   const std::vector< SyntaxNode* >& GetAllNodes() {
     return m_nodes;
@@ -78,6 +83,9 @@ private:
   NodeIndex m_index;
   int m_numWords;
   std::vector< SyntaxNode* > m_emptyNode;
+
+  InnerNodeIndex m_endPositionsIndex;
+  InnerNodeIndex m_startPositionsIndex;
 };
 
 }  // namespace MosesTraining
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index 0e77368f4..e4d074e15 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -1,11 +1,3 @@
-/*
- * extract.cpp
- *	Modified by: Rohit Gupta CDAC, Mumbai, India
- *	on July 15, 2012 to implement parallel processing
- *      Modified by: Nadi Tomeh - LIMSI/CNRS
- *      Machine Translation Marathon 2010, Dublin
- */
-
 #include <cstdio>
 #include <iostream>
 #include <fstream>
@@ -20,11 +12,12 @@
 #include <vector>
 #include <limits>
 
-#include "SentenceAlignment.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
 #include "PhraseExtractionOptions.h"
+#include "SentenceAlignmentWithSyntax.h"
+#include "SyntaxNode.h"
 
 using namespace std;
 using namespace MosesTraining;
@@ -46,14 +39,14 @@ typedef vector < HPhrase > HPhraseVector;
 // The key of the map is the English index and the value is a set of the source ones
 typedef map <int, set<int> > HSentenceVertices;
 
-REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+REO_POS getOrientWordModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
                            int, int, int, int, int, int, int,
                            bool (*)(int, int), bool (*)(int, int));
-REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+REO_POS getOrientPhraseModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
                              int, int, int, int, int, int, int,
                              bool (*)(int, int), bool (*)(int, int),
                              const HSentenceVertices &, const HSentenceVertices &);
-REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+REO_POS getOrientHierModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
                            int, int, int, int, int, int, int,
                            bool (*)(int, int), bool (*)(int, int),
                            const HSentenceVertices &, const HSentenceVertices &,
@@ -69,7 +62,7 @@ bool ge(int, int);
 bool le(int, int);
 bool lt(int, int);
 
-bool isAligned (SentenceAlignment &, int, int);
+bool isAligned (SentenceAlignmentWithSyntax &, int, int);
 
 int sentenceOffset = 0;
 
@@ -87,7 +80,7 @@ class ExtractTask
 {
 public:
   ExtractTask(
-    size_t id, SentenceAlignment &sentence,
+    size_t id, SentenceAlignmentWithSyntax &sentence,
     PhraseExtractionOptions &initoptions,
     Moses::OutputFileStream &extractFile,
     Moses::OutputFileStream &extractFileInv,
@@ -109,14 +102,17 @@ private:
   vector< string > m_extractedPhrasesSid;
   vector< string > m_extractedPhrasesContext;
   vector< string > m_extractedPhrasesContextInv;
-  void extractBase(SentenceAlignment &);
-  void extract(SentenceAlignment &);
-  void addPhrase(SentenceAlignment &, int, int, int, int, string &);
+  void extractBase(SentenceAlignmentWithSyntax &);
+  void extract(SentenceAlignmentWithSyntax &);
+  void addPhrase(const SentenceAlignmentWithSyntax &, int, int, int, int, const std::string &, const std::string &);
   void writePhrasesToFile();
-  bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF);
+  bool checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF);
   bool isPlaceholder(const string &word);
+  bool checkTargetConstituentBoundaries(const SentenceAlignmentWithSyntax &sentence,
+                                        int startE, int endE, int startF, int endF,
+                                        std::string &phrasePropertiesString);
 
-  SentenceAlignment &m_sentence;
+  SentenceAlignmentWithSyntax &m_sentence;
   const PhraseExtractionOptions &m_options;
   Moses::OutputFileStream &m_extractFile;
   Moses::OutputFileStream &m_extractFileInv;
@@ -133,7 +129,8 @@ int main(int argc, char* argv[])
 
   if (argc < 6) {
     cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
-    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
+    cerr << "| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ";
+    cerr << "| --TargetConstituentConstrained | --TargetConstituentBoundaries ]" << std::endl;
     exit(1);
   }
 
@@ -153,6 +150,10 @@ int main(int argc, char* argv[])
       options.initOnlyOutputSpanInfo(true);
     } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
       options.initOrientationFlag(true);
+    } else if (strcmp(argv[i],"--TargetConstituentConstrained") == 0) {
+      options.initTargetConstituentConstrainedFlag(true);
+    } else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
+      options.initTargetConstituentBoundariesFlag(true);
     } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
       options.initFlexScoreFlag(true);
     } else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) {
@@ -280,6 +281,11 @@ int main(int argc, char* argv[])
     extractFileContextInv.Open(fileNameExtractContextInv.c_str());
   }
 
+  // stats on labels for glue grammar and unknown word label probabilities
+  set< string > targetLabelCollection, sourceLabelCollection;
+  map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
+  const bool targetSyntax = true;
+
   int i = sentenceOffset;
 
   string englishString, foreignString, alignmentString, weightString;
@@ -295,7 +301,10 @@ int main(int argc, char* argv[])
       getline(*iwFileP, weightString);
     }
 
-    SentenceAlignment sentence;
+    SentenceAlignmentWithSyntax sentence
+    (targetLabelCollection, sourceLabelCollection,
+     targetTopLabelCollection, sourceTopLabelCollection, 
+     targetSyntax, false);
     // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
     //az: output src, tgt, and alingment line
     if (options.isOnlyOutputSpanInfo()) {
@@ -360,7 +369,7 @@ void ExtractTask::Run()
 
 }
 
-void ExtractTask::extract(SentenceAlignment &sentence)
+void ExtractTask::extract(SentenceAlignmentWithSyntax &sentence)
 {
   int countE = sentence.target.size();
   int countF = sentence.source.size();
@@ -454,7 +463,15 @@ void ExtractTask::extract(SentenceAlignment &sentence)
                   // if(m_options.isAllModelsOutputFlag())
                   // " | | ";
                 }
-                addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
+                std::string phrasePropertiesString;
+                bool doAdd = !m_options.isTargetConstituentBoundariesFlag();
+                if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) {
+                  bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString);
+                  doAdd = doAdd || isTargetConstituentCovered;
+                }
+                if (doAdd) {
+                  addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString);
+                }
               }
             }
         }
@@ -510,12 +527,20 @@ void ExtractTask::extract(SentenceAlignment &sentence)
                         ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
                         ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
 
-      addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
+      std::string phrasePropertiesString;
+      bool doAdd = !m_options.isTargetConstituentBoundariesFlag();
+      if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) {
+        bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString);
+        doAdd = doAdd || isTargetConstituentCovered;
+      }
+      if (doAdd) {
+        addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString);
+      }
     }
   }
 }
 
-REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+REO_POS getOrientWordModel(SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
                            bool connectedLeftTop, bool connectedRightTop,
                            int startF, int endF, int startE, int endE, int countF, int zero, int unit,
                            bool (*ge)(int, int), bool (*lt)(int, int) )
@@ -541,7 +566,7 @@ REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelTyp
 }
 
 // to be called with countF-1 instead of countF
-REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+REO_POS getOrientPhraseModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
                               bool connectedLeftTop, bool connectedRightTop,
                               int startF, int endF, int startE, int endE, int countF, int zero, int unit,
                               bool (*ge)(int, int), bool (*lt)(int, int),
@@ -577,7 +602,7 @@ REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE model
 }
 
 // to be called with countF-1 instead of countF
-REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+REO_POS getOrientHierModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
                             bool connectedLeftTop, bool connectedRightTop,
                             int startF, int endF, int startE, int endE, int countF, int zero, int unit,
                             bool (*ge)(int, int), bool (*lt)(int, int),
@@ -629,7 +654,7 @@ REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelTy
   return UNKNOWN;
 }
 
-bool isAligned ( SentenceAlignment &sentence, int fi, int ei )
+bool isAligned ( SentenceAlignmentWithSyntax &sentence, int fi, int ei )
 {
   if (ei == -1 && fi == -1)
     return true;
@@ -715,8 +740,138 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
   }
   return "";
 }
+  
 
-void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+bool ExtractTask::checkTargetConstituentBoundaries( const SentenceAlignmentWithSyntax &sentence,
+                                                    int startE, int endE, int startF, int endF,
+                                                    std::string &phrasePropertiesString)
+{
+  ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesLeft;
+
+  if (m_options.isTargetConstituentBoundariesFlag()) {
+    outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "{{TargetConstituentBoundariesLeft ";
+  }
+
+  bool validTargetConstituentBoundaries = false;
+  bool outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
+
+  if (m_options.isTargetConstituentBoundariesFlag()) {
+    if (startE==0) {
+      outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
+      outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "BOS_";
+    }
+  }
+
+  if (!sentence.targetTree.HasNodeStartingAtPosition(startE)) {
+
+    validTargetConstituentBoundaries = false;
+
+  } else {
+
+    const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(startE);
+    for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); iter != startingNodes.rend(); ++iter ) {
+      if ( (*iter)->end == endE ) {
+        validTargetConstituentBoundaries = true;
+        if (!m_options.isTargetConstituentBoundariesFlag()) {
+          break;
+        }
+      }
+      if (m_options.isTargetConstituentBoundariesFlag()) {
+        if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
+          outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
+        } else {
+          outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<";
+        }
+        outextractstrPhrasePropertyTargetConstituentBoundariesLeft << (*iter)->label;
+      }
+    }
+  }
+
+  if (m_options.isTargetConstituentBoundariesFlag()) {
+    if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
+      outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<";
+    }
+    outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "}}";
+  }
+
+
+  if (m_options.isTargetConstituentConstrainedFlag() && !validTargetConstituentBoundaries) {
+    // skip over all boundary punctuation and check again
+    bool relaxedValidTargetConstituentBoundaries = false;
+    int relaxedStartE = startE;
+    int relaxedEndE = endE;
+    const std::string punctuation = ",;.:!?";
+    while ( (relaxedStartE < endE) &&
+            (sentence.target[relaxedStartE].size() == 1) &&
+            (punctuation.find(sentence.target[relaxedStartE].at(0)) != std::string::npos) ) {
+      ++relaxedStartE;
+    }
+    while ( (relaxedEndE > relaxedStartE) &&
+            (sentence.target[relaxedEndE].size() == 1) &&
+            (punctuation.find(sentence.target[relaxedEndE].at(0)) != std::string::npos) ) {
+      --relaxedEndE;
+    }
+
+    if ( (relaxedStartE != startE) || (relaxedEndE !=endE) ) {
+      const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(relaxedStartE);
+      for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); 
+            (iter != startingNodes.rend() && !relaxedValidTargetConstituentBoundaries); 
+            ++iter ) {
+        if ( (*iter)->end == relaxedEndE ) {
+          relaxedValidTargetConstituentBoundaries = true;
+        }
+      }
+    }
+
+    if (!relaxedValidTargetConstituentBoundaries) {
+      return false;
+    }
+  }
+
+
+  if (m_options.isTargetConstituentBoundariesFlag()) {
+
+    ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent;
+    outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "{{TargetConstituentBoundariesRightAdjacent ";
+    outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
+
+    if (endE==sentence.target.size()-1) {
+
+      outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "EOS_";
+      outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
+
+    } else {
+
+      const std::vector< SyntaxNode* >& adjacentNodes = sentence.targetTree.GetNodesByStartPosition(endE+1);
+      for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = adjacentNodes.rbegin(); iter != adjacentNodes.rend(); ++iter ) {
+        if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
+          outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
+        } else {
+          outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<";
+        }
+        outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << (*iter)->label;
+      }
+    }
+
+    if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
+      outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<";
+    }
+    outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "}}";
+
+    phrasePropertiesString += " ";
+    phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesLeft.str();
+    phrasePropertiesString += " ";
+    phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent.str();
+  }
+
+  return true;
+}
+
+
+void ExtractTask::addPhrase( const SentenceAlignmentWithSyntax &sentence, 
+                             int startE, int endE, int startF, int endF, 
+                             const std::string &orientationInfo,
+                             const std::string &phrasePropertiesString)
 {
   // source
   //   // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
@@ -746,11 +901,18 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
   if (m_options.isTranslationFlag()) outextractstr << "||| ";
   if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
 
+
   // target
   for(int ei=startE; ei<=endE; ei++) {
-    if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
-    if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
-    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
+
+    if (m_options.isTranslationFlag()) {
+      outextractstr << sentence.target[ei] << " ";
+      outextractstrInv << sentence.target[ei] << " ";
+    }
+
+    if (m_options.isOrientationFlag()) {
+      outextractstrOrientation << sentence.target[ei] << " ";
+    }
   }
   if (m_options.isTranslationFlag()) outextractstr << "|||";
   if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
@@ -792,7 +954,7 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
     }
   }
 
-
+  outextractstr << phrasePropertiesString;
 
   // generate two lines for every extracted phrase:
   // once with left, once with right context
@@ -901,7 +1063,7 @@ void ExtractTask::writePhrasesToFile()
 
 // if proper conditioning, we need the number of times a source phrase occured
 
-void ExtractTask::extractBase( SentenceAlignment &sentence )
+void ExtractTask::extractBase( SentenceAlignmentWithSyntax &sentence )
 {
   ostringstream outextractFile;
   ostringstream outextractFileInv;
@@ -935,7 +1097,7 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
 }
 
 
-bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF)
+bool ExtractTask::checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF)
 {
   for (size_t pos = startF; pos <= endF; ++pos) {
     const string &sourceWord = sentence.source[pos];
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 9095df01b..081ee8ef1 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -68,6 +68,7 @@ bool spanLength = false;
 bool ruleLength = false;
 bool nonTermContext = false;
 bool nonTermContextTarget = false;
+bool targetConstituentBoundariesFlag = false;
 
 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
@@ -286,6 +287,9 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
       nonTermContextTarget = true;
       std::cerr << "non-term context (target)" << std::endl;
+    } else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
+      targetConstituentBoundariesFlag = true;
+      std::cerr << "including target constituent boundaries information" << std::endl;
     } else {
       featureArgs.push_back(argv[i]);
       ++i;
@@ -957,6 +961,18 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
     }
   }
 
+  // target constituent boundaries
+  if (targetConstituentBoundariesFlag && !inverseFlag) {
+    const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft");
+    if (!targetConstituentBoundariesLeftValues.empty()) {
+      phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}";
+    }
+    const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent");
+    if (!targetConstituentBoundariesRightAdjacentValues.empty()) {
+      phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}";
+    }
+  }
+
   phraseTableFile << std::endl;
 }
author	Matthias Huck <mhuck@inf.ed.ac.uk>	2016-02-12 20:46:57 +0300
committer	Matthias Huck <mhuck@inf.ed.ac.uk>	2016-02-12 20:46:57 +0300
commit	1659d6b4c8c0b2a261678c012b9eab32f8c7b296 (patch)
tree	6e1cc6b7bf0a17e6b34bf6bbbafd45ad19bd42d3 /phrase-extract
parent	c75f9854e489c14670d3c9ab6e381fc0878d27d9 (diff)