new version of the `score` tool

which is now capable of dealing with additional properties in an appropriate manner
author: Matthias Huck <huck@i6.informatik.rwth-aachen.de> 2014-01-29 22:37:42 +0400
committer: Matthias Huck <huck@i6.informatik.rwth-aachen.de> 2014-01-29 22:37:42 +0400
commit: 86ee3e15a441aec72eaebdd0389fa925da2316c7 (patch)
tree: 6de24e964968820fb708a2e37a935c40a2a1494e /phrase-extract
parent: ffd62e994ecb88358b5f3aa835f84d441ec58c77 (diff)
13 files changed, 1044 insertions, 819 deletions
diff --git a/phrase-extract/domain.cpp b/phrase-extract/DomainFeature.cpp
index 67b4a13c3..2f99a8709 100644
--- a/phrase-extract/domain.cpp
+++ b/phrase-extract/DomainFeature.cpp
@@ -1,6 +1,5 @@
-// $Id$
-//#include "beammain.h"
-#include "domain.h"
+#include "DomainFeature.h"
+#include "ExtractionPhrasePair.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "SafeGetline.h"
@@ -26,7 +25,7 @@ void Domain::load( const std::string &domainFileName )
     int lineNumber;
     if (domainSpecLine.size() != 2 ||
         ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
-      cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+      std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
       exit(1);
     }
     // store
@@ -50,29 +49,34 @@ string Domain::getDomainOfSentence( int sentenceId ) const
   return "undefined";
 }
 
-DomainFeature::DomainFeature(const string& domainFile)
+DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
 {
   //process domain file
   m_domain.load(domainFile);
 }
 
+void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, 
+                                              float count, 
+                                              int sentenceId) const
+{
+  std::string value = m_domain.getDomainOfSentence(sentenceId);
+  phrasePair.AddProperty(m_propertyKey, value, count);
+}
+
 void DomainFeature::add(const ScoreFeatureContext& context,
                         std::vector<float>& denseValues,
                         std::map<std::string,float>& sparseValues)  const
 {
-  map< string, float > domainCount;
-  for(size_t i=0; i<context.phrasePair.size(); i++) {
-    string d = m_domain.getDomainOfSentence(context.phrasePair[i]->sentenceId );
-    if (domainCount.find( d ) == domainCount.end()) {
-      domainCount[d] = context.phrasePair[i]->count;
-    } else {
-      domainCount[d] += context.phrasePair[i]->count;
-    }
-  }
-  add(domainCount, context.count, context.maybeLog, denseValues, sparseValues);
+  const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
+  assert( domainCount != NULL );
+  add(*domainCount, 
+      context.phrasePair.GetCount(), 
+      context.maybeLog, 
+      denseValues, sparseValues);
 }
 
-void SubsetDomainFeature::add(const map<string,float>& domainCount,float count,
+void SubsetDomainFeature::add(const map<string,float>& domainCount, 
+                              float count,
                               const MaybeLog& maybeLog,
                               std::vector<float>& denseValues,
                               std::map<std::string,float>& sparseValues)  const
@@ -152,7 +156,6 @@ void IndicatorDomainFeature::add(const map<string,float>& domainCount,float coun
       denseValues.push_back(maybeLog(2.718));
     }
   }
-
 }
 
 void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
@@ -166,12 +169,5 @@ void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,floa
   }
 }
 
-bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
-{
-  return m_domain.getDomainOfSentence(lhs.sentenceId) ==
-         m_domain.getDomainOfSentence( rhs.sentenceId);
-}
-
-
 }
 
diff --git a/phrase-extract/domain.h b/phrase-extract/DomainFeature.h
index 279496e01..8ebc599e2 100644
--- a/phrase-extract/domain.h
+++ b/phrase-extract/DomainFeature.h
@@ -34,13 +34,17 @@ class DomainFeature : public ScoreFeature
 public:
 
   DomainFeature(const std::string& domainFile);
-  bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
+
+  void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, 
+                                 float count, 
+                                 int sentenceId) const;
+
   void add(const ScoreFeatureContext& context,
            std::vector<float>& denseValues,
            std::map<std::string,float>& sparseValues) const;
 
 protected:
-  /** Overriden in subclass */
+  /** Overridden in subclass */
   virtual void add(const std::map<std::string,float>& domainCounts, float count,
                    const MaybeLog& maybeLog,
                    std::vector<float>& denseValues,
@@ -49,6 +53,8 @@ protected:
 
   Domain m_domain;
 
+  const std::string m_propertyKey;
+
 };
 
 class SubsetDomainFeature : public DomainFeature
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
new file mode 100644
index 000000000..e2814f33c
--- /dev/null
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -0,0 +1,327 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#include <sstream>
+#include "ExtractionPhrasePair.h"
+#include "SafeGetline.h"
+#include "tables-core.h"
+#include "score.h"
+#include "moses/Util.h"
+
+#include <cstdlib>
+
+using namespace std;
+
+
+namespace MosesTraining {
+
+
+extern Vocabulary vcbT;
+extern Vocabulary vcbS;
+
+extern bool hierarchicalFlag;
+
+
+ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource, 
+                                            const PHRASE *phraseTarget, 
+                                            ALIGNMENT *targetToSourceAlignment, 
+                                            float count, float pcfgSum ) :
+    m_phraseSource(phraseSource),
+    m_phraseTarget(phraseTarget),
+    m_count(count),
+    m_pcfgSum(pcfgSum)
+{
+  assert(phraseSource.empty());
+  assert(phraseTarget.empty());
+
+  m_count = count;
+  m_pcfgSum = pcfgSum;
+  
+  std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
+      m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
+
+  m_lastTargetToSourceAlignment = insertedAlignment.first;
+  m_lastCount = m_count;
+  m_lastPcfgSum = m_pcfgSum;
+
+  m_isValid = true;
+}
+
+
+ExtractionPhrasePair::~ExtractionPhrasePair( ) {
+  Clear();
+}
+
+
+// return value: true if the given alignment was seen for the first time and thus will be stored,
+//               false if it was present already (the pointer may thus be deleted(
+bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment, 
+                                float count, float pcfgSum ) 
+{
+  m_count += count;
+  m_pcfgSum += pcfgSum;
+
+  m_lastCount = count;
+  m_lastPcfgSum = pcfgSum;
+  
+  std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
+  if ( *(iter->first) == *targetToSourceAlignment ) {
+    iter->second += count;
+    return false;
+  } else {
+    std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
+        m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
+    if ( !insertedAlignment.second ) {
+      // the alignment already exists: increment count
+      insertedAlignment.first->second += count;
+      return false;
+    }
+    m_lastTargetToSourceAlignment = insertedAlignment.first;
+  }
+
+  return true;
+}
+
+
+void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
+{
+  m_count += count;
+  m_pcfgSum += pcfgSum;
+  m_lastTargetToSourceAlignment->second += count;
+  // properties
+  for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin(); 
+        iter !=m_properties.end(); ++iter ) {
+    LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
+    (*lastPropertyValue)->second += count;
+  }
+
+  m_lastCount = count;
+  m_lastPcfgSum = pcfgSum;
+}
+
+
+// Check for lexical match 
+// and in case of SCFG rules for equal non-terminal alignment.
+bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
+                                    const PHRASE *otherPhraseTarget,
+                                    ALIGNMENT *otherTargetToSourceAlignment ) const
+{
+  if (*otherPhraseTarget != *m_phraseTarget) {
+    return false;
+  }
+  if (*otherPhraseSource != *m_phraseSource) {
+    return false;
+  }
+
+  return MatchesAlignment( otherTargetToSourceAlignment );
+}
+
+// Check for lexical match 
+// and in case of SCFG rules for equal non-terminal alignment.
+// Set boolean indicators. 
+// (Note that we check in the order: target - source - alignment
+//  and do not touch the subsequent boolean indicators once a previous one has been set to false.)
+bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
+                                    const PHRASE *otherPhraseTarget,
+                                    ALIGNMENT *otherTargetToSourceAlignment,
+                                    bool &sourceMatch,
+                                    bool &targetMatch,
+                                    bool &alignmentMatch ) const
+{
+  if (*otherPhraseSource != *m_phraseSource) {
+    sourceMatch = false;
+    return false;
+  } else {
+    sourceMatch = true;
+  }
+  if (*otherPhraseTarget != *m_phraseTarget) {
+    targetMatch = false;
+    return false;
+  } else {
+    targetMatch = true;
+  }
+  if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
+    alignmentMatch = false;
+    return false;
+  } else {
+    alignmentMatch = true;
+  }
+  return true;
+}
+
+// Check for equal non-terminal alignment in case of SCFG rules.
+// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
+bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
+{
+  if (!hierarchicalFlag) return true;
+
+  // all or none of the phrasePair's word alignment matrices match, so just pick one
+  const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
+
+  assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
+  assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
+
+  // loop over all symbols but the left hand side of the rule
+  for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
+    if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
+      size_t thisAlign  = *(thisTargetToSourceAlignment->at(i).begin());
+      size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
+
+      if (thisTargetToSourceAlignment->at(i).size() != 1 ||
+          otherTargetToSourceAlignment->at(i).size() != 1 ||
+          thisAlign != otherAlign) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void ExtractionPhrasePair::Clear() 
+{
+  delete m_phraseSource;
+  delete m_phraseTarget;
+
+  m_count = 0.0f;
+  m_pcfgSum = 0.0f;
+
+  for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
+        iter!=m_targetToSourceAlignments.end(); ++iter) {
+    delete iter->first;
+  }
+  m_targetToSourceAlignments.clear();
+
+  for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
+        iter!=m_properties.end(); ++iter) {
+    delete (iter->second).second;
+    delete (iter->second).first;
+  }
+  m_properties.clear();
+
+  m_lastCount = 0.0f;
+  m_lastPcfgSum = 0.0f;
+  m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
+  
+  m_isValid = false;
+}
+
+
+void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
+{
+  if (propertiesString.empty()) {
+    return;
+  }
+
+  vector<std::string> toks;
+  Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
+  for (size_t i = 1; i < toks.size(); ++i) {
+    std::string &tok = toks[i];
+    if (tok.empty()) {
+      continue;
+    }
+    size_t endPos = tok.rfind("}");
+    tok = tok.substr(0, endPos - 1);
+
+    vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
+    assert(keyValue.size() == 2);
+    AddProperty(keyValue[0], keyValue[1], count);
+  }
+}
+
+
+const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
+{
+  float bestAlignmentCount = -1;
+
+  std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
+
+  for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin(); 
+       iter!=m_targetToSourceAlignments.end(); ++iter) {
+    if ( (iter->second > bestAlignmentCount) ||
+         ( (iter->second == bestAlignmentCount) &&
+           (*(iter->first) > *(bestAlignment->first)) ) ) {
+      bestAlignmentCount = iter->second;
+      bestAlignment = iter;
+    }
+  }
+
+  if ( bestAlignment == m_targetToSourceAlignments.end()) {
+    return NULL;
+  }
+
+  return bestAlignment->first;
+}
+
+
+const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
+{
+  float bestPropertyCount = -1;
+
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+  if ( allPropertyValues == NULL ) {
+    return NULL;
+  }
+
+  PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
+
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); 
+       iter!=allPropertyValues->end(); ++iter) {
+    if ( (iter->second > bestPropertyCount) ||
+         ( (iter->second == bestPropertyCount) &&
+           (iter->first > bestPropertyValue->first) ) ) {
+      bestPropertyCount = iter->second;
+      bestPropertyValue = iter;
+    }
+  }
+
+  if ( bestPropertyValue == allPropertyValues->end()) {
+    return NULL;
+  }
+
+  return &(bestPropertyValue->first);
+}
+
+
+std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
+{
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+
+  if ( allPropertyValues == NULL ) {
+    return "";
+  }
+
+  std::ostringstream oss;
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); 
+       iter!=allPropertyValues->end(); ++iter) {
+    if (iter!=allPropertyValues->begin()) {
+      oss << " ";
+    }
+    oss << iter->first;
+    oss << " ";
+    oss << iter->second;
+  }
+
+  std::string allPropertyValuesString(oss.str());
+  return allPropertyValuesString;
+}
+
+
+}
+
diff --git a/phrase-extract/ExtractionPhrasePair.h b/phrase-extract/ExtractionPhrasePair.h
new file mode 100644
index 000000000..f04984391
--- /dev/null
+++ b/phrase-extract/ExtractionPhrasePair.h
@@ -0,0 +1,162 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+#pragma once
+#include "tables-core.h"
+
+#include <vector>
+#include <set>
+#include <map>
+
+namespace MosesTraining {
+
+
+typedef std::vector< std::set<size_t> > ALIGNMENT;
+
+
+class ExtractionPhrasePair {
+
+protected:
+
+  typedef std::map<std::string,float> PROPERTY_VALUES;
+  typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
+
+  
+  bool m_isValid;
+
+  const PHRASE *m_phraseSource;
+  const PHRASE *m_phraseTarget;
+
+  float m_count;
+  float m_pcfgSum;
+
+  std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
+  std::map<std::string, 
+           std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
+
+  float m_lastCount;
+  float m_lastPcfgSum;
+  std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
+
+public:
+
+  ExtractionPhrasePair( const PHRASE *phraseSource, 
+                        const PHRASE *phraseTarget, 
+                        ALIGNMENT *targetToSourceAlignment, 
+                        float count, float pcfgSum );
+
+  ~ExtractionPhrasePair();
+
+  bool Add( ALIGNMENT *targetToSourceAlignment, 
+            float count, float pcfgSum );
+
+  void IncrementPrevious( float count, float pcfgSum );
+
+  bool Matches( const PHRASE *otherPhraseSource,
+                const PHRASE *otherPhraseTarget,
+                ALIGNMENT *otherTargetToSourceAlignment ) const;
+
+  bool Matches( const PHRASE *otherPhraseSource,
+                const PHRASE *otherPhraseTarget,
+                ALIGNMENT *otherTargetToSourceAlignment,
+                bool &sourceMatch,
+                bool &targetMatch,
+                bool &alignmentMatch ) const;
+
+  bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
+
+  void Clear();
+
+  bool IsValid() const {
+    return m_isValid;
+  }
+
+
+  const PHRASE *GetSource() const {
+    return m_phraseSource;
+  }
+  
+  const PHRASE *GetTarget() const {
+    return m_phraseTarget;
+  }
+
+  float GetCount() const {
+    return m_count;
+  }
+
+  float GetPcfgScore() const {
+    return m_pcfgSum;
+  }
+
+  const size_t GetNumberOfProperties() const {
+    return m_properties.size();
+  }
+
+  const std::map<std::string,float> *GetProperty( const std::string &key ) const {
+    std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
+    iter = m_properties.find(key);
+    if (iter == m_properties.end()) {
+      return NULL;
+    } else {
+      return iter->second.first;
+    }
+  }
+
+  const ALIGNMENT *FindBestAlignmentTargetToSource() const;
+
+  const std::string *FindBestPropertyValue(const std::string &key) const;
+
+  std::string CollectAllPropertyValues(const std::string &key) const;
+
+  void AddProperties( const std::string &str, float count );
+
+  void AddProperty( const std::string &key, const std::string &value, float count ) 
+  {
+    std::map<std::string,
+             std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
+    if ( iter == m_properties.end() ) {
+      // key not found: insert property key and value
+      PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
+      std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
+      LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
+      m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
+    } else {
+      LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
+      if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before
+        // property key-value pair exists already: add count
+        (*lastPropertyValue)->second += count;
+      } else { // need to check whether the property key-value pair has appeared before (insert if not)
+        // property key exists, but not in combination with this value:
+        // add new value with count
+        PROPERTY_VALUES *propertyValues = (iter->second).first;
+        std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
+        if ( !insertedProperty.second ) { // property value for this key appeared before: add count
+          insertedProperty.first->second += count;
+        }
+        LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
+        delete (iter->second).second;
+        (iter->second).second = lastPropertyValue;
+      }
+    }
+  }
+
+};
+
+}
+
diff --git a/phrase-extract/InternalStructFeature.cpp b/phrase-extract/InternalStructFeature.cpp
index e0e9fd3e2..3757b0e43 100644
--- a/phrase-extract/InternalStructFeature.cpp
+++ b/phrase-extract/InternalStructFeature.cpp
@@ -1,50 +1,30 @@
 #include "InternalStructFeature.h"
+#include <map>
 
 using namespace std;
 
 namespace MosesTraining
 {
 
-InternalStructFeature::InternalStructFeature()
-	:m_type(0){
-	//cout<<"InternalStructFeature: Construct "<<m_type<<"\n";
-}
-
-bool InternalStructFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const{
-	//cout<<"InternalStructFeature: Equals\n";
-	//don't know what it's used for and what we should compare
-	//-> if the dense score is the same
-	//-> if the sparse feature is set
-	// compare phrases? with the internalStrucutre string?
-	/** Return true if the two phrase pairs are equal from the point of this feature. Assume
-	      that they already compare true according to PhraseAlignment.equals()
-	   **/
-
-/*	if(lhs.ghkmParse==rhs.ghkmParse)
-		return true;
-	else
-		return false;
-*/
-	//return true;
-}
-
 void InternalStructFeature::add(const ScoreFeatureContext& context,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const{
-	for(size_t i=0; i<context.phrasePair.size(); i++) {
-		add(&context.phrasePair[i]->treeFragment, denseValues, sparseValues);
-	}
-
+                                std::vector<float>& denseValues,
+                                std::map<std::string,float>& sparseValues) const {
+  const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
+  for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
+        iter!=allTrees->end(); ++iter ) {
+    add(&(iter->first), iter->second, denseValues, sparseValues);
+  }
 }
 
-void InternalStructFeatureDense::add(std::string *internalStruct,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const{
+void InternalStructFeatureDense::add(const std::string *treeFragment,
+                                     float count,
+                                     std::vector<float>& denseValues,
+                                     std::map<std::string,float>& sparseValues) const {
 	//cout<<"Dense: "<<*internalStruct<<endl;
 	size_t start=0;
 	int countNP=0;
-	while((start = internalStruct->find("NP", start)) != string::npos) {
-		countNP++;
+	while((start = treeFragment->find("NP", start)) != string::npos) {
+		countNP += count;
 		start+=2; //length of "NP"
 	}
 	//should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
@@ -53,21 +33,21 @@ void InternalStructFeatureDense::add(std::string *internalStruct,
 
 }
 
-void InternalStructFeatureSparse::add(std::string *internalStruct,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const{
-	//cout<<"Sparse: "<<*internalStruct<<endl;
-	if(internalStruct->find("VBZ")!=std::string::npos)
-		sparseValues["NTVBZ"] = 1;
-	if(internalStruct->find("VBD")!=std::string::npos)
-			sparseValues["NTVBD"] = 1;
-	if(internalStruct->find("VBP")!=std::string::npos)
-				sparseValues["NTVBP"] = 1;
-	if(internalStruct->find("PP")!=std::string::npos)
-				sparseValues["NTPP"] = 1;
-	if(internalStruct->find("SBAR")!=std::string::npos)
-				sparseValues["NTSBAR"] = 1;
-
+void InternalStructFeatureSparse::add(const std::string *treeFragment,
+                                      float count,
+                                      std::vector<float>& denseValues,
+                                      std::map<std::string,float>& sparseValues) const {
+  //cout<<"Sparse: "<<*internalStruct<<endl;
+  if(treeFragment->find("VBZ")!=std::string::npos)
+    sparseValues["NTVBZ"] += count;
+  if(treeFragment->find("VBD")!=std::string::npos)
+    sparseValues["NTVBD"] += count;
+  if(treeFragment->find("VBP")!=std::string::npos)
+    sparseValues["NTVBP"] += count;
+  if(treeFragment->find("PP")!=std::string::npos)
+    sparseValues["NTPP"] += count;
+  if(treeFragment->find("SBAR")!=std::string::npos)
+    sparseValues["NTSBAR"] += count;
 }
 
 
diff --git a/phrase-extract/InternalStructFeature.h b/phrase-extract/InternalStructFeature.h
index bd513a715..7a6efec1d 100644
--- a/phrase-extract/InternalStructFeature.h
+++ b/phrase-extract/InternalStructFeature.h
@@ -21,22 +21,19 @@ namespace MosesTraining
 class InternalStructFeature : public ScoreFeature
 {
 public:
-	InternalStructFeature();
-	/** Return true if the two phrase pairs are equal from the point of this feature. Assume
-	      that they already compare true according to PhraseAlignment.equals()
-	   **/
-	bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
+        InternalStructFeature() : m_type(0) {};
 	/** Add the values for this feature function. */
 	void add(const ScoreFeatureContext& context,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const;
+                 std::vector<float>& denseValues,
+                 std::map<std::string,float>& sparseValues) const;
 
 
 protected:
-	/** Overriden in subclass */
-	 virtual void add(std::string *internalStruct,
-	                   std::vector<float>& denseValues,
-	                   std::map<std::string,float>& sparseValues) const = 0;
+	/** Overridden in subclass */
+	 virtual void add(const std::string *treeFragment,
+                          float count,
+                          std::vector<float>& denseValues,
+                          std::map<std::string,float>& sparseValues) const = 0;
 	int m_type;
 
 };
@@ -47,9 +44,10 @@ public:
 	InternalStructFeatureDense()
 		:InternalStructFeature(){m_type=1;} //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
 protected:
-	virtual void add(std::string *internalStruct,
-		             std::vector<float>& denseValues,
-		             std::map<std::string,float>& sparseValues) const;
+	virtual void add(const std::string *treeFragment,
+                         float count,
+                         std::vector<float>& denseValues,
+                         std::map<std::string,float>& sparseValues) const;
 };
 
 class InternalStructFeatureSparse : public InternalStructFeature
@@ -58,9 +56,10 @@ public:
 	InternalStructFeatureSparse()
 		:InternalStructFeature(){m_type=2;}// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
 protected:
-	virtual void add(std::string *internalStruct,
-		             std::vector<float>& denseValues,
-		             std::map<std::string,float>& sparseValues) const;
+	virtual void add(const std::string *treeFragment,
+                         float count,
+                         std::vector<float>& denseValues,
+                         std::map<std::string,float>& sparseValues) const;
 };
 
 }
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index e66ecb9b3..50fed2973 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -1,19 +1,19 @@
-local most-deps = [ glob *.cpp : PhraseAlignment.cpp *Test.cpp *-main.cpp ] ;
+local most-deps = [ glob *.cpp : ExtractionPhrasePair.cpp *Test.cpp *-main.cpp ] ;
 #Build .o files with include path setting, reused. 
 for local d in $(most-deps) {
   obj $(d:B).o : $(d) ;
 }
 #and stuff them into an alias.
-alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ../moses//ThreadPool ../util//kenutil ;
+alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ../moses//ThreadPool ../moses//Util ../util//kenutil ;
 
-#PhraseAlignment.cpp requires that main define some global variables.  
+#ExtractionPhrasePair.cpp requires that main define some global variables.  
 #Build the mains that do not need these global variables.  
 for local m in [ glob *-main.cpp : score-main.cpp ] {
   exe [ MATCH "(.*)-main.cpp" : $(m) ] : $(m) deps ;
 }
 
-#The side dishes that use PhraseAlignment.cpp
-exe score : PhraseAlignment.cpp score-main.cpp deps ;
+#The side dishes that use ExtractionPhrasePair.cpp
+exe score : ExtractionPhrasePair.cpp score-main.cpp deps ;
 
 import testing ;
-run ScoreFeatureTest.cpp PhraseAlignment.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ;
+run ScoreFeatureTest.cpp ExtractionPhrasePair.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ;
diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp
deleted file mode 100644
index 075e1b18c..000000000
--- a/phrase-extract/PhraseAlignment.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- *  PhraseAlignment.cpp
- *  extract
- *
- *  Created by Hieu Hoang on 28/07/2010.
- *  Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-
-#include <sstream>
-#include "PhraseAlignment.h"
-#include "SafeGetline.h"
-#include "tables-core.h"
-#include "score.h"
-
-#include <cstdlib>
-
-using namespace std;
-
-namespace MosesTraining
-{
-
-extern Vocabulary vcbT;
-extern Vocabulary vcbS;
-
-extern bool hierarchicalFlag;
-
-//! convert string to variable of type T. Used to reading floats, int etc from files
-template<typename T>
-inline T Scan(const std::string &input)
-{
-  std::stringstream stream(input);
-  T ret;
-  stream >> ret;
-  return ret;
-}
-
-
-//! speeded up version of above
-template<typename T>
-inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
-{
-  output.resize(input.size());
-  for (size_t i = 0 ; i < input.size() ; i++) {
-    output[i] = Scan<T>( input[i] );
-  }
-}
-
-
-inline void Tokenize(std::vector<std::string> &output
-                     , const std::string& str
-                     , const std::string& delimiters = " \t")
-{
-  // Skip delimiters at beginning.
-  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
-  // Find first "non-delimiter".
-  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
-
-  while (std::string::npos != pos || std::string::npos != lastPos) {
-    // Found a token, add it to the vector.
-    output.push_back(str.substr(lastPos, pos - lastPos));
-    // Skip delimiters.  Note the "not_of"
-    lastPos = str.find_first_not_of(delimiters, pos);
-    // Find next "non-delimiter"
-    pos = str.find_first_of(delimiters, lastPos);
-  }
-}
-
-// speeded up version of above
-template<typename T>
-inline void Tokenize( std::vector<T> &output
-                      , const std::string &input
-                      , const std::string& delimiters = " \t")
-{
-  std::vector<std::string> stringVector;
-  Tokenize(stringVector, input, delimiters);
-  return Scan<T>(output, stringVector );
-}
-
-// read in a phrase pair and store it
-void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFlag )
-{
-  assert(phraseS.empty());
-  assert(phraseT.empty());
-  treeFragment.clear();
-
-  vector< string > token = tokenize( line );
-  int item = 1;
-  for (size_t j=0; j<token.size(); j++) {
-    if (token[j] == "|||") item++;
-    else if (item == 1) { // source phrase
-      phraseS.push_back( vcbS.storeIfNew( token[j] ) );
-    }
-
-    else if (item == 2) { // target phrase
-      phraseT.push_back( vcbT.storeIfNew( token[j] ) );
-    } else if (item == 3) { // alignment
-      int s,t;
-      sscanf(token[j].c_str(), "%d-%d", &s, &t);
-      if ((size_t)t >= phraseT.size() || (size_t)s >= phraseS.size()) {
-        cerr << "WARNING: phrase pair " << lineID
-             << " has alignment point (" << s << ", " << t
-             << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n";
-      } else {
-        // first alignment point? -> initialize
-        createAlignVec(phraseS.size(), phraseT.size());
-
-        // add alignment point
-        alignedToT[t].insert( s );
-        alignedToS[s].insert( t );
-      }
-    } else if ( (item >= 4) && (token[j] == "Tree") ) { // check for information with a key field
-      ++j;
-      while ( (j < token.size() ) && (token[j] != "|||") ) {
-        treeFragment.append(" ");
-        treeFragment.append(token[j]);
-        ++j;
-      }
-      --j;
-    } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
-      sscanf(token[j].c_str(), "%d", &sentenceId);
-    } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
-      sscanf(token[j].c_str(), "%f", &count);
-    } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
-      float pcfgScore = std::atof(token[j].c_str());
-      pcfgSum = pcfgScore * count;
-    }
-  }
-
-  createAlignVec(phraseS.size(), phraseT.size());
-
-  if (item + (includeSentenceIdFlag?-1:0) == 3) {
-    count = 1.0;
-  }
-  if (item < 3 || item > 6) {
-    cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
-  }
-}
-
-void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)
-{
-  // in case of no align info. always need align info, even if blank
-  if (alignedToT.size() == 0) {
-    size_t numTgtSymbols = (hierarchicalFlag ? targetSize-1 : targetSize);
-    alignedToT.resize(numTgtSymbols);
-  }
-
-  if (alignedToS.size() == 0) {
-    size_t numSrcSymbols = (hierarchicalFlag ? sourceSize-1 : sourceSize);
-    alignedToS.resize(numSrcSymbols);
-  }
-}
-
-void PhraseAlignment::clear()
-{
-  phraseS.clear();
-  phraseT.clear();
-  alignedToT.clear();
-  alignedToS.clear();
-}
-
-// check if two word alignments between a phrase pair are the same
-bool PhraseAlignment::equals( const PhraseAlignment& other )
-{
-  if (this == &other) return true;
-  if (other.GetTarget() != GetTarget()) return false;
-  if (other.GetSource() != GetSource()) return false;
-  if (other.alignedToT != alignedToT) return false;
-  if (other.alignedToS != alignedToS) return false;
-  return true;
-}
-
-// check if two word alignments between a phrase pairs "match"
-// i.e. they do not differ in the alignment of non-termimals
-bool PhraseAlignment::match( const PhraseAlignment& other )
-{
-  if (this == &other) return true;
-  if (other.GetTarget() != GetTarget()) return false;
-  if (other.GetSource() != GetSource()) return false;
-  if (!hierarchicalFlag) return true;
-
-  assert(phraseT.size() == alignedToT.size() + 1);
-  assert(alignedToT.size() == other.alignedToT.size());
-
-  // loop over all words (note: 0 = left hand side of rule)
-  for(size_t i=0; i<phraseT.size()-1; i++) {
-    if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
-      if (alignedToT[i].size() != 1 ||
-          other.alignedToT[i].size() != 1 ||
-          *(alignedToT[i].begin()) != *(other.alignedToT[i].begin()))
-        return false;
-    }
-  }
-  return true;
-}
-
-int PhraseAlignment::Compare(const PhraseAlignment &other) const
-{
-  if (this == &other) // comparing with itself
-    return 0;
-
-  if (GetTarget() != other.GetTarget())
-    return ( GetTarget() < other.GetTarget() ) ? -1 : +1;
-
-  if (GetSource() != other.GetSource())
-    return ( GetSource() < other.GetSource() ) ? -1 : +1;
-
-  if (!hierarchicalFlag)
-    return 0;
-
-  // loop over all words (note: 0 = left hand side of rule)
-  for(size_t i=0; i<phraseT.size()-1; i++) {
-    if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
-      size_t thisAlign = *(alignedToT[i].begin());
-      size_t otherAlign = *(other.alignedToT[i].begin());
-
-      if (alignedToT[i].size() != 1 ||
-          other.alignedToT[i].size() != 1 ||
-          thisAlign != otherAlign) {
-        int ret = (thisAlign < otherAlign) ? -1 : +1;
-        return ret;
-      }
-    }
-  }
-  return 0;
-
-}
-
-}
-
diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h
deleted file mode 100644
index 0e854c136..000000000
--- a/phrase-extract/PhraseAlignment.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#pragma once
-/*
- *  PhraseAlignment.h
- *  extract
- *
- *  Created by Hieu Hoang on 28/07/2010.
- *  Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include "tables-core.h"
-
-#include <vector>
-#include <set>
-
-namespace MosesTraining
-{
-
-// data structure for a single phrase pair
-class PhraseAlignment
-{
-protected:
-  PHRASE phraseS;
-  PHRASE phraseT;
-
-  void createAlignVec(size_t sourceSize, size_t targetSize);
-  void addNTLength(const std::string &tok);
-public:
-  float pcfgSum;
-  float count;
-  int sentenceId;
-  std::string domain;
-  std::string treeFragment;
-
-  std::vector< std::set<size_t> > alignedToT;
-  std::vector< std::set<size_t> > alignedToS;
-
-  void create( char*, int, bool );
-  void clear();
-  bool equals( const PhraseAlignment& );
-  bool match( const PhraseAlignment& );
-
-  int Compare(const PhraseAlignment &compare) const;
-  inline bool operator<(const PhraseAlignment &compare) const {
-    return Compare(compare) < 0;
-  }
-
-  const PHRASE &GetSource() const {
-    return phraseS;
-  }
-  const PHRASE &GetTarget() const {
-    return phraseT;
-  }
-};
-
-class PhraseAlignment;
-
-typedef std::vector<PhraseAlignment*>          PhraseAlignmentCollection;
-//typedef std::vector<PhraseAlignmentCollection> PhrasePairGroup;
-
-class PhraseAlignmentCollectionOrderer
-{
-public:
-  bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const {
-    assert(collA.size() > 0);
-    assert(collB.size() > 0);
-
-    const PhraseAlignment &objA = *collA[0];
-    const PhraseAlignment &objB = *collB[0];
-    bool ret = objA < objB;
-
-    return ret;
-  }
-};
-
-
-//typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> PhrasePairGroup;
-
-class PhrasePairGroup
-{
-private:
-  typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> Coll;
-  Coll m_coll;
-
-
-public:
-  typedef Coll::iterator iterator;
-  typedef Coll::const_iterator const_iterator;
-  typedef std::vector<const PhraseAlignmentCollection *> SortedColl;
-
-  std::pair<Coll::iterator,bool> insert ( const PhraseAlignmentCollection& obj );
-
-  const SortedColl &GetSortedColl() const {
-    return m_sortedColl;
-  }
-  size_t GetSize() const {
-    return m_coll.size();
-  }
-
-private:
-  SortedColl m_sortedColl;
-
-};
-
-
-}
-
diff --git a/phrase-extract/ScoreFeature.cpp b/phrase-extract/ScoreFeature.cpp
index f98759755..52157a8cb 100644
--- a/phrase-extract/ScoreFeature.cpp
+++ b/phrase-extract/ScoreFeature.cpp
@@ -18,7 +18,7 @@
  ***********************************************************************/
 
 #include "ScoreFeature.h"
-#include "domain.h"
+#include "DomainFeature.h"
 #include "InternalStructFeature.h"
 
 using namespace std;
@@ -77,10 +77,10 @@ void ScoreFeatureManager::configure(const std::vector<std::string> args)
       }
       sparseDomainAdded = true;
       m_includeSentenceId = true;
-    } else if(args[i] == "--GHKMFeatureSparse"){
+    } else if(args[i] == "--TreeFeatureSparse"){
     	//MARIA
     	m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
-    } else if(args[i] == "--GHKMFeatureDense"){
+    } else if(args[i] == "--TreeFeatureDense"){
     	//MARIA
     	m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
     } else {
@@ -91,12 +91,13 @@ void ScoreFeatureManager::configure(const std::vector<std::string> args)
 
 }
 
-bool ScoreFeatureManager::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
+void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, 
+                                                    float count, 
+                                                    int sentenceId) const
 {
   for (size_t i = 0; i < m_features.size(); ++i) {
-    if (!m_features[i]->equals(lhs,rhs)) return false;
+    m_features[i]->addPropertiesToPhrasePair(phrasePair, count, sentenceId);
   }
-  return true;
 }
 
 void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context,
diff --git a/phrase-extract/ScoreFeature.h b/phrase-extract/ScoreFeature.h
index 5823f21ce..926397e71 100644
--- a/phrase-extract/ScoreFeature.h
+++ b/phrase-extract/ScoreFeature.h
@@ -35,7 +35,7 @@
 
 #include "util/exception.hh"
 
-#include "PhraseAlignment.h"
+#include "ExtractionPhrasePair.h"
 
 namespace MosesTraining
 {
@@ -64,17 +64,14 @@ public:
 /** Passed to each feature to be used to calculate its values */
 struct ScoreFeatureContext {
   ScoreFeatureContext(
-    const PhraseAlignmentCollection &thePhrasePair,
-    float theCount, /* Total counts of all phrase pairs*/
+    const ExtractionPhrasePair &thePhrasePair,
     const MaybeLog& theMaybeLog
   ) :
     phrasePair(thePhrasePair),
-    count(theCount),
     maybeLog(theMaybeLog) {
   }
 
-  const PhraseAlignmentCollection& phrasePair;
-  float count;
+  const ExtractionPhrasePair &phrasePair;
   MaybeLog maybeLog;
 };
 
@@ -85,16 +82,19 @@ struct ScoreFeatureContext {
 class ScoreFeature
 {
 public:
+
+  /** Some features might need to store properties in ExtractionPhrasePair,
+   *  e.g. to pass along external information loaded by a feature 
+   *  which may distinguish several phrase occurrences based on sentence ID */
+  virtual void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, 
+                                         float count, 
+                                         int sentenceId) const {};
+
   /** Add the values for this feature function. */
   virtual void add(const ScoreFeatureContext& context,
                    std::vector<float>& denseValues,
                    std::map<std::string,float>& sparseValues) const = 0;
 
-  /** Return true if the two phrase pairs are equal from the point of this feature. Assume
-      that they already compare true according to PhraseAlignment.equals()
-   **/
-  virtual bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const = 0;
-
   virtual ~ScoreFeature() {}
 
 };
@@ -112,17 +112,18 @@ public:
   /** Pass the unused command-line arguments to configure the extra features */
   void configure(const std::vector<std::string> args);
 
+  /** Some features might need to store properties in ExtractionPhrasePair,
+   *  e.g. to pass along external information loaded by a feature 
+   *  which may distinguish several phrase occurrences based on sentence ID */
+  void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, 
+                                 float count, 
+                                 int sentenceId) const;
+
   /** Add all the features */
   void addFeatures(const ScoreFeatureContext& context,
                    std::vector<float>& denseValues,
                    std::map<std::string,float>& sparseValues) const;
 
-  /**
-    * Used to tell if the PhraseAlignment should be considered the same by all
-    * extended features.
-   **/
-  bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
-
   const std::vector<ScoreFeaturePtr>& getFeatures() const {
     return m_features;
   }
diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 65f1f4437..534ab177b 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -17,7 +17,7 @@
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  ***********************************************************************/
 
-#include "domain.h"
+#include "DomainFeature.h"
 #include "ScoreFeature.h"
 #include "tables-core.h"
 
@@ -93,18 +93,3 @@ BOOST_AUTO_TEST_CASE(manager_config_domain)
   (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
 }
 
-
-BOOST_AUTO_TEST_CASE(domain_equals)
-{
-  SubsetDomainFeature feature(DomainFileLocation());
-  PhraseAlignment a1,a2,a3;
-  char buf1[] = "a ||| b ||| 0-0 ||| 1";
-  char buf2[] = "a ||| b ||| 0-0 ||| 2";
-  char buf3[] = "a ||| b ||| 0-0 ||| 3";
-  a1.create(buf1, 0, true); //domain a
-  a2.create(buf2, 1, true); //domain c
-  a3.create(buf3, 2, true); //domain c
-  BOOST_CHECK(feature.equals(a2,a3));
-  BOOST_CHECK(!feature.equals(a1,a3));
-  BOOST_CHECK(!feature.equals(a1,a3));
-}
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index d0305fad2..65a12d176 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -21,22 +21,21 @@
 #include <cstdio>
 #include <iostream>
 #include <fstream>
-#include <vector>
 #include <stdlib.h>
 #include <assert.h>
 #include <cstring>
+#include <map>
 #include <set>
+#include <vector>
 #include <algorithm>
 
 #include "SafeGetline.h"
 #include "ScoreFeature.h"
 #include "tables-core.h"
-#include "domain.h"
-#include "PhraseAlignment.h"
+#include "ExtractionPhrasePair.h"
 #include "score.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
-#include "InternalStructFeature.h"
 
 using namespace std;
 using namespace MosesTraining;
@@ -61,111 +60,120 @@ int negLogProb = 1;
 bool lexFlag = true;
 bool unalignedFlag = false;
 bool unalignedFWFlag = false;
-bool singletonFeature = false;
 bool crossedNonTerm = false;
 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
 float minCountHierarchical = 0;
+std::map<std::string,float> sourceLHSCounts;
+std::map<std::string, std::map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
+
+std::set<std::string> sourceLabelSet;
+std::map<std::string,size_t> sourceLabels; 
+std::vector<std::string> sourceLabelsByIndex;
 
 Vocabulary vcbT;
 Vocabulary vcbS;
 
 } // namespace
 
-vector<string> tokenize( const char [] );
-
-void writeCountOfCounts( const string &fileNameCountOfCounts );
-void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLog);
-const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
-const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phrasePair );
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLog );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
-set<string> functionWordList;
+std::vector<std::string> tokenize( const char [] );
+
+void processLine( std::string line,
+                  int lineID, bool includeSentenceIdFlag, int &sentenceId,  
+                  PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
+                  std::string &additionalPropertiesString,
+                  float &count, float &pcfgSum );
+void writeCountOfCounts( const std::string &fileNameCountOfCounts );
+void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile, 
+                         const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
+void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
+double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
+double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
+set<std::string> functionWordList;
 void loadFunctionWords( const string &fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
-void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
-void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
+double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
+int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
+void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
+void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
+void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
+
 
 int main(int argc, char* argv[])
 {
-  cerr << "Score v2.0 written by Philipp Koehn\n"
-       << "scoring methods for extracted rules\n";
+  std::cerr << "Score v2.1 -- " 
+            << "scoring methods for extracted rules" << std::endl;
 
   ScoreFeatureManager featureManager;
   if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
-    cerr << featureManager.usage() << endl;
+    std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+    std::cerr << featureManager.usage() << std::endl;
     exit(1);
   }
-  string fileNameExtract = argv[1];
-  string fileNameLex = argv[2];
-  string fileNamePhraseTable = argv[3];
-  string fileNameCountOfCounts;
-  char* fileNameFunctionWords = NULL;
-  vector<string> featureArgs; //all unknown args passed to feature manager
+  std::string fileNameExtract = argv[1];
+  std::string fileNameLex = argv[2];
+  std::string fileNamePhraseTable = argv[3];
+  std::string fileNameCountOfCounts;
+  std::string fileNameFunctionWords;
+  std::vector<std::string> featureArgs; // all unknown args passed to feature manager
 
   for(int i=4; i<argc; i++) {
     if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
       inverseFlag = true;
-      cerr << "using inverse mode\n";
+      std::cerr << "using inverse mode" << std::endl;
     } else if (strcmp(argv[i],"--Hierarchical") == 0) {
       hierarchicalFlag = true;
-      cerr << "processing hierarchical rules\n";
+      std::cerr << "processing hierarchical rules" << std::endl;
     } else if (strcmp(argv[i],"--PCFG") == 0) {
       pcfgFlag = true;
-      cerr << "including PCFG scores\n";
+      std::cerr << "including PCFG scores" << std::endl;
     } else if (strcmp(argv[i],"--TreeFragments") == 0) {
       treeFragmentsFlag = true;
-      cerr << "including tree fragments from syntactic parse\n";
+      std::cerr << "including tree fragment information from syntactic parse\n";
     } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
       unpairedExtractFormatFlag = true;
-      cerr << "processing unpaired extract format\n";
+      std::cerr << "processing unpaired extract format" << std::endl;
     } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
       conditionOnTargetLhsFlag = true;
-      cerr << "processing unpaired extract format\n";
+      std::cerr << "processing unpaired extract format" << std::endl;
     } else if (strcmp(argv[i],"--NoWordAlignment") == 0) {
       wordAlignmentFlag = false;
-      cerr << "omitting word alignment" << endl;
+      std::cerr << "omitting word alignment" << std::endl;
     } else if (strcmp(argv[i],"--NoLex") == 0) {
       lexFlag = false;
-      cerr << "not computing lexical translation score\n";
+      std::cerr << "not computing lexical translation score" << std::endl;
     } else if (strcmp(argv[i],"--GoodTuring") == 0) {
       goodTuringFlag = true;
-      fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
-      cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+      fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
+      std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
     } else if (strcmp(argv[i],"--KneserNey") == 0) {
       kneserNeyFlag = true;
-      fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
-      cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+      fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
+      std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
     } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
       unalignedFlag = true;
-      cerr << "using unaligned word penalty\n";
+      std::cerr << "using unaligned word penalty" << std::endl;
     } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
       unalignedFWFlag = true;
       if (i+1==argc) {
-        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
+          std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
         exit(1);
       }
       fileNameFunctionWords = argv[++i];
-      cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
+      std::cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << std::endl;
     }  else if (strcmp(argv[i],"--LogProb") == 0) {
       logProbFlag = true;
-      cerr << "using log-probabilities\n";
+      std::cerr << "using log-probabilities" << std::endl;
     } else if (strcmp(argv[i],"--NegLogProb") == 0) {
       logProbFlag = true;
       negLogProb = -1;
-      cerr << "using negative log-probabilities\n";
+      std::cerr << "using negative log-probabilities" << std::endl;
     } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
       minCountHierarchical = atof(argv[++i]);
-      cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
+      std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
       minCountHierarchical -= 0.00001; // account for rounding
-    } else if (strcmp(argv[i],"--Singleton") == 0) {
-      singletonFeature = true;
-      cerr << "binary singleton feature\n";
     } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
       crossedNonTerm = true;
-      cerr << "crossed non-term reordering feature\n";
+      std::cerr << "crossed non-term reordering feature" << std::endl;
     } else {
       featureArgs.push_back(argv[i]);
       ++i;
@@ -178,16 +186,20 @@ int main(int argc, char* argv[])
 
   MaybeLog maybeLogProb(logProbFlag, negLogProb);
 
-  //configure extra features
-  if (!inverseFlag) featureManager.configure(featureArgs);
+  // configure extra features
+  if (!inverseFlag) {
+    featureManager.configure(featureArgs);
+  }
 
   // lexical translation table
-  if (lexFlag)
+  if (lexFlag) {
     lexTable.load( fileNameLex );
+  }
 
   // function word list
-  if (unalignedFWFlag)
+  if (unalignedFWFlag) {
     loadFunctionWords( fileNameFunctionWords );
+  }
 
   // compute count of counts for Good Turing discounting
   if (goodTuringFlag || kneserNeyFlag) {
@@ -198,7 +210,7 @@ int main(int argc, char* argv[])
   Moses::InputFileStream extractFile(fileNameExtract);
 
   if (extractFile.fail()) {
-    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
+    std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
     exit(1);
   }
   istream &extractFileP = extractFile;
@@ -207,76 +219,160 @@ int main(int argc, char* argv[])
   ostream *phraseTableFile;
 
   if (fileNamePhraseTable == "-") {
-    phraseTableFile = &cout;
+    phraseTableFile = &std::cout;
   } else {
     Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
     bool success = outputFile->Open(fileNamePhraseTable);
     if (!success) {
-      cerr << "ERROR: could not open file phrase table file "
-           << fileNamePhraseTable << endl;
+        std::cerr << "ERROR: could not open file phrase table file "
+                  << fileNamePhraseTable << std::endl;
       exit(1);
     }
     phraseTableFile = outputFile;
   }
 
   // loop through all extracted phrase translations
-  float lastCount = 0.0f;
-  float lastPcfgSum = 0.0f;
-  vector< PhraseAlignment > phrasePairsWithSameF;
-  bool isSingleton = true;
-  int i=0;
-  char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
+  char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH];
   lastLine[0] = '\0';
-  PhraseAlignment *lastPhrasePair = NULL;
-  while(true) {
-    if (extractFileP.eof()) break;
-    if (++i % 100000 == 0) cerr << "." << flush;
-    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (extractFileP.eof())	break;
+  ExtractionPhrasePair *phrasePair = NULL;
+  std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
+  std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
 
-    // identical to last line? just add count
-    if (strcmp(line,lastLine) == 0) {
-      lastPhrasePair->count += lastCount;
-      lastPhrasePair->pcfgSum += lastPcfgSum;
-      continue;
+  int tmpSentenceId;
+  PHRASE *tmpPhraseSource, *tmpPhraseTarget;
+  ALIGNMENT *tmpTargetToSourceAlignment;
+  std::string tmpAdditionalPropertiesString;
+  float tmpCount=0.0f, tmpPcfgSum=0.0f;
+
+  int i=0;
+  SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
+  if ( !extractFileP.eof() ) {
+    ++i;
+    tmpPhraseSource = new PHRASE();
+    tmpPhraseTarget = new PHRASE();
+    tmpTargetToSourceAlignment = new ALIGNMENT();
+    processLine( std::string(line), 
+                 i, featureManager.includeSentenceId(), tmpSentenceId,
+                 tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, 
+                 tmpAdditionalPropertiesString,
+                 tmpCount, tmpPcfgSum);
+    phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget, 
+                                           tmpTargetToSourceAlignment,
+                                           tmpCount, tmpPcfgSum );
+    phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
+    featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
+    phrasePairsWithSameSource.push_back( phrasePair );
+    if ( hierarchicalFlag ) {
+      phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
     }
     strcpy( lastLine, line );
+    SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
+  }
+
+  while ( !extractFileP.eof() ) {
+
+    if ( ++i % 100000 == 0 ) {
+      std::cerr << "." << std::flush;
+    }
 
-    // create new phrase pair
-    PhraseAlignment phrasePair;
-    phrasePair.create( line, i, featureManager.includeSentenceId());
-    lastCount = phrasePair.count;
-    lastPcfgSum = phrasePair.pcfgSum;
-
-    // only differs in count? just add count
-    if (lastPhrasePair != NULL
-        && lastPhrasePair->equals( phrasePair )
-        && featureManager.equals(*lastPhrasePair, phrasePair)) {
-      lastPhrasePair->count += phrasePair.count;
-      lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
+    // identical to last line? just add count
+    if (strcmp(line,lastLine) == 0) {
+      phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
+      SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
       continue;
+    } else {
+      strcpy( lastLine, line );
     }
 
-    // if new source phrase, process last batch
-    if (lastPhrasePair != NULL &&
-        lastPhrasePair->GetSource() != phrasePair.GetSource()) {
-      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb );
+    tmpPhraseSource = new PHRASE();
+    tmpPhraseTarget = new PHRASE();
+    tmpTargetToSourceAlignment = new ALIGNMENT();
+    tmpAdditionalPropertiesString.clear();
+    processLine( std::string(line), 
+                 i, featureManager.includeSentenceId(), tmpSentenceId,
+                 tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, 
+                 tmpAdditionalPropertiesString,
+                 tmpCount, tmpPcfgSum); 
+
+    bool matchesPrevious = false;
+    bool sourceMatch = true; bool targetMatch = true; bool alignmentMatch = true; // be careful with these,
+    // ExtractionPhrasePair::Matches() checks them in order and does not continue with the others
+    // once the first of them has been found to have to be set to false
+
+    if ( hierarchicalFlag ) {
+      for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
+            iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
+        if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
+                               sourceMatch, targetMatch, alignmentMatch ) ) {
+          matchesPrevious = true;
+          phrasePair = (*iter);
+          break;
+        }
+      }
+    } else {
+      if ( phrasePair->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
+                                sourceMatch, targetMatch, alignmentMatch ) ) {
+        matchesPrevious = true;
+      }
+    }
 
-      phrasePairsWithSameF.clear();
-      isSingleton = false;
-      lastPhrasePair = NULL;
+    if ( matchesPrevious ) {
+      delete tmpPhraseSource;
+      delete tmpPhraseTarget;
+      if ( !phrasePair->Add( tmpTargetToSourceAlignment,
+                             tmpCount, tmpPcfgSum ) ) {
+        delete tmpTargetToSourceAlignment;
+      }
+      phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
+      featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
     } else {
-      isSingleton = true;
+
+      if ( !phrasePairsWithSameSource.empty() &&
+           !sourceMatch ) {
+        processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
+        for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); 
+              iter!=phrasePairsWithSameSource.end(); ++iter) {
+          delete *iter;
+        }
+        phrasePairsWithSameSource.clear();
+        if ( hierarchicalFlag ) {
+          phrasePairsWithSameSourceAndTarget.clear();
+        }
+      }
+
+      if ( hierarchicalFlag ) {
+        if ( !phrasePairsWithSameSourceAndTarget.empty() &&
+             !targetMatch ) {
+          phrasePairsWithSameSourceAndTarget.clear();
+        }
+      }
+
+      phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget, 
+                                             tmpTargetToSourceAlignment, 
+                                             tmpCount, tmpPcfgSum );
+      phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
+      featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
+      phrasePairsWithSameSource.push_back(phrasePair);
+
+      if ( hierarchicalFlag ) {
+        phrasePairsWithSameSourceAndTarget.push_back(phrasePair);
+      }
     }
 
-    // add phrase pairs to list, it's now the last one
-    phrasePairsWithSameF.push_back( phrasePair );
-    lastPhrasePair = &phrasePairsWithSameF.back();
+    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+
   }
-  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb );
+
+  processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
+  for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); 
+        iter!=phrasePairsWithSameSource.end(); ++iter) {
+    delete *iter;
+  }
+  phrasePairsWithSameSource.clear();
+
 
   phraseTableFile->flush();
-  if (phraseTableFile != &cout) {
+  if (phraseTableFile != &std::cout) {
     delete phraseTableFile;
   }
 
@@ -286,177 +382,134 @@ int main(int argc, char* argv[])
   }
 }
 
+
+void processLine( std::string line,
+                  int lineID, bool includeSentenceIdFlag, int &sentenceId,  
+                  PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
+                  std::string &additionalPropertiesString,
+                  float &count, float &pcfgSum )
+{
+  size_t foundAdditionalProperties = line.find("{{");
+  if (foundAdditionalProperties != std::string::npos) {
+    additionalPropertiesString = line.substr(foundAdditionalProperties);
+    line = line.substr(0,foundAdditionalProperties);
+  } else {
+    additionalPropertiesString.clear();
+  }
+
+  phraseSource->clear();
+  phraseTarget->clear();
+  targetToSourceAlignment->clear();
+
+  std::vector<std::string> token = tokenize( line.c_str() );
+  int item = 1;
+  for ( size_t j=0; j<token.size(); ++j ) {
+    if (token[j] == "|||") {
+      ++item;
+    } else if (item == 1) { // source phrase
+      phraseSource->push_back( vcbS.storeIfNew( token[j] ) );
+    } else if (item == 2) { // target phrase
+      phraseTarget->push_back( vcbT.storeIfNew( token[j] ) );
+    } else if (item == 3) { // alignment
+      int s,t;
+      sscanf(token[j].c_str(), "%d-%d", &s, &t);
+      if ((size_t)t >= phraseTarget->size() || (size_t)s >= phraseSource->size()) {
+        std::cerr << "WARNING: phrase pair " << lineID
+                  << " has alignment point (" << s << ", " << t << ")"
+                  << " out of bounds (" << phraseSource->size() << ", " << phraseTarget->size() << ")"
+                  << std::endl;
+      } else {
+        // first alignment point? -> initialize
+        if ( targetToSourceAlignment->size() == 0 ) {
+          size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
+          targetToSourceAlignment->resize(numberOfTargetSymbols);
+        }
+        // add alignment point
+        targetToSourceAlignment->at(t).insert(s);
+      }
+    } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
+      sscanf(token[j].c_str(), "%d", &sentenceId);
+    } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
+      sscanf(token[j].c_str(), "%f", &count);
+    } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
+      float pcfgScore = std::atof(token[j].c_str());
+      pcfgSum = pcfgScore * count;
+    }
+  }
+
+  if ( targetToSourceAlignment->size() == 0 ) {
+    size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
+    targetToSourceAlignment->resize(numberOfTargetSymbols);
+  }
+
+  if (item + (includeSentenceIdFlag?-1:0) == 3) {
+    count = 1.0;
+  }
+  if (item < 3 || item > 6) {
+    std::cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
+  }
+
+}
+
+
 void writeCountOfCounts( const string &fileNameCountOfCounts )
 {
   // open file
   Moses::OutputFileStream countOfCountsFile;
   bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
   if (!success) {
-    cerr << "ERROR: could not open count-of-counts file "
-         << fileNameCountOfCounts << endl;
+    std::cerr << "ERROR: could not open count-of-counts file "
+              << fileNameCountOfCounts << std::endl;
     return;
   }
 
   // Kneser-Ney needs the total number of phrase pairs
-  countOfCountsFile << totalDistinct << endl;
+  countOfCountsFile << totalDistinct << std::endl;
 
   // write out counts
   for(int i=1; i<=COC_MAX; i++) {
-    countOfCountsFile << countOfCounts[ i ] << endl;
+    countOfCountsFile << countOfCounts[ i ] << std::endl;
   }
   countOfCountsFile.Close();
 }
 
-void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
-{
-  if (phrasePair.size() == 0) return;
 
-  // group phrase pairs based on alignments that matter
-  // (i.e. that re-arrange non-terminals)
-  PhrasePairGroup phrasePairGroup;
+void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile, 
+                         const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
+{
+  if (phrasePairsWithSameSource.size() == 0) {
+    return;
+  }
 
   float totalSource = 0;
 
-  //cerr << "phrasePair.size() = " << phrasePair.size() << endl;
+  //std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
 
   // loop through phrase pairs
-  for(size_t i=0; i<phrasePair.size(); i++) {
+  for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); 
+        iter!=phrasePairsWithSameSource.end(); ++iter) {
     // add to total count
-    PhraseAlignment &currPhrasePair = phrasePair[i];
-
-    totalSource += phrasePair[i].count;
-
-    // check for matches
-    //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
-
-    PhraseAlignmentCollection phraseAlignColl;
-    phraseAlignColl.push_back(&currPhrasePair);
-    pair<PhrasePairGroup::iterator, bool> retInsert;
-    retInsert = phrasePairGroup.insert(phraseAlignColl);
-    if (!retInsert.second) {
-      // already exist. Add to that collection instead
-      PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
-      existingColl.push_back(&currPhrasePair);
-    }
-
+    totalSource += (*iter)->GetCount();
   }
 
   // output the distinct phrase pairs, one at a time
-  const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
-  PhrasePairGroup::SortedColl::const_iterator iter;
-
-  for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) {
-    const PhraseAlignmentCollection &group = **iter;
-    outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile, isSingleton, featureManager, maybeLogProb );
-  }
-
-}
-
-const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
-{
-  float bestAlignmentCount = -1;
-  PhraseAlignment* bestAlignment = NULL;
-
-  for(size_t i=0; i<phrasePair.size(); i++) {
-    size_t alignInd;
-    if (inverseFlag) {
-      // count backwards, so that alignments for ties will be the same for both normal & inverse scores
-      alignInd = phrasePair.size() - i - 1;
-    } else {
-      alignInd = i;
-    }
-
-    if (phrasePair[alignInd]->count > bestAlignmentCount) {
-      bestAlignmentCount = phrasePair[alignInd]->count;
-      bestAlignment = phrasePair[alignInd];
-    }
-  }
-
-  return *bestAlignment;
-}
-
-const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phrasePair )
-{
-  float bestTreeFragmentCount = -1;
-  PhraseAlignment *bestTreeFragment = NULL;
-
-  for(size_t i=0; i<phrasePair.size(); i++) {
-    size_t treeFragmentInd;
-    if (inverseFlag) {
-      // count backwards, so that alignments for ties will be the same for both normal & inverse scores
-      treeFragmentInd = phrasePair.size() - i - 1;
-    } else {
-      treeFragmentInd = i;
-    }
-
-    if (phrasePair[treeFragmentInd]->count > bestTreeFragmentCount) {
-      bestTreeFragmentCount = phrasePair[treeFragmentInd]->count;
-      bestTreeFragment = phrasePair[treeFragmentInd];
-    }
-  }
-
-  return bestTreeFragment->treeFragment;
-}
-
-bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set<size_t> > &alignedToS)
-{
-  for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) {
-    if (currSource == sourcePos) {
-      // skip
-    } else {
-      const std::set<size_t> &targetSet = alignedToS[currSource];
-      std::set<size_t>::const_iterator iter;
-      for (iter = targetSet.begin(); iter != targetSet.end(); ++iter) {
-        size_t currTarget = *iter;
-
-        if ((currSource < sourcePos && currTarget > targetPos)
-            || (currSource > sourcePos && currTarget < targetPos)
-           ) {
-          return true;
-        }
-      }
-
-    }
-  }
-
-  return false;
-}
-
-int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
-{
-  const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
-
-  for (size_t sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos) {
-    const std::set<size_t> &targetSet = alignedToS[sourcePos];
-
-    WORD_ID wordId = phraseS[sourcePos];
-    const WORD &word = vcbS.getWord(wordId);
-    bool isNonTerm = isNonTerminal(word);
-
-    if (isNonTerm) {
-      assert(targetSet.size() == 1);
-      size_t targetPos = *targetSet.begin();
-      bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
-      if (ret)
-        return 1;
-    }
+  for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); 
+        iter!=phrasePairsWithSameSource.end(); ++iter) {
+    // add to total count
+    outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
   }
-
-  return 0;
 }
 
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager,
+void outputPhrasePair(const ExtractionPhrasePair &phrasePair, 
+                      float totalCount, int distinctCount, 
+                      ostream &phraseTableFile, 
+                      const ScoreFeatureManager& featureManager,
                       const MaybeLog& maybeLogProb )
 {
-  if (phrasePair.size() == 0) return;
+  assert(phrasePair.isValid());
 
-  const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
-
-  // compute count
-  float count = 0;
-  for(size_t i=0; i<phrasePair.size(); i++) {
-    count += phrasePair[i]->count;
-  }
+  const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
+  float count = phrasePair.GetCount();
 
   map< string, float > domainCount;
 
@@ -464,82 +517,74 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   if (goodTuringFlag || kneserNeyFlag) {
     totalDistinct++;
     int countInt = count + 0.99999;
-    if(countInt <= COC_MAX)
+    if (countInt <= COC_MAX)
       countOfCounts[ countInt ]++;
   }
 
   // compute PCFG score
   float pcfgScore = 0;
   if (pcfgFlag && !inverseFlag) {
-    float pcfgSum = 0;
-    for(size_t i=0; i<phrasePair.size(); ++i) {
-      pcfgSum += phrasePair[i]->pcfgSum;
-    }
-    pcfgScore = pcfgSum / count;
+    pcfgScore = phrasePair.GetPcfgScore() / count;
   }
 
   // output phrases
-  const PHRASE &phraseS = phrasePair[0]->GetSource();
-  const PHRASE &phraseT = phrasePair[0]->GetTarget();
+  const PHRASE *phraseSource = phrasePair.GetSource();
+  const PHRASE *phraseTarget = phrasePair.GetTarget();
 
   // do not output if hierarchical and count below threshold
   if (hierarchicalFlag && count < minCountHierarchical) {
-    for(size_t j=0; j<phraseS.size()-1; j++) {
-      if (isNonTerminal(vcbS.getWord( phraseS[j] )))
+    for(size_t j=0; j<phraseSource->size()-1; ++j) {
+      if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
         return;
     }
   }
 
   // source phrase (unless inverse)
-  if (! inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
+  if (!inverseFlag) {
+    printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
     phraseTableFile << " ||| ";
   }
 
   // target phrase
-  printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
+  printTargetPhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
   phraseTableFile << " ||| ";
 
   // source phrase (if inverse)
   if (inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
     phraseTableFile << " ||| ";
   }
 
   // lexical translation probability
   if (lexFlag) {
-    double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
-    phraseTableFile << maybeLogProb(lexScore );
+    double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
+    phraseTableFile << maybeLogProb( lexScore );
   }
 
   // unaligned word penalty
   if (unalignedFlag) {
-    double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
-    phraseTableFile << " " << maybeLogProb(penalty );
+    double penalty = computeUnalignedPenalty( bestAlignmentT2S );
+    phraseTableFile << " " << maybeLogProb( penalty );
   }
 
   // unaligned function word penalty
   if (unalignedFWFlag) {
-    double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
-    phraseTableFile << " " << maybeLogProb(penalty );
-  }
-
-  if (singletonFeature) {
-    phraseTableFile << " " << (isSingleton ? 1 : 0);
+    double penalty = computeUnalignedFWPenalty( phraseTarget, bestAlignmentT2S );
+    phraseTableFile << " " << maybeLogProb( penalty );
   }
 
   if (crossedNonTerm && !inverseFlag) {
-    phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
+    phraseTableFile << " " << calcCrossedNonTerm( phraseTarget, bestAlignmentT2S );
   }
 
   // target-side PCFG score
   if (pcfgFlag && !inverseFlag) {
-    phraseTableFile << " " << maybeLogProb(pcfgScore );
+    phraseTableFile << " " << maybeLogProb( pcfgScore );
   }
 
   // extra features
-  ScoreFeatureContext context(phrasePair, count, maybeLogProb);
-  vector<float> extraDense;
+  ScoreFeatureContext context(phrasePair, maybeLogProb);
+  std::vector<float> extraDense;
   map<string,float> extraSparse;
   featureManager.addFeatures(context, extraDense, extraSparse);
   for (size_t i = 0; i < extraDense.size(); ++i) {
@@ -553,30 +598,28 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
 
   phraseTableFile << " ||| ";
 
-  // alignment info for non-terminals
-  if (! inverseFlag) {
-    if (hierarchicalFlag) {
-      // always output alignment if hiero style, but only for non-terms
-      // (eh: output all alignments, needed for some feature functions)
-      assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
+  // output alignment info
+  if ( !inverseFlag ) {
+    if ( hierarchicalFlag ) {
+      // always output alignment if hiero style
+      assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
       std::vector<std::string> alignment;
-      for(size_t j = 0; j < phraseT.size() - 1; j++) {
-        if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
-          if (bestAlignment.alignedToT[ j ].size() != 1) {
-            cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
+      for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
+        if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
+          if ( bestAlignmentT2S->at(j).size() != 1 ) {
+            std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
             phraseTableFile.flush();
-            assert(bestAlignment.alignedToT[ j ].size() == 1);
+            assert(bestAlignmentT2S->at(j).size() == 1);
           }
-          int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
+          size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
           //phraseTableFile << sourcePos << "-" << j << " ";
           std::stringstream point;
           point << sourcePos << "-" << j;
           alignment.push_back(point.str());
         } else {
-          set<size_t>::iterator setIter;
-          for(setIter = (bestAlignment.alignedToT[j]).begin(); setIter != (bestAlignment.alignedToT[j]).end(); setIter++) {
-            int sourcePos = *setIter;
-            //phraseTableFile << sourcePos << "-" << j << " ";
+          for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin(); 
+                setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
+            size_t sourcePos = *setIter;
             std::stringstream point;
             point << sourcePos << "-" << j;
             alignment.push_back(point.str());
@@ -590,40 +633,90 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
       }
     } else if (wordAlignmentFlag) {
       // alignment info in pb model
-      for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
-        const set< size_t > &aligned = bestAlignment.alignedToT[j];
-        for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
-          phraseTableFile << *p << "-" << j << " ";
+      for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
+        for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin(); 
+              setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
+          size_t sourcePos = *setIter;
+          phraseTableFile << sourcePos << "-" << j << " ";
         }
       }
     }
   }
 
-
   // counts
-
   phraseTableFile << " ||| " << totalCount << " " << count;
   if (kneserNeyFlag)
     phraseTableFile << " " << distinctCount;
 
+  if ((treeFragmentsFlag) && 
+      !inverseFlag) {
+    phraseTableFile << " |||";
+  }
+
   // tree fragments
   if (treeFragmentsFlag && !inverseFlag) {
-    const std::string &bestTreeFragment = findBestTreeFragment( phrasePair );
-    if ( !bestTreeFragment.empty() )
-      phraseTableFile << " ||| {{Tree " << bestTreeFragment << "}}";
+    const std::string *bestTreeFragment = phrasePair.FindBestPropertyValue("Tree");
+    if (bestTreeFragment) {
+      phraseTableFile << " {{Tree " << *bestTreeFragment << "}}";
+    }
   }
 
+  phraseTableFile << std::endl;
+}
+
+
+
+bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource )
+{
+  for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) {
+    if (currTarget == targetPos) {
+      // skip
+    } else {
+      const std::set<size_t> &sourceSet = alignmentTargetToSource->at(currTarget);
+      for (std::set<size_t>::const_iterator iter = sourceSet.begin(); 
+           iter != sourceSet.end(); ++iter) {
+        size_t currSource = *iter;
+
+        if ((currTarget < targetPos && currSource > sourcePos)
+            || (currTarget > targetPos && currSource < sourcePos)
+           ) {
+          return true;
+        }
+      }
 
-  phraseTableFile << endl;
+    }
+  }
+
+  return false;
 }
 
-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
+int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
+{
+  assert(phraseTarget->size() >= alignmentTargetToSource->size() );
+
+  for (size_t targetPos = 0; targetPos < alignmentTargetToSource->size(); ++targetPos) {
+
+    if ( isNonTerminal(vcbT.getWord( phraseTarget->at(targetPos) ))) {
+      const std::set<size_t> &alignmentPoints = alignmentTargetToSource->at(targetPos);
+      assert( alignmentPoints.size() == 1 );
+      size_t sourcePos = *alignmentPoints.begin();
+      bool ret = calcCrossedNonTerm(targetPos, sourcePos, alignmentTargetToSource);
+      if (ret)
+        return 1;
+    }
+  }
+
+  return 0;
+}
+
+
+double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
 {
   // unaligned word counter
   double unaligned = 1.0;
   // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
+  for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
+    const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
     if (srcIndices.empty()) {
       unaligned *= 2.718;
     }
@@ -631,14 +724,15 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, co
   return unaligned;
 }
 
-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
+
+double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
 {
   // unaligned word counter
   double unaligned = 1.0;
   // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
-    if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
+  for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
+    const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
+    if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
       unaligned *= 2.718;
     }
   }
@@ -647,11 +741,11 @@ double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT,
 
 void loadFunctionWords( const string &fileName )
 {
-  cerr << "Loading function word list from " << fileName;
+  std::cerr << "Loading function word list from " << fileName;
   ifstream inFile;
   inFile.open(fileName.c_str());
   if (inFile.fail()) {
-    cerr << " - ERROR: could not open file\n";
+    std::cerr << " - ERROR: could not open file" << std::endl;
     exit(1);
   }
   istream *inFileP = &inFile;
@@ -660,32 +754,32 @@ void loadFunctionWords( const string &fileName )
   while(true) {
     SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
     if (inFileP->eof()) break;
-    vector<string> token = tokenize( line );
+    std::vector<string> token = tokenize( line );
     if (token.size() > 0)
       functionWordList.insert( token[0] );
   }
-  inFile.close();
 
-  cerr << " - read " << functionWordList.size() << " function words\n";
+  std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
   inFile.close();
 }
 
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
+
+double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
 {
   // lexical translation probability
   double lexScore = 1.0;
   int null = vcbS.getWordID("NULL");
   // all target words have to be explained
-  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
+  for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
+    const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
     if (srcIndices.empty()) {
       // explain unaligned word by NULL
-      lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
+      lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
     } else {
       // go through all the aligned words to compute average
       double thisWordScore = 0;
       for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
-        thisWordScore += lexTable.permissiveLookup( phraseS[ *p ], phraseT[ ti ] );
+        thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
       }
       lexScore *= thisWordScore / (double)srcIndices.size();
     }
@@ -693,13 +787,14 @@ double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT,
   return lexScore;
 }
 
+
 void LexicalTable::load( const string &fileName )
 {
-  cerr << "Loading lexical translation table from " << fileName;
+  std::cerr << "Loading lexical translation table from " << fileName;
   ifstream inFile;
   inFile.open(fileName.c_str());
   if (inFile.fail()) {
-    cerr << " - ERROR: could not open file\n";
+    std::cerr << " - ERROR: could not open file" << std::endl;
     exit(1);
   }
   istream *inFileP = &inFile;
@@ -709,15 +804,15 @@ void LexicalTable::load( const string &fileName )
   int i=0;
   while(true) {
     i++;
-    if (i%100000 == 0) cerr << "." << flush;
+    if (i%100000 == 0) std::cerr << "." << flush;
     SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
     if (inFileP->eof()) break;
 
-    vector<string> token = tokenize( line );
+    std::vector<string> token = tokenize( line );
     if (token.size() != 3) {
-      cerr << "line " << i << " in " << fileName
-           << " has wrong number of tokens, skipping:\n"
-           << token.size() << " " << token[0] << " " << line << endl;
+        std::cerr << "line " << i << " in " << fileName
+           << " has wrong number of tokens, skipping:" << std::endl
+           << token.size() << " " << token[0] << " " << line << std::endl;
       continue;
     }
 
@@ -726,55 +821,60 @@ void LexicalTable::load( const string &fileName )
     WORD_ID wordS = vcbS.storeIfNew( token[1] );
     ltable[ wordS ][ wordT ] = prob;
   }
-  cerr << endl;
+  std::cerr << std::endl;
 }
 
-void printSourcePhrase(const PHRASE &phraseS, const PHRASE &phraseT,
-                       const PhraseAlignment &bestAlignment, ostream &out)
+
+void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
+                       const ALIGNMENT *targetToSourceAlignment, ostream &out)
 {
+  // get corresponding target non-terminal and output pair
+  ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
+  invertAlignment(phraseSource, phraseTarget, targetToSourceAlignment, sourceToTargetAlignment);
   // output source symbols, except root, in rule table format
-  for (std::size_t i = 0; i < phraseS.size()-1; ++i) {
-    const std::string &word = vcbS.getWord(phraseS[i]);
+  for (std::size_t i = 0; i < phraseSource->size()-1; ++i) {
+    const std::string &word = vcbS.getWord(phraseSource->at(i));
     if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
       out << word << " ";
       continue;
     }
-    // get corresponding target non-terminal and output pair
-    std::set<std::size_t> alignmentPoints = bestAlignment.alignedToS[i];
+    const std::set<std::size_t> &alignmentPoints = sourceToTargetAlignment->at(i);
     assert(alignmentPoints.size() == 1);
-    int j = *(alignmentPoints.begin());
+    size_t j = *(alignmentPoints.begin());
     if (inverseFlag) {
-      out << vcbT.getWord(phraseT[j]) << word << " ";
+      out << vcbT.getWord(phraseTarget->at(j)) << word << " ";
     } else {
-      out << word << vcbT.getWord(phraseT[j]) << " ";
+      out << word << vcbT.getWord(phraseTarget->at(j)) << " ";
     }
   }
   // output source root symbol
   if (conditionOnTargetLhsFlag && !inverseFlag) {
     out << "[X]";
   } else {
-    out << vcbS.getWord(phraseS.back());
+    out << vcbS.getWord(phraseSource->back());
   }
+  delete sourceToTargetAlignment;
 }
 
-void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT,
-                       const PhraseAlignment &bestAlignment, ostream &out)
+
+void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
+                       const ALIGNMENT *targetToSourceAlignment, ostream &out)
 {
   // output target symbols, except root, in rule table format
-  for (std::size_t i = 0; i < phraseT.size()-1; ++i) {
-    const std::string &word = vcbT.getWord(phraseT[i]);
+  for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {
+    const std::string &word = vcbT.getWord(phraseTarget->at(i));
     if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
       out << word << " ";
       continue;
     }
     // get corresponding source non-terminal and output pair
-    std::set<std::size_t> alignmentPoints = bestAlignment.alignedToT[i];
+    std::set<std::size_t> alignmentPoints = targetToSourceAlignment->at(i);
     assert(alignmentPoints.size() == 1);
     int j = *(alignmentPoints.begin());
     if (inverseFlag) {
-      out << word << vcbS.getWord(phraseS[j]) << " ";
+      out << word << vcbS.getWord(phraseSource->at(j)) << " ";
     } else {
-      out << vcbS.getWord(phraseS[j]) << word << " ";
+      out << vcbS.getWord(phraseSource->at(j)) << word << " ";
     }
   }
   // output target root symbol
@@ -782,24 +882,28 @@ void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT,
     if (inverseFlag) {
       out << "[X]";
     } else {
-      out << vcbS.getWord(phraseS.back());
+      out << vcbS.getWord(phraseSource->back());
     }
   } else {
-    out << vcbT.getWord(phraseT.back());
+    out << vcbT.getWord(phraseTarget->back());
   }
 }
 
-std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj )
-{
-  std::pair<iterator,bool> ret = m_coll.insert(obj);
 
-  if (ret.second) {
-    // obj inserted. Also add to sorted vector
-    const PhraseAlignmentCollection &insertedObj = *ret.first;
-    m_sortedColl.push_back(&insertedObj);
-  }
+void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
+                     const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment) {
+// typedef std::vector< std::set<size_t> > ALIGNMENT; 
 
-  return ret;
+  outSourceToTargetAlignment->clear();
+  size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size());
+  outSourceToTargetAlignment->resize(numberOfSourceSymbols);
+  // add alignment point
+  for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) {
+    for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin(); 
+          setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) {
+      size_t sourcePosition = *setIter;
+      outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition);
+    }
+  }
 }
 
-
author	Matthias Huck <huck@i6.informatik.rwth-aachen.de>	2014-01-29 22:37:42 +0400
committer	Matthias Huck <huck@i6.informatik.rwth-aachen.de>	2014-01-29 22:37:42 +0400
commit	86ee3e15a441aec72eaebdd0389fa925da2316c7 (patch)
tree	6de24e964968820fb708a2e37a935c40a2a1494e /phrase-extract
parent	ffd62e994ecb88358b5f3aa835f84d441ec58c77 (diff)