Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <huck@i6.informatik.rwth-aachen.de>2014-01-29 22:37:42 +0400
committerMatthias Huck <huck@i6.informatik.rwth-aachen.de>2014-01-29 22:37:42 +0400
commit86ee3e15a441aec72eaebdd0389fa925da2316c7 (patch)
tree6de24e964968820fb708a2e37a935c40a2a1494e /phrase-extract
parentffd62e994ecb88358b5f3aa835f84d441ec58c77 (diff)
new version of the `score` tool
which is now capable of dealing with additional properties in an appropriate manner
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/DomainFeature.cpp (renamed from phrase-extract/domain.cpp)44
-rw-r--r--phrase-extract/DomainFeature.h (renamed from phrase-extract/domain.h)10
-rw-r--r--phrase-extract/ExtractionPhrasePair.cpp327
-rw-r--r--phrase-extract/ExtractionPhrasePair.h162
-rw-r--r--phrase-extract/InternalStructFeature.cpp78
-rw-r--r--phrase-extract/InternalStructFeature.h33
-rw-r--r--phrase-extract/Jamfile12
-rw-r--r--phrase-extract/PhraseAlignment.cpp230
-rw-r--r--phrase-extract/PhraseAlignment.h106
-rw-r--r--phrase-extract/ScoreFeature.cpp13
-rw-r--r--phrase-extract/ScoreFeature.h35
-rw-r--r--phrase-extract/ScoreFeatureTest.cpp17
-rw-r--r--phrase-extract/score-main.cpp796
13 files changed, 1044 insertions, 819 deletions
diff --git a/phrase-extract/domain.cpp b/phrase-extract/DomainFeature.cpp
index 67b4a13c3..2f99a8709 100644
--- a/phrase-extract/domain.cpp
+++ b/phrase-extract/DomainFeature.cpp
@@ -1,6 +1,5 @@
-// $Id$
-//#include "beammain.h"
-#include "domain.h"
+#include "DomainFeature.h"
+#include "ExtractionPhrasePair.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "SafeGetline.h"
@@ -26,7 +25,7 @@ void Domain::load( const std::string &domainFileName )
int lineNumber;
if (domainSpecLine.size() != 2 ||
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
- cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+ std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
exit(1);
}
// store
@@ -50,29 +49,34 @@ string Domain::getDomainOfSentence( int sentenceId ) const
return "undefined";
}
-DomainFeature::DomainFeature(const string& domainFile)
+DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
{
//process domain file
m_domain.load(domainFile);
}
+void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+ float count,
+ int sentenceId) const
+{
+ std::string value = m_domain.getDomainOfSentence(sentenceId);
+ phrasePair.AddProperty(m_propertyKey, value, count);
+}
+
void DomainFeature::add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const
{
- map< string, float > domainCount;
- for(size_t i=0; i<context.phrasePair.size(); i++) {
- string d = m_domain.getDomainOfSentence(context.phrasePair[i]->sentenceId );
- if (domainCount.find( d ) == domainCount.end()) {
- domainCount[d] = context.phrasePair[i]->count;
- } else {
- domainCount[d] += context.phrasePair[i]->count;
- }
- }
- add(domainCount, context.count, context.maybeLog, denseValues, sparseValues);
+ const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
+ assert( domainCount != NULL );
+ add(*domainCount,
+ context.phrasePair.GetCount(),
+ context.maybeLog,
+ denseValues, sparseValues);
}
-void SubsetDomainFeature::add(const map<string,float>& domainCount,float count,
+void SubsetDomainFeature::add(const map<string,float>& domainCount,
+ float count,
const MaybeLog& maybeLog,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const
@@ -152,7 +156,6 @@ void IndicatorDomainFeature::add(const map<string,float>& domainCount,float coun
denseValues.push_back(maybeLog(2.718));
}
}
-
}
void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
@@ -166,12 +169,5 @@ void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,floa
}
}
-bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
-{
- return m_domain.getDomainOfSentence(lhs.sentenceId) ==
- m_domain.getDomainOfSentence( rhs.sentenceId);
-}
-
-
}
diff --git a/phrase-extract/domain.h b/phrase-extract/DomainFeature.h
index 279496e01..8ebc599e2 100644
--- a/phrase-extract/domain.h
+++ b/phrase-extract/DomainFeature.h
@@ -34,13 +34,17 @@ class DomainFeature : public ScoreFeature
public:
DomainFeature(const std::string& domainFile);
- bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
+
+ void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+ float count,
+ int sentenceId) const;
+
void add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
protected:
- /** Overriden in subclass */
+ /** Overridden in subclass */
virtual void add(const std::map<std::string,float>& domainCounts, float count,
const MaybeLog& maybeLog,
std::vector<float>& denseValues,
@@ -49,6 +53,8 @@ protected:
Domain m_domain;
+ const std::string m_propertyKey;
+
};
class SubsetDomainFeature : public DomainFeature
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
new file mode 100644
index 000000000..e2814f33c
--- /dev/null
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -0,0 +1,327 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <sstream>
+#include "ExtractionPhrasePair.h"
+#include "SafeGetline.h"
+#include "tables-core.h"
+#include "score.h"
+#include "moses/Util.h"
+
+#include <cstdlib>
+
+using namespace std;
+
+
+namespace MosesTraining {
+
+
+extern Vocabulary vcbT;
+extern Vocabulary vcbS;
+
+extern bool hierarchicalFlag;
+
+
+ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
+ const PHRASE *phraseTarget,
+ ALIGNMENT *targetToSourceAlignment,
+ float count, float pcfgSum ) :
+ m_phraseSource(phraseSource),
+ m_phraseTarget(phraseTarget),
+ m_count(count),
+ m_pcfgSum(pcfgSum)
+{
+ assert(phraseSource.empty());
+ assert(phraseTarget.empty());
+
+ m_count = count;
+ m_pcfgSum = pcfgSum;
+
+ std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
+ m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
+
+ m_lastTargetToSourceAlignment = insertedAlignment.first;
+ m_lastCount = m_count;
+ m_lastPcfgSum = m_pcfgSum;
+
+ m_isValid = true;
+}
+
+
+ExtractionPhrasePair::~ExtractionPhrasePair( ) {
+ Clear();
+}
+
+
+// return value: true if the given alignment was seen for the first time and thus will be stored,
+// false if it was present already (the pointer may thus be deleted(
+bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
+ float count, float pcfgSum )
+{
+ m_count += count;
+ m_pcfgSum += pcfgSum;
+
+ m_lastCount = count;
+ m_lastPcfgSum = pcfgSum;
+
+ std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
+ if ( *(iter->first) == *targetToSourceAlignment ) {
+ iter->second += count;
+ return false;
+ } else {
+ std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
+ m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
+ if ( !insertedAlignment.second ) {
+ // the alignment already exists: increment count
+ insertedAlignment.first->second += count;
+ return false;
+ }
+ m_lastTargetToSourceAlignment = insertedAlignment.first;
+ }
+
+ return true;
+}
+
+
+void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
+{
+ m_count += count;
+ m_pcfgSum += pcfgSum;
+ m_lastTargetToSourceAlignment->second += count;
+ // properties
+ for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
+ iter !=m_properties.end(); ++iter ) {
+ LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
+ (*lastPropertyValue)->second += count;
+ }
+
+ m_lastCount = count;
+ m_lastPcfgSum = pcfgSum;
+}
+
+
+// Check for lexical match
+// and in case of SCFG rules for equal non-terminal alignment.
+bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
+ const PHRASE *otherPhraseTarget,
+ ALIGNMENT *otherTargetToSourceAlignment ) const
+{
+ if (*otherPhraseTarget != *m_phraseTarget) {
+ return false;
+ }
+ if (*otherPhraseSource != *m_phraseSource) {
+ return false;
+ }
+
+ return MatchesAlignment( otherTargetToSourceAlignment );
+}
+
+// Check for lexical match
+// and in case of SCFG rules for equal non-terminal alignment.
+// Set boolean indicators.
+// (Note that we check in the order: target - source - alignment
+// and do not touch the subsequent boolean indicators once a previous one has been set to false.)
+bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
+ const PHRASE *otherPhraseTarget,
+ ALIGNMENT *otherTargetToSourceAlignment,
+ bool &sourceMatch,
+ bool &targetMatch,
+ bool &alignmentMatch ) const
+{
+ if (*otherPhraseSource != *m_phraseSource) {
+ sourceMatch = false;
+ return false;
+ } else {
+ sourceMatch = true;
+ }
+ if (*otherPhraseTarget != *m_phraseTarget) {
+ targetMatch = false;
+ return false;
+ } else {
+ targetMatch = true;
+ }
+ if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
+ alignmentMatch = false;
+ return false;
+ } else {
+ alignmentMatch = true;
+ }
+ return true;
+}
+
+// Check for equal non-terminal alignment in case of SCFG rules.
+// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
+bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
+{
+ if (!hierarchicalFlag) return true;
+
+ // all or none of the phrasePair's word alignment matrices match, so just pick one
+ const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
+
+ assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
+ assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
+
+ // loop over all symbols but the left hand side of the rule
+ for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
+ if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
+ size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin());
+ size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
+
+ if (thisTargetToSourceAlignment->at(i).size() != 1 ||
+ otherTargetToSourceAlignment->at(i).size() != 1 ||
+ thisAlign != otherAlign) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+void ExtractionPhrasePair::Clear()
+{
+ delete m_phraseSource;
+ delete m_phraseTarget;
+
+ m_count = 0.0f;
+ m_pcfgSum = 0.0f;
+
+ for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
+ iter!=m_targetToSourceAlignments.end(); ++iter) {
+ delete iter->first;
+ }
+ m_targetToSourceAlignments.clear();
+
+ for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
+ iter!=m_properties.end(); ++iter) {
+ delete (iter->second).second;
+ delete (iter->second).first;
+ }
+ m_properties.clear();
+
+ m_lastCount = 0.0f;
+ m_lastPcfgSum = 0.0f;
+ m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
+
+ m_isValid = false;
+}
+
+
+void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
+{
+ if (propertiesString.empty()) {
+ return;
+ }
+
+ vector<std::string> toks;
+ Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
+ for (size_t i = 1; i < toks.size(); ++i) {
+ std::string &tok = toks[i];
+ if (tok.empty()) {
+ continue;
+ }
+ size_t endPos = tok.rfind("}");
+ tok = tok.substr(0, endPos - 1);
+
+ vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
+ assert(keyValue.size() == 2);
+ AddProperty(keyValue[0], keyValue[1], count);
+ }
+}
+
+
+const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
+{
+ float bestAlignmentCount = -1;
+
+ std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
+
+ for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
+ iter!=m_targetToSourceAlignments.end(); ++iter) {
+ if ( (iter->second > bestAlignmentCount) ||
+ ( (iter->second == bestAlignmentCount) &&
+ (*(iter->first) > *(bestAlignment->first)) ) ) {
+ bestAlignmentCount = iter->second;
+ bestAlignment = iter;
+ }
+ }
+
+ if ( bestAlignment == m_targetToSourceAlignments.end()) {
+ return NULL;
+ }
+
+ return bestAlignment->first;
+}
+
+
+const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
+{
+ float bestPropertyCount = -1;
+
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+ if ( allPropertyValues == NULL ) {
+ return NULL;
+ }
+
+ PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
+
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+ iter!=allPropertyValues->end(); ++iter) {
+ if ( (iter->second > bestPropertyCount) ||
+ ( (iter->second == bestPropertyCount) &&
+ (iter->first > bestPropertyValue->first) ) ) {
+ bestPropertyCount = iter->second;
+ bestPropertyValue = iter;
+ }
+ }
+
+ if ( bestPropertyValue == allPropertyValues->end()) {
+ return NULL;
+ }
+
+ return &(bestPropertyValue->first);
+}
+
+
+std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
+{
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+
+ if ( allPropertyValues == NULL ) {
+ return "";
+ }
+
+ std::ostringstream oss;
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+ iter!=allPropertyValues->end(); ++iter) {
+ if (iter!=allPropertyValues->begin()) {
+ oss << " ";
+ }
+ oss << iter->first;
+ oss << " ";
+ oss << iter->second;
+ }
+
+ std::string allPropertyValuesString(oss.str());
+ return allPropertyValuesString;
+}
+
+
+}
+
diff --git a/phrase-extract/ExtractionPhrasePair.h b/phrase-extract/ExtractionPhrasePair.h
new file mode 100644
index 000000000..f04984391
--- /dev/null
+++ b/phrase-extract/ExtractionPhrasePair.h
@@ -0,0 +1,162 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#include "tables-core.h"
+
+#include <vector>
+#include <set>
+#include <map>
+
+namespace MosesTraining {
+
+
+typedef std::vector< std::set<size_t> > ALIGNMENT;
+
+
+class ExtractionPhrasePair {
+
+protected:
+
+ typedef std::map<std::string,float> PROPERTY_VALUES;
+ typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
+
+
+ bool m_isValid;
+
+ const PHRASE *m_phraseSource;
+ const PHRASE *m_phraseTarget;
+
+ float m_count;
+ float m_pcfgSum;
+
+ std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
+ std::map<std::string,
+ std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
+
+ float m_lastCount;
+ float m_lastPcfgSum;
+ std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
+
+public:
+
+ ExtractionPhrasePair( const PHRASE *phraseSource,
+ const PHRASE *phraseTarget,
+ ALIGNMENT *targetToSourceAlignment,
+ float count, float pcfgSum );
+
+ ~ExtractionPhrasePair();
+
+ bool Add( ALIGNMENT *targetToSourceAlignment,
+ float count, float pcfgSum );
+
+ void IncrementPrevious( float count, float pcfgSum );
+
+ bool Matches( const PHRASE *otherPhraseSource,
+ const PHRASE *otherPhraseTarget,
+ ALIGNMENT *otherTargetToSourceAlignment ) const;
+
+ bool Matches( const PHRASE *otherPhraseSource,
+ const PHRASE *otherPhraseTarget,
+ ALIGNMENT *otherTargetToSourceAlignment,
+ bool &sourceMatch,
+ bool &targetMatch,
+ bool &alignmentMatch ) const;
+
+ bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
+
+ void Clear();
+
+ bool IsValid() const {
+ return m_isValid;
+ }
+
+
+ const PHRASE *GetSource() const {
+ return m_phraseSource;
+ }
+
+ const PHRASE *GetTarget() const {
+ return m_phraseTarget;
+ }
+
+ float GetCount() const {
+ return m_count;
+ }
+
+ float GetPcfgScore() const {
+ return m_pcfgSum;
+ }
+
+ const size_t GetNumberOfProperties() const {
+ return m_properties.size();
+ }
+
+ const std::map<std::string,float> *GetProperty( const std::string &key ) const {
+ std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
+ iter = m_properties.find(key);
+ if (iter == m_properties.end()) {
+ return NULL;
+ } else {
+ return iter->second.first;
+ }
+ }
+
+ const ALIGNMENT *FindBestAlignmentTargetToSource() const;
+
+ const std::string *FindBestPropertyValue(const std::string &key) const;
+
+ std::string CollectAllPropertyValues(const std::string &key) const;
+
+ void AddProperties( const std::string &str, float count );
+
+ void AddProperty( const std::string &key, const std::string &value, float count )
+ {
+ std::map<std::string,
+ std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
+ if ( iter == m_properties.end() ) {
+ // key not found: insert property key and value
+ PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
+ std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
+ LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
+ m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
+ } else {
+ LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
+ if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before
+ // property key-value pair exists already: add count
+ (*lastPropertyValue)->second += count;
+ } else { // need to check whether the property key-value pair has appeared before (insert if not)
+ // property key exists, but not in combination with this value:
+ // add new value with count
+ PROPERTY_VALUES *propertyValues = (iter->second).first;
+ std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
+ if ( !insertedProperty.second ) { // property value for this key appeared before: add count
+ insertedProperty.first->second += count;
+ }
+ LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
+ delete (iter->second).second;
+ (iter->second).second = lastPropertyValue;
+ }
+ }
+ }
+
+};
+
+}
+
diff --git a/phrase-extract/InternalStructFeature.cpp b/phrase-extract/InternalStructFeature.cpp
index e0e9fd3e2..3757b0e43 100644
--- a/phrase-extract/InternalStructFeature.cpp
+++ b/phrase-extract/InternalStructFeature.cpp
@@ -1,50 +1,30 @@
#include "InternalStructFeature.h"
+#include <map>
using namespace std;
namespace MosesTraining
{
-InternalStructFeature::InternalStructFeature()
- :m_type(0){
- //cout<<"InternalStructFeature: Construct "<<m_type<<"\n";
-}
-
-bool InternalStructFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const{
- //cout<<"InternalStructFeature: Equals\n";
- //don't know what it's used for and what we should compare
- //-> if the dense score is the same
- //-> if the sparse feature is set
- // compare phrases? with the internalStrucutre string?
- /** Return true if the two phrase pairs are equal from the point of this feature. Assume
- that they already compare true according to PhraseAlignment.equals()
- **/
-
-/* if(lhs.ghkmParse==rhs.ghkmParse)
- return true;
- else
- return false;
-*/
- //return true;
-}
-
void InternalStructFeature::add(const ScoreFeatureContext& context,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const{
- for(size_t i=0; i<context.phrasePair.size(); i++) {
- add(&context.phrasePair[i]->treeFragment, denseValues, sparseValues);
- }
-
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const {
+ const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
+ for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
+ iter!=allTrees->end(); ++iter ) {
+ add(&(iter->first), iter->second, denseValues, sparseValues);
+ }
}
-void InternalStructFeatureDense::add(std::string *internalStruct,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const{
+void InternalStructFeatureDense::add(const std::string *treeFragment,
+ float count,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const {
//cout<<"Dense: "<<*internalStruct<<endl;
size_t start=0;
int countNP=0;
- while((start = internalStruct->find("NP", start)) != string::npos) {
- countNP++;
+ while((start = treeFragment->find("NP", start)) != string::npos) {
+ countNP += count;
start+=2; //length of "NP"
}
//should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
@@ -53,21 +33,21 @@ void InternalStructFeatureDense::add(std::string *internalStruct,
}
-void InternalStructFeatureSparse::add(std::string *internalStruct,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const{
- //cout<<"Sparse: "<<*internalStruct<<endl;
- if(internalStruct->find("VBZ")!=std::string::npos)
- sparseValues["NTVBZ"] = 1;
- if(internalStruct->find("VBD")!=std::string::npos)
- sparseValues["NTVBD"] = 1;
- if(internalStruct->find("VBP")!=std::string::npos)
- sparseValues["NTVBP"] = 1;
- if(internalStruct->find("PP")!=std::string::npos)
- sparseValues["NTPP"] = 1;
- if(internalStruct->find("SBAR")!=std::string::npos)
- sparseValues["NTSBAR"] = 1;
-
+void InternalStructFeatureSparse::add(const std::string *treeFragment,
+ float count,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const {
+ //cout<<"Sparse: "<<*internalStruct<<endl;
+ if(treeFragment->find("VBZ")!=std::string::npos)
+ sparseValues["NTVBZ"] += count;
+ if(treeFragment->find("VBD")!=std::string::npos)
+ sparseValues["NTVBD"] += count;
+ if(treeFragment->find("VBP")!=std::string::npos)
+ sparseValues["NTVBP"] += count;
+ if(treeFragment->find("PP")!=std::string::npos)
+ sparseValues["NTPP"] += count;
+ if(treeFragment->find("SBAR")!=std::string::npos)
+ sparseValues["NTSBAR"] += count;
}
diff --git a/phrase-extract/InternalStructFeature.h b/phrase-extract/InternalStructFeature.h
index bd513a715..7a6efec1d 100644
--- a/phrase-extract/InternalStructFeature.h
+++ b/phrase-extract/InternalStructFeature.h
@@ -21,22 +21,19 @@ namespace MosesTraining
class InternalStructFeature : public ScoreFeature
{
public:
- InternalStructFeature();
- /** Return true if the two phrase pairs are equal from the point of this feature. Assume
- that they already compare true according to PhraseAlignment.equals()
- **/
- bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
+ InternalStructFeature() : m_type(0) {};
/** Add the values for this feature function. */
void add(const ScoreFeatureContext& context,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
protected:
- /** Overriden in subclass */
- virtual void add(std::string *internalStruct,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const = 0;
+ /** Overridden in subclass */
+ virtual void add(const std::string *treeFragment,
+ float count,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const = 0;
int m_type;
};
@@ -47,9 +44,10 @@ public:
InternalStructFeatureDense()
:InternalStructFeature(){m_type=1;} //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
protected:
- virtual void add(std::string *internalStruct,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+ virtual void add(const std::string *treeFragment,
+ float count,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
};
class InternalStructFeatureSparse : public InternalStructFeature
@@ -58,9 +56,10 @@ public:
InternalStructFeatureSparse()
:InternalStructFeature(){m_type=2;}// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
protected:
- virtual void add(std::string *internalStruct,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+ virtual void add(const std::string *treeFragment,
+ float count,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
};
}
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index e66ecb9b3..50fed2973 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -1,19 +1,19 @@
-local most-deps = [ glob *.cpp : PhraseAlignment.cpp *Test.cpp *-main.cpp ] ;
+local most-deps = [ glob *.cpp : ExtractionPhrasePair.cpp *Test.cpp *-main.cpp ] ;
#Build .o files with include path setting, reused.
for local d in $(most-deps) {
obj $(d:B).o : $(d) ;
}
#and stuff them into an alias.
-alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ../moses//ThreadPool ../util//kenutil ;
+alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ../moses//ThreadPool ../moses//Util ../util//kenutil ;
-#PhraseAlignment.cpp requires that main define some global variables.
+#ExtractionPhrasePair.cpp requires that main define some global variables.
#Build the mains that do not need these global variables.
for local m in [ glob *-main.cpp : score-main.cpp ] {
exe [ MATCH "(.*)-main.cpp" : $(m) ] : $(m) deps ;
}
-#The side dishes that use PhraseAlignment.cpp
-exe score : PhraseAlignment.cpp score-main.cpp deps ;
+#The side dishes that use ExtractionPhrasePair.cpp
+exe score : ExtractionPhrasePair.cpp score-main.cpp deps ;
import testing ;
-run ScoreFeatureTest.cpp PhraseAlignment.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ;
+run ScoreFeatureTest.cpp ExtractionPhrasePair.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ;
diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp
deleted file mode 100644
index 075e1b18c..000000000
--- a/phrase-extract/PhraseAlignment.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * PhraseAlignment.cpp
- * extract
- *
- * Created by Hieu Hoang on 28/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-
-#include <sstream>
-#include "PhraseAlignment.h"
-#include "SafeGetline.h"
-#include "tables-core.h"
-#include "score.h"
-
-#include <cstdlib>
-
-using namespace std;
-
-namespace MosesTraining
-{
-
-extern Vocabulary vcbT;
-extern Vocabulary vcbS;
-
-extern bool hierarchicalFlag;
-
-//! convert string to variable of type T. Used to reading floats, int etc from files
-template<typename T>
-inline T Scan(const std::string &input)
-{
- std::stringstream stream(input);
- T ret;
- stream >> ret;
- return ret;
-}
-
-
-//! speeded up version of above
-template<typename T>
-inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
-{
- output.resize(input.size());
- for (size_t i = 0 ; i < input.size() ; i++) {
- output[i] = Scan<T>( input[i] );
- }
-}
-
-
-inline void Tokenize(std::vector<std::string> &output
- , const std::string& str
- , const std::string& delimiters = " \t")
-{
- // Skip delimiters at beginning.
- std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
- // Find first "non-delimiter".
- std::string::size_type pos = str.find_first_of(delimiters, lastPos);
-
- while (std::string::npos != pos || std::string::npos != lastPos) {
- // Found a token, add it to the vector.
- output.push_back(str.substr(lastPos, pos - lastPos));
- // Skip delimiters. Note the "not_of"
- lastPos = str.find_first_not_of(delimiters, pos);
- // Find next "non-delimiter"
- pos = str.find_first_of(delimiters, lastPos);
- }
-}
-
-// speeded up version of above
-template<typename T>
-inline void Tokenize( std::vector<T> &output
- , const std::string &input
- , const std::string& delimiters = " \t")
-{
- std::vector<std::string> stringVector;
- Tokenize(stringVector, input, delimiters);
- return Scan<T>(output, stringVector );
-}
-
-// read in a phrase pair and store it
-void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFlag )
-{
- assert(phraseS.empty());
- assert(phraseT.empty());
- treeFragment.clear();
-
- vector< string > token = tokenize( line );
- int item = 1;
- for (size_t j=0; j<token.size(); j++) {
- if (token[j] == "|||") item++;
- else if (item == 1) { // source phrase
- phraseS.push_back( vcbS.storeIfNew( token[j] ) );
- }
-
- else if (item == 2) { // target phrase
- phraseT.push_back( vcbT.storeIfNew( token[j] ) );
- } else if (item == 3) { // alignment
- int s,t;
- sscanf(token[j].c_str(), "%d-%d", &s, &t);
- if ((size_t)t >= phraseT.size() || (size_t)s >= phraseS.size()) {
- cerr << "WARNING: phrase pair " << lineID
- << " has alignment point (" << s << ", " << t
- << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n";
- } else {
- // first alignment point? -> initialize
- createAlignVec(phraseS.size(), phraseT.size());
-
- // add alignment point
- alignedToT[t].insert( s );
- alignedToS[s].insert( t );
- }
- } else if ( (item >= 4) && (token[j] == "Tree") ) { // check for information with a key field
- ++j;
- while ( (j < token.size() ) && (token[j] != "|||") ) {
- treeFragment.append(" ");
- treeFragment.append(token[j]);
- ++j;
- }
- --j;
- } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
- sscanf(token[j].c_str(), "%d", &sentenceId);
- } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
- sscanf(token[j].c_str(), "%f", &count);
- } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
- float pcfgScore = std::atof(token[j].c_str());
- pcfgSum = pcfgScore * count;
- }
- }
-
- createAlignVec(phraseS.size(), phraseT.size());
-
- if (item + (includeSentenceIdFlag?-1:0) == 3) {
- count = 1.0;
- }
- if (item < 3 || item > 6) {
- cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
- }
-}
-
-void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)
-{
- // in case of no align info. always need align info, even if blank
- if (alignedToT.size() == 0) {
- size_t numTgtSymbols = (hierarchicalFlag ? targetSize-1 : targetSize);
- alignedToT.resize(numTgtSymbols);
- }
-
- if (alignedToS.size() == 0) {
- size_t numSrcSymbols = (hierarchicalFlag ? sourceSize-1 : sourceSize);
- alignedToS.resize(numSrcSymbols);
- }
-}
-
-void PhraseAlignment::clear()
-{
- phraseS.clear();
- phraseT.clear();
- alignedToT.clear();
- alignedToS.clear();
-}
-
-// check if two word alignments between a phrase pair are the same
-bool PhraseAlignment::equals( const PhraseAlignment& other )
-{
- if (this == &other) return true;
- if (other.GetTarget() != GetTarget()) return false;
- if (other.GetSource() != GetSource()) return false;
- if (other.alignedToT != alignedToT) return false;
- if (other.alignedToS != alignedToS) return false;
- return true;
-}
-
-// check if two word alignments between a phrase pairs "match"
-// i.e. they do not differ in the alignment of non-termimals
-bool PhraseAlignment::match( const PhraseAlignment& other )
-{
- if (this == &other) return true;
- if (other.GetTarget() != GetTarget()) return false;
- if (other.GetSource() != GetSource()) return false;
- if (!hierarchicalFlag) return true;
-
- assert(phraseT.size() == alignedToT.size() + 1);
- assert(alignedToT.size() == other.alignedToT.size());
-
- // loop over all words (note: 0 = left hand side of rule)
- for(size_t i=0; i<phraseT.size()-1; i++) {
- if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
- if (alignedToT[i].size() != 1 ||
- other.alignedToT[i].size() != 1 ||
- *(alignedToT[i].begin()) != *(other.alignedToT[i].begin()))
- return false;
- }
- }
- return true;
-}
-
-int PhraseAlignment::Compare(const PhraseAlignment &other) const
-{
- if (this == &other) // comparing with itself
- return 0;
-
- if (GetTarget() != other.GetTarget())
- return ( GetTarget() < other.GetTarget() ) ? -1 : +1;
-
- if (GetSource() != other.GetSource())
- return ( GetSource() < other.GetSource() ) ? -1 : +1;
-
- if (!hierarchicalFlag)
- return 0;
-
- // loop over all words (note: 0 = left hand side of rule)
- for(size_t i=0; i<phraseT.size()-1; i++) {
- if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
- size_t thisAlign = *(alignedToT[i].begin());
- size_t otherAlign = *(other.alignedToT[i].begin());
-
- if (alignedToT[i].size() != 1 ||
- other.alignedToT[i].size() != 1 ||
- thisAlign != otherAlign) {
- int ret = (thisAlign < otherAlign) ? -1 : +1;
- return ret;
- }
- }
- }
- return 0;
-
-}
-
-}
-
diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h
deleted file mode 100644
index 0e854c136..000000000
--- a/phrase-extract/PhraseAlignment.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#pragma once
-/*
- * PhraseAlignment.h
- * extract
- *
- * Created by Hieu Hoang on 28/07/2010.
- * Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include "tables-core.h"
-
-#include <vector>
-#include <set>
-
-namespace MosesTraining
-{
-
-// data structure for a single phrase pair
-class PhraseAlignment
-{
-protected:
- PHRASE phraseS;
- PHRASE phraseT;
-
- void createAlignVec(size_t sourceSize, size_t targetSize);
- void addNTLength(const std::string &tok);
-public:
- float pcfgSum;
- float count;
- int sentenceId;
- std::string domain;
- std::string treeFragment;
-
- std::vector< std::set<size_t> > alignedToT;
- std::vector< std::set<size_t> > alignedToS;
-
- void create( char*, int, bool );
- void clear();
- bool equals( const PhraseAlignment& );
- bool match( const PhraseAlignment& );
-
- int Compare(const PhraseAlignment &compare) const;
- inline bool operator<(const PhraseAlignment &compare) const {
- return Compare(compare) < 0;
- }
-
- const PHRASE &GetSource() const {
- return phraseS;
- }
- const PHRASE &GetTarget() const {
- return phraseT;
- }
-};
-
-class PhraseAlignment;
-
-typedef std::vector<PhraseAlignment*> PhraseAlignmentCollection;
-//typedef std::vector<PhraseAlignmentCollection> PhrasePairGroup;
-
-class PhraseAlignmentCollectionOrderer
-{
-public:
- bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const {
- assert(collA.size() > 0);
- assert(collB.size() > 0);
-
- const PhraseAlignment &objA = *collA[0];
- const PhraseAlignment &objB = *collB[0];
- bool ret = objA < objB;
-
- return ret;
- }
-};
-
-
-//typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> PhrasePairGroup;
-
-class PhrasePairGroup
-{
-private:
- typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> Coll;
- Coll m_coll;
-
-
-public:
- typedef Coll::iterator iterator;
- typedef Coll::const_iterator const_iterator;
- typedef std::vector<const PhraseAlignmentCollection *> SortedColl;
-
- std::pair<Coll::iterator,bool> insert ( const PhraseAlignmentCollection& obj );
-
- const SortedColl &GetSortedColl() const {
- return m_sortedColl;
- }
- size_t GetSize() const {
- return m_coll.size();
- }
-
-private:
- SortedColl m_sortedColl;
-
-};
-
-
-}
-
diff --git a/phrase-extract/ScoreFeature.cpp b/phrase-extract/ScoreFeature.cpp
index f98759755..52157a8cb 100644
--- a/phrase-extract/ScoreFeature.cpp
+++ b/phrase-extract/ScoreFeature.cpp
@@ -18,7 +18,7 @@
***********************************************************************/
#include "ScoreFeature.h"
-#include "domain.h"
+#include "DomainFeature.h"
#include "InternalStructFeature.h"
using namespace std;
@@ -77,10 +77,10 @@ void ScoreFeatureManager::configure(const std::vector<std::string> args)
}
sparseDomainAdded = true;
m_includeSentenceId = true;
- } else if(args[i] == "--GHKMFeatureSparse"){
+ } else if(args[i] == "--TreeFeatureSparse"){
//MARIA
m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
- } else if(args[i] == "--GHKMFeatureDense"){
+ } else if(args[i] == "--TreeFeatureDense"){
//MARIA
m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
} else {
@@ -91,12 +91,13 @@ void ScoreFeatureManager::configure(const std::vector<std::string> args)
}
-bool ScoreFeatureManager::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
+void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+ float count,
+ int sentenceId) const
{
for (size_t i = 0; i < m_features.size(); ++i) {
- if (!m_features[i]->equals(lhs,rhs)) return false;
+ m_features[i]->addPropertiesToPhrasePair(phrasePair, count, sentenceId);
}
- return true;
}
void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context,
diff --git a/phrase-extract/ScoreFeature.h b/phrase-extract/ScoreFeature.h
index 5823f21ce..926397e71 100644
--- a/phrase-extract/ScoreFeature.h
+++ b/phrase-extract/ScoreFeature.h
@@ -35,7 +35,7 @@
#include "util/exception.hh"
-#include "PhraseAlignment.h"
+#include "ExtractionPhrasePair.h"
namespace MosesTraining
{
@@ -64,17 +64,14 @@ public:
/** Passed to each feature to be used to calculate its values */
struct ScoreFeatureContext {
ScoreFeatureContext(
- const PhraseAlignmentCollection &thePhrasePair,
- float theCount, /* Total counts of all phrase pairs*/
+ const ExtractionPhrasePair &thePhrasePair,
const MaybeLog& theMaybeLog
) :
phrasePair(thePhrasePair),
- count(theCount),
maybeLog(theMaybeLog) {
}
- const PhraseAlignmentCollection& phrasePair;
- float count;
+ const ExtractionPhrasePair &phrasePair;
MaybeLog maybeLog;
};
@@ -85,16 +82,19 @@ struct ScoreFeatureContext {
class ScoreFeature
{
public:
+
+ /** Some features might need to store properties in ExtractionPhrasePair,
+ * e.g. to pass along external information loaded by a feature
+ * which may distinguish several phrase occurrences based on sentence ID */
+ virtual void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+ float count,
+ int sentenceId) const {};
+
/** Add the values for this feature function. */
virtual void add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const = 0;
- /** Return true if the two phrase pairs are equal from the point of this feature. Assume
- that they already compare true according to PhraseAlignment.equals()
- **/
- virtual bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const = 0;
-
virtual ~ScoreFeature() {}
};
@@ -112,17 +112,18 @@ public:
/** Pass the unused command-line arguments to configure the extra features */
void configure(const std::vector<std::string> args);
+ /** Some features might need to store properties in ExtractionPhrasePair,
+ * e.g. to pass along external information loaded by a feature
+ * which may distinguish several phrase occurrences based on sentence ID */
+ void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+ float count,
+ int sentenceId) const;
+
/** Add all the features */
void addFeatures(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const;
- /**
- * Used to tell if the PhraseAlignment should be considered the same by all
- * extended features.
- **/
- bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
-
const std::vector<ScoreFeaturePtr>& getFeatures() const {
return m_features;
}
diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index 65f1f4437..534ab177b 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -17,7 +17,7 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#include "domain.h"
+#include "DomainFeature.h"
#include "ScoreFeature.h"
#include "tables-core.h"
@@ -93,18 +93,3 @@ BOOST_AUTO_TEST_CASE(manager_config_domain)
(boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
}
-
-BOOST_AUTO_TEST_CASE(domain_equals)
-{
- SubsetDomainFeature feature(DomainFileLocation());
- PhraseAlignment a1,a2,a3;
- char buf1[] = "a ||| b ||| 0-0 ||| 1";
- char buf2[] = "a ||| b ||| 0-0 ||| 2";
- char buf3[] = "a ||| b ||| 0-0 ||| 3";
- a1.create(buf1, 0, true); //domain a
- a2.create(buf2, 1, true); //domain c
- a3.create(buf3, 2, true); //domain c
- BOOST_CHECK(feature.equals(a2,a3));
- BOOST_CHECK(!feature.equals(a1,a3));
- BOOST_CHECK(!feature.equals(a1,a3));
-}
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index d0305fad2..65a12d176 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -21,22 +21,21 @@
#include <cstdio>
#include <iostream>
#include <fstream>
-#include <vector>
#include <stdlib.h>
#include <assert.h>
#include <cstring>
+#include <map>
#include <set>
+#include <vector>
#include <algorithm>
#include "SafeGetline.h"
#include "ScoreFeature.h"
#include "tables-core.h"
-#include "domain.h"
-#include "PhraseAlignment.h"
+#include "ExtractionPhrasePair.h"
#include "score.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
-#include "InternalStructFeature.h"
using namespace std;
using namespace MosesTraining;
@@ -61,111 +60,120 @@ int negLogProb = 1;
bool lexFlag = true;
bool unalignedFlag = false;
bool unalignedFWFlag = false;
-bool singletonFeature = false;
bool crossedNonTerm = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
float minCountHierarchical = 0;
+std::map<std::string,float> sourceLHSCounts;
+std::map<std::string, std::map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
+
+std::set<std::string> sourceLabelSet;
+std::map<std::string,size_t> sourceLabels;
+std::vector<std::string> sourceLabelsByIndex;
Vocabulary vcbT;
Vocabulary vcbS;
} // namespace
-vector<string> tokenize( const char [] );
-
-void writeCountOfCounts( const string &fileNameCountOfCounts );
-void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLog);
-const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
-const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phrasePair );
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLog );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
-set<string> functionWordList;
+std::vector<std::string> tokenize( const char [] );
+
+void processLine( std::string line,
+ int lineID, bool includeSentenceIdFlag, int &sentenceId,
+ PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
+ std::string &additionalPropertiesString,
+ float &count, float &pcfgSum );
+void writeCountOfCounts( const std::string &fileNameCountOfCounts );
+void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
+ const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
+void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
+double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
+double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
+set<std::string> functionWordList;
void loadFunctionWords( const string &fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
-void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
-void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
+double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
+int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
+void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
+void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out );
+void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
+
int main(int argc, char* argv[])
{
- cerr << "Score v2.0 written by Philipp Koehn\n"
- << "scoring methods for extracted rules\n";
+ std::cerr << "Score v2.1 -- "
+ << "scoring methods for extracted rules" << std::endl;
ScoreFeatureManager featureManager;
if (argc < 4) {
- cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
- cerr << featureManager.usage() << endl;
+ std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+ std::cerr << featureManager.usage() << std::endl;
exit(1);
}
- string fileNameExtract = argv[1];
- string fileNameLex = argv[2];
- string fileNamePhraseTable = argv[3];
- string fileNameCountOfCounts;
- char* fileNameFunctionWords = NULL;
- vector<string> featureArgs; //all unknown args passed to feature manager
+ std::string fileNameExtract = argv[1];
+ std::string fileNameLex = argv[2];
+ std::string fileNamePhraseTable = argv[3];
+ std::string fileNameCountOfCounts;
+ std::string fileNameFunctionWords;
+ std::vector<std::string> featureArgs; // all unknown args passed to feature manager
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
inverseFlag = true;
- cerr << "using inverse mode\n";
+ std::cerr << "using inverse mode" << std::endl;
} else if (strcmp(argv[i],"--Hierarchical") == 0) {
hierarchicalFlag = true;
- cerr << "processing hierarchical rules\n";
+ std::cerr << "processing hierarchical rules" << std::endl;
} else if (strcmp(argv[i],"--PCFG") == 0) {
pcfgFlag = true;
- cerr << "including PCFG scores\n";
+ std::cerr << "including PCFG scores" << std::endl;
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
treeFragmentsFlag = true;
- cerr << "including tree fragments from syntactic parse\n";
+ std::cerr << "including tree fragment information from syntactic parse\n";
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
unpairedExtractFormatFlag = true;
- cerr << "processing unpaired extract format\n";
+ std::cerr << "processing unpaired extract format" << std::endl;
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
conditionOnTargetLhsFlag = true;
- cerr << "processing unpaired extract format\n";
+ std::cerr << "processing unpaired extract format" << std::endl;
} else if (strcmp(argv[i],"--NoWordAlignment") == 0) {
wordAlignmentFlag = false;
- cerr << "omitting word alignment" << endl;
+ std::cerr << "omitting word alignment" << std::endl;
} else if (strcmp(argv[i],"--NoLex") == 0) {
lexFlag = false;
- cerr << "not computing lexical translation score\n";
+ std::cerr << "not computing lexical translation score" << std::endl;
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
goodTuringFlag = true;
- fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
- cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+ fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
+ std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
} else if (strcmp(argv[i],"--KneserNey") == 0) {
kneserNeyFlag = true;
- fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
- cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+ fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
+ std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
} else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
unalignedFlag = true;
- cerr << "using unaligned word penalty\n";
+ std::cerr << "using unaligned word penalty" << std::endl;
} else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
unalignedFWFlag = true;
if (i+1==argc) {
- cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
+ std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
exit(1);
}
fileNameFunctionWords = argv[++i];
- cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
+ std::cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << std::endl;
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
- cerr << "using log-probabilities\n";
+ std::cerr << "using log-probabilities" << std::endl;
} else if (strcmp(argv[i],"--NegLogProb") == 0) {
logProbFlag = true;
negLogProb = -1;
- cerr << "using negative log-probabilities\n";
+ std::cerr << "using negative log-probabilities" << std::endl;
} else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
minCountHierarchical = atof(argv[++i]);
- cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
+ std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
minCountHierarchical -= 0.00001; // account for rounding
- } else if (strcmp(argv[i],"--Singleton") == 0) {
- singletonFeature = true;
- cerr << "binary singleton feature\n";
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
crossedNonTerm = true;
- cerr << "crossed non-term reordering feature\n";
+ std::cerr << "crossed non-term reordering feature" << std::endl;
} else {
featureArgs.push_back(argv[i]);
++i;
@@ -178,16 +186,20 @@ int main(int argc, char* argv[])
MaybeLog maybeLogProb(logProbFlag, negLogProb);
- //configure extra features
- if (!inverseFlag) featureManager.configure(featureArgs);
+ // configure extra features
+ if (!inverseFlag) {
+ featureManager.configure(featureArgs);
+ }
// lexical translation table
- if (lexFlag)
+ if (lexFlag) {
lexTable.load( fileNameLex );
+ }
// function word list
- if (unalignedFWFlag)
+ if (unalignedFWFlag) {
loadFunctionWords( fileNameFunctionWords );
+ }
// compute count of counts for Good Turing discounting
if (goodTuringFlag || kneserNeyFlag) {
@@ -198,7 +210,7 @@ int main(int argc, char* argv[])
Moses::InputFileStream extractFile(fileNameExtract);
if (extractFile.fail()) {
- cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
+ std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
exit(1);
}
istream &extractFileP = extractFile;
@@ -207,76 +219,160 @@ int main(int argc, char* argv[])
ostream *phraseTableFile;
if (fileNamePhraseTable == "-") {
- phraseTableFile = &cout;
+ phraseTableFile = &std::cout;
} else {
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
bool success = outputFile->Open(fileNamePhraseTable);
if (!success) {
- cerr << "ERROR: could not open file phrase table file "
- << fileNamePhraseTable << endl;
+ std::cerr << "ERROR: could not open file phrase table file "
+ << fileNamePhraseTable << std::endl;
exit(1);
}
phraseTableFile = outputFile;
}
// loop through all extracted phrase translations
- float lastCount = 0.0f;
- float lastPcfgSum = 0.0f;
- vector< PhraseAlignment > phrasePairsWithSameF;
- bool isSingleton = true;
- int i=0;
- char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
+ char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH];
lastLine[0] = '\0';
- PhraseAlignment *lastPhrasePair = NULL;
- while(true) {
- if (extractFileP.eof()) break;
- if (++i % 100000 == 0) cerr << "." << flush;
- SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
- if (extractFileP.eof()) break;
+ ExtractionPhrasePair *phrasePair = NULL;
+ std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
+ std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
- // identical to last line? just add count
- if (strcmp(line,lastLine) == 0) {
- lastPhrasePair->count += lastCount;
- lastPhrasePair->pcfgSum += lastPcfgSum;
- continue;
+ int tmpSentenceId;
+ PHRASE *tmpPhraseSource, *tmpPhraseTarget;
+ ALIGNMENT *tmpTargetToSourceAlignment;
+ std::string tmpAdditionalPropertiesString;
+ float tmpCount=0.0f, tmpPcfgSum=0.0f;
+
+ int i=0;
+ SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
+ if ( !extractFileP.eof() ) {
+ ++i;
+ tmpPhraseSource = new PHRASE();
+ tmpPhraseTarget = new PHRASE();
+ tmpTargetToSourceAlignment = new ALIGNMENT();
+ processLine( std::string(line),
+ i, featureManager.includeSentenceId(), tmpSentenceId,
+ tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
+ tmpAdditionalPropertiesString,
+ tmpCount, tmpPcfgSum);
+ phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
+ tmpTargetToSourceAlignment,
+ tmpCount, tmpPcfgSum );
+ phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
+ featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
+ phrasePairsWithSameSource.push_back( phrasePair );
+ if ( hierarchicalFlag ) {
+ phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
}
strcpy( lastLine, line );
+ SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
+ }
+
+ while ( !extractFileP.eof() ) {
+
+ if ( ++i % 100000 == 0 ) {
+ std::cerr << "." << std::flush;
+ }
- // create new phrase pair
- PhraseAlignment phrasePair;
- phrasePair.create( line, i, featureManager.includeSentenceId());
- lastCount = phrasePair.count;
- lastPcfgSum = phrasePair.pcfgSum;
-
- // only differs in count? just add count
- if (lastPhrasePair != NULL
- && lastPhrasePair->equals( phrasePair )
- && featureManager.equals(*lastPhrasePair, phrasePair)) {
- lastPhrasePair->count += phrasePair.count;
- lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
+ // identical to last line? just add count
+ if (strcmp(line,lastLine) == 0) {
+ phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
+ SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
continue;
+ } else {
+ strcpy( lastLine, line );
}
- // if new source phrase, process last batch
- if (lastPhrasePair != NULL &&
- lastPhrasePair->GetSource() != phrasePair.GetSource()) {
- processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb );
+ tmpPhraseSource = new PHRASE();
+ tmpPhraseTarget = new PHRASE();
+ tmpTargetToSourceAlignment = new ALIGNMENT();
+ tmpAdditionalPropertiesString.clear();
+ processLine( std::string(line),
+ i, featureManager.includeSentenceId(), tmpSentenceId,
+ tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
+ tmpAdditionalPropertiesString,
+ tmpCount, tmpPcfgSum);
+
+ bool matchesPrevious = false;
+ bool sourceMatch = true; bool targetMatch = true; bool alignmentMatch = true; // be careful with these,
+ // ExtractionPhrasePair::Matches() checks them in order and does not continue with the others
+ // once the first of them has been found to have to be set to false
+
+ if ( hierarchicalFlag ) {
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
+ iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
+ if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
+ sourceMatch, targetMatch, alignmentMatch ) ) {
+ matchesPrevious = true;
+ phrasePair = (*iter);
+ break;
+ }
+ }
+ } else {
+ if ( phrasePair->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
+ sourceMatch, targetMatch, alignmentMatch ) ) {
+ matchesPrevious = true;
+ }
+ }
- phrasePairsWithSameF.clear();
- isSingleton = false;
- lastPhrasePair = NULL;
+ if ( matchesPrevious ) {
+ delete tmpPhraseSource;
+ delete tmpPhraseTarget;
+ if ( !phrasePair->Add( tmpTargetToSourceAlignment,
+ tmpCount, tmpPcfgSum ) ) {
+ delete tmpTargetToSourceAlignment;
+ }
+ phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
+ featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
} else {
- isSingleton = true;
+
+ if ( !phrasePairsWithSameSource.empty() &&
+ !sourceMatch ) {
+ processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ iter!=phrasePairsWithSameSource.end(); ++iter) {
+ delete *iter;
+ }
+ phrasePairsWithSameSource.clear();
+ if ( hierarchicalFlag ) {
+ phrasePairsWithSameSourceAndTarget.clear();
+ }
+ }
+
+ if ( hierarchicalFlag ) {
+ if ( !phrasePairsWithSameSourceAndTarget.empty() &&
+ !targetMatch ) {
+ phrasePairsWithSameSourceAndTarget.clear();
+ }
+ }
+
+ phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
+ tmpTargetToSourceAlignment,
+ tmpCount, tmpPcfgSum );
+ phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
+ featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
+ phrasePairsWithSameSource.push_back(phrasePair);
+
+ if ( hierarchicalFlag ) {
+ phrasePairsWithSameSourceAndTarget.push_back(phrasePair);
+ }
}
- // add phrase pairs to list, it's now the last one
- phrasePairsWithSameF.push_back( phrasePair );
- lastPhrasePair = &phrasePairsWithSameF.back();
+ SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+
}
- processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb );
+
+ processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ iter!=phrasePairsWithSameSource.end(); ++iter) {
+ delete *iter;
+ }
+ phrasePairsWithSameSource.clear();
+
phraseTableFile->flush();
- if (phraseTableFile != &cout) {
+ if (phraseTableFile != &std::cout) {
delete phraseTableFile;
}
@@ -286,177 +382,134 @@ int main(int argc, char* argv[])
}
}
+
+void processLine( std::string line,
+ int lineID, bool includeSentenceIdFlag, int &sentenceId,
+ PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
+ std::string &additionalPropertiesString,
+ float &count, float &pcfgSum )
+{
+ size_t foundAdditionalProperties = line.find("{{");
+ if (foundAdditionalProperties != std::string::npos) {
+ additionalPropertiesString = line.substr(foundAdditionalProperties);
+ line = line.substr(0,foundAdditionalProperties);
+ } else {
+ additionalPropertiesString.clear();
+ }
+
+ phraseSource->clear();
+ phraseTarget->clear();
+ targetToSourceAlignment->clear();
+
+ std::vector<std::string> token = tokenize( line.c_str() );
+ int item = 1;
+ for ( size_t j=0; j<token.size(); ++j ) {
+ if (token[j] == "|||") {
+ ++item;
+ } else if (item == 1) { // source phrase
+ phraseSource->push_back( vcbS.storeIfNew( token[j] ) );
+ } else if (item == 2) { // target phrase
+ phraseTarget->push_back( vcbT.storeIfNew( token[j] ) );
+ } else if (item == 3) { // alignment
+ int s,t;
+ sscanf(token[j].c_str(), "%d-%d", &s, &t);
+ if ((size_t)t >= phraseTarget->size() || (size_t)s >= phraseSource->size()) {
+ std::cerr << "WARNING: phrase pair " << lineID
+ << " has alignment point (" << s << ", " << t << ")"
+ << " out of bounds (" << phraseSource->size() << ", " << phraseTarget->size() << ")"
+ << std::endl;
+ } else {
+ // first alignment point? -> initialize
+ if ( targetToSourceAlignment->size() == 0 ) {
+ size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
+ targetToSourceAlignment->resize(numberOfTargetSymbols);
+ }
+ // add alignment point
+ targetToSourceAlignment->at(t).insert(s);
+ }
+ } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
+ sscanf(token[j].c_str(), "%d", &sentenceId);
+ } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
+ sscanf(token[j].c_str(), "%f", &count);
+ } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
+ float pcfgScore = std::atof(token[j].c_str());
+ pcfgSum = pcfgScore * count;
+ }
+ }
+
+ if ( targetToSourceAlignment->size() == 0 ) {
+ size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
+ targetToSourceAlignment->resize(numberOfTargetSymbols);
+ }
+
+ if (item + (includeSentenceIdFlag?-1:0) == 3) {
+ count = 1.0;
+ }
+ if (item < 3 || item > 6) {
+ std::cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
+ }
+
+}
+
+
void writeCountOfCounts( const string &fileNameCountOfCounts )
{
// open file
Moses::OutputFileStream countOfCountsFile;
bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
if (!success) {
- cerr << "ERROR: could not open count-of-counts file "
- << fileNameCountOfCounts << endl;
+ std::cerr << "ERROR: could not open count-of-counts file "
+ << fileNameCountOfCounts << std::endl;
return;
}
// Kneser-Ney needs the total number of phrase pairs
- countOfCountsFile << totalDistinct << endl;
+ countOfCountsFile << totalDistinct << std::endl;
// write out counts
for(int i=1; i<=COC_MAX; i++) {
- countOfCountsFile << countOfCounts[ i ] << endl;
+ countOfCountsFile << countOfCounts[ i ] << std::endl;
}
countOfCountsFile.Close();
}
-void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
-{
- if (phrasePair.size() == 0) return;
- // group phrase pairs based on alignments that matter
- // (i.e. that re-arrange non-terminals)
- PhrasePairGroup phrasePairGroup;
+void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
+ const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
+{
+ if (phrasePairsWithSameSource.size() == 0) {
+ return;
+ }
float totalSource = 0;
- //cerr << "phrasePair.size() = " << phrasePair.size() << endl;
+ //std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
// loop through phrase pairs
- for(size_t i=0; i<phrasePair.size(); i++) {
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ iter!=phrasePairsWithSameSource.end(); ++iter) {
// add to total count
- PhraseAlignment &currPhrasePair = phrasePair[i];
-
- totalSource += phrasePair[i].count;
-
- // check for matches
- //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
-
- PhraseAlignmentCollection phraseAlignColl;
- phraseAlignColl.push_back(&currPhrasePair);
- pair<PhrasePairGroup::iterator, bool> retInsert;
- retInsert = phrasePairGroup.insert(phraseAlignColl);
- if (!retInsert.second) {
- // already exist. Add to that collection instead
- PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
- existingColl.push_back(&currPhrasePair);
- }
-
+ totalSource += (*iter)->GetCount();
}
// output the distinct phrase pairs, one at a time
- const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
- PhrasePairGroup::SortedColl::const_iterator iter;
-
- for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) {
- const PhraseAlignmentCollection &group = **iter;
- outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile, isSingleton, featureManager, maybeLogProb );
- }
-
-}
-
-const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
-{
- float bestAlignmentCount = -1;
- PhraseAlignment* bestAlignment = NULL;
-
- for(size_t i=0; i<phrasePair.size(); i++) {
- size_t alignInd;
- if (inverseFlag) {
- // count backwards, so that alignments for ties will be the same for both normal & inverse scores
- alignInd = phrasePair.size() - i - 1;
- } else {
- alignInd = i;
- }
-
- if (phrasePair[alignInd]->count > bestAlignmentCount) {
- bestAlignmentCount = phrasePair[alignInd]->count;
- bestAlignment = phrasePair[alignInd];
- }
- }
-
- return *bestAlignment;
-}
-
-const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phrasePair )
-{
- float bestTreeFragmentCount = -1;
- PhraseAlignment *bestTreeFragment = NULL;
-
- for(size_t i=0; i<phrasePair.size(); i++) {
- size_t treeFragmentInd;
- if (inverseFlag) {
- // count backwards, so that alignments for ties will be the same for both normal & inverse scores
- treeFragmentInd = phrasePair.size() - i - 1;
- } else {
- treeFragmentInd = i;
- }
-
- if (phrasePair[treeFragmentInd]->count > bestTreeFragmentCount) {
- bestTreeFragmentCount = phrasePair[treeFragmentInd]->count;
- bestTreeFragment = phrasePair[treeFragmentInd];
- }
- }
-
- return bestTreeFragment->treeFragment;
-}
-
-bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set<size_t> > &alignedToS)
-{
- for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) {
- if (currSource == sourcePos) {
- // skip
- } else {
- const std::set<size_t> &targetSet = alignedToS[currSource];
- std::set<size_t>::const_iterator iter;
- for (iter = targetSet.begin(); iter != targetSet.end(); ++iter) {
- size_t currTarget = *iter;
-
- if ((currSource < sourcePos && currTarget > targetPos)
- || (currSource > sourcePos && currTarget < targetPos)
- ) {
- return true;
- }
- }
-
- }
- }
-
- return false;
-}
-
-int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
-{
- const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
-
- for (size_t sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos) {
- const std::set<size_t> &targetSet = alignedToS[sourcePos];
-
- WORD_ID wordId = phraseS[sourcePos];
- const WORD &word = vcbS.getWord(wordId);
- bool isNonTerm = isNonTerminal(word);
-
- if (isNonTerm) {
- assert(targetSet.size() == 1);
- size_t targetPos = *targetSet.begin();
- bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
- if (ret)
- return 1;
- }
+ for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
+ iter!=phrasePairsWithSameSource.end(); ++iter) {
+ // add to total count
+ outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
}
-
- return 0;
}
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager,
+void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
+ float totalCount, int distinctCount,
+ ostream &phraseTableFile,
+ const ScoreFeatureManager& featureManager,
const MaybeLog& maybeLogProb )
{
- if (phrasePair.size() == 0) return;
+ assert(phrasePair.isValid());
- const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
-
- // compute count
- float count = 0;
- for(size_t i=0; i<phrasePair.size(); i++) {
- count += phrasePair[i]->count;
- }
+ const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
+ float count = phrasePair.GetCount();
map< string, float > domainCount;
@@ -464,82 +517,74 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
if (goodTuringFlag || kneserNeyFlag) {
totalDistinct++;
int countInt = count + 0.99999;
- if(countInt <= COC_MAX)
+ if (countInt <= COC_MAX)
countOfCounts[ countInt ]++;
}
// compute PCFG score
float pcfgScore = 0;
if (pcfgFlag && !inverseFlag) {
- float pcfgSum = 0;
- for(size_t i=0; i<phrasePair.size(); ++i) {
- pcfgSum += phrasePair[i]->pcfgSum;
- }
- pcfgScore = pcfgSum / count;
+ pcfgScore = phrasePair.GetPcfgScore() / count;
}
// output phrases
- const PHRASE &phraseS = phrasePair[0]->GetSource();
- const PHRASE &phraseT = phrasePair[0]->GetTarget();
+ const PHRASE *phraseSource = phrasePair.GetSource();
+ const PHRASE *phraseTarget = phrasePair.GetTarget();
// do not output if hierarchical and count below threshold
if (hierarchicalFlag && count < minCountHierarchical) {
- for(size_t j=0; j<phraseS.size()-1; j++) {
- if (isNonTerminal(vcbS.getWord( phraseS[j] )))
+ for(size_t j=0; j<phraseSource->size()-1; ++j) {
+ if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
return;
}
}
// source phrase (unless inverse)
- if (! inverseFlag) {
- printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
+ if (!inverseFlag) {
+ printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
phraseTableFile << " ||| ";
}
// target phrase
- printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
+ printTargetPhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
phraseTableFile << " ||| ";
// source phrase (if inverse)
if (inverseFlag) {
- printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
+ printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
phraseTableFile << " ||| ";
}
// lexical translation probability
if (lexFlag) {
- double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
- phraseTableFile << maybeLogProb(lexScore );
+ double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
+ phraseTableFile << maybeLogProb( lexScore );
}
// unaligned word penalty
if (unalignedFlag) {
- double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
- phraseTableFile << " " << maybeLogProb(penalty );
+ double penalty = computeUnalignedPenalty( bestAlignmentT2S );
+ phraseTableFile << " " << maybeLogProb( penalty );
}
// unaligned function word penalty
if (unalignedFWFlag) {
- double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
- phraseTableFile << " " << maybeLogProb(penalty );
- }
-
- if (singletonFeature) {
- phraseTableFile << " " << (isSingleton ? 1 : 0);
+ double penalty = computeUnalignedFWPenalty( phraseTarget, bestAlignmentT2S );
+ phraseTableFile << " " << maybeLogProb( penalty );
}
if (crossedNonTerm && !inverseFlag) {
- phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
+ phraseTableFile << " " << calcCrossedNonTerm( phraseTarget, bestAlignmentT2S );
}
// target-side PCFG score
if (pcfgFlag && !inverseFlag) {
- phraseTableFile << " " << maybeLogProb(pcfgScore );
+ phraseTableFile << " " << maybeLogProb( pcfgScore );
}
// extra features
- ScoreFeatureContext context(phrasePair, count, maybeLogProb);
- vector<float> extraDense;
+ ScoreFeatureContext context(phrasePair, maybeLogProb);
+ std::vector<float> extraDense;
map<string,float> extraSparse;
featureManager.addFeatures(context, extraDense, extraSparse);
for (size_t i = 0; i < extraDense.size(); ++i) {
@@ -553,30 +598,28 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
phraseTableFile << " ||| ";
- // alignment info for non-terminals
- if (! inverseFlag) {
- if (hierarchicalFlag) {
- // always output alignment if hiero style, but only for non-terms
- // (eh: output all alignments, needed for some feature functions)
- assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
+ // output alignment info
+ if ( !inverseFlag ) {
+ if ( hierarchicalFlag ) {
+ // always output alignment if hiero style
+ assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
std::vector<std::string> alignment;
- for(size_t j = 0; j < phraseT.size() - 1; j++) {
- if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
- if (bestAlignment.alignedToT[ j ].size() != 1) {
- cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
+ for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
+ if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
+ if ( bestAlignmentT2S->at(j).size() != 1 ) {
+ std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
phraseTableFile.flush();
- assert(bestAlignment.alignedToT[ j ].size() == 1);
+ assert(bestAlignmentT2S->at(j).size() == 1);
}
- int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
+ size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
//phraseTableFile << sourcePos << "-" << j << " ";
std::stringstream point;
point << sourcePos << "-" << j;
alignment.push_back(point.str());
} else {
- set<size_t>::iterator setIter;
- for(setIter = (bestAlignment.alignedToT[j]).begin(); setIter != (bestAlignment.alignedToT[j]).end(); setIter++) {
- int sourcePos = *setIter;
- //phraseTableFile << sourcePos << "-" << j << " ";
+ for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
+ setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
+ size_t sourcePos = *setIter;
std::stringstream point;
point << sourcePos << "-" << j;
alignment.push_back(point.str());
@@ -590,40 +633,90 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
}
} else if (wordAlignmentFlag) {
// alignment info in pb model
- for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
- const set< size_t > &aligned = bestAlignment.alignedToT[j];
- for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
- phraseTableFile << *p << "-" << j << " ";
+ for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
+ for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
+ setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
+ size_t sourcePos = *setIter;
+ phraseTableFile << sourcePos << "-" << j << " ";
}
}
}
}
-
// counts
-
phraseTableFile << " ||| " << totalCount << " " << count;
if (kneserNeyFlag)
phraseTableFile << " " << distinctCount;
+ if ((treeFragmentsFlag) &&
+ !inverseFlag) {
+ phraseTableFile << " |||";
+ }
+
// tree fragments
if (treeFragmentsFlag && !inverseFlag) {
- const std::string &bestTreeFragment = findBestTreeFragment( phrasePair );
- if ( !bestTreeFragment.empty() )
- phraseTableFile << " ||| {{Tree " << bestTreeFragment << "}}";
+ const std::string *bestTreeFragment = phrasePair.FindBestPropertyValue("Tree");
+ if (bestTreeFragment) {
+ phraseTableFile << " {{Tree " << *bestTreeFragment << "}}";
+ }
}
+ phraseTableFile << std::endl;
+}
+
+
+
+bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource )
+{
+ for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) {
+ if (currTarget == targetPos) {
+ // skip
+ } else {
+ const std::set<size_t> &sourceSet = alignmentTargetToSource->at(currTarget);
+ for (std::set<size_t>::const_iterator iter = sourceSet.begin();
+ iter != sourceSet.end(); ++iter) {
+ size_t currSource = *iter;
+
+ if ((currTarget < targetPos && currSource > sourcePos)
+ || (currTarget > targetPos && currSource < sourcePos)
+ ) {
+ return true;
+ }
+ }
- phraseTableFile << endl;
+ }
+ }
+
+ return false;
}
-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
+int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
+{
+ assert(phraseTarget->size() >= alignmentTargetToSource->size() );
+
+ for (size_t targetPos = 0; targetPos < alignmentTargetToSource->size(); ++targetPos) {
+
+ if ( isNonTerminal(vcbT.getWord( phraseTarget->at(targetPos) ))) {
+ const std::set<size_t> &alignmentPoints = alignmentTargetToSource->at(targetPos);
+ assert( alignmentPoints.size() == 1 );
+ size_t sourcePos = *alignmentPoints.begin();
+ bool ret = calcCrossedNonTerm(targetPos, sourcePos, alignmentTargetToSource);
+ if (ret)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
{
// unaligned word counter
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
- for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
+ for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
+ const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
if (srcIndices.empty()) {
unaligned *= 2.718;
}
@@ -631,14 +724,15 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, co
return unaligned;
}
-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
+
+double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
{
// unaligned word counter
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
- for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
- if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
+ for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
+ const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
+ if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
unaligned *= 2.718;
}
}
@@ -647,11 +741,11 @@ double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT,
void loadFunctionWords( const string &fileName )
{
- cerr << "Loading function word list from " << fileName;
+ std::cerr << "Loading function word list from " << fileName;
ifstream inFile;
inFile.open(fileName.c_str());
if (inFile.fail()) {
- cerr << " - ERROR: could not open file\n";
+ std::cerr << " - ERROR: could not open file" << std::endl;
exit(1);
}
istream *inFileP = &inFile;
@@ -660,32 +754,32 @@ void loadFunctionWords( const string &fileName )
while(true) {
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
- vector<string> token = tokenize( line );
+ std::vector<string> token = tokenize( line );
if (token.size() > 0)
functionWordList.insert( token[0] );
}
- inFile.close();
- cerr << " - read " << functionWordList.size() << " function words\n";
+ std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
inFile.close();
}
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
+
+double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
{
// lexical translation probability
double lexScore = 1.0;
int null = vcbS.getWordID("NULL");
// all target words have to be explained
- for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
+ for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
+ const set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
if (srcIndices.empty()) {
// explain unaligned word by NULL
- lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
+ lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
} else {
// go through all the aligned words to compute average
double thisWordScore = 0;
for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
- thisWordScore += lexTable.permissiveLookup( phraseS[ *p ], phraseT[ ti ] );
+ thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
}
lexScore *= thisWordScore / (double)srcIndices.size();
}
@@ -693,13 +787,14 @@ double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT,
return lexScore;
}
+
void LexicalTable::load( const string &fileName )
{
- cerr << "Loading lexical translation table from " << fileName;
+ std::cerr << "Loading lexical translation table from " << fileName;
ifstream inFile;
inFile.open(fileName.c_str());
if (inFile.fail()) {
- cerr << " - ERROR: could not open file\n";
+ std::cerr << " - ERROR: could not open file" << std::endl;
exit(1);
}
istream *inFileP = &inFile;
@@ -709,15 +804,15 @@ void LexicalTable::load( const string &fileName )
int i=0;
while(true) {
i++;
- if (i%100000 == 0) cerr << "." << flush;
+ if (i%100000 == 0) std::cerr << "." << flush;
SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (inFileP->eof()) break;
- vector<string> token = tokenize( line );
+ std::vector<string> token = tokenize( line );
if (token.size() != 3) {
- cerr << "line " << i << " in " << fileName
- << " has wrong number of tokens, skipping:\n"
- << token.size() << " " << token[0] << " " << line << endl;
+ std::cerr << "line " << i << " in " << fileName
+ << " has wrong number of tokens, skipping:" << std::endl
+ << token.size() << " " << token[0] << " " << line << std::endl;
continue;
}
@@ -726,55 +821,60 @@ void LexicalTable::load( const string &fileName )
WORD_ID wordS = vcbS.storeIfNew( token[1] );
ltable[ wordS ][ wordT ] = prob;
}
- cerr << endl;
+ std::cerr << std::endl;
}
-void printSourcePhrase(const PHRASE &phraseS, const PHRASE &phraseT,
- const PhraseAlignment &bestAlignment, ostream &out)
+
+void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
+ const ALIGNMENT *targetToSourceAlignment, ostream &out)
{
+ // get corresponding target non-terminal and output pair
+ ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
+ invertAlignment(phraseSource, phraseTarget, targetToSourceAlignment, sourceToTargetAlignment);
// output source symbols, except root, in rule table format
- for (std::size_t i = 0; i < phraseS.size()-1; ++i) {
- const std::string &word = vcbS.getWord(phraseS[i]);
+ for (std::size_t i = 0; i < phraseSource->size()-1; ++i) {
+ const std::string &word = vcbS.getWord(phraseSource->at(i));
if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
out << word << " ";
continue;
}
- // get corresponding target non-terminal and output pair
- std::set<std::size_t> alignmentPoints = bestAlignment.alignedToS[i];
+ const std::set<std::size_t> &alignmentPoints = sourceToTargetAlignment->at(i);
assert(alignmentPoints.size() == 1);
- int j = *(alignmentPoints.begin());
+ size_t j = *(alignmentPoints.begin());
if (inverseFlag) {
- out << vcbT.getWord(phraseT[j]) << word << " ";
+ out << vcbT.getWord(phraseTarget->at(j)) << word << " ";
} else {
- out << word << vcbT.getWord(phraseT[j]) << " ";
+ out << word << vcbT.getWord(phraseTarget->at(j)) << " ";
}
}
// output source root symbol
if (conditionOnTargetLhsFlag && !inverseFlag) {
out << "[X]";
} else {
- out << vcbS.getWord(phraseS.back());
+ out << vcbS.getWord(phraseSource->back());
}
+ delete sourceToTargetAlignment;
}
-void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT,
- const PhraseAlignment &bestAlignment, ostream &out)
+
+void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
+ const ALIGNMENT *targetToSourceAlignment, ostream &out)
{
// output target symbols, except root, in rule table format
- for (std::size_t i = 0; i < phraseT.size()-1; ++i) {
- const std::string &word = vcbT.getWord(phraseT[i]);
+ for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {
+ const std::string &word = vcbT.getWord(phraseTarget->at(i));
if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
out << word << " ";
continue;
}
// get corresponding source non-terminal and output pair
- std::set<std::size_t> alignmentPoints = bestAlignment.alignedToT[i];
+ std::set<std::size_t> alignmentPoints = targetToSourceAlignment->at(i);
assert(alignmentPoints.size() == 1);
int j = *(alignmentPoints.begin());
if (inverseFlag) {
- out << word << vcbS.getWord(phraseS[j]) << " ";
+ out << word << vcbS.getWord(phraseSource->at(j)) << " ";
} else {
- out << vcbS.getWord(phraseS[j]) << word << " ";
+ out << vcbS.getWord(phraseSource->at(j)) << word << " ";
}
}
// output target root symbol
@@ -782,24 +882,28 @@ void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT,
if (inverseFlag) {
out << "[X]";
} else {
- out << vcbS.getWord(phraseS.back());
+ out << vcbS.getWord(phraseSource->back());
}
} else {
- out << vcbT.getWord(phraseT.back());
+ out << vcbT.getWord(phraseTarget->back());
}
}
-std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj )
-{
- std::pair<iterator,bool> ret = m_coll.insert(obj);
- if (ret.second) {
- // obj inserted. Also add to sorted vector
- const PhraseAlignmentCollection &insertedObj = *ret.first;
- m_sortedColl.push_back(&insertedObj);
- }
+void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
+ const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment) {
+// typedef std::vector< std::set<size_t> > ALIGNMENT;
- return ret;
+ outSourceToTargetAlignment->clear();
+ size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size());
+ outSourceToTargetAlignment->resize(numberOfSourceSymbols);
+ // add alignment point
+ for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) {
+ for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
+ setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) {
+ size_t sourcePosition = *setIter;
+ outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition);
+ }
+ }
}
-