From 86ee3e15a441aec72eaebdd0389fa925da2316c7 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Wed, 29 Jan 2014 18:37:42 +0000 Subject: new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner --- phrase-extract/DomainFeature.cpp | 173 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 phrase-extract/DomainFeature.cpp (limited to 'phrase-extract/DomainFeature.cpp') diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp new file mode 100644 index 000000000..2f99a8709 --- /dev/null +++ b/phrase-extract/DomainFeature.cpp @@ -0,0 +1,173 @@ +#include "DomainFeature.h" +#include "ExtractionPhrasePair.h" +#include "tables-core.h" +#include "InputFileStream.h" +#include "SafeGetline.h" + +#define TABLE_LINE_MAX_LENGTH 1000 + +using namespace std; + +namespace MosesTraining +{ + +// handling of domain names: load database with sentence-id / domain name info +void Domain::load( const std::string &domainFileName ) +{ + Moses::InputFileStream fileS( domainFileName ); + istream *fileP = &fileS; + while(true) { + char line[TABLE_LINE_MAX_LENGTH]; + SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__); + if (fileP->eof()) break; + // read + vector< string > domainSpecLine = tokenize( line ); + int lineNumber; + if (domainSpecLine.size() != 2 || + ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) { + std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl; + exit(1); + } + // store + string &name = domainSpecLine[1]; + spec.push_back( make_pair( lineNumber, name )); + if (name2id.find( name ) == name2id.end()) { + name2id[ name ] = list.size(); + list.push_back( name ); + } + } +} + +// get domain name based on sentence number +string Domain::getDomainOfSentence( int sentenceId ) const +{ + for(size_t i=0; i& denseValues, + std::map& sparseValues) const +{ + const map *domainCount = context.phrasePair.GetProperty(m_propertyKey); + assert( domainCount != NULL ); + add(*domainCount, + context.phrasePair.GetCount(), + context.maybeLog, + denseValues, sparseValues); +} + +void SubsetDomainFeature::add(const map& domainCount, + float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const +{ + if (m_domain.list.size() > 6) { + UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException, + "too many domains for core domain subset features"); + } + size_t bitmap = 0; + for(size_t bit = 0; bit < m_domain.list.size(); bit++) { + if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) { + bitmap += 1 << bit; + } + } + for(size_t i = 1; i < (1 << m_domain.list.size()); i++) { + denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 )); + } +} + +void SparseSubsetDomainFeature::add(const map& domainCount,float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const +{ + typedef vector::const_iterator I; + ostringstream key; + key << "doms"; + for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) { + if (domainCount.find(*i) != domainCount.end()) { + key << "_" << *i; + } + } + sparseValues[key.str()] = 1; +} + + +void RatioDomainFeature::add(const map& domainCount,float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const +{ + typedef vector< string >::const_iterator I; + for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) { + map::const_iterator dci = domainCount.find(*i); + if (dci == domainCount.end() ) { + denseValues.push_back(maybeLog( 1 )); + } else { + denseValues.push_back(maybeLog(exp( dci->second / count ) )); + } + } +} + + +void SparseRatioDomainFeature::add(const map& domainCount,float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const +{ + typedef map< string, float >::const_iterator I; + for (I i=domainCount.begin(); i != domainCount.end(); i++) { + sparseValues["domr_" + i->first] = (i->second / count); + } +} + + +void IndicatorDomainFeature::add(const map& domainCount,float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const +{ + typedef vector< string >::const_iterator I; + for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) { + map::const_iterator dci = domainCount.find(*i); + if (dci == domainCount.end() ) { + denseValues.push_back(maybeLog( 1 )); + } else { + denseValues.push_back(maybeLog(2.718)); + } + } +} + +void SparseIndicatorDomainFeature::add(const map& domainCount,float count, + const MaybeLog& maybeLog, + std::vector& denseValues, + std::map& sparseValues) const +{ + typedef map< string, float >::const_iterator I; + for (I i=domainCount.begin(); i != domainCount.end(); i++) { + sparseValues["dom_" + i->first] = 1; + } +} + +} + -- cgit v1.2.3