#include "DomainFeature.h" #include "ExtractionPhrasePair.h" #include "tables-core.h" #include "InputFileStream.h" #include "util/tokenize.hh" using namespace std; namespace MosesTraining { // handling of domain names: load database with sentence-id / domain name info void Domain::load( const std::string &domainFileName ) { Moses::InputFileStream fileS( domainFileName ); istream *fileP = &fileS; string line; while(getline(*fileP, line)) { // read const vector< string > domainSpecLine = util::tokenize( line ); int lineNumber; if (domainSpecLine.size() != 2 || ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) { std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl; exit(1); } // store const string &name = domainSpecLine[1]; spec.push_back( make_pair( lineNumber, name )); if (name2id.find( name ) == name2id.end()) { name2id[ name ] = list.size(); list.push_back( name ); } } } // get domain name based on sentence number string Domain::getDomainOfSentence( int sentenceId ) const { for(size_t i=0; i& denseValues, std::map& sparseValues) const { const map *domainCount = context.phrasePair.GetProperty(m_propertyKey); assert( domainCount != NULL ); add(*domainCount, context.phrasePair.GetCount(), context.maybeLog, denseValues, sparseValues); } void SubsetDomainFeature::add(const map& domainCount, float count, const MaybeLog& maybeLog, std::vector& denseValues, std::map& sparseValues) const { if (m_domain.list.size() > 6) { UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException, "too many domains for core domain subset features"); } size_t bitmap = 0; for(size_t bit = 0; bit < m_domain.list.size(); bit++) { if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) { bitmap += 1 << bit; } } for(size_t i = 1; i < (1 << m_domain.list.size()); i++) { denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 )); } } void SparseSubsetDomainFeature::add(const map& domainCount,float count, const MaybeLog& maybeLog, std::vector& denseValues, std::map& sparseValues) const { typedef vector::const_iterator I; ostringstream key; key << "doms"; for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) { if (domainCount.find(*i) != domainCount.end()) { key << "_" << *i; } } sparseValues[key.str()] = 1; } void RatioDomainFeature::add(const map& domainCount,float count, const MaybeLog& maybeLog, std::vector& denseValues, std::map& sparseValues) const { typedef vector< string >::const_iterator I; for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) { map::const_iterator dci = domainCount.find(*i); if (dci == domainCount.end() ) { denseValues.push_back(maybeLog( 1 )); } else { denseValues.push_back(maybeLog(exp( dci->second / count ) )); } } } void SparseRatioDomainFeature::add(const map& domainCount,float count, const MaybeLog& maybeLog, std::vector& denseValues, std::map& sparseValues) const { typedef map< string, float >::const_iterator I; for (I i=domainCount.begin(); i != domainCount.end(); i++) { sparseValues["domr_" + i->first] = (i->second / count); } } void IndicatorDomainFeature::add(const map& domainCount,float count, const MaybeLog& maybeLog, std::vector& denseValues, std::map& sparseValues) const { typedef vector< string >::const_iterator I; for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) { map::const_iterator dci = domainCount.find(*i); if (dci == domainCount.end() ) { denseValues.push_back(maybeLog( 1 )); } else { denseValues.push_back(maybeLog(2.718)); } } } void SparseIndicatorDomainFeature::add(const map& domainCount,float count, const MaybeLog& maybeLog, std::vector& denseValues, std::map& sparseValues) const { typedef map< string, float >::const_iterator I; for (I i=domainCount.begin(); i != domainCount.end(); i++) { sparseValues["dom_" + i->first] = 1; } } }