Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <huck@i6.informatik.rwth-aachen.de>2014-01-29 22:37:42 +0400
committerMatthias Huck <huck@i6.informatik.rwth-aachen.de>2014-01-29 22:37:42 +0400
commit86ee3e15a441aec72eaebdd0389fa925da2316c7 (patch)
tree6de24e964968820fb708a2e37a935c40a2a1494e /phrase-extract/DomainFeature.cpp
parentffd62e994ecb88358b5f3aa835f84d441ec58c77 (diff)
new version of the `score` tool
which is now capable of dealing with additional properties in an appropriate manner
Diffstat (limited to 'phrase-extract/DomainFeature.cpp')
-rw-r--r--phrase-extract/DomainFeature.cpp173
1 files changed, 173 insertions, 0 deletions
diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp
new file mode 100644
index 000000000..2f99a8709
--- /dev/null
+++ b/phrase-extract/DomainFeature.cpp
@@ -0,0 +1,173 @@
+#include "DomainFeature.h"
+#include "ExtractionPhrasePair.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+#include "SafeGetline.h"
+
+#define TABLE_LINE_MAX_LENGTH 1000
+
+using namespace std;
+
+namespace MosesTraining
+{
+
+// handling of domain names: load database with sentence-id / domain name info
+void Domain::load( const std::string &domainFileName )
+{
+ Moses::InputFileStream fileS( domainFileName );
+ istream *fileP = &fileS;
+ while(true) {
+ char line[TABLE_LINE_MAX_LENGTH];
+ SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
+ if (fileP->eof()) break;
+ // read
+ vector< string > domainSpecLine = tokenize( line );
+ int lineNumber;
+ if (domainSpecLine.size() != 2 ||
+ ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
+ std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+ exit(1);
+ }
+ // store
+ string &name = domainSpecLine[1];
+ spec.push_back( make_pair( lineNumber, name ));
+ if (name2id.find( name ) == name2id.end()) {
+ name2id[ name ] = list.size();
+ list.push_back( name );
+ }
+ }
+}
+
+// get domain name based on sentence number
+string Domain::getDomainOfSentence( int sentenceId ) const
+{
+ for(size_t i=0; i<spec.size(); i++) {
+ if (sentenceId <= spec[i].first) {
+ return spec[i].second;
+ }
+ }
+ return "undefined";
+}
+
+DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
+{
+ //process domain file
+ m_domain.load(domainFile);
+}
+
+void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+ float count,
+ int sentenceId) const
+{
+ std::string value = m_domain.getDomainOfSentence(sentenceId);
+ phrasePair.AddProperty(m_propertyKey, value, count);
+}
+
+void DomainFeature::add(const ScoreFeatureContext& context,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
+{
+ const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
+ assert( domainCount != NULL );
+ add(*domainCount,
+ context.phrasePair.GetCount(),
+ context.maybeLog,
+ denseValues, sparseValues);
+}
+
+void SubsetDomainFeature::add(const map<string,float>& domainCount,
+ float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
+{
+ if (m_domain.list.size() > 6) {
+ UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
+ "too many domains for core domain subset features");
+ }
+ size_t bitmap = 0;
+ for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
+ if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
+ bitmap += 1 << bit;
+ }
+ }
+ for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
+ denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
+ }
+}
+
+void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
+{
+ typedef vector<string>::const_iterator I;
+ ostringstream key;
+ key << "doms";
+ for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
+ if (domainCount.find(*i) != domainCount.end()) {
+ key << "_" << *i;
+ }
+ }
+ sparseValues[key.str()] = 1;
+}
+
+
+void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
+{
+ typedef vector< string >::const_iterator I;
+ for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
+ map<string,float>::const_iterator dci = domainCount.find(*i);
+ if (dci == domainCount.end() ) {
+ denseValues.push_back(maybeLog( 1 ));
+ } else {
+ denseValues.push_back(maybeLog(exp( dci->second / count ) ));
+ }
+ }
+}
+
+
+void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
+{
+ typedef map< string, float >::const_iterator I;
+ for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+ sparseValues["domr_" + i->first] = (i->second / count);
+ }
+}
+
+
+void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
+{
+ typedef vector< string >::const_iterator I;
+ for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
+ map<string,float>::const_iterator dci = domainCount.find(*i);
+ if (dci == domainCount.end() ) {
+ denseValues.push_back(maybeLog( 1 ));
+ } else {
+ denseValues.push_back(maybeLog(2.718));
+ }
+ }
+}
+
+void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
+{
+ typedef map< string, float >::const_iterator I;
+ for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+ sparseValues["dom_" + i->first] = 1;
+ }
+}
+
+}
+