Model1Feature: a simple IBM Model 1 scorer,

source-to-target with global source-sentence context
author: Matthias Huck <huck@i6.informatik.rwth-aachen.de> 2015-02-25 03:11:31 +0300
committer: Matthias Huck <huck@i6.informatik.rwth-aachen.de> 2015-02-25 03:11:31 +0300
commit: 3c8d48f8af7c65c4cc6cf731e6eee6dd6f8f29db (patch)
tree: 72b089ec2d9283d7ff7062bd1a60c67f939f1f64 /moses
parent: 28fbf07c37589c1cb118d83842f31c969cf5b228 (diff)
3 files changed, 313 insertions, 0 deletions
diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp
index 525808f98..1e8316cad 100644
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@@ -21,6 +21,7 @@
 #include "moses/FF/SourceWordDeletionFeature.h"
 #include "moses/FF/GlobalLexicalModel.h"
 #include "moses/FF/GlobalLexicalModelUnlimited.h"
+#include "moses/FF/Model1Feature.h"
 #include "moses/FF/UnknownWordPenaltyProducer.h"
 #include "moses/FF/WordTranslationFeature.h"
 #include "moses/FF/TargetBigramFeature.h"
@@ -202,6 +203,7 @@ FeatureRegistry::FeatureRegistry()
 
   MOSES_FNAME(GlobalLexicalModel);
   //MOSES_FNAME(GlobalLexicalModelUnlimited); This was commented out in the original
+  MOSES_FNAME(Model1Feature);
   MOSES_FNAME(SourceWordDeletionFeature);
   MOSES_FNAME(TargetWordInsertionFeature);
   MOSES_FNAME(PhraseBoundaryFeature);
diff --git a/moses/FF/Model1Feature.cpp b/moses/FF/Model1Feature.cpp
new file mode 100644
index 000000000..d6b9a0a8b
--- /dev/null
+++ b/moses/FF/Model1Feature.cpp
@@ -0,0 +1,209 @@
+#include <assert.h>
+#include "util/exception.hh"
+#include "Model1Feature.h"
+#include "moses/StaticData.h"
+#include "moses/InputFileStream.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/ChartManager.h"
+#include "moses/FactorCollection.h"
+
+
+using namespace std;
+
+namespace Moses
+{
+
+const std::string Model1Vocabulary::GIZANULL = "GIZANULL";
+
+Model1Vocabulary::Model1Vocabulary()
+{
+  FactorCollection &factorCollection = FactorCollection::Instance();
+  m_NULL = factorCollection.AddFactor(GIZANULL,false);
+  Store(m_NULL,0);
+}
+
+bool Model1Vocabulary::Store(const Factor* word, const unsigned id) 
+{
+  boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
+  if ( iter != m_lookup.end() ) {
+    return false;
+  }
+  m_lookup[ word ] = id;
+  if ( m_vocab.size() <= id ) {
+    m_vocab.resize(id+1);
+  }
+  m_vocab[id] = word;
+  return true;
+}
+
+unsigned Model1Vocabulary::StoreIfNew(const Factor* word) 
+{
+  boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
+
+  if ( iter != m_lookup.end() ) {
+    return iter->second;
+  }
+
+  unsigned id = m_vocab.size();
+  m_vocab.push_back( word );
+  m_lookup[ word ] = id;
+  return id;
+}
+
+unsigned Model1Vocabulary::GetWordID(const Factor* word) const 
+{
+  boost::unordered_map<const Factor*, unsigned>::const_iterator iter = m_lookup.find( word );
+  if ( iter == m_lookup.end() ) {
+    return INVALID_ID;
+  }
+  return iter->second;
+}
+
+const Factor* Model1Vocabulary::GetWord(unsigned id) const 
+{
+  if (id >= m_vocab.size()) {
+    return NULL;
+  }
+  return m_vocab[ id ];
+}
+
+void Model1Vocabulary::Load(const std::string& fileName) 
+{
+  InputFileStream inFile(fileName);
+  FactorCollection &factorCollection = FactorCollection::Instance();
+  std::string line;
+
+  unsigned i = 0;
+  while ( getline(inFile, line) ) 
+  {
+    ++i;
+    std::vector<std::string> tokens = Tokenize(line);
+    UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
+    unsigned id = Scan<unsigned>(tokens[0]);
+    const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
+    bool stored = Store(factor, id);
+    UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
+  }
+  inFile.Close();
+}
+
+
+void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT)
+{
+  InputFileStream inFile(fileName);
+  std::string line;
+
+  unsigned i = 0;
+  while ( getline(inFile, line) ) 
+  {
+    ++i;
+    std::vector<std::string> tokens = Tokenize(line);
+    UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
+    unsigned idS = Scan<unsigned>(tokens[0]);
+    unsigned idT = Scan<unsigned>(tokens[1]);
+    const Factor* wordS = vcbS.GetWord(idS);
+    const Factor* wordT = vcbT.GetWord(idT);
+    float prob = Scan<float>(tokens[2]);
+    if ( (wordS != NULL) && (wordT != NULL) ) {
+      m_ltable[ wordS ][ wordT ] = prob;
+    }
+    UTIL_THROW_IF2((wordS == NULL) || (wordT == NULL), "Line " << i << " in " << fileName << " has unknown vocabulary."); // TODO: can we assume that the vocabulary is know and filter the model on loading? Then remove this line.
+  }
+  inFile.Close();
+}
+
+// p( wordT | wordS )
+float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* wordT) const
+{
+  float prob = m_floor;
+ 
+  boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > >::const_iterator iter1 = m_ltable.find( wordS ); 
+
+  if ( iter1 != m_ltable.end() ) {
+    boost::unordered_map< const Factor*, float >::const_iterator iter2 = iter1->second.find( wordT );
+    if ( iter2 != iter1->second.end() ) {
+      prob = iter2->second;
+      if ( prob < m_floor ) {
+        prob = m_floor;
+      }
+    }
+  }
+  return prob;
+}
+
+
+Model1Feature::Model1Feature(const std::string &line)
+  : StatelessFeatureFunction(1, line)
+{
+  VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
+  ReadParameters();
+  VERBOSE(1, " Done.");
+}
+
+void Model1Feature::SetParameter(const std::string& key, const std::string& value)
+{
+  if (key == "path") {
+    m_fileNameModel1 = value;
+  } else if (key == "sourceVocabulary") {
+    m_fileNameVcbS = value;
+  } else if (key == "targetVocabulary") {
+    m_fileNameVcbT = value;
+  } else {
+    StatelessFeatureFunction::SetParameter(key, value);
+  }
+}
+
+void Model1Feature::Load()
+{
+  FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading source vocabulary from file " << m_fileNameVcbS << " ...");
+  Model1Vocabulary vcbS;
+  vcbS.Load(m_fileNameVcbS);
+  FEATUREVERBOSE2(2, " Done." << std::endl);
+  FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading target vocabulary from file " << m_fileNameVcbT << " ...");
+  Model1Vocabulary vcbT;
+  vcbT.Load(m_fileNameVcbT);
+  FEATUREVERBOSE2(2, " Done." << std::endl);
+  FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading model 1 lexical translation table from file " << m_fileNameModel1 << " ...");
+  m_model1.Load(m_fileNameModel1,vcbS,vcbT);
+  FEATUREVERBOSE2(2, " Done." << std::endl);
+  FactorCollection &factorCollection = FactorCollection::Instance();
+  m_emptyWord = factorCollection.GetFactor(Model1Vocabulary::GIZANULL,false);
+  UTIL_THROW_IF2(m_emptyWord==NULL, GetScoreProducerDescription()
+                 << ": Factor for GIZA empty word does not exist.");
+}
+
+void Model1Feature::EvaluateWithSourceContext(const InputType &input
+                                 , const InputPath &inputPath
+                                 , const TargetPhrase &targetPhrase
+                                 , const StackVec *stackVec
+                                 , ScoreComponentCollection &scoreBreakdown
+                                 , ScoreComponentCollection *estimatedFutureScore) const
+{
+  const Sentence& sentence = static_cast<const Sentence&>(input);
+  float score = 0.0;
+  float norm = TransformScore(1+sentence.GetSize());
+
+  for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) 
+  {
+    const Word &wordT = targetPhrase.GetWord(posT);
+    if ( !wordT.IsNonTerminal() ) 
+    {
+      float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]); // probability conditioned on empty word
+      for (size_t posS=1; posS<sentence.GetSize()-1; ++posS) // ignore <s> and </s>
+      {
+        const Word &wordS = sentence.GetWord(posS);
+        float modelProb = m_model1.GetProbability(wordS[0],wordT[0]);
+        FEATUREVERBOSE(3, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl);
+        thisWordProb += modelProb;
+      }
+      score += TransformScore(thisWordProb) - norm;
+    }
+  } 
+
+  scoreBreakdown.PlusEquals(this, score);
+}
+
+}
+
diff --git a/moses/FF/Model1Feature.h b/moses/FF/Model1Feature.h
new file mode 100644
index 000000000..7df941e7a
--- /dev/null
+++ b/moses/FF/Model1Feature.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <string>
+#include <limits>
+#include <boost/unordered_map.hpp>
+#include "StatelessFeatureFunction.h"
+#include "FFState.h"
+#include "moses/Factor.h"
+
+namespace Moses
+{
+
+class Model1Vocabulary
+{
+public:
+
+  #define INVALID_ID std::numeric_limits<unsigned>::max() // UINT_MAX
+  static const std::string GIZANULL;
+
+  Model1Vocabulary();
+  bool Store(const Factor* word, const unsigned id);
+  unsigned StoreIfNew(const Factor* word);
+  unsigned GetWordID(const Factor* word) const;
+  const Factor* GetWord(unsigned id) const;
+  void Load(const std::string& fileName);
+
+protected:
+  boost::unordered_map<const Factor*, unsigned> m_lookup;
+  std::vector< const Factor* > m_vocab;
+  const Factor* m_NULL;
+};
+
+
+class Model1LexicalTable
+{
+public:
+  Model1LexicalTable(float floor=1e-7) : m_floor(floor) 
+  {}
+
+  void Load(const std::string& fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT);
+
+  // p( wordT | wordS )
+  float GetProbability(const Factor* wordS, const Factor* wordT) const;
+
+protected:
+  boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > > m_ltable;
+  const float m_floor;
+};
+
+
+
+class Model1Feature : public StatelessFeatureFunction
+{
+public:
+  Model1Feature(const std::string &line);
+
+  bool IsUseable(const FactorMask &mask) const {
+    return true;
+  }
+
+  void SetParameter(const std::string& key, const std::string& value);
+
+  void EvaluateInIsolation(const Phrase &source
+                           , const TargetPhrase &targetPhrase
+                           , ScoreComponentCollection &scoreBreakdown
+                           , ScoreComponentCollection &estimatedFutureScore) const
+  {};
+
+  void EvaluateWithSourceContext(const InputType &input
+                                 , const InputPath &inputPath
+                                 , const TargetPhrase &targetPhrase
+                                 , const StackVec *stackVec
+                                 , ScoreComponentCollection &scoreBreakdown
+                                 , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const
+  {}
+
+  void EvaluateWhenApplied(
+    const Hypothesis& cur_hypo,
+    ScoreComponentCollection* accumulator) const
+  {}
+
+  void EvaluateWhenApplied(
+    const ChartHypothesis& cur_hypo,
+    ScoreComponentCollection* accumulator) const
+  {}
+
+private:
+  std::string m_fileNameVcbS;
+  std::string m_fileNameVcbT;
+  std::string m_fileNameModel1;
+  Model1LexicalTable m_model1;
+  const Factor* m_emptyWord;
+
+  void Load();
+};
+
+
+}
+
author	Matthias Huck <huck@i6.informatik.rwth-aachen.de>	2015-02-25 03:11:31 +0300
committer	Matthias Huck <huck@i6.informatik.rwth-aachen.de>	2015-02-25 03:11:31 +0300
commit	3c8d48f8af7c65c4cc6cf731e6eee6dd6f8f29db (patch)
tree	72b089ec2d9283d7ff7062bd1a60c67f939f1f64 /moses
parent	28fbf07c37589c1cb118d83842f31c969cf5b228 (diff)