Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authorMatthias Huck <huck@i6.informatik.rwth-aachen.de>2015-02-25 03:11:31 +0300
committerMatthias Huck <huck@i6.informatik.rwth-aachen.de>2015-02-25 03:11:31 +0300
commit3c8d48f8af7c65c4cc6cf731e6eee6dd6f8f29db (patch)
tree72b089ec2d9283d7ff7062bd1a60c67f939f1f64 /moses
parent28fbf07c37589c1cb118d83842f31c969cf5b228 (diff)
Model1Feature: a simple IBM Model 1 scorer,
source-to-target with global source-sentence context
Diffstat (limited to 'moses')
-rw-r--r--moses/FF/Factory.cpp2
-rw-r--r--moses/FF/Model1Feature.cpp209
-rw-r--r--moses/FF/Model1Feature.h102
3 files changed, 313 insertions, 0 deletions
diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp
index 525808f98..1e8316cad 100644
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@@ -21,6 +21,7 @@
#include "moses/FF/SourceWordDeletionFeature.h"
#include "moses/FF/GlobalLexicalModel.h"
#include "moses/FF/GlobalLexicalModelUnlimited.h"
+#include "moses/FF/Model1Feature.h"
#include "moses/FF/UnknownWordPenaltyProducer.h"
#include "moses/FF/WordTranslationFeature.h"
#include "moses/FF/TargetBigramFeature.h"
@@ -202,6 +203,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(GlobalLexicalModel);
//MOSES_FNAME(GlobalLexicalModelUnlimited); This was commented out in the original
+ MOSES_FNAME(Model1Feature);
MOSES_FNAME(SourceWordDeletionFeature);
MOSES_FNAME(TargetWordInsertionFeature);
MOSES_FNAME(PhraseBoundaryFeature);
diff --git a/moses/FF/Model1Feature.cpp b/moses/FF/Model1Feature.cpp
new file mode 100644
index 000000000..d6b9a0a8b
--- /dev/null
+++ b/moses/FF/Model1Feature.cpp
@@ -0,0 +1,209 @@
+#include <assert.h>
+#include "util/exception.hh"
+#include "Model1Feature.h"
+#include "moses/StaticData.h"
+#include "moses/InputFileStream.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/ChartManager.h"
+#include "moses/FactorCollection.h"
+
+
+using namespace std;
+
+namespace Moses
+{
+
+const std::string Model1Vocabulary::GIZANULL = "GIZANULL";
+
+Model1Vocabulary::Model1Vocabulary()
+{
+ FactorCollection &factorCollection = FactorCollection::Instance();
+ m_NULL = factorCollection.AddFactor(GIZANULL,false);
+ Store(m_NULL,0);
+}
+
+bool Model1Vocabulary::Store(const Factor* word, const unsigned id)
+{
+ boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
+ if ( iter != m_lookup.end() ) {
+ return false;
+ }
+ m_lookup[ word ] = id;
+ if ( m_vocab.size() <= id ) {
+ m_vocab.resize(id+1);
+ }
+ m_vocab[id] = word;
+ return true;
+}
+
+unsigned Model1Vocabulary::StoreIfNew(const Factor* word)
+{
+ boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
+
+ if ( iter != m_lookup.end() ) {
+ return iter->second;
+ }
+
+ unsigned id = m_vocab.size();
+ m_vocab.push_back( word );
+ m_lookup[ word ] = id;
+ return id;
+}
+
+unsigned Model1Vocabulary::GetWordID(const Factor* word) const
+{
+ boost::unordered_map<const Factor*, unsigned>::const_iterator iter = m_lookup.find( word );
+ if ( iter == m_lookup.end() ) {
+ return INVALID_ID;
+ }
+ return iter->second;
+}
+
+const Factor* Model1Vocabulary::GetWord(unsigned id) const
+{
+ if (id >= m_vocab.size()) {
+ return NULL;
+ }
+ return m_vocab[ id ];
+}
+
+void Model1Vocabulary::Load(const std::string& fileName)
+{
+ InputFileStream inFile(fileName);
+ FactorCollection &factorCollection = FactorCollection::Instance();
+ std::string line;
+
+ unsigned i = 0;
+ while ( getline(inFile, line) )
+ {
+ ++i;
+ std::vector<std::string> tokens = Tokenize(line);
+ UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
+ unsigned id = Scan<unsigned>(tokens[0]);
+ const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
+ bool stored = Store(factor, id);
+ UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
+ }
+ inFile.Close();
+}
+
+
+void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT)
+{
+ InputFileStream inFile(fileName);
+ std::string line;
+
+ unsigned i = 0;
+ while ( getline(inFile, line) )
+ {
+ ++i;
+ std::vector<std::string> tokens = Tokenize(line);
+ UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
+ unsigned idS = Scan<unsigned>(tokens[0]);
+ unsigned idT = Scan<unsigned>(tokens[1]);
+ const Factor* wordS = vcbS.GetWord(idS);
+ const Factor* wordT = vcbT.GetWord(idT);
+ float prob = Scan<float>(tokens[2]);
+ if ( (wordS != NULL) && (wordT != NULL) ) {
+ m_ltable[ wordS ][ wordT ] = prob;
+ }
+ UTIL_THROW_IF2((wordS == NULL) || (wordT == NULL), "Line " << i << " in " << fileName << " has unknown vocabulary."); // TODO: can we assume that the vocabulary is know and filter the model on loading? Then remove this line.
+ }
+ inFile.Close();
+}
+
+// p( wordT | wordS )
+float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* wordT) const
+{
+ float prob = m_floor;
+
+ boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > >::const_iterator iter1 = m_ltable.find( wordS );
+
+ if ( iter1 != m_ltable.end() ) {
+ boost::unordered_map< const Factor*, float >::const_iterator iter2 = iter1->second.find( wordT );
+ if ( iter2 != iter1->second.end() ) {
+ prob = iter2->second;
+ if ( prob < m_floor ) {
+ prob = m_floor;
+ }
+ }
+ }
+ return prob;
+}
+
+
+Model1Feature::Model1Feature(const std::string &line)
+ : StatelessFeatureFunction(1, line)
+{
+ VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
+ ReadParameters();
+ VERBOSE(1, " Done.");
+}
+
+void Model1Feature::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "path") {
+ m_fileNameModel1 = value;
+ } else if (key == "sourceVocabulary") {
+ m_fileNameVcbS = value;
+ } else if (key == "targetVocabulary") {
+ m_fileNameVcbT = value;
+ } else {
+ StatelessFeatureFunction::SetParameter(key, value);
+ }
+}
+
+void Model1Feature::Load()
+{
+ FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading source vocabulary from file " << m_fileNameVcbS << " ...");
+ Model1Vocabulary vcbS;
+ vcbS.Load(m_fileNameVcbS);
+ FEATUREVERBOSE2(2, " Done." << std::endl);
+ FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading target vocabulary from file " << m_fileNameVcbT << " ...");
+ Model1Vocabulary vcbT;
+ vcbT.Load(m_fileNameVcbT);
+ FEATUREVERBOSE2(2, " Done." << std::endl);
+ FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading model 1 lexical translation table from file " << m_fileNameModel1 << " ...");
+ m_model1.Load(m_fileNameModel1,vcbS,vcbT);
+ FEATUREVERBOSE2(2, " Done." << std::endl);
+ FactorCollection &factorCollection = FactorCollection::Instance();
+ m_emptyWord = factorCollection.GetFactor(Model1Vocabulary::GIZANULL,false);
+ UTIL_THROW_IF2(m_emptyWord==NULL, GetScoreProducerDescription()
+ << ": Factor for GIZA empty word does not exist.");
+}
+
+void Model1Feature::EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore) const
+{
+ const Sentence& sentence = static_cast<const Sentence&>(input);
+ float score = 0.0;
+ float norm = TransformScore(1+sentence.GetSize());
+
+ for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT)
+ {
+ const Word &wordT = targetPhrase.GetWord(posT);
+ if ( !wordT.IsNonTerminal() )
+ {
+ float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]); // probability conditioned on empty word
+ for (size_t posS=1; posS<sentence.GetSize()-1; ++posS) // ignore <s> and </s>
+ {
+ const Word &wordS = sentence.GetWord(posS);
+ float modelProb = m_model1.GetProbability(wordS[0],wordT[0]);
+ FEATUREVERBOSE(3, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl);
+ thisWordProb += modelProb;
+ }
+ score += TransformScore(thisWordProb) - norm;
+ }
+ }
+
+ scoreBreakdown.PlusEquals(this, score);
+}
+
+}
+
diff --git a/moses/FF/Model1Feature.h b/moses/FF/Model1Feature.h
new file mode 100644
index 000000000..7df941e7a
--- /dev/null
+++ b/moses/FF/Model1Feature.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <string>
+#include <limits>
+#include <boost/unordered_map.hpp>
+#include "StatelessFeatureFunction.h"
+#include "FFState.h"
+#include "moses/Factor.h"
+
+namespace Moses
+{
+
+class Model1Vocabulary
+{
+public:
+
+ #define INVALID_ID std::numeric_limits<unsigned>::max() // UINT_MAX
+ static const std::string GIZANULL;
+
+ Model1Vocabulary();
+ bool Store(const Factor* word, const unsigned id);
+ unsigned StoreIfNew(const Factor* word);
+ unsigned GetWordID(const Factor* word) const;
+ const Factor* GetWord(unsigned id) const;
+ void Load(const std::string& fileName);
+
+protected:
+ boost::unordered_map<const Factor*, unsigned> m_lookup;
+ std::vector< const Factor* > m_vocab;
+ const Factor* m_NULL;
+};
+
+
+class Model1LexicalTable
+{
+public:
+ Model1LexicalTable(float floor=1e-7) : m_floor(floor)
+ {}
+
+ void Load(const std::string& fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT);
+
+ // p( wordT | wordS )
+ float GetProbability(const Factor* wordS, const Factor* wordT) const;
+
+protected:
+ boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > > m_ltable;
+ const float m_floor;
+};
+
+
+
+class Model1Feature : public StatelessFeatureFunction
+{
+public:
+ Model1Feature(const std::string &line);
+
+ bool IsUseable(const FactorMask &mask) const {
+ return true;
+ }
+
+ void SetParameter(const std::string& key, const std::string& value);
+
+ void EvaluateInIsolation(const Phrase &source
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
+ {};
+
+ void EvaluateWithSourceContext(const InputType &input
+ , const InputPath &inputPath
+ , const TargetPhrase &targetPhrase
+ , const StackVec *stackVec
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const;
+
+ void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+ , const TranslationOptionList &translationOptionList) const
+ {}
+
+ void EvaluateWhenApplied(
+ const Hypothesis& cur_hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+ void EvaluateWhenApplied(
+ const ChartHypothesis& cur_hypo,
+ ScoreComponentCollection* accumulator) const
+ {}
+
+private:
+ std::string m_fileNameVcbS;
+ std::string m_fileNameVcbT;
+ std::string m_fileNameModel1;
+ Model1LexicalTable m_model1;
+ const Factor* m_emptyWord;
+
+ void Load();
+};
+
+
+}
+