Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authorheafield <heafield@1f5c12ca-751b-0410-a591-d2e778427230>2011-10-13 16:33:05 +0400
committerheafield <heafield@1f5c12ca-751b-0410-a591-d2e778427230>2011-10-13 16:33:05 +0400
commitf08424840530575aaf349fc01397a07678b8cbf5 (patch)
tree92e05ed9bb3473d542b4a798738dc0179c3d25f8 /moses
parent7d9bc523a6a5f3151254d8bf95d99e3307394173 (diff)
Cut the middle men out of the language model interface.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4348 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'moses')
-rw-r--r--moses/src/Hypothesis.h1
-rw-r--r--moses/src/LanguageModel.cpp56
-rw-r--r--moses/src/LanguageModel.h90
-rw-r--r--moses/src/LanguageModelFactory.cpp23
-rw-r--r--moses/src/LanguageModelImplementation.h71
-rw-r--r--moses/src/LanguageModelKen.cpp307
-rw-r--r--moses/src/LanguageModelKen.h30
-rw-r--r--moses/src/StaticData.cpp2
8 files changed, 242 insertions, 338 deletions
diff --git a/moses/src/Hypothesis.h b/moses/src/Hypothesis.h
index 66a7fe86a..d05a17b8d 100644
--- a/moses/src/Hypothesis.h
+++ b/moses/src/Hypothesis.h
@@ -32,7 +32,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Phrase.h"
#include "PhraseDictionaryMemory.h"
#include "GenerationDictionary.h"
-#include "LanguageModelSingleFactor.h"
#include "ScoreComponentCollection.h"
#include "InputType.h"
#include "ObjectPool.h"
diff --git a/moses/src/LanguageModel.cpp b/moses/src/LanguageModel.cpp
index 336d214fc..d3b031268 100644
--- a/moses/src/LanguageModel.cpp
+++ b/moses/src/LanguageModel.cpp
@@ -19,15 +19,7 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#include <cassert>
-#include <limits>
-#include <iostream>
-#include <memory>
-#include <sstream>
-
-#include "FFState.h"
#include "LanguageModel.h"
-#include "LanguageModelImplementation.h"
#include "TypeDef.h"
#include "Util.h"
#include "Manager.h"
@@ -38,41 +30,20 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
-namespace Moses
-{
-LanguageModel::LanguageModel(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *implementation) :
- m_implementation(implementation)
-{
+namespace Moses {
+
+LanguageModel::LanguageModel() {
m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
- scoreIndexManager.AddScoreProducer(this);
-#ifndef WITH_THREADS
- // ref counting handled by boost otherwise
- m_implementation->IncrementReferenceCount();
-#endif
}
-LanguageModel::LanguageModel(ScoreIndexManager &scoreIndexManager, LanguageModel *loadedLM) :
- m_implementation(loadedLM->m_implementation)
-{
- m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
+void LanguageModel::Init(ScoreIndexManager &scoreIndexManager) {
scoreIndexManager.AddScoreProducer(this);
-#ifndef WITH_THREADS
- // ref counting handled by boost otherwise
- m_implementation->IncrementReferenceCount();
-#endif
}
-LanguageModel::~LanguageModel()
-{
-#ifndef WITH_THREADS
- if(m_implementation->DecrementReferenceCount() == 0)
- delete m_implementation;
-#endif
-}
+LanguageModel::~LanguageModel() {}
// don't inline virtual funcs...
-size_t LanguageModel::GetNumScoreComponents() const
-{
+size_t LanguageModel::GetNumScoreComponents() const {
if (m_enableOOVFeature) {
return 2;
} else {
@@ -80,26 +51,17 @@ size_t LanguageModel::GetNumScoreComponents() const
}
}
-float LanguageModel::GetWeight() const
-{
+float LanguageModel::GetWeight() const {
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
GetBeginIndex(GetScoreBookkeepingID());
return StaticData::Instance().GetAllWeights()[lmIndex];
}
-float LanguageModel::GetOOVWeight() const
-{
+float LanguageModel::GetOOVWeight() const {
if (!m_enableOOVFeature) return 0;
size_t lmIndex = StaticData::Instance().GetScoreIndexManager().
GetBeginIndex(GetScoreBookkeepingID());
return StaticData::Instance().GetAllWeights()[lmIndex+1];
-
-}
-
-const FFState* LanguageModel::EmptyHypothesisState(const InputType &/*input*/) const
-{
- // This is actually correct. The empty _hypothesis_ has <s> in it. Phrases use m_emptyContextState.
- return m_implementation->NewState(m_implementation->GetBeginSentenceState());
}
-}
+} // namespace Moses
diff --git a/moses/src/LanguageModel.h b/moses/src/LanguageModel.h
index db42d1896..41472a3b8 100644
--- a/moses/src/LanguageModel.h
+++ b/moses/src/LanguageModel.h
@@ -29,11 +29,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Util.h"
#include "FeatureFunction.h"
#include "Word.h"
-#include "LanguageModelImplementation.h"
-
-#ifdef WITH_THREADS
-#include <boost/shared_ptr.hpp>
-#endif
namespace Moses
{
@@ -43,56 +38,24 @@ class Factor;
class Phrase;
//! Abstract base class which represent a language model on a contiguous phrase
-class LanguageModel : public StatefulFeatureFunction
-{
+class LanguageModel : public StatefulFeatureFunction {
protected:
-#ifdef WITH_THREADS
- // if we have threads, we also have boost and can let it handle thread-safe reference counting
- boost::shared_ptr<LanguageModelImplementation> m_implementation;
-#else
- LanguageModelImplementation *m_implementation;
-#endif
+ LanguageModel();
+
+ // This can't be in the constructor for virual function dispatch reasons
+ void Init(ScoreIndexManager &scoreIndexManager);
+
bool m_enableOOVFeature;
public:
- /**
- * Create a new language model
- */
- LanguageModel(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *implementation);
-
- /**
- * Create a new language model reusing an already loaded implementation
- */
- LanguageModel(ScoreIndexManager &scoreIndexManager, LanguageModel *implementation);
-
virtual ~LanguageModel();
+ // Make another feature without copying the underlying model data.
+ virtual LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const = 0;
+
//! see ScoreProducer.h
size_t GetNumScoreComponents() const;
- /* whether this LM can be used on a particular phrase.
- * Should return false if phrase size = 0 or factor types required don't exists
- */
- bool Useable(const Phrase &phrase) const {
- return m_implementation->Useable(phrase);
- }
-
- /* calc total unweighted LM score of this phrase and return score via arguments.
- * Return scores should always be in natural log, regardless of representation with LM implementation.
- * Uses GetValue() of inherited class.
- * Useable() should be called beforehand on the phrase
- * \param fullScore scores of all unigram, bigram... of contiguous n-gram of the phrase
- * \param ngramScore score of only n-gram of order m_nGramOrder
- * \param oovCount number of LM OOVs
- */
- void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
- return m_implementation->CalcScore(phrase, fullScore, ngramScore, oovCount);
- }
-
- virtual std::string GetScoreProducerDescription(unsigned idx=0) const {
- return m_implementation->GetScoreProducerDescription(idx);
- }
-
bool OOVFeatureEnabled() const {
return m_enableOOVFeature;
}
@@ -104,29 +67,26 @@ public:
return "lm";
}
- void InitializeBeforeSentenceProcessing() {
- m_implementation->InitializeBeforeSentenceProcessing();
- }
+ virtual void InitializeBeforeSentenceProcessing() {}
- void CleanUpAfterSentenceProcessing() {
- m_implementation->CleanUpAfterSentenceProcessing();
- }
+ virtual void CleanUpAfterSentenceProcessing() {}
- virtual const FFState* EmptyHypothesisState(const InputType &input) const;
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const = 0;
- FFState* Evaluate(
- const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const {
- return m_implementation->Evaluate(cur_hypo, prev_state, accumulator, this);
- }
+ /* whether this LM can be used on a particular phrase.
+ * Should return false if phrase size = 0 or factor types required don't exists
+ */
+ virtual bool Useable(const Phrase &phrase) const = 0;
- FFState* EvaluateChart(
- const ChartHypothesis& cur_hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const {
- return m_implementation->EvaluateChart(cur_hypo, featureID, accumulator, this);
- }
+ /* calc total unweighted LM score of this phrase and return score via arguments.
+ * Return scores should always be in natural log, regardless of representation with LM implementation.
+ * Uses GetValue() of inherited class.
+ * Useable() should be called beforehand on the phrase
+ * \param fullScore scores of all unigram, bigram... of contiguous n-gram of the phrase
+ * \param ngramScore score of only n-gram of order m_nGramOrder
+ * \param oovCount number of LM OOVs
+ */
+ virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const = 0;
};
}
diff --git a/moses/src/LanguageModelFactory.cpp b/moses/src/LanguageModelFactory.cpp
index 92d5597e8..0087ac3a8 100644
--- a/moses/src/LanguageModelFactory.cpp
+++ b/moses/src/LanguageModelFactory.cpp
@@ -70,6 +70,14 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
, ScoreIndexManager &scoreIndexManager
, int dub )
{
+ if (lmImplementation == Ken || lmImplementation == LazyKen) {
+#ifdef LM_KEN
+ return ConstructKenLM(languageModelFile, scoreIndexManager, factorTypes[0], lmImplementation == LazyKen);
+#else
+ UserMessage::Add("KenLM isn't compiled in but your config asked for it");
+ return NULL;
+#endif
+ }
LanguageModelImplementation *lm = NULL;
switch (lmImplementation) {
case RandLM:
@@ -105,16 +113,6 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
lm = new LanguageModelSkip(new LanguageModelInternal());
#endif
break;
- case Ken:
-#ifdef LM_KEN
- lm = ConstructKenLM(languageModelFile, false);
-#endif
- break;
- case LazyKen:
-#ifdef LM_KEN
- lm = ConstructKenLM(languageModelFile, true);
-#endif
- break;
case Joint:
#ifdef LM_SRI
lm = new LanguageModelJoint(new LanguageModelSRI());
@@ -137,10 +135,13 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
lm = new LanguageModelDMapLM();
#endif
break;
+ default:
+ break;
}
if (lm == NULL) {
UserMessage::Add("Language model type unknown. Probably not compiled into library");
+ return NULL;
} else {
switch (lm->GetLMType()) {
case SingleFactor:
@@ -160,7 +161,7 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
}
}
- return new LanguageModel(scoreIndexManager, lm);
+ return new LMRefCount(scoreIndexManager, lm);
}
}
diff --git a/moses/src/LanguageModelImplementation.h b/moses/src/LanguageModelImplementation.h
index bd1ea41fd..8af54cc5d 100644
--- a/moses/src/LanguageModelImplementation.h
+++ b/moses/src/LanguageModelImplementation.h
@@ -29,6 +29,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Util.h"
#include "FeatureFunction.h"
#include "Word.h"
+#include "LanguageModel.h"
+
+#ifdef WITH_THREADS
+#include <boost/shared_ptr.hpp>
+#endif
namespace Moses
{
@@ -145,6 +150,72 @@ public:
#endif
};
+class LMRefCount : public LanguageModel {
+ public:
+ LMRefCount(ScoreIndexManager &scoreIndexManager, LanguageModelImplementation *impl) : m_impl(impl) {
+#ifndef WITH_THREADS
+ impl->IncrementReferenceCount();
+#endif
+ Init(scoreIndexManager);
+ }
+
+ ~LMRefCount() {
+#ifndef WITH_THREADS
+ if (!m_impl->DecrementReferenceCount()) delete m_impl;
+#endif
+ }
+
+ LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const {
+ return new LMRefCount(scoreIndexManager, *this);
+ }
+
+ void InitializeBeforeSentenceProcessing() {
+ m_impl->InitializeBeforeSentenceProcessing();
+ }
+
+ void CleanUpAfterSentenceProcessing() {
+ m_impl->CleanUpAfterSentenceProcessing();
+ }
+
+ const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
+ return m_impl->NewState(m_impl->GetBeginSentenceState());
+ }
+
+ bool Useable(const Phrase &phrase) const {
+ return m_impl->Useable(phrase);
+ }
+
+ void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
+ return m_impl->CalcScore(phrase, fullScore, ngramScore, oovCount);
+ }
+
+ FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const {
+ return m_impl->Evaluate(cur_hypo, prev_state, accumulator, this);
+ }
+
+ FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator) const {
+ return m_impl->EvaluateChart(cur_hypo, featureID, accumulator, this);
+ }
+
+ std::string GetScoreProducerDescription(unsigned int param) const {
+ return m_impl->GetScoreProducerDescription(param);
+ }
+
+ private:
+ LMRefCount(ScoreIndexManager &scoreIndexManager, const LMRefCount &copy_from) : m_impl(copy_from.m_impl) {
+#ifndef WITH_THREADS
+ m_impl->IncrementReferenceCount();
+#endif
+ Init(scoreIndexManager);
+ }
+
+#ifdef WITH_THREADS
+ boost::shared_ptr<LanguageModelImplementation> m_impl;
+#else
+ LanguageModelImplementation *m_impl;
+#endif
+};
+
}
#endif
diff --git a/moses/src/LanguageModelKen.cpp b/moses/src/LanguageModelKen.cpp
index d3548df15..596378f17 100644
--- a/moses/src/LanguageModelKen.cpp
+++ b/moses/src/LanguageModelKen.cpp
@@ -26,9 +26,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <stdlib.h>
#include "lm/binary_format.hh"
#include "lm/enumerate_vocab.hh"
+#include "lm/left.hh"
#include "lm/model.hh"
#include "LanguageModelKen.h"
+#include "LanguageModel.h"
#include "FFState.h"
#include "TypeDef.h"
#include "Util.h"
@@ -38,12 +40,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "StaticData.h"
#include "ChartHypothesis.h"
+#ifdef WITH_THREADS
+#include <boost/scoped_ptr.hpp>
+#endif
+
using namespace std;
namespace Moses {
-
-LanguageModelKenBase::~LanguageModelKenBase() {}
-
namespace {
struct KenLMState : public FFState {
@@ -59,56 +62,51 @@ struct KenLMState : public FFState {
/*
* An implementation of single factor LM using Ken's code.
*/
-template <class Model> class LanguageModelKen : public LanguageModelKenBase {
+template <class Model> class LanguageModelKen : public LanguageModel {
public:
- LanguageModelKen(bool lazy);
- ~LanguageModelKen();
-
- bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder);
+ LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
- LMResult GetValueGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const {
- return GetKenFullScoreGivenState(contextFactor, state);
+ ~LanguageModelKen() {
+#ifndef WITH_THREADS
+ if (!--*m_refcount) {
+ delete m_ngram;
+ delete m_refcount;
+ }
+#endif
}
- LMKenResult GetKenFullScoreGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const;
- LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const {
- return GetKenFullScoreForgotState(contextFactor, outState);
- }
- LMKenResult GetKenFullScoreForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
+ LanguageModel *Duplicate(ScoreIndexManager &scoreIndexManager) const;
- void GetState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
+ bool Useable(const Phrase &phrase) const {
+ return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
+ }
- const FFState *GetNullContextState() const;
- const FFState *GetBeginSentenceState() const;
- FFState *NewState(const FFState *from = NULL) const;
+ std::string GetScoreProducerDescription(unsigned) const {
+ std::ostringstream oss;
+ oss << "LM_" << m_ngram->Order() << "gram";
+ return oss.str();
+ }
- void CleanUpAfterSentenceProcessing() {}
- void InitializeBeforeSentenceProcessing() {}
+ const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
+ KenLMState *ret = new KenLMState();
+ ret->state = m_ngram->BeginSentenceState();
+ return ret;
+ }
- FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const;
+ void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
- FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator, const LanguageModel *feature) const;
+ FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
- void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
+ FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
private:
- Model *m_ngram;
- std::vector<lm::WordIndex> m_lmIdLookup;
- bool m_lazy;
- KenLMState m_nullContextState;
- KenLMState m_beginSentenceState;
+ LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from);
lm::WordIndex TranslateID(const Word &word) const {
- std::size_t factor = word.GetFactor(GetFactorType())->GetId();
+ std::size_t factor = word.GetFactor(m_factorType)->GetId();
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
}
- void TranslateIDs(const std::vector<const Word*> &contextFactor, lm::WordIndex *indices) const {
- for (size_t i = 0 ; i < contextFactor.size(); i++) {
- indices[contextFactor.size() - 1 - i] = TranslateID(*contextFactor[i]);
- }
- }
-
// Convert last words of hypothesis into vocab ids, returning an end pointer.
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
lm::WordIndex *index = indices;
@@ -123,17 +121,19 @@ template <class Model> class LanguageModelKen : public LanguageModelKenBase {
*index = TranslateID(hypo.GetWord(position));
}
}
-};
-template <class Model> LanguageModelKen<Model>::LanguageModelKen(bool lazy)
- :m_ngram(NULL), m_lazy(lazy)
-{
-}
+#ifdef WITH_THREADS
+ boost::shared_ptr<Model> m_ngram;
+#else
+ Model *m_ngram;
+ mutable unsigned int *m_refcount;
+#endif
+ std::vector<lm::WordIndex> m_lmIdLookup;
-template <class Model> LanguageModelKen<Model>::~LanguageModelKen()
-{
- delete m_ngram;
-}
+ FactorType m_factorType;
+
+ const Factor *m_beginSentenceFactor;
+};
class MappingBuilder : public lm::EnumerateVocab {
public:
@@ -157,119 +157,88 @@ private:
std::string str_;
};
-template <class Model> bool LanguageModelKen<Model>::Load(const std::string &filePath,
- FactorType factorType,
- size_t /*nGramOrder*/)
-{
- m_factorType = factorType;
- m_filePath = filePath;
-
- FactorCollection &factorCollection = FactorCollection::Instance();
- m_sentenceStart = factorCollection.AddFactor(BOS_);
- m_sentenceStartArray[m_factorType] = m_sentenceStart;
- m_sentenceEnd = factorCollection.AddFactor(EOS_);
- m_sentenceEndArray[m_factorType] = m_sentenceEnd;
-
- MappingBuilder builder(factorCollection, m_lmIdLookup);
+template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) : m_factorType(factorType) {
lm::ngram::Config config;
-
IFVERBOSE(1) {
config.messages = &std::cerr;
- }
- else {
+ } else {
config.messages = NULL;
}
-
+ FactorCollection &collection = FactorCollection::Instance();
+ MappingBuilder builder(collection, m_lmIdLookup);
config.enumerate_vocab = &builder;
- config.load_method = m_lazy ? util::LAZY : util::POPULATE_OR_READ;
+ config.load_method = lazy ? util::LAZY : util::POPULATE_OR_READ;
try {
- m_ngram = new Model(filePath.c_str(), config);
+#ifdef WITH_THREADS
+ m_ngram.reset(new Model(file.c_str(), config));
+#else
+ m_ngram = new Model(file.c_str(), config);
+ m_refcount = new unsigned int();
+ *m_refcount = 1;
+#endif
} catch (std::exception &e) {
std::cerr << e.what() << std::endl;
abort();
}
- m_nGramOrder = m_ngram->Order();
- m_nullContextState.state = m_ngram->NullContextState();
- m_beginSentenceState.state = m_ngram->BeginSentenceState();
- return true;
+ m_beginSentenceFactor = collection.AddFactor(BOS_);
+ Init(manager);
}
-template <class Model> LMKenResult LanguageModelKen<Model>::GetKenFullScoreGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const
-{
- LMKenResult result;
- if (contextFactor.empty()) {
- result.score = 0.0;
- result.unknown = false;
- result.ngram_length = 0;
- return result;
- }
- lm::ngram::State &realState = static_cast<KenLMState&>(state).state;
- std::size_t factor = contextFactor.back()->GetFactor(GetFactorType())->GetId();
- lm::WordIndex new_word = (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
- lm::ngram::State copied(realState);
- lm::FullScoreReturn ret(m_ngram->FullScore(copied, new_word, realState));
-
- result.score = TransformLMScore(ret.prob);
- result.unknown = (new_word == 0);
- result.ngram_length = ret.ngram_length;
- return result;
+template <class Model> LanguageModel *LanguageModelKen<Model>::Duplicate(ScoreIndexManager &manager) const {
+ return new LanguageModelKen<Model>(manager, *this);
}
-template <class Model> LMKenResult LanguageModelKen<Model>::GetKenFullScoreForgotState(const vector<const Word*> &contextFactor, FFState &outState) const
-{
- LMKenResult result;
- if (contextFactor.empty()) {
- static_cast<KenLMState&>(outState).state = m_ngram->NullContextState();
- result.score = 0.0;
- result.unknown = false;
- result.ngram_length = 0;
- return result;
- }
-
- lm::WordIndex indices[contextFactor.size()];
- TranslateIDs(contextFactor, indices);
-
- lm::FullScoreReturn ret(m_ngram->FullScoreForgotState(indices + 1, indices + contextFactor.size(), indices[0], static_cast<KenLMState&>(outState).state));
-
- result.score = TransformLMScore(ret.prob);
- result.unknown = (indices[0] == 0);
- result.ngram_length = ret.ngram_length;
- return result;
+template <class Model> LanguageModelKen<Model>::LanguageModelKen(ScoreIndexManager &manager, const LanguageModelKen<Model> &copy_from) :
+ m_ngram(copy_from.m_ngram),
+ // TODO: don't copy this.
+ m_lmIdLookup(copy_from.m_lmIdLookup),
+ m_factorType(copy_from.m_factorType),
+ m_beginSentenceFactor(copy_from.m_beginSentenceFactor) {
+#ifndef WITH_THREADS
+ m_refcount = copy_from.m_refcount;
+ ++*m_refcount;
+#endif
+ Init(manager);
}
-template <class Model> void LanguageModelKen<Model>::GetState(const std::vector<const Word*> &contextFactor, FFState &outState) const
-{
- if (contextFactor.empty()) {
- static_cast<KenLMState&>(outState).state = m_ngram->NullContextState();
- return;
- }
- lm::WordIndex indices[contextFactor.size()];
- TranslateIDs(contextFactor, indices);
- m_ngram->GetState(indices, indices + contextFactor.size(), static_cast<KenLMState&>(outState).state);
-}
+template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
+ fullScore = 0;
+ ngramScore = 0;
+ oovCount = 0;
-template <class Model> const FFState *LanguageModelKen<Model>::GetNullContextState() const
-{
- return &m_nullContextState;
-}
+ if (!phrase.GetSize()) return;
-template <class Model> const FFState *LanguageModelKen<Model>::GetBeginSentenceState() const
-{
- return &m_beginSentenceState;
-}
+ typename Model::State state_backing[2];
+ typename Model::State *state0 = &state_backing[0], *state1 = &state_backing[1];
+ size_t position;
+ if (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)) {
+ *state0 = m_ngram->BeginSentenceState();
+ position = 1;
+ } else {
+ *state0 = m_ngram->NullContextState();
+ position = 0;
+ }
+
+ size_t ngramBoundary = m_ngram->Order() - 1;
-template <class Model> FFState *LanguageModelKen<Model>::NewState(const FFState *from) const
-{
- KenLMState *ret = new KenLMState;
- if (from) {
- ret->state = static_cast<const KenLMState&>(*from).state;
+ for (; position < phrase.GetSize(); ++position) {
+ const Word &word = phrase.GetWord(position);
+ if (word.IsNonTerminal()) {
+ *state0 = m_ngram->NullContextState();
+ } else {
+ lm::WordIndex index = TranslateID(word);
+ float score = TransformLMScore(m_ngram->Score(*state0, index, *state1));
+ std::swap(state0, state1);
+ if (position >= ngramBoundary) ngramScore += score;
+ fullScore += score;
+ if (!index) ++oovCount;
+ }
}
- return ret;
}
-template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out, const LanguageModel *feature) const {
+template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
std::auto_ptr<KenLMState> ret(new KenLMState());
@@ -312,13 +281,13 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
score = TransformLMScore(score);
- if (feature->OOVFeatureEnabled()) {
+ if (OOVFeatureEnabled()) {
std::vector<float> scores(2);
scores[0] = score;
scores[1] = 0.0;
- out->PlusEquals(feature, scores);
+ out->PlusEquals(this, scores);
} else {
- out->PlusEquals(feature, score);
+ out->PlusEquals(this, score);
}
return ret.release();
@@ -342,11 +311,7 @@ class LanguageModelChartStateKenLM : public FFState {
lm::ngram::ChartState m_state;
};
-template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(
- const ChartHypothesis& hypo,
- int featureID,
- ScoreComponentCollection *accumulator,
- const LanguageModel *feature) const {
+template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const {
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = hypo.GetCurrTargetPhrase().GetAlignmentInfo().GetNonTermIndexMap();
@@ -356,7 +321,7 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(
// Special cases for first word.
if (size) {
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
- if (word == GetSentenceStartArray()) {
+ if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
// Begin of sentence
ruleScore.BeginSentence();
phrasePos++;
@@ -364,7 +329,7 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(
// Non-terminal is first so we can copy instead of rescoring.
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
- ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(feature)[0]);
+ ruleScore.BeginNonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
phrasePos++;
}
}
@@ -374,76 +339,38 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(
if (word.IsNonTerminal()) {
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
- ruleScore.NonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(feature)[0]);
+ ruleScore.NonTerminal(prevState, prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
} else {
ruleScore.Terminal(TranslateID(word));
}
}
- accumulator->Assign(feature, ruleScore.Finish());
+ accumulator->Assign(this, ruleScore.Finish());
return newState;
}
-template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
- fullScore = 0;
- ngramScore = 0;
- oovCount = 0;
-
- if (!phrase.GetSize()) return;
-
- typename Model::State state_backing[2];
- typename Model::State *state0 = &state_backing[0], *state1 = &state_backing[1];
- size_t position;
- if (phrase.GetWord(0) == GetSentenceStartArray()) {
- *state0 = m_ngram->BeginSentenceState();
- position = 1;
- } else {
- *state0 = m_ngram->NullContextState();
- position = 0;
- }
-
- FactorType factorType = GetFactorType();
- size_t ngramBoundary = m_ngram->Order() - 1;
-
- for (; position < phrase.GetSize(); ++position) {
- const Word &word = phrase.GetWord(position);
- if (word.IsNonTerminal()) {
- *state0 = m_ngram->NullContextState();
- } else {
- std::size_t factor = word.GetFactor(factorType)->GetId();
- lm::WordIndex index = factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor];
- float score = TransformLMScore(m_ngram->Score(*state0, index, *state1));
- std::swap(state0, state1);
- if (position >= ngramBoundary) ngramScore += score;
- fullScore += score;
- if (!index) ++oovCount;
- }
- }
-}
-
} // namespace
-LanguageModelSingleFactor *ConstructKenLM(const std::string &file, bool lazy)
-{
+LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy) {
lm::ngram::ModelType model_type;
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
switch(model_type) {
case lm::ngram::HASH_PROBING:
- return new LanguageModelKen<lm::ngram::ProbingModel>(lazy);
+ return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
case lm::ngram::TRIE_SORTED:
- return new LanguageModelKen<lm::ngram::TrieModel>(lazy);
+ return new LanguageModelKen<lm::ngram::TrieModel>(file, manager, factorType, lazy);
case lm::ngram::QUANT_TRIE_SORTED:
- return new LanguageModelKen<lm::ngram::QuantTrieModel>(lazy);
+ return new LanguageModelKen<lm::ngram::QuantTrieModel>(file, manager, factorType, lazy);
case lm::ngram::ARRAY_TRIE_SORTED:
- return new LanguageModelKen<lm::ngram::ArrayTrieModel>(lazy);
+ return new LanguageModelKen<lm::ngram::ArrayTrieModel>(file, manager, factorType, lazy);
case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
- return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(lazy);
+ return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(file, manager, factorType, lazy);
default:
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
abort();
}
} else {
- return new LanguageModelKen<lm::ngram::ProbingModel>(lazy);
+ return new LanguageModelKen<lm::ngram::ProbingModel>(file, manager, factorType, lazy);
}
}
diff --git a/moses/src/LanguageModelKen.h b/moses/src/LanguageModelKen.h
index 1b3d8def1..0a4076111 100644
--- a/moses/src/LanguageModelKen.h
+++ b/moses/src/LanguageModelKen.h
@@ -24,32 +24,16 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
-#include "LanguageModelSingleFactor.h"
-#include "kenlm/lm/left.hh"
-
-namespace Moses
-{
-
- // kenlm specific score value
- struct LMKenResult : public LMResult {
- unsigned char ngram_length;
- };
-
- // base-class for the actual LanguageModelKen; only here to provide a specific behaviour without exposing the implementation
- class LanguageModelKenBase : public LanguageModelSingleFactor {
- public:
- virtual ~LanguageModelKenBase();
- // scoring functions which provide more info than the common interface of LanguageModel
- virtual LMKenResult GetKenFullScoreGivenState(const std::vector<const Word*> &contextFactor, FFState &state) const = 0;
- virtual LMKenResult GetKenFullScoreForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const = 0;
-};
+#include "TypeDef.h"
-class ScoreIndexManager;
+namespace Moses {
-// Doesn't actually load; moses wants the Load method for that. It needs the file to autodetect binary format.
-LanguageModelSingleFactor *ConstructKenLM(const std::string &file, bool lazy);
+class ScoreIndexManager;
+class LanguageModel;
-}
+// This will also load.
+LanguageModel *ConstructKenLM(const std::string &file, ScoreIndexManager &manager, FactorType factorType, bool lazy);
+} // namespace Moses
#endif
diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp
index a9a521cb8..52682641c 100644
--- a/moses/src/StaticData.cpp
+++ b/moses/src/StaticData.cpp
@@ -812,7 +812,7 @@ bool StaticData::LoadLanguageModels()
for(size_t i=0; i<lmVector.size(); i++) {
LanguageModel* lm = NULL;
if (languageModelsLoaded.find(lmVector[i]) != languageModelsLoaded.end()) {
- lm = new LanguageModel(m_scoreIndexManager, languageModelsLoaded[lmVector[i]]);
+ lm = languageModelsLoaded[lmVector[i]]->Duplicate(m_scoreIndexManager);
} else {
vector<string> token = Tokenize(lmVector[i]);
if (token.size() != 4 && token.size() != 5 ) {