diff options
author | Ales Tamchyna <a.tamchyna@gmail.com> | 2012-12-19 16:41:11 +0400 |
---|---|---|
committer | Ales Tamchyna <a.tamchyna@gmail.com> | 2012-12-19 16:41:11 +0400 |
commit | a9dcada475b73db20c0355dd28853d04c2a1add1 (patch) | |
tree | c710be8eed5719b9b54e46c850e4988d10381956 | |
parent | cd3fb3b831e5ca0821a735c0d075f5fd6e79296a (diff) |
toward implementing local language models (Monz 2011)
-rw-r--r-- | moses/LM/Factory.cpp | 10 | ||||
-rw-r--r-- | moses/LM/IRST.h | 2 | ||||
-rw-r--r-- | moses/LM/Jamfile | 3 | ||||
-rw-r--r-- | moses/LM/Local.cpp | 169 | ||||
-rw-r--r-- | moses/LM/Local.h | 72 | ||||
-rw-r--r-- | moses/LM/MultiFactor.h | 2 | ||||
-rw-r--r-- | moses/LM/ORLM.h | 2 | ||||
-rw-r--r-- | moses/LM/Rand.cpp | 2 | ||||
-rw-r--r-- | moses/LM/Remote.h | 2 | ||||
-rw-r--r-- | moses/LM/SRI.h | 2 | ||||
-rw-r--r-- | moses/LM/SingleFactor.h | 6 | ||||
-rw-r--r-- | moses/TypeDef.h | 1 |
12 files changed, 261 insertions, 12 deletions
diff --git a/moses/LM/Factory.cpp b/moses/LM/Factory.cpp index a711540dd..49d69e074 100644 --- a/moses/LM/Factory.cpp +++ b/moses/LM/Factory.cpp @@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #ifdef LM_SRI # include "SRI.h" #include "ParallelBackoff.h" +#include "Local.h" #endif #ifdef LM_IRST # include "IRST.h" @@ -101,6 +102,11 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation lm = new LanguageModelJoint(new LanguageModelSRI()); #endif break; + case Local: +#ifdef LM_SRI + lm = new LanguageModelLocal(); +#endif + break; case ParallelBackoff: #ifdef LM_SRI lm = NewParallelBackoff(); @@ -123,14 +129,14 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation } else { switch (lm->GetLMType()) { case SingleFactor: - if (! static_cast<LanguageModelSingleFactor*>(lm)->Load(languageModelFile, factorTypes[0], nGramOrder)) { + if (! dynamic_cast<LanguageModelSingleFactor*>(lm)->Load(languageModelFile, factorTypes[0], nGramOrder)) { cerr << "single factor model failed" << endl; delete lm; lm = NULL; } break; case MultiFactor: - if (! static_cast<LanguageModelMultiFactor*>(lm)->Load(languageModelFile, factorTypes, nGramOrder)) { + if (! dynamic_cast<LanguageModelMultiFactor*>(lm)->Load(languageModelFile, factorTypes, nGramOrder)) { cerr << "multi factor model failed" << endl; delete lm; lm = NULL; diff --git a/moses/LM/IRST.h b/moses/LM/IRST.h index 205455e93..6e7ba7750 100644 --- a/moses/LM/IRST.h +++ b/moses/LM/IRST.h @@ -40,7 +40,7 @@ class Phrase; /** Implementation of single factor LM using IRST's code. * This is available from the same sourceforge repository */ -class LanguageModelIRST : public LanguageModelPointerState +class LanguageModelIRST : public LanguageModelPointerState, public LanguageModelSingleFactor { protected: mutable std::vector<int> m_lmIdLookup; diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile index d7ee23e02..e2d0e8bab 100644 --- a/moses/LM/Jamfile +++ b/moses/LM/Jamfile @@ -60,8 +60,9 @@ if $(with-srilm) { } obj SRI.o : SRI.cpp ..//headers : <include>$(with-srilm)/include <include>$(with-srilm)/include/srilm <warnings>off ; + obj Local.o : Local.cpp ..//headers : <include>$(with-srilm)/include <include>$(with-srilm)/include/srilm <warnings>off ; obj ParallelBackoff.o : ParallelBackoff.cpp ..//headers : <include>$(with-srilm)/include <include>$(with-srilm)/include/srilm <warnings>off ; - alias sri : SRI.o ParallelBackoff.o sri-libs : : : <define>LM_SRI ; + alias sri : SRI.o ParallelBackoff.o Local.o sri-libs : : : <define>LM_SRI ; dependencies += sri ; } diff --git a/moses/LM/Local.cpp b/moses/LM/Local.cpp new file mode 100644 index 000000000..026f22ca6 --- /dev/null +++ b/moses/LM/Local.cpp @@ -0,0 +1,169 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "util/check.hh" +#include <limits> +#include <iostream> +#include <fstream> + +#include "Local.h" +#include "moses/TypeDef.h" +#include "moses/Util.h" +#include "moses/FactorCollection.h" +#include "moses/Phrase.h" +#include "moses/StaticData.h" + +#include "Vocab.h" +#include "Ngram.h" + +using namespace std; + +namespace Moses +{ +LanguageModelLocal::LanguageModelLocal() + : m_srilmVocab(0) + , m_srilmModel(0) +{ +} + +LanguageModelLocal::~LanguageModelLocal() +{ + delete m_srilmModel; + delete m_srilmVocab; +} + +bool LanguageModelLocal::Load(const std::string &filePath, const std::vector<FactorType> &factors, + size_t nGramOrder) +{ + m_srilmVocab = new ::Vocab(); + m_srilmModel = new Ngram(*m_srilmVocab, nGramOrder); + m_factorTypes = FactorMask(factors); + m_nGramOrder = nGramOrder; + m_filePath = filePath; + + if (factors.size() != 2) { + cerr << "LocalLM needs exactly two factors form|tag" << endl; + abort(); + } + + m_srilmModel->skipOOVs() = false; + + File file( filePath.c_str(), "r" ); + m_srilmModel->read(file); + + // LM can be ok, just outputs warnings + CreateFactors(); + m_unknownId = m_srilmVocab->unkIndex(); + + return true; +} + +void LanguageModelLocal::CreateFactors() +{ + // add factors which have srilm id + FactorCollection &factorCollection = FactorCollection::Instance(); + + VocabString str; + VocabIter iter(*m_srilmVocab); + FactorType formFactor = m_factorTypes[0]; + FactorType tagFactor = m_factorTypes[1]; + while ( (str = iter.next()) != NULL) { + vector<string> factors = Tokenize(str, "|"); + if (factors.size() != 2) { + cerr << "Incorrect format for LocalLM, expected 2 factors in word: " << str << endl; + abort(); + } + VocabIndex lmId = GetLmID(str); + size_t formId = factorCollection.AddFactor(Output, formFactor, factors[0])->GetId(); + size_t tagId = factorCollection.AddFactor(Output, tagFactor, factors[1])->GetId(); + m_lmIdLookup[PairNumbers(formId, tagId)] = lmId; + } + + // sentence markers + for (size_t index = 0 ; index < m_factorTypes.size() ; ++index) { + FactorType factorType = m_factorTypes[index]; + m_sentenceStartArray[factorType] = factorCollection.AddFactor(Output, factorType, BOS_); + m_sentenceEndArray[factorType] = factorCollection.AddFactor(Output, factorType, EOS_); + } + m_lmIdLookup[PairNumbers(m_sentenceStartArray[formFactor]->GetId(), + m_sentenceStartArray[tagFactor]->GetId())] = GetLmID(BOS_); + m_lmIdLookup[PairNumbers(m_sentenceEndArray[formFactor]->GetId(), + m_sentenceEndArray[tagFactor]->GetId())] = GetLmID(EOS_); +} + +VocabIndex LanguageModelLocal::GetLmID( const std::string &str ) const +{ + return m_srilmVocab->getIndex( str.c_str(), m_unknownId ); +} + +VocabIndex LanguageModelLocal::GetLmID( const Factor *form, const Factor *tag ) const +{ + boost::unordered_map<size_t, unsigned int>::const_iterator it; + it = m_lmIdLookup.find(PairNumbers(form->GetId(), tag->GetId())); + return (it == m_lmIdLookup.end()) ? m_unknownId : it->second; +} + +LMResult LanguageModelLocal::GetValue(VocabIndex wordId, VocabIndex *context) const +{ + LMResult ret; + ret.score = FloorScore(TransformLMScore(m_srilmModel->wordProb( wordId, context))); + ret.unknown = (wordId == m_unknownId); + return ret; +} + +LMResult LanguageModelLocal::GetValue(const vector<const Word*> &contextFactor, State* finalState) const +{ + LMResult ret; + FactorType factorType = 0; // XXX + size_t count = contextFactor.size(); + if (count <= 0) { + if(finalState) + *finalState = NULL; + ret.score = 0.0; + ret.unknown = false; + return ret; + } + + // set up context + // + // TODO + // for each head word (i.e. word W in contextFactor, ask about this n-gram: + // contextFactor[0].tag + W.form, ..., "HEAD" + W.form, ..., contextFactor[last].tag + W.form + VocabIndex ngram[count + 1]; + for (size_t i = 0 ; i < count - 1 ; i++) { + ngram[i+1] = GetLmID((*contextFactor[count-2-i])[factorType], 0); // XXX + } + ngram[count] = Vocab_None; + + CHECK((*contextFactor[count-1])[factorType] != NULL); + // call sri lm fn + VocabIndex lmId = GetLmID((*contextFactor[count-1])[factorType], 0); // XXX + ret = GetValue(lmId, ngram+1); + + if (finalState) { + ngram[0] = lmId; + unsigned int dummy; + *finalState = m_srilmModel->contextID(ngram, dummy); + } + return ret; +} + +} diff --git a/moses/LM/Local.h b/moses/LM/Local.h new file mode 100644 index 000000000..9190fd1c6 --- /dev/null +++ b/moses/LM/Local.h @@ -0,0 +1,72 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_LanguageModelLocal_h +#define moses_LanguageModelLocal_h + +#include <string> +#include <vector> +#include <boost/unordered_map.hpp> +#include "moses/Factor.h" +#include "moses/TypeDef.h" +#include "SingleFactor.h" +#include "MultiFactor.h" + +class Factor; +class Phrase; +class Vocab; +class Ngram; + +namespace Moses +{ + +/** Local language models (Monz 2011) + */ +class LanguageModelLocal : public LanguageModelMultiFactor, public LanguageModelPointerState +{ +protected: + boost::unordered_map<size_t, unsigned int> m_lmIdLookup; + ::Vocab *m_srilmVocab; + Ngram *m_srilmModel; + unsigned int m_unknownId; + + LMResult GetValue(unsigned int wordId, unsigned int *context) const; + void CreateFactors(); + unsigned int GetLmID( const std::string &str ) const; + unsigned int GetLmID( const Factor *form, const Factor *tag ) const; + + // Cantor's pairing function + size_t PairNumbers(size_t a, size_t b) const + { + return (a + b) * (a + b + 1) / 2 + b; + } + +public: + LanguageModelLocal(); + ~LanguageModelLocal(); + bool Load(const std::string &filePath, const std::vector<FactorType> &factors, size_t nGramOrder); + + virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = 0) const; +}; + + +} +#endif diff --git a/moses/LM/MultiFactor.h b/moses/LM/MultiFactor.h index 1d38fbee6..d2211b1c5 100644 --- a/moses/LM/MultiFactor.h +++ b/moses/LM/MultiFactor.h @@ -36,7 +36,7 @@ class Phrase; /* Abstract class for for multi factor LM. Only inherited by the JointLM at the moment. * Could use this when factored LM are implemented */ -class LanguageModelMultiFactor : public LanguageModelImplementation +class LanguageModelMultiFactor : virtual public LanguageModelImplementation { protected: FactorMask m_factorTypes; diff --git a/moses/LM/ORLM.h b/moses/LM/ORLM.h index 442f22bac..87bf44f86 100644 --- a/moses/LM/ORLM.h +++ b/moses/LM/ORLM.h @@ -17,7 +17,7 @@ class Phrase; /** @todo ask ollie */ -class LanguageModelORLM : public LanguageModelPointerState { +class LanguageModelORLM : public LanguageModelPointerState, public LanguageModelSingleFactor { public: typedef count_t T; // type for ORLM filter LanguageModelORLM() diff --git a/moses/LM/Rand.cpp b/moses/LM/Rand.cpp index 4f0718b68..13205d6a9 100644 --- a/moses/LM/Rand.cpp +++ b/moses/LM/Rand.cpp @@ -41,7 +41,7 @@ namespace { using namespace std; -class LanguageModelRandLM : public LanguageModelPointerState +class LanguageModelRandLM : public LanguageModelPointerState, public LanguageModelSingleFactor { public: LanguageModelRandLM() diff --git a/moses/LM/Remote.h b/moses/LM/Remote.h index 7fa4bd0af..f60f17257 100644 --- a/moses/LM/Remote.h +++ b/moses/LM/Remote.h @@ -13,7 +13,7 @@ namespace Moses /** @todo ask miles */ -class LanguageModelRemote : public LanguageModelPointerState +class LanguageModelRemote : public LanguageModelPointerState, public LanguageModelSingleFactor { private: struct Cache { diff --git a/moses/LM/SRI.h b/moses/LM/SRI.h index f88f5947e..2e7bb53f6 100644 --- a/moses/LM/SRI.h +++ b/moses/LM/SRI.h @@ -38,7 +38,7 @@ namespace Moses /** Implementation of single factor LM using IRST's code. */ -class LanguageModelSRI : public LanguageModelPointerState +class LanguageModelSRI : public LanguageModelPointerState, public LanguageModelSingleFactor { protected: std::vector<unsigned int> m_lmIdLookup; diff --git a/moses/LM/SingleFactor.h b/moses/LM/SingleFactor.h index 05828dc9b..6908f4e2d 100644 --- a/moses/LM/SingleFactor.h +++ b/moses/LM/SingleFactor.h @@ -32,7 +32,7 @@ class FactorCollection; class Factor; //! Abstract class for for single factor LM -class LanguageModelSingleFactor : public LanguageModelImplementation +class LanguageModelSingleFactor : virtual public LanguageModelImplementation { protected: const Factor *m_sentenceStart, *m_sentenceEnd; @@ -70,8 +70,8 @@ public: } }; -// Single factor LM that uses a null pointer state. -class LanguageModelPointerState : public LanguageModelSingleFactor +// LM that uses a null pointer state. +class LanguageModelPointerState : virtual public LanguageModelImplementation { private: FFState *m_nullContextState; diff --git a/moses/TypeDef.h b/moses/TypeDef.h index faf98c448..706ae12cb 100644 --- a/moses/TypeDef.h +++ b/moses/TypeDef.h @@ -123,6 +123,7 @@ enum LMImplementation { ,LazyKen = 9 ,ORLM = 10 ,LDHTLM = 11 + ,Local = 12 }; enum PhraseTableImplementation { |