Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAles Tamchyna <a.tamchyna@gmail.com>2012-12-19 16:41:11 +0400
committerAles Tamchyna <a.tamchyna@gmail.com>2012-12-19 16:41:11 +0400
commita9dcada475b73db20c0355dd28853d04c2a1add1 (patch)
treec710be8eed5719b9b54e46c850e4988d10381956
parentcd3fb3b831e5ca0821a735c0d075f5fd6e79296a (diff)
toward implementing local language models (Monz 2011)
-rw-r--r--moses/LM/Factory.cpp10
-rw-r--r--moses/LM/IRST.h2
-rw-r--r--moses/LM/Jamfile3
-rw-r--r--moses/LM/Local.cpp169
-rw-r--r--moses/LM/Local.h72
-rw-r--r--moses/LM/MultiFactor.h2
-rw-r--r--moses/LM/ORLM.h2
-rw-r--r--moses/LM/Rand.cpp2
-rw-r--r--moses/LM/Remote.h2
-rw-r--r--moses/LM/SRI.h2
-rw-r--r--moses/LM/SingleFactor.h6
-rw-r--r--moses/TypeDef.h1
12 files changed, 261 insertions, 12 deletions
diff --git a/moses/LM/Factory.cpp b/moses/LM/Factory.cpp
index a711540dd..49d69e074 100644
--- a/moses/LM/Factory.cpp
+++ b/moses/LM/Factory.cpp
@@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#ifdef LM_SRI
# include "SRI.h"
#include "ParallelBackoff.h"
+#include "Local.h"
#endif
#ifdef LM_IRST
# include "IRST.h"
@@ -101,6 +102,11 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
lm = new LanguageModelJoint(new LanguageModelSRI());
#endif
break;
+ case Local:
+#ifdef LM_SRI
+ lm = new LanguageModelLocal();
+#endif
+ break;
case ParallelBackoff:
#ifdef LM_SRI
lm = NewParallelBackoff();
@@ -123,14 +129,14 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
} else {
switch (lm->GetLMType()) {
case SingleFactor:
- if (! static_cast<LanguageModelSingleFactor*>(lm)->Load(languageModelFile, factorTypes[0], nGramOrder)) {
+ if (! dynamic_cast<LanguageModelSingleFactor*>(lm)->Load(languageModelFile, factorTypes[0], nGramOrder)) {
cerr << "single factor model failed" << endl;
delete lm;
lm = NULL;
}
break;
case MultiFactor:
- if (! static_cast<LanguageModelMultiFactor*>(lm)->Load(languageModelFile, factorTypes, nGramOrder)) {
+ if (! dynamic_cast<LanguageModelMultiFactor*>(lm)->Load(languageModelFile, factorTypes, nGramOrder)) {
cerr << "multi factor model failed" << endl;
delete lm;
lm = NULL;
diff --git a/moses/LM/IRST.h b/moses/LM/IRST.h
index 205455e93..6e7ba7750 100644
--- a/moses/LM/IRST.h
+++ b/moses/LM/IRST.h
@@ -40,7 +40,7 @@ class Phrase;
/** Implementation of single factor LM using IRST's code.
* This is available from the same sourceforge repository
*/
-class LanguageModelIRST : public LanguageModelPointerState
+class LanguageModelIRST : public LanguageModelPointerState, public LanguageModelSingleFactor
{
protected:
mutable std::vector<int> m_lmIdLookup;
diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile
index d7ee23e02..e2d0e8bab 100644
--- a/moses/LM/Jamfile
+++ b/moses/LM/Jamfile
@@ -60,8 +60,9 @@ if $(with-srilm) {
}
obj SRI.o : SRI.cpp ..//headers : <include>$(with-srilm)/include <include>$(with-srilm)/include/srilm <warnings>off ;
+ obj Local.o : Local.cpp ..//headers : <include>$(with-srilm)/include <include>$(with-srilm)/include/srilm <warnings>off ;
obj ParallelBackoff.o : ParallelBackoff.cpp ..//headers : <include>$(with-srilm)/include <include>$(with-srilm)/include/srilm <warnings>off ;
- alias sri : SRI.o ParallelBackoff.o sri-libs : : : <define>LM_SRI ;
+ alias sri : SRI.o ParallelBackoff.o Local.o sri-libs : : : <define>LM_SRI ;
dependencies += sri ;
}
diff --git a/moses/LM/Local.cpp b/moses/LM/Local.cpp
new file mode 100644
index 000000000..026f22ca6
--- /dev/null
+++ b/moses/LM/Local.cpp
@@ -0,0 +1,169 @@
+// $Id$
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "util/check.hh"
+#include <limits>
+#include <iostream>
+#include <fstream>
+
+#include "Local.h"
+#include "moses/TypeDef.h"
+#include "moses/Util.h"
+#include "moses/FactorCollection.h"
+#include "moses/Phrase.h"
+#include "moses/StaticData.h"
+
+#include "Vocab.h"
+#include "Ngram.h"
+
+using namespace std;
+
+namespace Moses
+{
+LanguageModelLocal::LanguageModelLocal()
+ : m_srilmVocab(0)
+ , m_srilmModel(0)
+{
+}
+
+LanguageModelLocal::~LanguageModelLocal()
+{
+ delete m_srilmModel;
+ delete m_srilmVocab;
+}
+
+bool LanguageModelLocal::Load(const std::string &filePath, const std::vector<FactorType> &factors,
+ size_t nGramOrder)
+{
+ m_srilmVocab = new ::Vocab();
+ m_srilmModel = new Ngram(*m_srilmVocab, nGramOrder);
+ m_factorTypes = FactorMask(factors);
+ m_nGramOrder = nGramOrder;
+ m_filePath = filePath;
+
+ if (factors.size() != 2) {
+ cerr << "LocalLM needs exactly two factors form|tag" << endl;
+ abort();
+ }
+
+ m_srilmModel->skipOOVs() = false;
+
+ File file( filePath.c_str(), "r" );
+ m_srilmModel->read(file);
+
+ // LM can be ok, just outputs warnings
+ CreateFactors();
+ m_unknownId = m_srilmVocab->unkIndex();
+
+ return true;
+}
+
+void LanguageModelLocal::CreateFactors()
+{
+ // add factors which have srilm id
+ FactorCollection &factorCollection = FactorCollection::Instance();
+
+ VocabString str;
+ VocabIter iter(*m_srilmVocab);
+ FactorType formFactor = m_factorTypes[0];
+ FactorType tagFactor = m_factorTypes[1];
+ while ( (str = iter.next()) != NULL) {
+ vector<string> factors = Tokenize(str, "|");
+ if (factors.size() != 2) {
+ cerr << "Incorrect format for LocalLM, expected 2 factors in word: " << str << endl;
+ abort();
+ }
+ VocabIndex lmId = GetLmID(str);
+ size_t formId = factorCollection.AddFactor(Output, formFactor, factors[0])->GetId();
+ size_t tagId = factorCollection.AddFactor(Output, tagFactor, factors[1])->GetId();
+ m_lmIdLookup[PairNumbers(formId, tagId)] = lmId;
+ }
+
+ // sentence markers
+ for (size_t index = 0 ; index < m_factorTypes.size() ; ++index) {
+ FactorType factorType = m_factorTypes[index];
+ m_sentenceStartArray[factorType] = factorCollection.AddFactor(Output, factorType, BOS_);
+ m_sentenceEndArray[factorType] = factorCollection.AddFactor(Output, factorType, EOS_);
+ }
+ m_lmIdLookup[PairNumbers(m_sentenceStartArray[formFactor]->GetId(),
+ m_sentenceStartArray[tagFactor]->GetId())] = GetLmID(BOS_);
+ m_lmIdLookup[PairNumbers(m_sentenceEndArray[formFactor]->GetId(),
+ m_sentenceEndArray[tagFactor]->GetId())] = GetLmID(EOS_);
+}
+
+VocabIndex LanguageModelLocal::GetLmID( const std::string &str ) const
+{
+ return m_srilmVocab->getIndex( str.c_str(), m_unknownId );
+}
+
+VocabIndex LanguageModelLocal::GetLmID( const Factor *form, const Factor *tag ) const
+{
+ boost::unordered_map<size_t, unsigned int>::const_iterator it;
+ it = m_lmIdLookup.find(PairNumbers(form->GetId(), tag->GetId()));
+ return (it == m_lmIdLookup.end()) ? m_unknownId : it->second;
+}
+
+LMResult LanguageModelLocal::GetValue(VocabIndex wordId, VocabIndex *context) const
+{
+ LMResult ret;
+ ret.score = FloorScore(TransformLMScore(m_srilmModel->wordProb( wordId, context)));
+ ret.unknown = (wordId == m_unknownId);
+ return ret;
+}
+
+LMResult LanguageModelLocal::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
+{
+ LMResult ret;
+ FactorType factorType = 0; // XXX
+ size_t count = contextFactor.size();
+ if (count <= 0) {
+ if(finalState)
+ *finalState = NULL;
+ ret.score = 0.0;
+ ret.unknown = false;
+ return ret;
+ }
+
+ // set up context
+ //
+ // TODO
+ // for each head word (i.e. word W in contextFactor, ask about this n-gram:
+ // contextFactor[0].tag + W.form, ..., "HEAD" + W.form, ..., contextFactor[last].tag + W.form
+ VocabIndex ngram[count + 1];
+ for (size_t i = 0 ; i < count - 1 ; i++) {
+ ngram[i+1] = GetLmID((*contextFactor[count-2-i])[factorType], 0); // XXX
+ }
+ ngram[count] = Vocab_None;
+
+ CHECK((*contextFactor[count-1])[factorType] != NULL);
+ // call sri lm fn
+ VocabIndex lmId = GetLmID((*contextFactor[count-1])[factorType], 0); // XXX
+ ret = GetValue(lmId, ngram+1);
+
+ if (finalState) {
+ ngram[0] = lmId;
+ unsigned int dummy;
+ *finalState = m_srilmModel->contextID(ngram, dummy);
+ }
+ return ret;
+}
+
+}
diff --git a/moses/LM/Local.h b/moses/LM/Local.h
new file mode 100644
index 000000000..9190fd1c6
--- /dev/null
+++ b/moses/LM/Local.h
@@ -0,0 +1,72 @@
+// $Id$
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#ifndef moses_LanguageModelLocal_h
+#define moses_LanguageModelLocal_h
+
+#include <string>
+#include <vector>
+#include <boost/unordered_map.hpp>
+#include "moses/Factor.h"
+#include "moses/TypeDef.h"
+#include "SingleFactor.h"
+#include "MultiFactor.h"
+
+class Factor;
+class Phrase;
+class Vocab;
+class Ngram;
+
+namespace Moses
+{
+
+/** Local language models (Monz 2011)
+ */
+class LanguageModelLocal : public LanguageModelMultiFactor, public LanguageModelPointerState
+{
+protected:
+ boost::unordered_map<size_t, unsigned int> m_lmIdLookup;
+ ::Vocab *m_srilmVocab;
+ Ngram *m_srilmModel;
+ unsigned int m_unknownId;
+
+ LMResult GetValue(unsigned int wordId, unsigned int *context) const;
+ void CreateFactors();
+ unsigned int GetLmID( const std::string &str ) const;
+ unsigned int GetLmID( const Factor *form, const Factor *tag ) const;
+
+ // Cantor's pairing function
+ size_t PairNumbers(size_t a, size_t b) const
+ {
+ return (a + b) * (a + b + 1) / 2 + b;
+ }
+
+public:
+ LanguageModelLocal();
+ ~LanguageModelLocal();
+ bool Load(const std::string &filePath, const std::vector<FactorType> &factors, size_t nGramOrder);
+
+ virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = 0) const;
+};
+
+
+}
+#endif
diff --git a/moses/LM/MultiFactor.h b/moses/LM/MultiFactor.h
index 1d38fbee6..d2211b1c5 100644
--- a/moses/LM/MultiFactor.h
+++ b/moses/LM/MultiFactor.h
@@ -36,7 +36,7 @@ class Phrase;
/* Abstract class for for multi factor LM. Only inherited by the JointLM at the moment.
* Could use this when factored LM are implemented
*/
-class LanguageModelMultiFactor : public LanguageModelImplementation
+class LanguageModelMultiFactor : virtual public LanguageModelImplementation
{
protected:
FactorMask m_factorTypes;
diff --git a/moses/LM/ORLM.h b/moses/LM/ORLM.h
index 442f22bac..87bf44f86 100644
--- a/moses/LM/ORLM.h
+++ b/moses/LM/ORLM.h
@@ -17,7 +17,7 @@ class Phrase;
/** @todo ask ollie
*/
-class LanguageModelORLM : public LanguageModelPointerState {
+class LanguageModelORLM : public LanguageModelPointerState, public LanguageModelSingleFactor {
public:
typedef count_t T; // type for ORLM filter
LanguageModelORLM()
diff --git a/moses/LM/Rand.cpp b/moses/LM/Rand.cpp
index 4f0718b68..13205d6a9 100644
--- a/moses/LM/Rand.cpp
+++ b/moses/LM/Rand.cpp
@@ -41,7 +41,7 @@ namespace
{
using namespace std;
-class LanguageModelRandLM : public LanguageModelPointerState
+class LanguageModelRandLM : public LanguageModelPointerState, public LanguageModelSingleFactor
{
public:
LanguageModelRandLM()
diff --git a/moses/LM/Remote.h b/moses/LM/Remote.h
index 7fa4bd0af..f60f17257 100644
--- a/moses/LM/Remote.h
+++ b/moses/LM/Remote.h
@@ -13,7 +13,7 @@ namespace Moses
/** @todo ask miles
*/
-class LanguageModelRemote : public LanguageModelPointerState
+class LanguageModelRemote : public LanguageModelPointerState, public LanguageModelSingleFactor
{
private:
struct Cache {
diff --git a/moses/LM/SRI.h b/moses/LM/SRI.h
index f88f5947e..2e7bb53f6 100644
--- a/moses/LM/SRI.h
+++ b/moses/LM/SRI.h
@@ -38,7 +38,7 @@ namespace Moses
/** Implementation of single factor LM using IRST's code.
*/
-class LanguageModelSRI : public LanguageModelPointerState
+class LanguageModelSRI : public LanguageModelPointerState, public LanguageModelSingleFactor
{
protected:
std::vector<unsigned int> m_lmIdLookup;
diff --git a/moses/LM/SingleFactor.h b/moses/LM/SingleFactor.h
index 05828dc9b..6908f4e2d 100644
--- a/moses/LM/SingleFactor.h
+++ b/moses/LM/SingleFactor.h
@@ -32,7 +32,7 @@ class FactorCollection;
class Factor;
//! Abstract class for for single factor LM
-class LanguageModelSingleFactor : public LanguageModelImplementation
+class LanguageModelSingleFactor : virtual public LanguageModelImplementation
{
protected:
const Factor *m_sentenceStart, *m_sentenceEnd;
@@ -70,8 +70,8 @@ public:
}
};
-// Single factor LM that uses a null pointer state.
-class LanguageModelPointerState : public LanguageModelSingleFactor
+// LM that uses a null pointer state.
+class LanguageModelPointerState : virtual public LanguageModelImplementation
{
private:
FFState *m_nullContextState;
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index faf98c448..706ae12cb 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -123,6 +123,7 @@ enum LMImplementation {
,LazyKen = 9
,ORLM = 10
,LDHTLM = 11
+ ,Local = 12
};
enum PhraseTableImplementation {