// $Id$ /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include "LM/ParallelBackoff.h" #include #include #include #include #include "LM/MultiFactor.h" #include "Word.h" #include "Factor.h" #include "FactorTypeSet.h" #include "FactorCollection.h" #include "Phrase.h" #include "TypeDef.h" #include "Util.h" #include "FNgramSpecs.h" #include "FNgramStats.h" #include "FactoredVocab.h" #include "FNgram.h" #include "wmatrix.h" #include "Vocab.h" #include "File.h" using namespace std; namespace Moses { namespace { class LanguageModelParallelBackoff : public LanguageModelMultiFactor { private: std::vector m_factorTypesOrdered; FactoredVocab *m_srilmVocab; FNgram *m_srilmModel; VocabIndex m_unknownId; VocabIndex m_wtid; VocabIndex m_wtbid; VocabIndex m_wteid; FNgramSpecs* fnSpecs; //std::vector m_lmIdLookup; std::map* lmIdMap; std::fstream* debugStream; WidMatrix *widMatrix; public: ~LanguageModelParallelBackoff(); bool Load(const std::string &filePath, const std::vector &factorTypes, size_t nGramOrder); VocabIndex GetLmID( const std::string &str ) const; VocabIndex GetLmID( const Factor *factor, FactorType ft ) const; void CreateFactors(); LMResult GetValueForgotState(const std::vector &contextFactor, FFState &outState) const; const FFState *GetNullContextState() const; const FFState *GetBeginSentenceState() const; FFState *NewState(const FFState *from) const; }; LanguageModelParallelBackoff::~LanguageModelParallelBackoff() { /// } bool LanguageModelParallelBackoff::Load(const std::string &filePath, const std::vector &factorTypes, size_t nGramOrder) { cerr << "Loading Language Model Parallel Backoff!!!\n"; widMatrix = new ::WidMatrix(); m_factorTypes = FactorMask(factorTypes); m_srilmVocab = new ::FactoredVocab(); //assert(m_srilmVocab != 0); fnSpecs = 0; File f(filePath.c_str(),"r"); fnSpecs = new ::FNgramSpecs(f,*m_srilmVocab, 0/*debug*/); cerr << "Loaded fnSpecs!\n"; m_srilmVocab->unkIsWord() = true; m_srilmVocab->nullIsWord() = true; m_srilmVocab->toLower() = false; FNgramStats *factoredStats = new FNgramStats(*m_srilmVocab, *fnSpecs); factoredStats->debugme(2); cerr << "Factored stats\n"; FNgram* fngramLM = new FNgram(*m_srilmVocab,*fnSpecs); cerr << "FNgram object created\n"; fngramLM->skipOOVs = false; if (!factoredStats->read()) { cerr << "error reading in counts in factor file\n"; exit(1); } cerr << "Factored stats read!\n"; factoredStats->estimateDiscounts(); factoredStats->computeCardinalityFunctions(); factoredStats->sumCounts(); cerr << "Another three operations made!\n"; if (!fngramLM->read()) { cerr << "format error in lm file\n"; exit(1); } cerr << "fngramLM reads!\n"; m_filePath = filePath; m_nGramOrder= nGramOrder; m_factorTypesOrdered= factorTypes; m_unknownId = m_srilmVocab->unkIndex(); cerr << "m_unknowdId = " << m_unknownId << endl; m_srilmModel = fngramLM; cerr << "Create factors...\n"; CreateFactors(); cerr << "Factors created! \n"; //FactorCollection &factorCollection = FactorCollection::Instance(); /*for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) { FactorType factorType = m_factorTypesOrdered[index]; m_sentenceStartArray[factorType] = factorCollection.AddFactor(Output, factorType, BOS_); m_sentenceEndArray[factorType] = factorCollection.AddFactor(Output, factorType, EOS_); //factorIdStart = m_sentenceStartArray[factorType]->GetId(); //factorIdEnd = m_sentenceEndArray[factorType]->GetId(); for (size_t i = 0; i < 10; i++) { lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_); lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_); } //(*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_); //(*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_); }*/ return true; } VocabIndex LanguageModelParallelBackoff::GetLmID( const std::string &str ) const { return m_srilmVocab->getIndex( str.c_str(), m_unknownId ); } VocabIndex LanguageModelParallelBackoff::GetLmID( const Factor *factor, size_t ft ) const { size_t factorId = factor->GetId(); if ( lmIdMap->find( factorId * 10 + ft ) != lmIdMap->end() ) { return lmIdMap->find( factorId * 10 + ft )->second; } else { return m_unknownId; } } void LanguageModelParallelBackoff::CreateFactors() { // add factors which have srilm id FactorCollection &factorCollection = FactorCollection::Instance(); lmIdMap = new std::map(); VocabString str; VocabIter iter(*m_srilmVocab); iter.init(); size_t pomFactorTypeNum = 0; while ( (str = iter.next()) != NULL) { if ((str[0] < 'a' || str[0] > 'k') && str[0] != 'W') { continue; } VocabIndex lmId = GetLmID(str); pomFactorTypeNum = str[0] - 'a'; size_t factorId = factorCollection.AddFactor(Output, m_factorTypesOrdered[pomFactorTypeNum], &(str[2]) )->GetId(); (*lmIdMap)[factorId * 10 + pomFactorTypeNum] = lmId; } size_t factorIdStart; size_t factorIdEnd; // sentence markers for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) { FactorType factorType = m_factorTypesOrdered[index]; m_sentenceStartArray[index] = factorCollection.AddFactor(Output, factorType, BOS_); m_sentenceEndArray[index] = factorCollection.AddFactor(Output, factorType, EOS_); factorIdStart = m_sentenceStartArray[index]->GetId(); factorIdEnd = m_sentenceEndArray[index]->GetId(); /*for (size_t i = 0; i < 10; i++) { lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_); lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_); }*/ (*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_); (*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_); cerr << "BOS_:" << GetLmID(BOS_) << ", EOS_:" << GetLmID(EOS_) << endl; } m_wtid = GetLmID("W-"); m_wtbid = GetLmID("W-~~"); m_wteid = GetLmID("W-~~"); cerr << "W- index: " << m_wtid << endl; cerr << "W- ~~index: " << m_wtbid << endl; cerr << "W-~~ index: " << m_wteid << endl; } LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector &contextFactor, FFState & /*outState */) const { static WidMatrix widMatrix; for (int i=0; iwordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() ); ret.score = FloorScore(TransformLMScore(ret.score)); ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId); return ret; /*if (contextFactor.size() == 0) { return 0; } for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos ) { const Word &word = *contextFactor[currPos]; for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) { FactorType factorType = m_factorTypesOrdered[index]; const Factor *factor = word[factorType]; (*widMatrix)[currPos][index] = GetLmID(factor, index); } } float p = m_srilmModel->wordProb( (*widMatrix), m_nGramOrder - 1, m_nGramOrder ); return FloorScore(TransformLMScore(p)); */ } // The old version did not initialize finalState like it should. Technically that makes the behavior undefined, so it's not clear what else to do here. FFState *LanguageModelParallelBackoff::NewState(const FFState * /*from*/) const { return NULL; } const FFState *LanguageModelParallelBackoff::GetNullContextState() const { return NULL; } const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const { return NULL; } } LanguageModelMultiFactor *NewParallelBackoff() { return new LanguageModelParallelBackoff(); } }