Welcome to mirror list, hosted at ThFree Co, Russian Federation.

Rand.cpp « LM « src « moses - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 93411b8d3c5cc0908a420487b4db0e1183865c5b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
***********************************************************************/

#include "util/check.hh"
#include <limits>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include "Factor.h"
#include "Util.h"
#include "LM/SingleFactor.h"
#include "RandLM.h"


#include "LM/Rand.h"
#include "FactorCollection.h"
#include "Phrase.h"
#include "InputFileStream.h"
#include "StaticData.h"


namespace Moses
{
namespace 
{
using namespace std;

class LanguageModelRandLM : public LanguageModelPointerState
{
public:
  LanguageModelRandLM()
    : m_lm(0) {}
  bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder);
  virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL) const;
  ~LanguageModelRandLM() {
    delete m_lm;
  }
  void CleanUpAfterSentenceProcessing() {
    m_lm->clearCaches(); // clear caches
  }
  void InitializeBeforeSentenceProcessing() {
    m_lm->initThreadSpecificData(); // Creates thread specific data iff
                                    // compiled with multithreading.
  }
protected:
  std::vector<randlm::WordID> m_randlm_ids_vec;
  randlm::RandLM* m_lm;
  randlm::WordID m_oov_id;
  void CreateFactors(FactorCollection &factorCollection);
  randlm::WordID GetLmID( const std::string &str ) const;
  randlm::WordID GetLmID( const Factor *factor ) const {
    size_t factorId = factor->GetId();
    return ( factorId >= m_randlm_ids_vec.size()) ? m_oov_id : m_randlm_ids_vec[factorId];
  };

};


bool LanguageModelRandLM::Load(const std::string &filePath, FactorType factorType,
                               size_t nGramOrder)
{
  cerr << "Loading LanguageModelRandLM..." << endl;
  FactorCollection &factorCollection = FactorCollection::Instance();
  m_filePath = filePath;
  m_factorType = factorType;
  m_nGramOrder = nGramOrder;
  int cache_MB = 50; // increase cache size
  m_lm = randlm::RandLM::initRandLM(filePath, nGramOrder, cache_MB);
  CHECK(m_lm != NULL);
  // get special word ids
  m_oov_id = m_lm->getWordID(m_lm->getOOV());
  CreateFactors(factorCollection);
  m_lm->initThreadSpecificData();
  return true;
}

void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection)   // add factors which have randlm id
{
  // code copied & paste from SRI LM class. should do template function
  // first get all bf vocab in map
  std::map<size_t, randlm::WordID> randlm_ids_map; // map from factor id -> randlm id
  size_t maxFactorId = 0; // to create lookup vector later on
  for(std::map<randlm::Word, randlm::WordID>::const_iterator vIter = m_lm->vocabStart();
      vIter != m_lm->vocabEnd(); vIter++) {
    // get word from randlm vocab and associate with (new) factor id
    size_t factorId=factorCollection.AddFactor(Output,m_factorType,vIter->first)->GetId();
    randlm_ids_map[factorId] = vIter->second;
    maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
  }
  // add factors for BOS and EOS and store bf word ids
  size_t factorId;
  m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, m_lm->getBOS());
  factorId = m_sentenceStart->GetId();
  maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
  m_sentenceStartArray[m_factorType] = m_sentenceStart;

  m_sentenceEnd	= factorCollection.AddFactor(Output, m_factorType, m_lm->getEOS());
  factorId = m_sentenceEnd->GetId();
  maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
  m_sentenceEndArray[m_factorType] = m_sentenceEnd;

  // add to lookup vector in object
  m_randlm_ids_vec.resize(maxFactorId+1);
  // fill with OOV code
  fill(m_randlm_ids_vec.begin(), m_randlm_ids_vec.end(), m_oov_id);

  for (map<size_t, randlm::WordID>::const_iterator iter = randlm_ids_map.begin();
       iter != randlm_ids_map.end() ; ++iter)
    m_randlm_ids_vec[iter->first] = iter->second;

}

randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
{
  return m_lm->getWordID(str);
}

LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
                                    State* finalState) const
{
  FactorType factorType = GetFactorType();
  // set up context
  randlm::WordID ngram[MAX_NGRAM_SIZE];
  int count = contextFactor.size();
  for (int i = 0 ; i < count ; i++) {
    ngram[i] = GetLmID((*contextFactor[i])[factorType]);
    //std::cerr << m_lm->getWord(ngram[i]) << " ";
  }
  int found = 0;
  LMResult ret;
  ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState)));
  ret.unknown = count && (ngram[count - 1] == m_oov_id);
  //if (finalState)
  //  std::cerr << " = " << logprob << "(" << *finalState << ", " <<")"<< std::endl;
  //else
  //  std::cerr << " = " << logprob << std::endl;
  return ret;
}

}

LanguageModelPointerState *NewRandLM() {
  return new LanguageModelRandLM();
}

}