moses/LM/BilingualLM.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478

#include <vector>
#include "BilingualLM.h"
#include "moses/ScoreComponentCollection.h"

using namespace std;

namespace Moses
{

////////////////////////////////////////////////////////////////
BilingualLM::BilingualLM(const std::string &line)
  : StatefulFeatureFunction(1, line),
    word_factortype(0)
{
  FactorCollection& factorFactory = FactorCollection::Instance(); //Factor Factory to use for BOS_ and EOS_
  BOS_factor = factorFactory.AddFactor(BOS_);
  BOS_word.SetFactor(0, BOS_factor);
  EOS_factor = factorFactory.AddFactor(EOS_);
  EOS_word.SetFactor(0, EOS_factor);

}

void BilingualLM::Load(AllOptions::ptr const& opts)
{
  m_options = opts;
  ReadParameters();
  loadModel();
}

//Populates words with amount words from the targetPhrase from the previous hypothesis where
//words[0] is the last word of the previous hypothesis, words[1] is the second last etc...
void BilingualLM::requestPrevTargetNgrams(
  const Hypothesis &cur_hypo, int amount, std::vector<int> &words) const
{
  const Hypothesis * prev_hyp = cur_hypo.GetPrevHypo();
  int found = 0;

  while (prev_hyp && found != amount) {
    const TargetPhrase& currTargetPhrase = prev_hyp->GetCurrTargetPhrase();
    for (int i = currTargetPhrase.GetSize() - 1; i> -1; i--) {
      if (found != amount) {
        const Word& word = currTargetPhrase.GetWord(i);
        words[found] = getNeuralLMId(word, false);
        found++;
      } else {
        return; //We have gotten everything needed
      }
    }

    prev_hyp = prev_hyp->GetPrevHypo();
  }

  int neuralLM_wordID = getNeuralLMId(BOS_word, false);
  for (int i = found; i < amount; i++) {
    words[i] = neuralLM_wordID;
  }
}

//Populates the words vector with target_ngrams sized that also contains the current word we are looking at.
//(in effect target_ngrams + 1)
void BilingualLM::getTargetWords(
  const Hypothesis &cur_hypo,
  const TargetPhrase &targetPhrase,
  int current_word_index,
  std::vector<int> &words) const
{
  //Check if we need to look at previous target phrases
  int additional_needed = current_word_index - target_ngrams;
  if (additional_needed < 0) {
    additional_needed = -additional_needed;
    std::vector<int> prev_words(additional_needed);
    requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
    for (int i = additional_needed - 1; i >= 0; i--) {
      words.push_back(prev_words[i]);
    }
  }

  if (words.size() > 0) {
    //We have added some words from previous phrases
    //Just add until we reach current_word_index
    for (int i = 0; i <= current_word_index; i++) {
      const Word& word = targetPhrase.GetWord(i);
      words.push_back(getNeuralLMId(word, false));
    }
  } else {
    //We haven't added any words, proceed as before
    for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) {
      const Word& word = targetPhrase.GetWord(i);
      words.push_back(getNeuralLMId(word, false));
    }
  }
}

//Returns source words in the way NeuralLM expects them.

size_t BilingualLM::selectMiddleAlignment(
  const set<size_t>& alignment_links) const
{

  set<size_t>::iterator it = alignment_links.begin();
  for (size_t i = 0; i < (alignment_links.size() - 1) / 2; ++i) {
    ++it;
  }

  return *it;
}

void BilingualLM::getSourceWords(
  const TargetPhrase &targetPhrase,
  int targetWordIdx,
  const Sentence &source_sent,
  const Range &sourceWordRange,
  std::vector<int> &words) const
{
  //Get source context

  //Get alignment for the word we require
  const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();

  // We are getting word alignment for targetPhrase.GetWord(i + target_ngrams -1) according to the paper.
  // Find the closest target word with alignment links.
  std::set<size_t> last_word_al;
  for (int j = 0; j < targetPhrase.GetSize(); j++) {
    // Find the nearest aligned word with preference for right.
    if ((targetWordIdx + j) < targetPhrase.GetSize()) {
      last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx + j);
      if (!last_word_al.empty()) {
        break;
      }
    }

    // We couldn't find word on the right, try to the left.
    if ((targetWordIdx - j) >= 0) {
      last_word_al = alignments.GetAlignmentsForTarget(targetWordIdx - j);
      if (!last_word_al.empty()) {
        break;
      }
    }
  }

  //Assume we have gotten some alignment here. If we couldn't get an alignment from the above routine it means
  //that none of the words in the target phrase aligned to any word in the source phrase

  // Now we get the source words. First select middle alignment.
  //It should never be the case the the word_al size would be zero, but several times this has happened because
  //of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier.
  UTIL_THROW_IF2(last_word_al.size() == 0,
                 "A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table.");
  size_t source_center_index = selectMiddleAlignment(last_word_al);
  // We have found the alignment. Now determine how much to shift by to get the actual source word index.
  size_t phrase_start_pos = sourceWordRange.GetStartPos();
  // Account for how far the current word is from the start of the phrase.
  size_t source_word_mid_idx = phrase_start_pos + source_center_index;

  appendSourceWordsToVector(source_sent, words, source_word_mid_idx);
}

size_t BilingualLM::getState(const Hypothesis& cur_hypo) const
{
  const TargetPhrase &targetPhrase = cur_hypo.GetCurrTargetPhrase();
  size_t hashCode = 0;

  // Check if we need to look at previous target phrases
  int additional_needed = targetPhrase.GetSize() - target_ngrams;
  if (additional_needed < 0) {
    additional_needed = -additional_needed;
    std::vector<int> prev_words(additional_needed);
    requestPrevTargetNgrams(cur_hypo, additional_needed, prev_words);
    for (int i = additional_needed - 1; i >= 0; i--) {
      boost::hash_combine(hashCode, prev_words[i]);
    }

    // Get the rest of the phrases needed
    for (int i = 0; i < targetPhrase.GetSize(); i++) {
      const Word& word = targetPhrase.GetWord(i);
      int neuralLM_wordID = getNeuralLMId(word, false);
      boost::hash_combine(hashCode, neuralLM_wordID);
    }
  } else {
    // We just need the last target_ngrams from the current target phrase.
    for (int i = targetPhrase.GetSize() - target_ngrams; i < targetPhrase.GetSize(); i++) {
      const Word& word = targetPhrase.GetWord(i);
      int neuralLM_wordID = getNeuralLMId(word, false);

      boost::hash_combine(hashCode, neuralLM_wordID);
    }
  }

  return hashCode;
}

FFState* BilingualLM::EvaluateWhenApplied(
  const Hypothesis& cur_hypo,
  const FFState* prev_state,
  ScoreComponentCollection* accumulator) const
{
  Manager& manager = cur_hypo.GetManager();
  const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());

  // Init vectors.
  std::vector<int> source_words;
  source_words.reserve(source_ngrams);
  std::vector<int> target_words;
  target_words.reserve(target_ngrams);

  float value = 0;
  const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();
  const Range& sourceWordRange = cur_hypo.GetCurrSourceWordsRange(); //Source words range to calculate offsets

  // For each word in the current target phrase get its LM score.
  for (int i = 0; i < currTargetPhrase.GetSize(); i++) {
    getSourceWords(
      currTargetPhrase, i, source_sent, sourceWordRange, source_words);
    getTargetWords(cur_hypo, currTargetPhrase, i, target_words);
    value += Score(source_words, target_words);

    // Clear the vectors.
    source_words.clear();
    target_words.clear();
  }

  size_t new_state = getState(cur_hypo);
  accumulator->PlusEquals(this, value);

  return new BilingualLMState(new_state);
}

void BilingualLM::getAllTargetIdsChart(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& wordIds) const
{
  const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();

  for (int i = 0; i < targetPhrase.GetSize(); i++) {
    if (targetPhrase.GetWord(i).IsNonTerminal()) { //Nonterminal get from prev state
      const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
      const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
      const std::vector<int> prevWordIDs = prev_state->GetWordIdsVector();
      for (std::vector<int>::const_iterator it = prevWordIDs.begin(); it!= prevWordIDs.end(); it++) {
        wordIds.push_back(*it);
      }
    } else {
      wordIds.push_back(getNeuralLMId(targetPhrase.GetWord(i), false));
    }
  }
}

void BilingualLM::getAllAlignments(const ChartHypothesis& cur_hypo, size_t featureID, std::vector<int>& word_alignments) const
{
  const TargetPhrase targetPhrase = cur_hypo.GetCurrTargetPhrase();
  int source_word_mid_idx; //The word alignment

  //Get source sent
  const AlignmentInfo& alignments = targetPhrase.GetAlignTerm();

  // get absolute position in source sentence for each source word in rule
  std::vector<int> absolute_source_position (cur_hypo.GetCurrSourceRange().GetNumWordsCovered(), 0); //we actually only need number of source symbols in rule; can we get this number cheaply?

  absolute_source_position[0] = cur_hypo.GetCurrSourceRange().GetStartPos();
  // get last absolute position of each source nonterminal symbol
  for (int i = 0; i < targetPhrase.GetSize(); i++) {
    if (targetPhrase.GetWord(i).IsNonTerminal()) {
      const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
      absolute_source_position[targetPhrase.GetAlignNonTerm().GetNonTermIndexMap2()[i]] = prev_hypo->GetCurrSourceRange().GetEndPos();
    }
  }

  // set absolute position of all source terminal symbols based on absolute position of previous symbol
  for (int i = 0; i != absolute_source_position.size(); i++) {
    if (i && absolute_source_position[i] == 0) {
      absolute_source_position[i] = absolute_source_position[i-1] + 1;
    }
  }

  for (int i = 0; i < targetPhrase.GetSize(); i++) {
    //Sometimes we have to traverse more than one target words because of
    //unaligned words. This is O(n^2) in worst case, but usually closer to O(n)
    if (targetPhrase.GetWord(i).IsNonTerminal()) {
      //If we have a non terminal we can get the alignments from the previous state
      const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i]);
      const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
      const std::vector<int> prevWordAls = prev_state->GetWordAlignmentVector();
      for (std::vector<int>::const_iterator it = prevWordAls.begin(); it!= prevWordAls.end(); it++) {
        word_alignments.push_back(*it);
      }
    } else {
      bool resolvedIndexis = false; //If we are aligning to an existing nonterm we don't need to calculate offsets
      std::set<size_t> word_al = alignments.GetAlignmentsForTarget(i);
      if (word_al.empty()) {
        for (int j = 1; j < targetPhrase.GetSize(); j++) {
          //Try to get alignment from the current word and if it is unaligned,
          //try from the first word to the right and then to the left
          if ((i+j) < targetPhrase.GetSize()) {
            //TODO: this will always succeed, even if first word in previous hypo is unaligned. should it?
            if (targetPhrase.GetWord(i + j).IsNonTerminal()) {
              const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i+j]);
              const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
              source_word_mid_idx = prev_state->GetWordAlignmentVector().front(); // The first word on the right of our word
              resolvedIndexis = true;
              break;
            }
            word_al = alignments.GetAlignmentsForTarget(i + j);
            if (!word_al.empty()) {
              break;
            }
          }

          if ((i - j) >= 0) {
            //TODO: this will always succeed, even if last word in previous hypo is unaligned. should it?
            if (targetPhrase.GetWord(i - j).IsNonTerminal()) {
              const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(targetPhrase.GetAlignNonTerm().GetNonTermIndexMap()[i-j]);
              const BilingualLMState * prev_state = static_cast<const BilingualLMState *>(prev_hypo->GetFFState(featureID));
              source_word_mid_idx = prev_state->GetWordAlignmentVector().back(); // The first word on the left of our word
              resolvedIndexis = true;
              break;
            }

            word_al = alignments.GetAlignmentsForTarget(i - j);
            if (!word_al.empty()) {
              break;
            }
          }
        }
      }

      if (!resolvedIndexis) {
        //It should never be the case the the word_al size would be zero, but several times this has happened because
        //of a corrupt phrase table. It is best to have this check here, as it makes debugging the problem a lot easier.
        UTIL_THROW_IF2(word_al.size() == 0,
                       "A target phrase with no alignments detected! " << targetPhrase << "Check if there is something wrong with your phrase table.");
        size_t source_center_index = selectMiddleAlignment(word_al);
        // We have found the alignment. Now determine how much to shift by to get the actual source word index.
        source_word_mid_idx = absolute_source_position[source_center_index];
      }
      word_alignments.push_back(source_word_mid_idx);
    }
  }

}

size_t BilingualLM::getStateChart(std::vector<int>& neuralLMids) const
{
  size_t hashCode = 0;
  for (int i = neuralLMids.size() - target_ngrams; i < neuralLMids.size(); i++) {
    int neuralLM_wordID;
    if (i < 0) {
      neuralLM_wordID = getNeuralLMId(BOS_word, false);
    } else {
      neuralLM_wordID = neuralLMids[i];
    }
    boost::hash_combine(hashCode, neuralLM_wordID);
  }
  return hashCode;
}

void BilingualLM::getTargetWordsChart(
  std::vector<int>& neuralLMids,
  int current_word_index,
  std::vector<int>& words,
  bool sentence_begin) const
{

  for (int i = current_word_index - target_ngrams; i <= current_word_index; i++) {
    if (i < 0) {
      if (sentence_begin) {
        words.push_back(getNeuralLMId(BOS_word, false));
      } else {
        words.push_back(getNeuralLMId(getNullWord(), false));
      }
    } else {
      words.push_back(neuralLMids[i]);
    }
  }
}

void BilingualLM::appendSourceWordsToVector(const Sentence &source_sent, std::vector<int> &words, int source_word_mid_idx) const
{
  //Define begin and end indexes of the lookup. Cases for even and odd ngrams
  //This can result in indexes which span larger than the length of the source phrase.
  //In this case we just
  int begin_idx;
  int end_idx;

  if (source_ngrams % 2 == 0) {
    begin_idx = source_word_mid_idx - source_ngrams / 2 + 1;
    end_idx = source_word_mid_idx + source_ngrams / 2;
  } else {
    begin_idx = source_word_mid_idx - (source_ngrams - 1) / 2;
    end_idx = source_word_mid_idx + (source_ngrams - 1) / 2;
  }

  //Add words to vector
  for (int j = begin_idx; j <= end_idx; j++) {
    int neuralLM_wordID;
    if (j < 0) {
      neuralLM_wordID = getNeuralLMId(BOS_word, true);
    } else if (j >= source_sent.GetSize()) {
      neuralLM_wordID = getNeuralLMId(EOS_word, true);
    } else {
      const Word& word = source_sent.GetWord(j);
      neuralLM_wordID = getNeuralLMId(word, true);
    }
    words.push_back(neuralLM_wordID);
  }
}

FFState* BilingualLM::EvaluateWhenApplied(
  const ChartHypothesis& cur_hypo,
  int featureID, /* - used to index the state in the previous hypotheses */
  ScoreComponentCollection* accumulator) const
{
  //Init vectors
  std::vector<int> source_words;
  source_words.reserve(source_ngrams);
  std::vector<int> target_words;
  target_words.reserve(target_ngrams+1);

  float value = 0; //NeuralLM score
  const TargetPhrase& currTargetPhrase = cur_hypo.GetCurrTargetPhrase();

  std::vector<int> neuralLMids; //Equivalent more or less to whole_phrase. Contains all word ids but not as expensive
  std::vector<int> alignments;
  //Estimate size and reserve vectors to avoid reallocation
  int future_size = currTargetPhrase.GetNumTerminals();
  for (int i =0; i<currTargetPhrase.GetNumNonTerminals(); i++) {
    const ChartHypothesis * prev_hypo = cur_hypo.GetPrevHypo(i); //We need to look at the nonterm on the left.
    future_size += prev_hypo->GetCurrTargetPhrase().GetSize();
  }
  neuralLMids.reserve(future_size);
  alignments.reserve(future_size);

  getAllTargetIdsChart(cur_hypo, featureID, neuralLMids);
  getAllAlignments(cur_hypo, featureID, alignments);

  bool sentence_begin = false; //Check if this hypothesis' target words are located in the beginning of the sentence
  if (neuralLMids[0] == getNeuralLMId(BOS_word, false)) {
    sentence_begin = true;
  }

  //Get source sentence
  const ChartManager& manager = cur_hypo.GetManager();
  const Sentence& source_sent = static_cast<const Sentence&>(manager.GetSource());

  for (int i = 0; i < neuralLMids.size(); i++) { //This loop should be bigger as non terminals expand

    //We already have resolved the nonterminals, we are left with a simple loop.
    appendSourceWordsToVector(source_sent, source_words, alignments[i]);
    getTargetWordsChart(neuralLMids, i, target_words, sentence_begin);

    value += Score(source_words, target_words); // Get the score

    //Clear the vectors before the next iteration
    source_words.clear();
    target_words.clear();

  }
  size_t new_state = getStateChart(neuralLMids);

  // we're rescoring the full hypothesis, so we need to detract scores from previous hypos
  for (std::vector<const ChartHypothesis*>::const_iterator iter = cur_hypo.GetPrevHypos().begin(); iter != cur_hypo.GetPrevHypos().end(); ++iter) {
    const ChartHypothesis &prevHypo = **iter;
    value -= (prevHypo.GetScoreBreakdown().GetScoreForProducer(this));
  }

  accumulator->PlusEquals(this, value);

  return new BilingualLMState(new_state, alignments, neuralLMids);
}

void BilingualLM::SetParameter(const std::string& key, const std::string& value)
{
  if (key == "path") {
    m_filePath = value;
  } else {
    StatefulFeatureFunction::SetParameter(key, value);
  }
}

} // namespace Moses