lm/interpolate/merge_probabilities.hh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

#ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H
#define LM_INTERPOLATE_MERGE_PROBABILITIES_H

#include "../common/ngram.hh"
#include "bounded_sequence_encoding.hh"
#include "../../util/fixed_array.hh"
#include "../../util/stream/multi_stream.hh"

#include <stdint.h>

namespace lm {
namespace interpolate {

struct InterpolateInfo;

/**
 * Make the encoding of backoff values for a given order.  This stores values
 * in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd())
 */
BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order);

/**
 * The first pass for the offline log-linear interpolation algorithm. This
 * reads K **suffix-ordered** streams for each model, for each order, of
 * ngram records (ngram-id, prob, backoff). It further assumes that the
 * ngram-ids have been unified over all of the stream inputs.
 *
 * Its output is records of (ngram-id, prob-prod, backoff-level,
 * backoff-level, ...) where the backoff-levels (of which there are K) are
 * the context length (0 for unigrams) that the corresponding model had to
 * back off to in order to obtain a probability for that ngram-id. Each of
 * these streams is terminated with a record whose ngram-id is all
 * maximum-integers for simplicity in implementation here.
 *
 * @param model_by_order An array of length N (max_i N_i) containing at
 *  the ChainPositions for the streams for order (i + 1).
 * The Rus attached to output chains for each order (of length K)
 */
class MergeProbabilities {
  public:
    MergeProbabilities(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order)
      : info_(info), models_by_order_(models_by_order) {}

    void Run(const util::stream::ChainPositions &outputs);

  private:
    const InterpolateInfo &info_;
    util::FixedArray<util::stream::ChainPositions> &models_by_order_;
};

/**
 * This class represents the output payload for this pass, which consists
 * of an ngram-id, a probability, and then a vector of orders from which
 * each of the component models backed off to for this ngram, encoded
 * using the BoundedSequenceEncoding class.
 */
class PartialProbGamma : public lm::NGramHeader {
public:
  PartialProbGamma(std::size_t order, std::size_t backoff_bytes)
      : lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) {
    // nothing
  }

  std::size_t TotalSize() const {
    return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_;
  }

  // TODO: cache bounded sequence encoding in the pipeline?
  static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) {
    return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, order).EncodedLength();
  }

  float &Prob() { return Pay().prob; }
  float Prob() const { return Pay().prob; }

  float &LowerProb() { return Pay().lower_prob; }
  float LowerProb() const { return Pay().lower_prob; }

  const uint8_t *FromBegin() const { return Pay().from; }
  uint8_t *FromBegin() { return Pay().from; }

private:
  struct After {
    // Note that backoff_and_normalize assumes this comes first.
    float prob;
    float lower_prob;
    uint8_t from[];
  };
  const After &Pay() const { return *reinterpret_cast<const After *>(end()); }
  After &Pay() { return *reinterpret_cast<After*>(end()); }

  std::size_t backoff_bytes_;
};

}} // namespaces
#endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H