src/layers/embedding.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

#pragma once
#include "generic.h"
#include "marian.h"

namespace marian {

class FactoredVocab;

// A regular embedding layer.
// Note that this also applies dropout if the option is passed (pass 0 when in inference mode).
// It is best to not use Embedding directly, but rather via getEmbeddingLayer() in
// EncoderDecoderLayerBase, which knows to pass on all required parameters from options.
class Embedding : public LayerBase, public IEmbeddingLayer {
  Expr E_;
  Expr FactorEmbMatrix_; // Factors embedding matrix if combining lemma and factors embeddings with concatenation
  Ptr<FactoredVocab> factoredVocab_;
  Expr multiRows(const Words& data, float dropProb) const;
  Expr embedWithConcat(const Words& data) const;
  bool inference_{false};

public:
  Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options);

  std::tuple<Expr /*embeddings*/, Expr /*mask*/> apply(
      Ptr<data::SubBatch> subBatch) const override final;

  Expr apply(const Words& words, const Shape& shape) const override final;

  Expr applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const override final;
};

class ULREmbedding : public LayerBase, public IEmbeddingLayer {
  std::vector<Expr> ulrEmbeddings_;  // @TODO: These could now better be written as 6 named class members
  bool inference_{false};

public:
  ULREmbedding(Ptr<ExpressionGraph> graph, Ptr<Options> options)
      : LayerBase(graph, options), inference_(opt<bool>("inference")) {
    std::string name = "url_embed";  // opt<std::string>("prefix");
    int dimKeys      = opt<int>("dimTgtVoc");
    int dimQueries   = opt<int>("dimSrcVoc");
    int dimEmb       = opt<int>("dimEmb");
    int dimUlrEmb    = opt<int>("dimUlrEmb");  // ULR mono embed size
    bool fixed       = opt<bool>("fixed", false);

    // Embedding layer initialization should depend only on embedding size, hence fanIn=false
    auto initFunc = inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true);

    std::string queryFile = opt<std::string>("ulrQueryFile");
    std::string keyFile   = opt<std::string>("ulrKeysFile");
    bool trainTrans       = opt<bool>("ulrTrainTransform", false);
    if(!queryFile.empty() && !keyFile.empty()) {
      initFunc         = inits::fromWord2vec(queryFile, dimQueries, dimUlrEmb, false);
      name             = "ulr_query";
      fixed            = true;
      auto query_embed = graph_->param(name, {dimQueries, dimUlrEmb}, initFunc, fixed);
      ulrEmbeddings_.push_back(query_embed);
      // keys embeds
      initFunc       = inits::fromWord2vec(keyFile, dimKeys, dimUlrEmb, false);
      name           = "ulr_keys";
      fixed          = true;
      auto key_embed = graph_->param(name, {dimKeys, dimUlrEmb}, initFunc, fixed);
      ulrEmbeddings_.push_back(key_embed);
      // actual  trainable embedding
      initFunc = inits::glorotUniform();
      name     = "ulr_embed";
      fixed    = false;
      auto ulr_embed = graph_->param(name, {dimKeys, dimEmb}, initFunc, fixed);  // note the reverse dim
      ulrEmbeddings_.push_back(ulr_embed);
      // init  trainable src embedding
      name               = "ulr_src_embed";
      auto ulr_src_embed = graph_->param(name, {dimQueries, dimEmb}, initFunc, fixed);
      ulrEmbeddings_.push_back(ulr_src_embed);
      // ulr transformation matrix
      // initFunc = inits::eye(1.f); // identity matrix  - is it ok to init wiht identity or shall
      // we make this to the fixed case only
      if(trainTrans) {
        initFunc = inits::glorotUniform();
        fixed    = false;
      } else {
        initFunc = inits::eye();  // identity matrix
        fixed    = true;
      }
      name              = "ulr_transform";
      auto ulrTransform = graph_->param(name, {dimUlrEmb, dimUlrEmb}, initFunc, fixed);
      ulrEmbeddings_.push_back(ulrTransform);

      initFunc = inits::fromValue(
          1.f);  // TBD: we should read sharable flags here - 1 means all sharable - 0 means no
                 // universal embeddings - should be zero for top freq only
      fixed            = true;
      name             = "ulr_shared";
      auto share_embed = graph_->param(name, {dimQueries, 1}, initFunc, fixed);
      ulrEmbeddings_.push_back(share_embed);
    }
  }

  std::tuple<Expr /*embeddings*/, Expr /*mask*/> apply(
      Ptr<data::SubBatch> subBatch) const override final {
    auto queryEmbed   = ulrEmbeddings_[0];  // Q : dimQueries*dimUlrEmb
    auto keyEmbed     = ulrEmbeddings_[1];  // K : dimKeys*dimUlrEmb
    auto uniEmbed     = ulrEmbeddings_[2];  // E : dimQueries*dimEmb
    auto srcEmbed     = ulrEmbeddings_[3];  // I : dimQueries*dimEmb
    auto ulrTransform = ulrEmbeddings_[4];  // A : dimUlrEmb *dimUlrEmb
    auto ulrSharable  = ulrEmbeddings_[5];  // alpha : dimQueries*1
    int dimBatch      = (int)subBatch->batchSize();
    int dimEmb        = uniEmbed->shape()[-1];
    int dimWords      = (int)subBatch->batchWidth();
    // D = K.A.QT
    // dimm(K) = univ_tok_vocab*uni_embed_size
    // dim A = uni_embed_size*uni_embed_size
    // dim Q: uni_embed_size * total_merged_vocab_size
    // dim D = univ_tok_vocab * total_merged_vocab_size
    // note all above can be precombuted and serialized if A is not trainiable and during decoding
    // (TBD) here we need to handle the mini-batch extract raws corresponding to Xs in this
    // minibatch from Q
    auto embIdx          = toWordIndexVector(subBatch->data());
    auto queryEmbeddings = rows(queryEmbed, embIdx);
    auto srcEmbeddings   = rows(srcEmbed, embIdx);     // extract trainable src embeddings
    auto alpha           = rows(ulrSharable, embIdx);  // extract sharable flags
    auto qt              = dot(queryEmbeddings, ulrTransform, false, false);  // A: transform embeddings based on similarity A :  dimUlrEmb*dimUlrEmb
    auto sqrtDim         = std::sqrt((float)queryEmbeddings->shape()[-1]);
    qt = qt / sqrtDim;  // normalize accordin to embed size to avoid dot prodcut growing large in
                        // magnitude with larger embeds sizes
    auto z         = dot(qt, keyEmbed, false, true);                   // query-key similarity
    float dropProb = this->options_->get<float>("ulr-dropout", 0.0f);  // default no dropout
    if(!inference_)
      z = dropout(z, dropProb);

    float tau
        = this->options_->get<float>("ulr-softmax-temperature", 1.0f);  // default no temperature
    // temperature in softmax is to control randomness of predictions
    // high temperature Softmax outputs are more close to each other
    // low temperatures the softmax become more similar to  "hardmax"
    auto weights = softmax(z / tau);  // assume default  is dim=-1, what about temprature? - scaler ??
    auto chosenEmbeddings = dot(weights, uniEmbed);  // AVERAGE
    auto chosenEmbeddings_mix = srcEmbeddings + alpha * chosenEmbeddings;  // this should be elementwise  broadcast
    auto batchEmbeddings = reshape(chosenEmbeddings_mix, {dimWords, dimBatch, dimEmb});
    auto graph           = ulrEmbeddings_.front()->graph();
    auto batchMask = graph->constant({dimWords, dimBatch, 1}, inits::fromVector(subBatch->mask()));
    if(!inference_)
      batchEmbeddings = dropout(batchEmbeddings,
                                options_->get<float>("dropout-embeddings", 0.0f),
                                {batchEmbeddings->shape()[-3], 1, 1});
    return std::make_tuple(batchEmbeddings, batchMask);
  }

  Expr apply(const Words& words, const Shape& shape) const override final {
    return applyIndices(toWordIndexVector(words), shape);
  }

  Expr applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const override final {
    embIdx;
    shape;
    ABORT("not implemented");  // @TODO: implement me
  }
};

}  // namespace marian