src/translator/nth_element.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

/* All or part of this file was contributed by Intel under license:
 *   Copyright (C) 2017-2018 Intel Corporation
 *   SPDX-License-Identifier: MIT
 */

#include "translator/nth_element.h"
#include <algorithm>
#include <iterator>
#include <limits>
#include <numeric>

namespace marian {

class NthElementCPU {
  std::vector<int> h_res_idx;
  std::vector<float> h_res;
  //size_t lastN_;

public:
  NthElementCPU() {}
  NthElementCPU(const NthElementCPU& copy) = delete;

private:
    void selectNBest(float* scores,
                     const std::vector<int>& batchFirstElementIdxs,
                     const std::vector<int>& cumulativeBeamSizes) {
    /* For each batch, select the max N elements, where N is the beam size for
     * this batch. Locally record these elements (their current value and index
     * in 'scores') before updating each element to a large negative value, such
     * that they won't be a maximum if we're called again on the same input.
     */

    int numProbs = batchFirstElementIdxs.back();
    std::vector<int> idxs(numProbs);
    std::iota(idxs.begin(), idxs.end(), 0);

    size_t numBatches = batchFirstElementIdxs.size() - 1;
    for(size_t batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
      int pos = cumulativeBeamSizes[batchIdx];
      int beamSize = cumulativeBeamSizes[batchIdx + 1] - pos;

      std::vector<int>::iterator begin = idxs.begin() + batchFirstElementIdxs[batchIdx];
      std::vector<int>::iterator middle = begin + beamSize;
      std::vector<int>::iterator end = idxs.begin() + batchFirstElementIdxs[batchIdx + 1];
      std::partial_sort(
          begin, middle, end, [=](int a, int b) { return scores[a] > scores[b]; });

      while(begin != middle) {
        int idx = *begin++;
        h_res_idx[pos] = idx;
        h_res[pos] = scores[idx];
        scores[idx] = std::numeric_limits<float>::lowest();
        ++pos;
      }
    }
  }

public:
  // @BUGBUG: This API mixes input and output beam size.
  void getNBestList(Tensor scores, // [dimBatch, 1, beamSize, dimVocab or dimShortlist]
                    size_t N,
                    std::vector<float>& outPathScores,
                    std::vector<unsigned>& outKeys,
                    const bool isFirst) {
    const auto vocabSize = scores->shape()[-1];
    const auto inputN    = scores->shape()[-2];
    const auto dimBatch  = scores->shape()[-4];
    ABORT_IF(inputN != (isFirst ? 1 : N), "Input tensor has wrong beam dim??");

    const std::vector<size_t> beamSizes(dimBatch, N);
    std::vector<int> cumulativeBeamSizes(beamSizes.size() + 1, 0);
    std::vector<int> batchFirstElementIdxs(beamSizes.size() + 1, 0);

    for(int batchIdx = 0; batchIdx < beamSizes.size(); ++batchIdx) {
      cumulativeBeamSizes[batchIdx + 1] = cumulativeBeamSizes[batchIdx] + (int)beamSizes[batchIdx];
      ABORT_IF(cumulativeBeamSizes[batchIdx + 1] != (batchIdx + 1) * N, "cumulativeBeamSizes wrong??");
      batchFirstElementIdxs[batchIdx + 1]
          += (isFirst ? batchIdx + 1 : cumulativeBeamSizes[batchIdx + 1]) * vocabSize;
      ABORT_IF((isFirst ? batchIdx + 1 : cumulativeBeamSizes[batchIdx + 1]) != (batchIdx + 1) * inputN, "inputN wrong??");
    }

    size_t maxSize = N * dimBatch;
    h_res.resize(maxSize);
    h_res_idx.resize(maxSize);

    selectNBest(scores->data(), batchFirstElementIdxs, cumulativeBeamSizes);
    getPairs(cumulativeBeamSizes.back(), outKeys, outPathScores);
    ABORT_IF(cumulativeBeamSizes.back() != dimBatch * N, "cumulativeBeamSizes.back() wrong??");
  }

private:
  void getPairs(size_t number,
                std::vector<unsigned>& outKeys,
                std::vector<float>& outValues) {
    std::copy(h_res_idx.begin(), h_res_idx.begin() + number, std::back_inserter(outKeys));
    std::copy(h_res    .begin(), h_res    .begin() + number, std::back_inserter(outValues));
    //lastN_ = number;
  }

  //void getValueByKey(std::vector<float>& out, float* d_in) {
  //  for(size_t i = 0; i < lastN_; ++i) {
  //    out[i] = d_in[h_res_idx[i]];
  //  }
  //}
};

#ifdef CUDA_FOUND
GetNBestListFn createGetNBestListGPUFn(size_t beamSize, size_t dimBatch, DeviceId deviceId); // in .cu file
#endif

// factory function
// Returns a lambda with the same signature as the getNBestList() function.
GetNBestListFn createGetNBestListFn(size_t beamSize, size_t dimBatch, DeviceId deviceId) {
#ifdef CUDA_FOUND
  if(deviceId.type == DeviceType::gpu)
    return createGetNBestListGPUFn(beamSize, dimBatch, deviceId);
#else
  deviceId; beamSize; dimBatch; // (unused)
#endif
  auto nth = New<NthElementCPU>();
  return [nth](Tensor logProbs, size_t N, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst) {
    return nth->getNBestList(logProbs, N, outCosts, outKeys, isFirst);
  };
}

}  // namespace marian