test/QuantizationHelpers.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

/*
 * Copyright (c) Facebook, Inc. and its affiliates.
 * All rights reserved.
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */
#include "QuantizationHelpers.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <limits>

using namespace std;

namespace fbgemm {
/*
 * @brief Make sure we won't have overflows from vpmaddubsw instruction.
 */
template <typename T>
void avoidOverflow(
    int m,
    int n,
    int k,
    const uint8_t* Aint8,
    int lda,
    T* B,
    int ldb) {
  for (int i = 0; i < m; ++i) {
    for (int j = 0; j < n; ++j) {
      for (int kk = 0; kk < k / 2 * 2; kk += 2) {
        int a0 = Aint8[i * lda + kk], a1 = Aint8[i * lda + kk + 1];
        int b0 = B[kk * ldb + j], b1 = B[(kk + 1) * ldb + j];
        int sum_pair = a0 * b0 + a1 * b1;
        if (sum_pair < numeric_limits<int16_t>::lowest()) {
          int b1_adjusted =
              ceil((numeric_limits<int16_t>::lowest() - a0 * b0) / a1);
          b1_adjusted = std::min(std::max(b1_adjusted, -128), 127);

          int new_sum_pair = a0 * b0 + a1 * b1_adjusted;
          assert(
              new_sum_pair >= numeric_limits<int16_t>::lowest() &&
              new_sum_pair <= numeric_limits<int16_t>::max());
          B[(kk + 1) * n + j] = b1_adjusted;
        } else if (sum_pair > numeric_limits<int16_t>::max()) {
          int b1_adjusted =
              floor((numeric_limits<int16_t>::max() - a0 * b0) / a1);
          b1_adjusted = std::min(std::max(b1_adjusted, -128), 127);

          int new_sum_pair = a0 * b0 + a1 * b1_adjusted;
          assert(
              new_sum_pair >= numeric_limits<int16_t>::lowest() &&
              new_sum_pair <= numeric_limits<int16_t>::max());
          B[(kk + 1) * ldb + j] = b1_adjusted;
        }
      }
    } // for each j
  } // for each i
}

template <typename T>
void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B) {
  return avoidOverflow(m, n, k, Aint8, k, B, n);
}

template void avoidOverflow(
    int m,
    int n,
    int k,
    const uint8_t* Aint8,
    int lda,
    int8_t* B,
    int ldb);
template void avoidOverflow(
    int m,
    int n,
    int k,
    const uint8_t* Aint8,
    int lda,
    float* B,
    int ldb);

template void
avoidOverflow(int m, int n, int k, const uint8_t* Aint8, int8_t* B);
template void
avoidOverflow(int m, int n, int k, const uint8_t* Aint8, float* B);
} // namespace fbgemm