1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
#pragma once
#include "types.h"
#include <cstdint>
#include <stdint.h>
#include "multiply.h"
// 8 bit is in ssse3_gemm.h
namespace intgemm {
namespace sse2 {
// Same implementation as AVX512, just width. Grabs 4 32-bit values.
// TODO duplicated function requires the removal of the annonymous namespace
INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
}
INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
class QuantizeTile16 {
public:
typedef __m128i Integer;
INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
// Quantize 8xfloat into 8xint16_t
INTGEMM_SSE2 inline __m128i Consecutive(const float *input) {
__m128i g0 = QuantizerGrab(input, mult_reg_);
__m128i g1 = QuantizerGrab(input + 4, mult_reg_);
return _mm_packs_epi32(g0, g1);
}
INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) {
return Consecutive(input);
}
private:
const __m128 mult_reg_;
};
// Technically only requires SSE
INTGEMM_MAXABSOLUTE(__m128, INTGEMM_SSE2)
} //namespace
// This should be pure INTGEMM_SSE2 (and below).
struct SSE2_16bit {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
INTGEMM_SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 8 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
sse2::QuantizeTile16 q(quant_mult);
const float *end = input + size;
for (; input != end; input += 8, output += 8) {
*reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
}
}
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 8;
static const Index kBTileCol = 8;
INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16)
INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
//TODO #DEFINE
sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
}
INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, OnSSE2)
constexpr static const char *const kName = "16-bit INTGEMM_SSE2";
static const CPUType kUses = CPU_SSE2;
};
} // namespace intgemm
|