Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt14
-rw-r--r--avx2_gemm.cc154
-rw-r--r--avx2_gemm.h151
-rw-r--r--avx512_gemm.cc281
-rw-r--r--avx512_gemm.h274
-rw-r--r--cops.h19
-rw-r--r--intgemm.cc117
-rw-r--r--intgemm.h103
-rw-r--r--intrinsics.h7
-rw-r--r--multiply.h10
-rw-r--r--sse2_gemm.cc75
-rw-r--r--sse2_gemm.h64
-rw-r--r--ssse3_gemm.cc88
-rw-r--r--ssse3_gemm.h83
-rw-r--r--test/multiply_test.cc15
-rw-r--r--types.h15
16 files changed, 667 insertions, 803 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9aa674..05e395a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,15 +29,15 @@ else()
set_source_files_properties(intgemm.cc test/quantize_test.cc test/multiply_test.cc benchmark.cc PROPERTIES COMPILE_DEFINITIONS "INTGEMM_NO_AVX512")
endif()
-add_library(intgemm STATIC ${GEMMS} intgemm.cc)
-foreach(exe example benchmark)
- add_executable(${exe} ${exe}.cc)
- target_link_libraries(${exe} intgemm)
-endforeach()
+#add_library(intgemm STATIC ${GEMMS} intgemm.cc)
+#foreach(exe example benchmark)
+# add_executable(${exe} ${exe}.cc)
+# target_link_libraries(${exe} intgemm)
+#endforeach()
include_directories(.)
-add_executable(tests test/multiply_test.cc test/quantize_test.cc)
-target_link_libraries(tests intgemm)
+add_executable(tests test/multiply_test.cc test/quantize_test.cc avx2_gemm.cc) # avx512_gemm.cc)
+#target_link_libraries(tests intgemm)
#CTest integration with Catch2
include(CMake/Catch.cmake)
diff --git a/avx2_gemm.cc b/avx2_gemm.cc
index 955f64c..541b5c2 100644
--- a/avx2_gemm.cc
+++ b/avx2_gemm.cc
@@ -1,155 +1 @@
#include "avx2_gemm.h"
-#include "cops.h"
-#include "interleave.h"
-#include "multiply.h"
-
-#include <cassert>
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-#include <stdint.h>
-
-namespace intgemm {
-
-// PREPARE A: just quantization in the same memory order.
-
-namespace {
-// Read a vector of floats, multiply them, and cast to 32-bit integer.
-inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
- return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg));
-}
-
-class QuantizeTile16 {
- public:
- typedef __m256i Integer;
-
- explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
-
- Integer Consecutive(const float *input) {
- return Tile(input, input + 8);
- }
-
- Integer ForReshape(const float *input, Index cols) {
- // 8 rows in the first 128-bit register, 8 in the second register.
- return Tile(input, input + 8 * cols);
- }
-
- private:
- __m256i Tile(const float *input0, const float *input1) {
- __m256i g0 = QuantizerGrab(input0, mult_);
- __m256i g1 = QuantizerGrab(input1, mult_);
- __m256i packed = _mm256_packs_epi32(g0, g1);
- // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
- // Technically this could be removed if the PrepareB did the same reordering internally.
- return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
- }
-
- const __m256 mult_;
-};
-
-} // namespace
-
-// Just quantize everything in order.
-void AVX2_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
- assert(size % 16 == 0);
- assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
- QuantizeTile16 q(quant_mult);
- const float *end = input + size;
- for (; input != end; input += 16, output += 16) {
- *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
- }
-}
-
-namespace {
-/* Read 8 floats at a time from input0, input1, input2, and input3. Quantize
- * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
- * the result into one register and return it.
- */
-class QuantizeTile8 {
- public:
- typedef __m256i Integer;
-
- explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
-
- inline __m256i Consecutive(const float *input) {
- return Tile(input, input + 8, input + 16, input + 24);
- }
-
- inline __m256i ForReshape(const float *input, Index cols) {
- // Put higher rows in the second half of the register. These will jumble
- // around in the same way then conveniently land in the right place.
- return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
- }
-
- private:
- inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
- // Looking at the assembly, gcc has pulled this outside the loops calling this.
- const __m256i neg127 = _mm256_set1_epi8(-127);
- const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- // Grab 4 registers at a time in 32-bit format.
- __m256i g0 = QuantizerGrab(input0, mult_);
- __m256i g1 = QuantizerGrab(input1, mult_);
- __m256i g2 = QuantizerGrab(input2, mult_);
- __m256i g3 = QuantizerGrab(input3, mult_);
- // Pack 32-bit to 16-bit.
- __m256i packed0 = _mm256_packs_epi32(g0, g1);
- __m256i packed1 = _mm256_packs_epi32(g2, g3);
- // Pack 16-bit to 8-bit.
- __m256i packed = _mm256_packs_epi16(packed0, packed1);
- // Ban -128.
- packed = _mm256_max_epi8(packed, neg127);
- // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
- // Or as 32-bit integers 0 2 4 6 1 3 5 7
- // Technically this could be removed so long as the rows are bigger than 16
- // and the values are only used for GEMM.
- return _mm256_permutevar8x32_epi32(packed, shuffle_param);
- }
-
- const __m256 mult_;
-};
-} // namespace
-
-// Just quantize everything in order.
-void AVX2_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
- assert(size % 32 == 0);
- assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
- QuantizeTile8 q(quant_mult);
- const float *end = input + size;
- for (; input != end; input += 32, output += 32) {
- *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
- }
-}
-
-void AVX2_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols);
-}
-
-void AVX2_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
-}
-
-void AVX2_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols);
-}
-
-void AVX2_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
-}
-
-void AVX2_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
- Multiply16<__m256i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
-}
-
-void AVX2_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
- Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols);
-}
-
-const char *const AVX2_16bit::kName = "16-bit AVX2";
-const char *const AVX2_8bit::kName = "8-bit AVX2";
-
-float AVX2_MaxAbsolute(const float *begin, const float *end) {
- return MaxAbsoluteBackend<__m256>(begin, end);
-}
-
-} // namespace intgemm
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 4b0b001..32b1a5e 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -3,59 +3,184 @@
#include <cstdint>
#include <stdint.h>
+#include "cops.h"
+#include "interleave.h"
+#include "multiply.h"
+
namespace intgemm {
+// PREPARE A: just quantization in the same memory order.
+
+namespace avx2 {
+// Read a vector of floats, multiply them, and cast to 32-bit integer.
+// EVIL EVIL CODE DUPLICATION, FIX
+AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
+ return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg));
+}
+
+class QuantizeTile16 {
+ public:
+ typedef __m256i Integer;
+
+ explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
+
+ AVX2 Integer Consecutive(const float *input) {
+ return Tile(input, input + 8);
+ }
+
+ AVX2 Integer ForReshape(const float *input, Index cols) {
+ // 8 rows in the first 128-bit register, 8 in the second register.
+ return Tile(input, input + 8 * cols);
+ }
+
+ private:
+ AVX2 __m256i Tile(const float *input0, const float *input1) {
+ __m256i g0 = QuantizerGrab(input0, mult_);
+ __m256i g1 = QuantizerGrab(input1, mult_);
+ __m256i packed = _mm256_packs_epi32(g0, g1);
+ // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
+ // Technically this could be removed if the PrepareB did the same reordering internally.
+ return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
+ }
+
+ const __m256 mult_;
+};
+
+} // namespace
+
+
struct AVX2_16bit {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
- static void Quantize(const float *input, int16_t *output, float quant_mult, Index size);
+ // Just quantize everything in order.
+ AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+ assert(size % 16 == 0);
+ assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
+ avx2::QuantizeTile16 q(quant_mult);
+ const float *end = input + size;
+ for (; input != end; input += 16, output += 16) {
+ *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
+ }
+ }
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
- static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
+ AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols);
+ }
- static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+ AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
+ }
- static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+ AVX2 static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ Multiply16<__m256i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
+ }
- static const char *const kName;
+ constexpr static const char *const kName = "16-bit AVX2";
static const CPUType kUses = CPU_AVX2;
};
+namespace avx2 {
+/* Read 8 floats at a time from input0, input1, input2, and input3. Quantize
+ * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
+ * the result into one register and return it.
+ */
+class QuantizeTile8 {
+ public:
+ typedef __m256i Integer;
+
+ explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
+
+ AVX2 inline __m256i Consecutive(const float *input) {
+ return Tile(input, input + 8, input + 16, input + 24);
+ }
+
+ AVX2 inline __m256i ForReshape(const float *input, Index cols) {
+ // Put higher rows in the second half of the register. These will jumble
+ // around in the same way then conveniently land in the right place.
+ return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
+ }
+
+ private:
+ AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
+ // Looking at the assembly, gcc has pulled this outside the loops calling this.
+ const __m256i neg127 = _mm256_set1_epi8(-127);
+ const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ // Grab 4 registers at a time in 32-bit format.
+ __m256i g0 = avx2::QuantizerGrab(input0, mult_);
+ __m256i g1 = avx2::QuantizerGrab(input1, mult_);
+ __m256i g2 = avx2::QuantizerGrab(input2, mult_);
+ __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+ // Pack 32-bit to 16-bit.
+ __m256i packed0 = _mm256_packs_epi32(g0, g1);
+ __m256i packed1 = _mm256_packs_epi32(g2, g3);
+ // Pack 16-bit to 8-bit.
+ __m256i packed = _mm256_packs_epi16(packed0, packed1);
+ // Ban -128.
+ packed = _mm256_max_epi8(packed, neg127);
+ // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
+ // Or as 32-bit integers 0 2 4 6 1 3 5 7
+ // Technically this could be removed so long as the rows are bigger than 16
+ // and the values are only used for GEMM.
+ return _mm256_permutevar8x32_epi32(packed, shuffle_param);
+ }
+
+ const __m256 mult_;
+};
+} // namespace
+
struct AVX2_8bit {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
- static void Quantize(const float *input, int8_t *output, float quant_mult, Index size);
+ // Just quantize everything in order.
+ AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+ assert(size % 32 == 0);
+ assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
+ avx2::QuantizeTile8 q(quant_mult);
+ const float *end = input + size;
+ for (; input != end; input += 32, output += 32) {
+ *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
+ }
+ }
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
- static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols);
+ AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ PrepareBFor8(input, output, avx2::QuantizeTile8(quant_mult), rows, cols);
+ }
- static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+ AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
+ }
- static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+ AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols);
+ }
- static const char *const kName;
+ constexpr static const char *const kName = "8-bit AVX2";
static const CPUType kUses = CPU_AVX2;
};
// Technically only requires AVX
-float AVX2_MaxAbsolute(const float *begin, const float *end);
+AVX2 float AVX2_MaxAbsolute(const float *begin, const float *end) {
+ return MaxAbsoluteBackend<__m256>(begin, end);
+}
} // namespace intgemm
diff --git a/avx512_gemm.cc b/avx512_gemm.cc
index ff200bf..59da3e4 100644
--- a/avx512_gemm.cc
+++ b/avx512_gemm.cc
@@ -1,282 +1 @@
#include "avx512_gemm.h"
-#include "interleave.h"
-#include "multiply.h"
-#include "cops.h"
-
-#include <cassert>
-#include <cstddef>
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-
-namespace intgemm {
-
-namespace {
-
-// Load from memory, multiply, and convert to int32_t.
-inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
- // Multiply each by the quantization factor.
- __m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg);
- // Cast to 32-bit int
- return _mm512_cvtps_epi32(val);
-}
-
-} // namespace
-
-
-// AVX512 has combined collapse and store instructions:
-// _mm512_mask_cvtsepi32_storeu_epi16
-// _mm512_mask_cvtsepi32_storeu_epi8
-// So conversion in memory uses these, but I also implement a wider version for
-// rearranging B.
-//
-// Convert to 16-bit signed integers.
-void AVX512_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
- assert(size % 16 == 0);
- assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
- // Fill with the quantization multiplier.
- const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
- const float *end = input + size;
- for (; input != end; input += 16, output += 16) {
- // There doesn't seem to be an unmasked version.
- _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, QuantizerGrab(input, quant_mult_reg));
- }
-}
-
-// Convert to 8-bit signed integers.
-void AVX512_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
- assert(size % 16 == 0);
- assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
- const __m512i neg127 = _mm512_set1_epi32(-127);
- const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
- const float *end = input + size;
- for (; input < end; input += 16, output += 16) {
- __m512i asint = QuantizerGrab(input, quant_mult_reg);
- asint = _mm512_max_epi32(asint, neg127);
- // There doesn't seem to be an unmasked version.
- _mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint);
- }
-}
-
-namespace {
-
-// For PrepareB we want to read 8 columns at a time. When converting 32-bit
-// floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes
-// wide so it reads off the edge of the tile. We could expand the tile size
-// but then the memory written to won't be contiguous anyway so we'd be doing a
-// scatter anyway. Easier to just read the 8 columns we wanted as 256 bits
-// concatenate.
-inline __m512 Concat(const __m256 first, const __m256 second) {
- // AVX512DQ but that goes with AVX512BW anyway.
- return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
-}
-
-// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
-inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
- __m512 appended = Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1));
- appended = _mm512_mul_ps(appended, quant_mult_reg);
- return _mm512_cvtps_epi32(appended);
-}
-
-// These are only used for reshaping due to the AVX512 instructions
-// _mm512_mask_cvtsepi32_storeu_epi16 and _mm512_mask_cvtsepi32_storeu_epi8
-// being used for the quantizer.
-class QuantizeTile16 {
- public:
- typedef __m512i Integer;
-
- explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
- inline __m512i ForReshape(const float *input, Index cols) {
- __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
- __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
- __m512i packed = _mm512_packs_epi32(g0, g1);
- // Permute within 256-bit lanes, so same as AVX2
- return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
- }
-
- private:
- const __m512 mult_reg_;
-};
-
-class QuantizeTile8 {
- public:
- typedef __m512i Integer;
-
- explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
- inline __m512i ForReshape(const float *input, Index cols) {
- // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
- const __m512i neg127 = _mm512_set1_epi8(-127);
- // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
- const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-
- // 32-bit format.
- __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_);
- __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_);
- __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_);
- __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_);
- // Pack 32-bit to 16-bit.
- __m512i packed0 = _mm512_packs_epi32(g0, g1);
- __m512i packed1 = _mm512_packs_epi32(g2, g3);
- // Pack 16-bit to 8-bit.
- __m512i packed = _mm512_packs_epi16(packed0, packed1);
- // Ban -128.
- packed = _mm512_max_epi8(packed, neg127);
- // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
- return _mm512_permutexvar_epi32(shuffle_param, packed);
- }
-
- private:
- const __m512 mult_reg_;
-};
-
-} // namespace
-
-void AVX512_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols);
-}
-
-void AVX512_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
-}
-
-void AVX512_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols);
-}
-
-void AVX512_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
-}
-
-void AVX512_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
- // The unquantization is only 256-bit wide because there are 8 results.
- Multiply16<__m512i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
-}
-
-// Special AVX512 implementation due to having 32 registers (so I don't have to
-// allocate registers manually) and no sign instruction.
-void AVX512_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
- typedef __m512i Integer;
- typedef __m256 Float; // For quantization we only do 8 at a time.
- // This is copy-paste from Multiply8_SSE2OrAVX2.
- assert(width % sizeof(Integer) == 0);
- assert(B_cols % 8 == 0);
- assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0);
- assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0);
- assert(reinterpret_cast<uintptr_t>(C) % sizeof(Integer) == 0);
- Float unquant_reg = set1_ps<Float>(unquant_mult);
- const int simd_width = width / sizeof(Integer);
- const Integer *B0_col = reinterpret_cast<const Integer*>(B);
- // Added for AVX512.
- Integer zeros = setzero_si<Integer>();
- // Go over 8 columns of B at a time.
- for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) {
- // Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.
- for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
- // Iterate over shared (inner) dimension.
- const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width);
- const Integer *A_end = A_live + simd_width;
- const Integer *B_live = B0_col;
-
- // Do the first iteration to initialize the sums.
- __m512i a = *A_live;
- __mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
- __m512i a_positive = _mm512_abs_epi8(a);
- // These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.
- Integer sum0 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[0], neg_mask, zeros, B_live[0]));
- Integer sum1 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[1], neg_mask, zeros, B_live[1]));
- Integer sum2 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[2], neg_mask, zeros, B_live[2]));
- Integer sum3 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[3], neg_mask, zeros, B_live[3]));
- Integer sum4 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[4], neg_mask, zeros, B_live[4]));
- Integer sum5 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[5], neg_mask, zeros, B_live[5]));
- Integer sum6 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[6], neg_mask, zeros, B_live[6]));
- Integer sum7 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[7], neg_mask, zeros, B_live[7]));
-
- ++A_live;
- B_live += 8;
-
- // Use A as the loop variable so the add can be done where gcc likes it
- // for branch prediction.
- for (; A_live != A_end; ++A_live, B_live += 8) {
- // Unique code here: can we do an inline function?
- // Retrieve a. We will use this as the unsigned part.
- a = *A_live;
- // Retrieve the conveniently consecutive values of B.
- __m512i b0 = *B_live;
- __m512i b1 = *(B_live + 1);
- __m512i b2 = *(B_live + 2);
- __m512i b3 = *(B_live + 3);
- __m512i b4 = *(B_live + 4);
- __m512i b5 = *(B_live + 5);
- __m512i b6 = *(B_live + 6);
- __m512i b7 = *(B_live + 7);
-
- // Get a mask where a is negative.
- // Didn't seem to make a difference definining sign bits here vs at top
- neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
- a_positive = _mm512_abs_epi8(a);
-
- // Negate by subtracting from zero with a mask.
- b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0);
- b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1);
- b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2);
- b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3);
- b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4);
- b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5);
- b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6);
- b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7);
- // The magic 8-bit multiply then horizontal sum into 16-bit.
- b0 = _mm512_maddubs_epi16(a_positive, b0);
- b1 = _mm512_maddubs_epi16(a_positive, b1);
- b2 = _mm512_maddubs_epi16(a_positive, b2);
- b3 = _mm512_maddubs_epi16(a_positive, b3);
- b4 = _mm512_maddubs_epi16(a_positive, b4);
- b5 = _mm512_maddubs_epi16(a_positive, b5);
- b6 = _mm512_maddubs_epi16(a_positive, b6);
- b7 = _mm512_maddubs_epi16(a_positive, b7);
- // Now we have 16-bit results that are the sum of two multiplies.
- // Choosing to approximate and do adds.
- // Perhaps every so often we could accumulate by upcasting.
- sum0 = _mm512_adds_epi16(sum0, b0);
- sum1 = _mm512_adds_epi16(sum1, b1);
- sum2 = _mm512_adds_epi16(sum2, b2);
- sum3 = _mm512_adds_epi16(sum3, b3);
- sum4 = _mm512_adds_epi16(sum4, b4);
- sum5 = _mm512_adds_epi16(sum5, b5);
- sum6 = _mm512_adds_epi16(sum6, b6);
- sum7 = _mm512_adds_epi16(sum7, b7);
- // Unique code ends: can we do an inline function?
- }
- // Upcast to 32-bit and horizontally add.
- Integer ones = set1_epi16<Integer>(1);
- sum0 = madd_epi16(sum0, ones);
- sum1 = madd_epi16(sum1, ones);
- sum2 = madd_epi16(sum2, ones);
- sum3 = madd_epi16(sum3, ones);
- sum4 = madd_epi16(sum4, ones);
- sum5 = madd_epi16(sum5, ones);
- sum6 = madd_epi16(sum6, ones);
- sum7 = madd_epi16(sum7, ones);
- Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3);
- Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7);
-
- auto total = PermuteSummer(pack0123, pack4567);
- WriteC(C + A_rowidx * B_cols + B0_colidx, total, unquant_reg);
- }
- }
-}
-
-const char *const AVX512_16bit::kName = "16-bit AVX512";
-const char *const AVX512_8bit::kName = "8-bit AVX512";
-
-float AVX512_MaxAbsolute(const float *begin, const float *end) {
- return MaxAbsoluteBackend<__m512>(begin, end);
-}
-
-} // namespace intgemm
diff --git a/avx512_gemm.h b/avx512_gemm.h
index f9b0f81..e226686 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -1,6 +1,15 @@
#pragma once
#include <stdint.h>
#include <cstdint>
+#include <cassert>
+#include <cstddef>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "interleave.h"
+#include "multiply.h"
+#include "cops.h"
#include "types.h"
@@ -15,31 +24,140 @@
namespace intgemm {
+// AVX512 has combined collapse and store instructions:
+// _mm512_mask_cvtsepi32_storeu_epi16
+// _mm512_mask_cvtsepi32_storeu_epi8
+// So conversion in memory uses these, but I also implement a wider version for
+// rearranging B.
+
+// Convert to 16-bit signed integers.
+namespace avx512f {
+
+// Load from memory, multiply, and convert to int32_t.
+AVX512F inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
+ // Multiply each by the quantization factor.
+ __m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg);
+ // Cast to 32-bit int
+ return _mm512_cvtps_epi32(val);
+}
+
+// For PrepareB we want to read 8 columns at a time. When converting 32-bit
+// floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes
+// wide so it reads off the edge of the tile. We could expand the tile size
+// but then the memory written to won't be contiguous anyway so we'd be doing a
+// scatter anyway. Easier to just read the 8 columns we wanted as 256 bits
+// concatenate.
+AVX512F inline __m512 Concat(const __m256 first, const __m256 second) {
+ // AVX512DQ but that goes with AVX512BW anyway.
+ return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
+}
+
+// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
+AVX512F inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
+ __m512 appended = avx512f::Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1));
+ appended = _mm512_mul_ps(appended, quant_mult_reg);
+ return _mm512_cvtps_epi32(appended);
+}
+
+// These are only used for reshaping due to the AVX512 instructions
+// _mm512_mask_cvtsepi32_storeu_epi16 and _mm512_mask_cvtsepi32_storeu_epi8
+// being used for the quantizer.
+class QuantizeTile16 {
+ public:
+ typedef __m512i Integer;
+
+ explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+
+ AVX512F inline __m512i ForReshape(const float *input, Index cols) {
+ __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
+ __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
+ __m512i packed = _mm512_packs_epi32(g0, g1);
+ // Permute within 256-bit lanes, so same as AVX2
+ return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
+ }
+
+ private:
+ const __m512 mult_reg_;
+};
+
+class QuantizeTile8 {
+ public:
+ typedef __m512i Integer;
+
+ explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+
+ AVX512F inline __m512i ForReshape(const float *input, Index cols) {
+ // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
+ const __m512i neg127 = _mm512_set1_epi8(-127);
+ // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
+ const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+ // 32-bit format.
+ __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_);
+ __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_);
+ __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_);
+ __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_);
+ // Pack 32-bit to 16-bit.
+ __m512i packed0 = _mm512_packs_epi32(g0, g1);
+ __m512i packed1 = _mm512_packs_epi32(g2, g3);
+ // Pack 16-bit to 8-bit.
+ __m512i packed = _mm512_packs_epi16(packed0, packed1);
+ // Ban -128.
+ packed = _mm512_max_epi8(packed, neg127);
+ // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
+ return _mm512_permutexvar_epi32(shuffle_param, packed);
+ }
+
+ private:
+ const __m512 mult_reg_;
+};
+
+} // namespace
+
struct AVX512_16bit {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
// rows * cols must be a multiple of 16.
- static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ AVX512F static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Technically output can be unaligned in Quantize.
// But then it will need to be aligned for Multiply.
// size must be a multiple of 16.
- static void Quantize(const float *input, int16_t *output, float quant_mult, Index size);
+ // Convert to 16-bit signed integers.
+ AVX512F static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+ assert(size % 16 == 0);
+ assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
+ // Fill with the quantization multiplier.
+ const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
+ const float *end = input + size;
+ for (; input != end; input += 16, output += 16) {
+ // There doesn't seem to be an unmasked version.
+ _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, avx512f::QuantizerGrab(input, quant_mult_reg));
+ }
+ }
+
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
- static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
+ AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols);
+ }
- static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+ AVX512F static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
+ }
- static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+ AVX512F static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ // The unquantization is only 256-bit wide because there are 8 results.
+ Multiply16<__m512i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
+ }
- static const char *const kName;
+ constexpr static const char *const kName = "16-bit AVX512";
static const CPUType kUses = CPU_AVX512BW;
};
@@ -48,29 +166,159 @@ struct AVX512_8bit {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ AVX512F static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Technically output can be unaligned in Quantize.
// But then it will need to be aligned for Multiply.
- static void Quantize(const float *input, int8_t *output, float quant_mult, Index size);
+ // Convert to 8-bit signed integers.
+ AVX512F static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+ assert(size % 16 == 0);
+ assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
+ const __m512i neg127 = _mm512_set1_epi32(-127);
+ const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
+ const float *end = input + size;
+ for (; input < end; input += 16, output += 16) {
+ __m512i asint = avx512f::QuantizerGrab(input, quant_mult_reg);
+ asint = _mm512_max_epi32(asint, neg127);
+ // There doesn't seem to be an unmasked version.
+ _mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint);
+ }
+ }
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 64;
static const Index kBTileCol = 8;
- static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols);
+ AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols);
+ }
+
+ AVX512F static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
+ }
+
+ // Special AVX512 implementation due to having 32 registers (so I don't have to
+ // allocate registers manually) and no sign instruction.
+ AVX512F static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ typedef __m512i Integer;
+ typedef __m256 Float; // For quantization we only do 8 at a time.
+ // This is copy-paste from Multiply8_SSE2OrAVX2.
+ assert(width % sizeof(Integer) == 0);
+ assert(B_cols % 8 == 0);
+ assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0);
+ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0);
+ assert(reinterpret_cast<uintptr_t>(C) % sizeof(Integer) == 0);
+ Float unquant_reg = set1_ps<Float>(unquant_mult);
+ const int simd_width = width / sizeof(Integer);
+ const Integer *B0_col = reinterpret_cast<const Integer*>(B);
+ // Added for AVX512.
+ Integer zeros = setzero_si<Integer>();
+ // Go over 8 columns of B at a time.
+ for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) {
+ // Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.
+ for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
+ // Iterate over shared (inner) dimension.
+ const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width);
+ const Integer *A_end = A_live + simd_width;
+ const Integer *B_live = B0_col;
+
+ // Do the first iteration to initialize the sums.
+ __m512i a = *A_live;
+ __mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
+ __m512i a_positive = _mm512_abs_epi8(a);
+ // These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.
+ Integer sum0 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[0], neg_mask, zeros, B_live[0]));
+ Integer sum1 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[1], neg_mask, zeros, B_live[1]));
+ Integer sum2 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[2], neg_mask, zeros, B_live[2]));
+ Integer sum3 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[3], neg_mask, zeros, B_live[3]));
+ Integer sum4 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[4], neg_mask, zeros, B_live[4]));
+ Integer sum5 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[5], neg_mask, zeros, B_live[5]));
+ Integer sum6 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[6], neg_mask, zeros, B_live[6]));
+ Integer sum7 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[7], neg_mask, zeros, B_live[7]));
- static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+ ++A_live;
+ B_live += 8;
- static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+ // Use A as the loop variable so the add can be done where gcc likes it
+ // for branch prediction.
+ for (; A_live != A_end; ++A_live, B_live += 8) {
+ // Unique code here: can we do an inline function?
+ // Retrieve a. We will use this as the unsigned part.
+ a = *A_live;
+ // Retrieve the conveniently consecutive values of B.
+ __m512i b0 = *B_live;
+ __m512i b1 = *(B_live + 1);
+ __m512i b2 = *(B_live + 2);
+ __m512i b3 = *(B_live + 3);
+ __m512i b4 = *(B_live + 4);
+ __m512i b5 = *(B_live + 5);
+ __m512i b6 = *(B_live + 6);
+ __m512i b7 = *(B_live + 7);
+
+ // Get a mask where a is negative.
+ // Didn't seem to make a difference definining sign bits here vs at top
+ neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
+ a_positive = _mm512_abs_epi8(a);
+
+ // Negate by subtracting from zero with a mask.
+ b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0);
+ b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1);
+ b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2);
+ b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3);
+ b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4);
+ b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5);
+ b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6);
+ b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7);
+ // The magic 8-bit multiply then horizontal sum into 16-bit.
+ b0 = _mm512_maddubs_epi16(a_positive, b0);
+ b1 = _mm512_maddubs_epi16(a_positive, b1);
+ b2 = _mm512_maddubs_epi16(a_positive, b2);
+ b3 = _mm512_maddubs_epi16(a_positive, b3);
+ b4 = _mm512_maddubs_epi16(a_positive, b4);
+ b5 = _mm512_maddubs_epi16(a_positive, b5);
+ b6 = _mm512_maddubs_epi16(a_positive, b6);
+ b7 = _mm512_maddubs_epi16(a_positive, b7);
+ // Now we have 16-bit results that are the sum of two multiplies.
+ // Choosing to approximate and do adds.
+ // Perhaps every so often we could accumulate by upcasting.
+ sum0 = _mm512_adds_epi16(sum0, b0);
+ sum1 = _mm512_adds_epi16(sum1, b1);
+ sum2 = _mm512_adds_epi16(sum2, b2);
+ sum3 = _mm512_adds_epi16(sum3, b3);
+ sum4 = _mm512_adds_epi16(sum4, b4);
+ sum5 = _mm512_adds_epi16(sum5, b5);
+ sum6 = _mm512_adds_epi16(sum6, b6);
+ sum7 = _mm512_adds_epi16(sum7, b7);
+ // Unique code ends: can we do an inline function?
+ }
+ // Upcast to 32-bit and horizontally add.
+ Integer ones = set1_epi16<Integer>(1);
+ sum0 = madd_epi16(sum0, ones);
+ sum1 = madd_epi16(sum1, ones);
+ sum2 = madd_epi16(sum2, ones);
+ sum3 = madd_epi16(sum3, ones);
+ sum4 = madd_epi16(sum4, ones);
+ sum5 = madd_epi16(sum5, ones);
+ sum6 = madd_epi16(sum6, ones);
+ sum7 = madd_epi16(sum7, ones);
+ Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3);
+ Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7);
+
+ auto total = PermuteSummer(pack0123, pack4567);
+ WriteC(C + A_rowidx * B_cols + B0_colidx, total, unquant_reg);
+ }
+ }
+}
- static const char *const kName;
+ constexpr static const char *const kName = "8-bit AVX512";
static const CPUType kUses = CPU_AVX512BW;
};
-float AVX512_MaxAbsolute(const float *begin_float, const float *end_float);
+AVX512F float AVX512_MaxAbsolute(const float *begin, const float *end) {
+ return MaxAbsoluteBackend<__m512>(begin, end);
+}
} // namespace intgemm
diff --git a/cops.h b/cops.h
index 5fb67bd..b93f4e9 100644
--- a/cops.h
+++ b/cops.h
@@ -1,27 +1,10 @@
+#pragma once
#include "intrinsics.h"
#include <exception>
namespace intgemm {
-// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
-class UnsupportedCPU : public std::exception {
- public:
- UnsupportedCPU();
-
- ~UnsupportedCPU() throw();
-
- const char *what() const throw() override;
-};
-
-UnsupportedCPU::UnsupportedCPU() {}
-
-UnsupportedCPU::~UnsupportedCPU() throw() {}
-
-const char *UnsupportedCPU::what() const throw() {
- return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
-}
-
class JustUnquantizeC {
public:
JustUnquantizeC(float *C, float unquant_mult);
diff --git a/intgemm.cc b/intgemm.cc
index d286c1a..f50a957 100644
--- a/intgemm.cc
+++ b/intgemm.cc
@@ -1,118 +1 @@
#include "intgemm.h"
-
-#include "types.h"
-#include "sse2_gemm.h"
-#include "ssse3_gemm.h"
-#include "avx2_gemm.h"
-#ifndef INTGEMM_NO_AVX512
-#include "avx512_gemm.h"
-#endif
-
-namespace intgemm {
-
-UnsupportedCPU::UnsupportedCPU() {}
-
-UnsupportedCPU::~UnsupportedCPU() throw() {}
-
-const char *UnsupportedCPU::what() const throw() {
- return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
-}
-
-namespace {
-
-struct Unsupported_16bit {
- static void Quantize(const float *, int16_t *, float, Index) {
- throw UnsupportedCPU();
- }
- static void PrepareB(const float *, int16_t *, float, Index, Index) {
- throw UnsupportedCPU();
- }
- static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
- throw UnsupportedCPU();
- }
- static void Multiply(const int16_t *, const int16_t *, float *, float, Index, Index, Index) {
- throw UnsupportedCPU();
- }
- static const char *const kName;
-};
-const char *const Unsupported_16bit::kName = "16-bit Unsupported";
-
-struct Unsupported_8bit {
- static void Quantize(const float *, int8_t *, float, Index) {
- throw UnsupportedCPU();
- }
- static void PrepareB(const float *, int8_t *, float, Index, Index) {
- throw UnsupportedCPU();
- }
- static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) {
- throw UnsupportedCPU();
- }
- static void Multiply(const int8_t *, const int8_t *, float *, float, Index, Index, Index) {
- throw UnsupportedCPU();
- }
- static const char *const kName;
-};
-const char *const Unsupported_8bit::kName = "8-bit Unsupported";
-
-float Unsupported_MaxAbsolute(const float *begin, const float *end) {
- throw UnsupportedCPU();
-}
-
-/* Returns:
- * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but
- * cloud providers lie). TODO: don't catch Knights processors with this.
- *
- * avx2 if the CPU supports AVX2
- *
- * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
- *
- * sse2 if the CPU supports SSE2
- *
- * unsupported otherwise
- */
-template <class T> T ChooseCPU(T avx512, T avx2, T ssse3, T sse2, T unsupported) {
- // TODO: don't catch Knights processors here!
-#ifndef INTGEMM_NO_AVX512
- if (__builtin_cpu_supports("avx512f")) {
- return avx512;
- }
-#endif
- if (__builtin_cpu_supports("avx2")) {
- return avx2;
- } else if (__builtin_cpu_supports("ssse3")) {
- return ssse3;
- } else if (__builtin_cpu_supports("sse2")) {
- return sse2;
- } else {
- return unsupported;
- }
-}
-
-#ifdef INTGEMM_NO_AVX512
-// These won't ever be called in this capacity, but it does let the code below compile.
-typedef Unsupported_16bit AVX512_16bit;
-typedef Unsupported_8bit AVX512_8bit;
-float AVX512_MaxAbsolute(const float *begin, const float *end) {
- throw UnsupportedCPU();
-}
-#endif
-
-} // namespace
-
-void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize);
-void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB);
-void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
-void (*Int16::Multiply)(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_16bit::Multiply, AVX2_16bit::Multiply, SSE2_16bit::Multiply, SSE2_16bit::Multiply, Unsupported_16bit::Multiply);
-const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName);
-
-void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
-void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
-void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
-void (*Int8::Multiply)(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_8bit::Multiply, AVX2_8bit::Multiply, SSSE3_8bit::Multiply, Unsupported_8bit::Multiply, Unsupported_8bit::Multiply);
-const char *const Int8::kName = ChooseCPU(AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-
-const CPUType kCPU = ChooseCPU(CPU_AVX512BW, CPU_AVX2, CPU_SSSE3, CPU_SSE2, CPU_UNSUPPORTED);
-
-float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512_MaxAbsolute, AVX2_MaxAbsolute, SSE2_MaxAbsolute, SSE2_MaxAbsolute, Unsupported_MaxAbsolute);
-
-} // namespace intgemm
diff --git a/intgemm.h b/intgemm.h
index 58fa8cc..5818ce8 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -43,24 +43,94 @@
// Yes, both headers due to the debacle about int32_t
#include <cstdint>
#include <stdint.h>
-#include <exception>
#include "types.h"
+#include "sse2_gemm.h"
+#include "ssse3_gemm.h"
+#include "avx2_gemm.h"
+#ifndef INTGEMM_NO_AVX512
+#include "avx512_gemm.h"
+#endif
/* Dispatch to functions based on runtime CPUID. This adds one call-by-variable to each call. */
namespace intgemm {
-// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
-class UnsupportedCPU : public std::exception {
- public:
- UnsupportedCPU();
-
- ~UnsupportedCPU() throw();
+struct Unsupported_16bit {
+ static void Quantize(const float *, int16_t *, float, Index) {
+ throw UnsupportedCPU();
+ }
+ static void PrepareB(const float *, int16_t *, float, Index, Index) {
+ throw UnsupportedCPU();
+ }
+ static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
+ throw UnsupportedCPU();
+ }
+ static void Multiply(const int16_t *, const int16_t *, float *, float, Index, Index, Index) {
+ throw UnsupportedCPU();
+ }
+ constexpr static const char *const kName = "16-bit Unsupported";
+};
- const char *what() const throw() override;
+struct Unsupported_8bit {
+ static void Quantize(const float *, int8_t *, float, Index) {
+ throw UnsupportedCPU();
+ }
+ static void PrepareB(const float *, int8_t *, float, Index, Index) {
+ throw UnsupportedCPU();
+ }
+ static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) {
+ throw UnsupportedCPU();
+ }
+ static void Multiply(const int8_t *, const int8_t *, float *, float, Index, Index, Index) {
+ throw UnsupportedCPU();
+ }
+ constexpr static const char *const kName = "8-bit Unsupported";
};
+float Unsupported_MaxAbsolute(const float *begin, const float *end) {
+ throw UnsupportedCPU();
+}
+
+#ifdef INTGEMM_NO_AVX512
+// These won't ever be called in this capacity, but it does let the code below compile.
+typedef Unsupported_16bit AVX512_16bit;
+typedef Unsupported_8bit AVX512_8bit;
+float AVX512_MaxAbsolute(const float *begin, const float *end) {
+ throw UnsupportedCPU();
+}
+#endif
+
+/* Returns:
+ * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but
+ * cloud providers lie). TODO: don't catch Knights processors with this.
+ *
+ * avx2 if the CPU supports AVX2
+ *
+ * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
+ *
+ * sse2 if the CPU supports SSE2
+ *
+ * unsupported otherwise
+ */
+template <class T> T ChooseCPU(T avx512, T avx2, T ssse3, T sse2, T unsupported) {
+ // TODO: don't catch Knights processors here!
+#ifndef INTGEMM_NO_AVX512
+ if (__builtin_cpu_supports("avx512f")) {
+ return avx512;
+ }
+#endif
+ if (__builtin_cpu_supports("avx2")) {
+ return avx2;
+ } else if (__builtin_cpu_supports("ssse3")) {
+ return ssse3;
+ } else if (__builtin_cpu_supports("sse2")) {
+ return sse2;
+ } else {
+ return unsupported;
+ }
+}
+
/* 16-bit matrix multiplication. */
struct Int16 {
typedef int16_t Integer;
@@ -96,6 +166,13 @@ struct Int16 {
static const char *const kName;
};
+void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize);
+void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB);
+void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
+void (*Int16::Multiply)(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_16bit::Multiply, AVX2_16bit::Multiply, SSE2_16bit::Multiply, SSE2_16bit::Multiply, Unsupported_16bit::Multiply);
+const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName);
+
+
/* 8-bit matrix multiplication */
struct Int8 {
typedef int8_t Integer;
@@ -130,7 +207,17 @@ struct Int8 {
static const char *const kName;
};
+void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
+void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
+void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
+void (*Int8::Multiply)(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_8bit::Multiply, AVX2_8bit::Multiply, SSSE3_8bit::Multiply, Unsupported_8bit::Multiply, Unsupported_8bit::Multiply);
+const char *const Int8::kName = ChooseCPU(AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+
+const CPUType kCPU = ChooseCPU(CPU_AVX512BW, CPU_AVX2, CPU_SSSE3, CPU_SSE2, CPU_UNSUPPORTED);
+
// Get the maximum absolute value of an array of floats. The number of floats must be a multiple of 16 and 64-byte aligned.
extern float (*MaxAbsolute)(const float *begin, const float *end);
+float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512_MaxAbsolute, AVX2_MaxAbsolute, SSE2_MaxAbsolute, SSE2_MaxAbsolute, Unsupported_MaxAbsolute);
+
} // namespace intgemm
diff --git a/intrinsics.h b/intrinsics.h
index 7c2cf57..fcc4752 100644
--- a/intrinsics.h
+++ b/intrinsics.h
@@ -33,13 +33,14 @@ template <> SSE2 inline __m128 set1_ps<__m128>(float to) {
SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
return _mm_madd_epi16(first, second);
}
-SSE2 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
+SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
return _mm_maddubs_epi16(first, second);
}
-SSE2 static inline __m128i sign_epi8(__m128i first, __m128i second) {
+SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
return _mm_sign_epi8(first, second);
}
-SSE2 static inline __m128i abs_epi8(__m128i arg) {
+
+SSSE3 static inline __m128i abs_epi8(__m128i arg) {
return _mm_abs_epi8(arg);
}
SSE2 static inline __m128 max_ps(__m128 first, __m128 second) {
diff --git a/multiply.h b/multiply.h
index 3e19bc8..9b7bf61 100644
--- a/multiply.h
+++ b/multiply.h
@@ -17,7 +17,7 @@ static inline float MaxFloat32(__m128 a) {
return *reinterpret_cast<float*>(&a);
}
-static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) {
+SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) {
// No op for 128 bits: already reduced fully.
MultiplyResult128 ret;
ret.pack0123 = pack0123;
@@ -38,7 +38,7 @@ static inline float MaxFloat32(__m256 a) {
return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)));
}
-static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
+AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
// This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f
__m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21);
// This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s
@@ -54,7 +54,7 @@ static inline void WriteC(float *to, __m256i total, __m256 unquant_reg) {
#endif
#ifdef __AVX512BW__
-static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
+AVX512F static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
// Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567]
__m512i mix0 = _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6));
// Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567, 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567]
@@ -123,8 +123,8 @@ template <class Register> inline Register Pack0123(Register sum0, Register sum1,
// A_rows can be anything non-negative.
// width must be a multiple of the register size.
// B_cols must be a multiple of 8.
-//#define Multiply16(Integer, Annotate) \
- template <class WriteC> Annotate inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) {
+//#define Multiply16(Integer, Annotate) \ //fd
+// template <class WriteC> Annotate inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) {
//
template <class Integer, class WriteC> inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) {
assert(width % (sizeof(Integer) / sizeof(int16_t)) == 0);
diff --git a/sse2_gemm.cc b/sse2_gemm.cc
index 0ef353a..2fa5596 100644
--- a/sse2_gemm.cc
+++ b/sse2_gemm.cc
@@ -1,77 +1,2 @@
// This is only 16-bit. 8-bit is in ssse3_gemm.cc since it requires that.
#include "sse2_gemm.h"
-#include "cops.h"
-
-#include "interleave.h"
-#include "multiply.h"
-
-#include <stdint.h>
-#include <cassert>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-namespace intgemm {
-
-namespace {
-// Same implementation as AVX512, just width. Grabs 4 32-bit values.
-inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
- return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
-}
-
-class QuantizeTile16 {
- public:
- typedef __m128i Integer;
-
- explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
- // Quantize 8xfloat into 8xint16_t
- inline __m128i Consecutive(const float *input) {
- __m128i g0 = QuantizerGrab(input, mult_reg_);
- __m128i g1 = QuantizerGrab(input + 4, mult_reg_);
- return _mm_packs_epi32(g0, g1);
- }
-
- inline __m128i ForReshape(const float *input, int) {
- return Consecutive(input);
- }
-
- private:
- const __m128 mult_reg_;
-};
-} // namespace
-
-/* I also tried an implementation based on _mm_cvtps_pi16 but it was slower:
- * For size 1048576, run 10x in seconds on i7-6700:
- * This code: 0.00228409, 0.00204906
- * With _mm_cvtps_pi16 basis: 0.00391884, 0.00390869
- */
-void SSE2_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
- assert(size % 8 == 0);
- assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
- assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
- QuantizeTile16 q(quant_mult);
- const float *end = input + size;
- for (; input != end; input += 8, output += 8) {
- *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
- }
-}
-
-void SSE2_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols);
-}
-
-void SSE2_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
-}
-
-void SSE2_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
- Multiply16<__m128i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
-}
-
-const char *const SSE2_16bit::kName = "16-bit SSE2";
-
-float SSE2_MaxAbsolute(const float *begin, const float *end) {
- return MaxAbsoluteBackend<__m128>(begin, end);
-}
-
-} // namespace intgemm
diff --git a/sse2_gemm.h b/sse2_gemm.h
index 0f26362..7a5e5ce 100644
--- a/sse2_gemm.h
+++ b/sse2_gemm.h
@@ -2,37 +2,87 @@
#include "types.h"
#include <cstdint>
#include <stdint.h>
+#include "cops.h"
+#include "multiply.h"
// 8 bit is in ssse3_gemm.h
namespace intgemm {
+namespace sse2 {
+// Same implementation as AVX512, just width. Grabs 4 32-bit values.
+// TODO duplicated function requires the removal of the annonymous namespace
+SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
+ return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
+}
+
+class QuantizeTile16 {
+ public:
+ typedef __m128i Integer;
+
+ explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
+
+ // Quantize 8xfloat into 8xint16_t
+ SSE2 inline __m128i Consecutive(const float *input) {
+ __m128i g0 = QuantizerGrab(input, mult_reg_);
+ __m128i g1 = QuantizerGrab(input + 4, mult_reg_);
+ return _mm_packs_epi32(g0, g1);
+ }
+
+ SSE2 inline __m128i ForReshape(const float *input, int) {
+ return Consecutive(input);
+ }
+
+ private:
+ const __m128 mult_reg_;
+};
+} //namespace
// This should be pure SSE2 (and below).
struct SSE2_16bit {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
- static void Quantize(const float *input, int16_t *output, float quant_mult, Index size);
+ SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+ assert(size % 8 == 0);
+ assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
+ assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
+ sse2::QuantizeTile16 q(quant_mult);
+ const float *end = input + size;
+ for (; input != end; input += 8, output += 8) {
+ *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
+ }
+ }
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 8;
static const Index kBTileCol = 8;
- static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
+ SSE2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ //TODO #DEFINE
+ PrepareBFor16(input, output, sse2::QuantizeTile16(quant_mult), rows, cols);
+ }
- static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+ SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ //TODO #DEFINE
+ SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
+ }
- static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+ SSE2 static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ //TODO #DEFINE
+ Multiply16<__m128i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
+ }
- static const char *const kName;
+ constexpr static const char *const kName = "16-bit SSE2";
static const CPUType kUses = CPU_SSE2;
};
// Technically only requires SSE
-float SSE2_MaxAbsolute(const float *begin, const float *end);
+SSE2 float SSE2_MaxAbsolute(const float *begin, const float *end) {
+ return MaxAbsoluteBackend<__m128>(begin, end);
+}
} // namespace intgemm
diff --git a/ssse3_gemm.cc b/ssse3_gemm.cc
index d5de13d..32e5bd2 100644
--- a/ssse3_gemm.cc
+++ b/ssse3_gemm.cc
@@ -1,89 +1 @@
#include "ssse3_gemm.h"
-
-#include "interleave.h"
-#include "multiply.h"
-
-#include <stdint.h>
-#include <cassert>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-namespace intgemm {
-
-namespace {
-// Same implementation as AVX512, just width. Grabs 4 32-bit values.
-inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
- return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
-}
-
-class QuantizeTile8 {
- public:
- typedef __m128i Integer;
-
- explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
- inline __m128i ForReshape(const float *input, Index cols) {
- // Skip a row.
- return Tile(input, input + 2 * cols);
- }
-
- inline __m128i Consecutive(const float *input) {
- return Tile(input, input + 8);
- }
-
- private:
- // Quantize 16xfloat into 16xint8_t
- inline __m128i Tile(const float *input0, const float *input1) {
- const __m128i neg128 = _mm_set1_epi8(-128);
- __m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_);
- __m128i g2 = QuantizerGrab(input1, mult_reg_);
- __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_);
- __m128i packed0 = _mm_packs_epi32(g0, g1);
- __m128i packed1 = _mm_packs_epi32(g2, g3);
- __m128i packed = _mm_packs_epi16(packed0, packed1);
- /* Ban -128.
- * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127). Instead,
- * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
- * The first generates 0xff for fields -128.
- * The second subtracts 0xff from -128 which has the effect of converting
- * to -127.
- */
- // packed = _mm_max_epi8(packed, neg127);
- __m128i evils = _mm_cmpeq_epi8(packed, neg128);
- return _mm_sub_epi8(packed, evils);
- // No permute needed. packs is in order for SSE.
- }
-
- private:
- const __m128 mult_reg_;
-};
-
-} // namespace
-
-void SSSE3_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
- assert(size % 16 == 0);
- assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
- assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
- QuantizeTile8 q(quant_mult);
- const float *end = input + size;
- for (; input != end; input += 16, output += 16) {
- *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
- }
-}
-
-void SSSE3_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols);
-}
-
-void SSSE3_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
-}
-
-void SSSE3_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
- Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols);
-}
-
-const char *const SSSE3_8bit::kName = "8-bit SSSE3";
-
-} // namespace intgemm
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index 4993ef6..69ac298 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -3,32 +3,103 @@
#include <cstdint>
#include <stdint.h>
+#include "interleave.h"
+#include "multiply.h"
+
// 16-bit is in sse2_gemm.h
namespace intgemm {
+namespace ssse3 {
+// Same implementation as AVX512, just width. Grabs 4 32-bit values.
+//TODO duplicated function requires the removal of the annonymous namespace
+SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
+ return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
+}
+
+class QuantizeTile8 {
+ public:
+ typedef __m128i Integer;
+
+ SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
+
+ SSSE3 inline __m128i ForReshape(const float *input, Index cols) {
+ // Skip a row.
+ return Tile(input, input + 2 * cols);
+ }
+
+ SSSE3 inline __m128i Consecutive(const float *input) {
+ return Tile(input, input + 8);
+ }
+
+ private:
+ // Quantize 16xfloat into 16xint8_t
+ SSSE3 inline __m128i Tile(const float *input0, const float *input1) {
+ const __m128i neg128 = _mm_set1_epi8(-128);
+ __m128i g0 = QuantizerGrab(input0, mult_reg_);
+ __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_);
+ __m128i g2 = QuantizerGrab(input1, mult_reg_);
+ __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_);
+ __m128i packed0 = _mm_packs_epi32(g0, g1);
+ __m128i packed1 = _mm_packs_epi32(g2, g3);
+ __m128i packed = _mm_packs_epi16(packed0, packed1);
+ /* Ban -128.
+ * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127). Instead,
+ * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
+ * The first generates 0xff for fields -128.
+ * The second subtracts 0xff from -128 which has the effect of converting
+ * to -127.
+ */
+ // packed = _mm_max_epi8(packed, neg127);
+ __m128i evils = _mm_cmpeq_epi8(packed, neg128);
+ return _mm_sub_epi8(packed, evils);
+ // No permute needed. packs is in order for SSE.
+ }
+
+ private:
+ const __m128 mult_reg_;
+};
+
+} // namespace
+
+
// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
struct SSSE3_8bit {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
- static void Quantize(const float *input, int8_t *output, float quant_mult, Index size);
+ SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+ assert(size % 16 == 0);
+ assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
+ assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
+ ssse3::QuantizeTile8 q(quant_mult);
+ const float *end = input + size;
+ for (; input != end; input += 16, output += 16) {
+ *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
+ }
+ }
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
- static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols);
+ SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ PrepareBFor8(input, output, ssse3::QuantizeTile8(quant_mult), rows, cols);
+ }
- static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+ SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
+ }
- static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+ SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols);
+ }
- static const char *const kName;
+ constexpr static const char *const kName = "8-bit SSSE3";
static const CPUType kUses = CPU_SSSE3;
};
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index 301254a..8d2f50d 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -1,4 +1,4 @@
-#include "avx512_gemm.h"
+//#include "avx512_gemm.h"
#include "avx2_gemm.h"
#include "ssse3_gemm.h"
#include "sse2_gemm.h"
@@ -73,7 +73,7 @@ TEST_CASE("Transpose 16", "[transpose]") {
}
}
-TEST_CASE("Transpose 8", "[transpose]") {
+SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
if (kCPU < CPU_SSSE3) return;
AlignedVector<int8_t> input(16 * 16);
for (int i = 0; i < 16 * 16; ++i) {
@@ -127,7 +127,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
"Quantized Input" << '\n' << PrintMatrix(quantized.get(), rows, cols) << "Reference" << '\n' <<
PrintMatrix(reference.get(), rows, cols) << "Routine" << '\n' << PrintMatrix(test.get(), rows, cols));
}
-
+/*
TEST_CASE("Prepare AVX512", "[prepare]") {
if (kCPU < CPU_AVX512BW) return;
#ifndef INTGEMM_NO_AVX512
@@ -137,7 +137,7 @@ TEST_CASE("Prepare AVX512", "[prepare]") {
TestPrepare<AVX512_16bit>(256, 32);
#endif
}
-
+*/
TEST_CASE("Prepare AVX2", "[prepare]") {
if (kCPU < CPU_AVX2) return;
TestPrepare<AVX2_8bit>(64, 32);
@@ -192,7 +192,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
CHECK_MESSAGE(memcmp(ref.get(), test.get(), sizeof(Integer) * rows * kSelectCols) == 0, "Reference:\n" <<
PrintMatrix(ref.get(), rows, kSelectCols) << PrintMatrix(test.get(), rows, kSelectCols));
}
-
+/*
TEST_CASE("SelectColumnsB AVX512", "[select]") {
if (kCPU < CPU_AVX512BW) return;
#ifndef INTGEMM_NO_AVX512
@@ -200,7 +200,7 @@ TEST_CASE("SelectColumnsB AVX512", "[select]") {
TestSelectColumnsB<AVX512_16bit>(256, 256);
#endif
}
-
+*/
TEST_CASE("SelectColumnsB AVX2", "[select]") {
if (kCPU < CPU_AVX2) return;
TestSelectColumnsB<AVX2_8bit>(256, 256);
@@ -411,7 +411,7 @@ TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
TestMultiply<AVX2_16bit>(248, 256, 256, .1, 1, 0.01);
TestMultiply<AVX2_16bit>(200, 256, 256, .1, 1, 0.01);
}
-
+/*
#ifndef INTGEMM_NO_AVX512
TEST_CASE ("Multiply AVX512 8bit", "[multiply]") {
if (kCPU < CPU_AVX512BW) return;
@@ -433,6 +433,7 @@ TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
TestMultiply<AVX512_16bit>(200, 256, 256, .1, 1, 0.01);
}
#endif
+ */
} // namespace intgemm
int main(int argc, char ** argv) {
diff --git a/types.h b/types.h
index a38d3c3..6ec5085 100644
--- a/types.h
+++ b/types.h
@@ -1,4 +1,5 @@
#pragma once
+#include <exception>
#define DEFAULT __attribute__ ((target ("default")))
#define SSE2 __attribute__ ((target ("sse2")))
@@ -6,10 +7,22 @@
#define SSSE3 __attribute__ ((target ("ssse3")))
#define AVX2 __attribute__ ((target ("avx2")))
//#define AVX2_512F __attribute__ ((target ("avx2"), target("avx512f"))) //Not supported by clang
-#define AVX512F __attribute__ ((target ("avx512f")))
+#define AVX512F __attribute__ ((target ("avx512bw")))
namespace intgemm {
+// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
+class UnsupportedCPU : public std::exception {
+ public:
+ UnsupportedCPU() {}
+
+ ~UnsupportedCPU() throw() {}
+
+ const char *what() const throw() override {
+ return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
+ }
+};
+
typedef unsigned int Index;
// If you want to detect the CPU and dispatch yourself, here's what to use: