diff options
Diffstat (limited to 'intgemm/intgemm.h')
-rw-r--r-- | intgemm/intgemm.h | 427 |
1 files changed, 427 insertions, 0 deletions
diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h new file mode 100644 index 0000000..8e2da02 --- /dev/null +++ b/intgemm/intgemm.h @@ -0,0 +1,427 @@ +#pragma once +/* Main interface for integer matrix multiplication. + * + * We are computing C = A * B with an optional scaling factor. + * + * A is typically activations. + * Rows a multiple of 1 (no restriction) + * Columns a multiple of 64 for 8-bit or 32 for 16-bit. + * Use PrepareA to prepare A for multiplication. This is meant to be fast. + * + * B is typically fixed model parameters. + * Rows a multiple of 64 for 8-bit or 32 for 16-bit. + * Columns a multiple of: 8 + * Use PrepareB to prepare B for multiplication. This is slower, with the + * intention that it will be prepared once and remembered. + * + * C is row major. + * + * Once both A and B are prepared, call Multiply. + * + * All memory (A, B, and C in float or prepared form) must be 64-byte aligned. + * It's easy to write code that works on your CPU with lower alignment, but + * breaks on AVX512. + * + * When preparing, you provide a quantization multiplier. Values will be + * multiplied by this then rounded to an integer. + * For 16-bit neural networks, Jacob Devlin recommends 1024.0. + * For 8-bit, use 127 / largest absolute value. + * + * Note that quantization saturates. However, 16-bit does accumulation in + * 32-bit which can overflow if you use too big of a multiplier. + * + * The multiply routine expects an unquantization multiplier. + * This should be unquant_mult = 1.0 / (A_quant_mult * B_quant_mult). + * Where A_quant_mult is what you passed to PrepareA and B_quant_mult is what you + * passed to PrepareB. + * + * Feel free to multiply in a scaling factor to compute C = \lambda A * B by + * passing unquant_mult = \lambda / (A_quant_mult * B_quant_mult). + */ + +#include <cstdint> + +#include "intgemm/intgemm_config.h" +#include "types.h" +#include "sse2_gemm.h" +#include "ssse3_gemm.h" +#include "avx2_gemm.h" +#include "avx512_gemm.h" +#include "avx512vnni_gemm.h" + +#if defined(__INTEL_COMPILER) +#include <immintrin.h> +#elif defined(_MSC_VER) +#include <intrin.h> +#elif defined(__GNUC__) || defined(__clang__) +#include <cpuid.h> +#endif + +/* Dispatch to functions based on runtime CPUID. This adds one call-by-variable to each call. */ + +namespace intgemm { + +struct Unsupported_16bit { + static void Quantize(const float *, int16_t *, float, Index) { + throw UnsupportedCPU(); + } + static void PrepareB(const float *, int16_t *, float, Index, Index) { + throw UnsupportedCPU(); + } + static void PrepareBQuantizedTransposed(const int16_t *, int16_t *, Index, Index) { + throw UnsupportedCPU(); + } + static void PrepareBTransposed(const float *, int16_t *, float, Index, Index) { + throw UnsupportedCPU(); + } + static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) { + throw UnsupportedCPU(); + } + template <typename Callback> + static void Multiply(const int16_t *, const int16_t *, Index, Index, Index, Callback) { + throw UnsupportedCPU(); + } + constexpr static const char *const kName = "16-bit Unsupported"; +}; + +struct Unsupported_8bit { + static void Quantize(const float *, int8_t *, float, Index) { + throw UnsupportedCPU(); + } + static void QuantizeU(const float *, uint8_t *, float, Index) { + throw UnsupportedCPU(); + } + static void PrepareA(const float *, int8_t *, float, Index, Index) { + throw UnsupportedCPU(); + } + static void PrepareBQuantizedTransposed(const int8_t *, int8_t *, Index, Index) { + throw UnsupportedCPU(); + } + static void PrepareBTransposed(const float *, int8_t *, float, Index, Index) { + throw UnsupportedCPU(); + } + static void PrepareB(const float *, int8_t *, float, Index, Index) { + throw UnsupportedCPU(); + } + template<class Callback> + static void PrepareBias(const int8_t *, Index, Index, Callback) { + throw UnsupportedCPU(); + } + static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) { + throw UnsupportedCPU(); + } + template <typename Callback> + static void Multiply(const int8_t *, const int8_t *, Index, Index, Index, Callback) { + throw UnsupportedCPU(); + } + template<class Callback> + static void Multiply8Shift(const uint8_t *, const int8_t *, Index, Index, Index, Callback) { + throw UnsupportedCPU(); + } + + constexpr static const char *const kName = "8-bit Unsupported"; +}; + +#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI +// These won't ever be called in this capacity, but it does let the code below compile. +namespace avx512vnni { +typedef Unsupported_8bit Kernels8; +} // namespace avx512vnni +#endif +#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW +namespace avx512bw { +typedef Unsupported_8bit Kernels8; +typedef Unsupported_16bit Kernels16; +} // namespace avx512bw +#endif + +/* Returns: + * axx512vnni if the CPU supports AVX512VNNI + * + * avx512bw if the CPU supports AVX512BW + * + * avx2 if the CPU supports AVX2 + * + * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit) + * + * sse2 if the CPU supports SSE2 + * + * unsupported otherwise + */ +template <class T> T ChooseCPU(T +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI + avx512vnni +#endif + , T +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW + avx512bw +#endif + , T avx2, T ssse3, T sse2, T unsupported) { +#if defined(__INTEL_COMPILER) +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI + if (_may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)) return avx512vnni; +# endif +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW + if (_may_i_use_cpu_feature(_FEATURE_AVX512BW)) return avx512bw; +# endif + if (_may_i_use_cpu_feature(_FEATURE_AVX2)) return avx2; + if (_may_i_use_cpu_feature(_FEATURE_SSSE3)) return ssse3; + if (_may_i_use_cpu_feature(_FEATURE_SSE2)) return sse2; + return unsupported; +#else +// Everybody except Intel compiler. +# if defined(_MSC_VER) + int regs[4]; + int &eax = regs[0], &ebx = regs[1], &ecx = regs[2], &edx = regs[3]; + __cpuid(regs, 0); + int m = eax; +# else + /* gcc and clang. + * If intgemm is compiled by gcc 6.4.1 then dlopened into an executable + * compiled by gcc 7.3.0, there will be a undefined symbol __cpu_info. + * Work around this by calling the intrinsics more directly instead of + * __builtin_cpu_supports. + * + * clang 6.0.0-1ubuntu2 supports vnni but doesn't have + * __builtin_cpu_supports("avx512vnni") + * so use the hand-coded CPUID for clang. + */ + unsigned int m = __get_cpuid_max(0, 0); + unsigned int eax, ebx, ecx, edx; +# endif + if (m >= 7) { +# if defined(_MSC_VER) + __cpuid(regs, 7); +# else + __cpuid_count(7, 0, eax, ebx, ecx, edx); +# endif +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI + if (ecx & (1 << 11)) return avx512vnni; +# endif +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW + if (ebx & (1 << 30)) return avx512bw; +# endif + if (ebx & (1 << 5)) return avx2; + } + if (m >= 1) { +# if defined(_MSC_VER) + __cpuid(regs, 1); +# else + __cpuid_count(1, 0, eax, ebx, ecx, edx); +# endif + if (ecx & (1 << 9)) return ssse3; + if (edx & (1 << 26)) return sse2; + } + return unsupported; +#endif +} + +struct TileInfo { + const Index a_rows; + const Index a_cols; + const Index b_rows; + const Index b_cols; +}; + +/* + * 8-bit matrix multiplication + */ +struct Int8 { + using Integer = int8_t; + + // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8. + static constexpr TileInfo tile_info{1, 64, 64, 8}; + + // Currently A is prepared by quantization but this could theoretically change. + // A's columns must be a multiple of 8. + // The number of rows is anything. + static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + Quantize(input, output, quant_mult, rows * cols); + } + + // Multiply floats by quant_mult then convert to 8-bit integers with saturation. + static void (*Quantize)(const float *input, int8_t *output, float quant_mult, Index size); + + // Multiply floats by quant_mult then convert to 8-bit integers with saturation. + // A version that adds 127 to each number, making sure that all numbers are positive + static void (*QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size); + + // Warning: the output of PrepareB depends on the CPU. + // It will match the Multiply function on the same CPU though. + static void (*PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols); + + // Convert from a B that was already transposed (routine not provided) and + // quantized (e.g. with Quantize) to the CPU-dependent format used for + // Multiply. This is useful for storing a quantized model on disk then in a + // CPU-independent fashion. + static void (*PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols); + + // Convert from a B that was already transposed (routine not provided) to + // the CPU-dependent format used for Multiply. This is useful for storing + // a quantized model on disk then in a CPU-independent fashion. + static void (*PrepareBTransposed)(const float *input, int8_t *output, float quant_mul, Index inner, Index B_untransposed_cols); + + // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8. + static void (*SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + + // Multiply C = A * B, presuming A and B have been prepared. + template <typename Callback> + static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { + MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback); + } + + static const char *const kName; + +private: + template <typename Callback> + struct MultiplyImpl { + static void (*run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback); + }; +}; + +template <typename Callback> +void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512vnni::Kernels8>, OMPParallelWrap<Callback, avx512bw::Kernels8>, OMPParallelWrap<Callback, avx2::Kernels8>, OMPParallelWrap<Callback, ssse3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>); + +/* + * 8-bit matrix multiplication with shifting A by 127 + */ +struct Int8Shift { + using Integer = int8_t; + + // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8. + static constexpr TileInfo tile_info{1, 64, 64, 8}; + + // Identical to the Int8 Version, except it adds 127 to each number, making sure that all numbers are positive. + static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + QuantizeU(input, reinterpret_cast<uint8_t *>(output), quant_mult, rows * cols); + } + + // Multiply floats by quant_mult then convert to 8-bit integers with saturation. + // A version that adds 127 to each number, making sure that all numbers are positive + static void (*QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size); + + // Warning: the output of PrepareB depends on the CPU. + // It will match the Multiply function on the same CPU though. + static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + Int8::PrepareB(input, output, quant_mult, rows, cols); + } + + // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8. + static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + Int8::SelectColumnsB(input, output, rows, cols_begin, cols_end); + } + + // A slightly faster version compared to the Int8 one (assuming a bias is used) because of better handling of the sign bit + // Multiply C = A * B + Bias, presuming A, B and Bias have all been prepared (for A, PrepareAnew should be used + template<class Callback> + static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { + MultiplyImpl<Callback>::run((const uint8_t *)A, B, A_rows, width, B_cols, callback); + } + + // This function prepares the bias for the Multiply routine that does unsigned * signed multiplication. + // The function takes: + // a preparedB matrix, width, B_cols and + // the callback UnquantizeAndAddBiasAndWrite(unquant_mult, Bias_matrix, Bias_matrix) + // unquant_mult is computed by (-1)*(alpha)*(alpha)/(127.0f); + template<class Callback> + static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) { + PrepareBiasImpl<Callback>::run(B, width, B_cols, callback); + } + + static const char *const kName; + +private: + template <typename Callback> + struct MultiplyImpl { + static void (*run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback); + }; + + template <typename Callback> + struct PrepareBiasImpl { + static void (*run)(const int8_t *B, Index width, Index B_cols, Callback callback); + }; +}; + +template <class Callback> +void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU( + OMPParallelWrap8Shift<Callback, avx512vnni::Kernels8>, + OMPParallelWrap8Shift<Callback, avx512bw::Kernels8>, + OMPParallelWrap8Shift<Callback, avx2::Kernels8>, + OMPParallelWrap8Shift<Callback, ssse3::Kernels8>, + Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>); + +template <class Callback> +void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(avx512vnni::Kernels8::PrepareBias<Callback>, avx512bw::Kernels8::PrepareBias<Callback>, avx2::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias); + +/* + * 16-bit matrix multiplication + */ +struct Int16 { + using Integer = int16_t; + + // A's size must be a multiple of 1x32, B's size must be a multiple of 32x8. + static constexpr TileInfo tile_info{1, 32, 32, 8}; + + // Currently A is prepared by quantization but this could theoretically change. + // A's columns must be a multiple of 8. + // The number of rows is anything. + static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + Quantize(input, output, quant_mult, rows * cols); + } + + // Multiply floats by quant_mult then convert to 16-bit integers with saturation. + // input + static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size); + + // Warning: the output of PrepareB depends on the CPU. + // It will match the Multiply function on the same CPU though. + static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols); + + // Convert from a B that was already transposed (routine not provided) and + // quantized (e.g. with Quantize) to the CPU-dependent format used for + // Multiply. This is useful for storing a quantized model on disk then in a + // CPU-independent fashion. + static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols); + + // Convert from a B that was already transposed (routine not provided) to + // the CPU-dependent format used for Multiply. This is useful for storing + // a quantized model on disk then in a CPU-independent fashion. + static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols); + + // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8. + static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + + // Multiply C = A * B, presuming A and B have been prepared. + template <typename Callback> + static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { + MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback); + } + + static const char *const kName; + +private: + template <typename Callback> + struct MultiplyImpl { + static void (*run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback); + }; +}; + +template <typename Callback> +void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512bw::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, avx512bw::Kernels16>, OMPParallelWrap<Callback, avx2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, Unsupported_16bit::Multiply<Callback>); + +extern const CPUType kCPU; + +// Get the maximum absolute value of an array of floats. The number of floats must be a multiple of 16 and 64-byte aligned. +extern float (*MaxAbsolute)(const float *begin, const float *end); + +// Get a Quantization value that is equant to the mean of the data +N standard deviations. Use 2 by default +extern MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool); + +/* Returns the Mean and the Standard deviation of a vector. + * If "absolute" is set to true, it computes the mean and the standard deviation of the absolute values of the vector */ +static inline MeanStd GetVectorMeanStd(const float * begin, const float * end, bool absolute=false) { + return VectorMeanStd(begin, end, absolute); +} + + +} // namespace intgemm |