Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intgemm.h')
-rw-r--r--intgemm.h181
1 files changed, 95 insertions, 86 deletions
diff --git a/intgemm.h b/intgemm.h
index 8a63fb8..baba935 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -204,84 +204,13 @@ struct TileInfo {
const Index b_cols;
};
-/* 16-bit matrix multiplication. */
-template <typename Callback>
-struct Int16Mult {
- // Multiply C = A * B, presuming A and B have been prepared.
- static void (*Multiply)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
-};
-
-template <typename Callback>
-void (*Int16Mult<Callback>::Multiply)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512_16bit::Multiply<Callback> /*TODO VNNI 16-bit. */, AVX512_16bit::Multiply<Callback>, AVX2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, Unsupported_16bit::Multiply);
-
-struct Int16 {
- using Integer = int16_t;
-
- // A's size must be a multiple of 1x32.
- // B's size must be a multiple of 32x8.
- static constexpr TileInfo tile_info{1, 32, 32, 8};
-
- // Currently A is prepared by quantization but this could theoretically change.
- // A's columns must be a multiple of 8.
- // The number of rows is anything.
- static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
- Quantize(input, output, quant_mult, rows * cols);
- }
-
- // Multiply floats by quant_mult then convert to 16-bit integers with saturation.
- // input
- static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size);
-
- // Warning: the output of PrepareB depends on the CPU.
- // It will match the Multiply function on the same CPU though.
- static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
-
- // Convert from a B that was already transposed (routine not provided) and
- // quantized (e.g. with Quantize) to the CPU-dependent format used for
- // Multiply. This is useful for storing a quantized model on disk then in a
- // CPU-independent fashion.
- static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols);
-
- // Convert from a B that was already transposed (routine not provided) to
- // the CPU-dependent format used for Multiply. This is useful for storing
- // a quantized model on disk then in a CPU-independent fashion.
- static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
-
- // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8.
- static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
-
- // Multiply C = A * B, presuming A and B have been prepared.
- template <typename Callback>
- static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
- Int16Mult<Callback>::Multiply(A, B, A_rows, width, B_cols, callback);
- }
-
- static const char *const kName;
-};
-
-/* 8-bit matrix multiplication */
-template <typename Callback>
-struct Int8Mult {
- // Multiply C = A * B, presuming A and B have been prepared.
- static void (*Multiply)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
- static void (*Multiply8Shift)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
- static void (*PrepareBias)(const int8_t *B, Index width, Index B_cols, Callback callback);
-};
-
-template <typename Callback>
-void (*Int8Mult<Callback>::Multiply)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply<Callback>, AVX512_8bit::Multiply<Callback>, AVX2_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, Unsupported_8bit::Multiply);
-
-template <class Callback>
-void (*Int8Mult<Callback>::Multiply8Shift)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply8Shift<Callback>, AVX512_8bit::Multiply8Shift<Callback>, AVX2_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift);
-
-template <class Callback>
-void (*Int8Mult<Callback>::PrepareBias)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
-
+/*
+ * 8-bit matrix multiplication
+ */
struct Int8 {
using Integer = int8_t;
- // A's size must be a multiple of 1x64.
- // B's size must be a multiple of 64x8.
+ // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8.
static constexpr TileInfo tile_info{1, 64, 64, 8};
// Currently A is prepared by quantization but this could theoretically change.
@@ -319,22 +248,29 @@ struct Int8 {
// Multiply C = A * B, presuming A and B have been prepared.
template <typename Callback>
static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
- Int8Mult<Callback>::Multiply(A, B, A_rows, width, B_cols, callback);
+ MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback);
}
static const char *const kName;
+
+private:
+ template <typename Callback>
+ struct MultiplyImpl {
+ static void (*run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
+ };
};
-// Shifting A by 127 version of the above code
+template <typename Callback>
+void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply<Callback>, AVX512_8bit::Multiply<Callback>, AVX2_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, Unsupported_8bit::Multiply);
+
+/*
+ * 8-bit matrix multiplication with shifting A by 127
+ */
struct Int8Shift {
- typedef int8_t Integer;
+ using Integer = int8_t;
- // A's size must be a multiple of 1x64.
- static const Index kATileRow = 1;
- static const Index kATileCol = 64;
- // B's size must be a multiple of 64x8.
- static const Index kBTileRow = 64;
- static const Index kBTileCol = 8;
+ // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8.
+ static constexpr TileInfo tile_info{1, 64, 64, 8};
// Identical to the Int8 Version, except it adds 127 to each number, making sure that all numbers are positive.
static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
@@ -360,7 +296,7 @@ struct Int8Shift {
// Multiply C = A * B + Bias, presuming A, B and Bias have all been prepared (for A, PrepareAnew should be used
template<class Callback>
static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
- Int8Mult<Callback>::Multiply8Shift((const uint8_t *)A, B, A_rows, width, B_cols, callback);
+ MultiplyImpl<Callback>::run((const uint8_t *)A, B, A_rows, width, B_cols, callback);
}
// This function prepares the bias for the Multiply routine that does unsigned * signed multiplication.
@@ -370,12 +306,85 @@ struct Int8Shift {
// unquant_mult is computed by (-1)*(alpha)*(alpha)/(127.0f);
template<class Callback>
static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) {
- Int8Mult<Callback>::PrepareBias(B, width, B_cols, callback);
+ PrepareBiasImpl<Callback>::run(B, width, B_cols, callback);
}
static const char *const kName;
+
+private:
+ template <typename Callback>
+ struct MultiplyImpl {
+ static void (*run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
+ };
+
+ template <typename Callback>
+ struct PrepareBiasImpl {
+ static void (*run)(const int8_t *B, Index width, Index B_cols, Callback callback);
+ };
+};
+
+template <class Callback>
+void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply8Shift<Callback>, AVX512_8bit::Multiply8Shift<Callback>, AVX2_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift);
+
+template <class Callback>
+void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
+
+/*
+ * 16-bit matrix multiplication
+ */
+struct Int16 {
+ using Integer = int16_t;
+
+ // A's size must be a multiple of 1x32, B's size must be a multiple of 32x8.
+ static constexpr TileInfo tile_info{1, 32, 32, 8};
+
+ // Currently A is prepared by quantization but this could theoretically change.
+ // A's columns must be a multiple of 8.
+ // The number of rows is anything.
+ static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ Quantize(input, output, quant_mult, rows * cols);
+ }
+
+ // Multiply floats by quant_mult then convert to 16-bit integers with saturation.
+ // input
+ static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size);
+
+ // Warning: the output of PrepareB depends on the CPU.
+ // It will match the Multiply function on the same CPU though.
+ static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
+
+ // Convert from a B that was already transposed (routine not provided) and
+ // quantized (e.g. with Quantize) to the CPU-dependent format used for
+ // Multiply. This is useful for storing a quantized model on disk then in a
+ // CPU-independent fashion.
+ static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols);
+
+ // Convert from a B that was already transposed (routine not provided) to
+ // the CPU-dependent format used for Multiply. This is useful for storing
+ // a quantized model on disk then in a CPU-independent fashion.
+ static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
+
+ // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8.
+ static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+
+ // Multiply C = A * B, presuming A and B have been prepared.
+ template <typename Callback>
+ static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
+ MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback);
+ }
+
+ static const char *const kName;
+
+private:
+ template <typename Callback>
+ struct MultiplyImpl {
+ static void (*run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
+ };
};
+template <typename Callback>
+void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512_16bit::Multiply<Callback> /*TODO VNNI 16-bit. */, AVX512_16bit::Multiply<Callback>, AVX2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, Unsupported_16bit::Multiply);
+
extern const CPUType kCPU;
// Get the maximum absolute value of an array of floats. The number of floats must be a multiple of 16 and 64-byte aligned.