Adding 16bit PrepareBQuantizedTransposed

author: Nikolay Bogoychev <nheart@gmail.com> 2020-02-04 21:10:14 +0300
committer: Nikolay Bogoychev <nheart@gmail.com> 2020-02-04 21:10:14 +0300
commit: 019c4186207cb9fe4763d2991f789fcdb7a7f158 (patch)
tree: 1f884635d56472dd6d0af58cdce37ea7b473265c
parent: 76a6d9f643c06880549725379b7207a259eb57b5 (diff)
2 files changed, 13 insertions, 0 deletions
diff --git a/intgemm.cc b/intgemm.cc
index 21b7ddd..850bb4e 100644
--- a/intgemm.cc
+++ b/intgemm.cc
@@ -10,6 +10,8 @@ void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, I
 
 void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB);
 
+void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBQuantizedTransposed, AVX512_16bit::PrepareBQuantizedTransposed, AVX2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
+
 void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
 
 const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName);
diff --git a/intgemm.h b/intgemm.h
index 58d2dcc..b67ceda 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -66,6 +66,11 @@ struct Unsupported_16bit {
   static void PrepareB(const float *, int16_t *, float, Index, Index) {
     throw UnsupportedCPU();
   }
+
+  static void PrepareBQuantizedTransposed(const int16_t *, int16_t *, Index, Index) {
+    throw UnsupportedCPU();
+  }
+
   static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
     throw UnsupportedCPU();
   }
@@ -227,6 +232,12 @@ struct Int16 {
   // It will match the Multiply function on the same CPU though.
   static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
 
+  // Convert from a B that was already transposed (routine not provided) and
+  // quantized (e.g. with Quantize) to the CPU-dependent format used for
+  // Multiply.  This is useful for storing a quantized model on disk then in a
+  // CPU-independent fashion.
+  static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols);
+
   // Select columns from a prepared B matrix.  The number of selected columns must be a multiple of 8. 
   static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
author	Nikolay Bogoychev <nheart@gmail.com>	2020-02-04 21:10:14 +0300
committer	Nikolay Bogoychev <nheart@gmail.com>	2020-02-04 21:10:14 +0300
commit	019c4186207cb9fe4763d2991f789fcdb7a7f158 (patch)
tree	1f884635d56472dd6d0af58cdce37ea7b473265c
parent	76a6d9f643c06880549725379b7207a259eb57b5 (diff)