From a72b13b72d04f0863decd46c5b9cdca24d962de3 Mon Sep 17 00:00:00 2001
From: Mateusz Chudyk <mateuszchudyk@gmail.com>
Date: Thu, 6 Feb 2020 18:32:34 +0000
Subject: Straighten functions producing test references values

---
 test/add127_test.cc   | 33 ++++++++++++++-----
 test/multiply_test.cc | 48 +++++++++------------------
 test/test.cc          | 56 ++------------------------------
 test/test.h           | 89 +++++++++++++++++++++++++++++++++++++++++++++------
 4 files changed, 122 insertions(+), 104 deletions(-)

(limited to 'test')
diff --git a/test/add127_test.cc b/test/add127_test.cc
index d1b850d..ae5c08a 100644
--- a/test/add127_test.cc
+++ b/test/add127_test.cc
@@ -81,7 +81,9 @@ template <class Routine> void TestPrepareBias(Index rows, Index cols) {
   //Routine::Multiply(A_prep2.begin(), B_prep.begin(), A_rows, rows, cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, goldBias.begin(), goldBias.begin()));
   //CompareBiases(goldBias.begin(), inputBias.begin(), cols);
   AlignedVector<float> slowint_C(cols);
-  SlowRefInt(A_prep2.begin(), B_quant.begin(), slowint_C.begin(), unquant_mult_forprep, A_rows, rows, cols, goldBias.begin());
+  references::Multiply(A_prep2.begin(), B_quant.begin(), slowint_C.begin(), A_rows, rows, cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
+    return sum * unquant_mult_forprep + goldBias[info.col_idx];
+  });
   CompareBiases(slowint_C.begin(), inputBias.begin(), cols);
 }
 
@@ -127,10 +129,14 @@ template <class Routine> void TestMultiplyBiasNew(Index A_rows, Index width, Ind
   // Taking the original A_preparation which means A would be int8_t
   AlignedVector<int8_t> A_prep2(A.size());
   Routine::PrepareA(A.begin(), A_prep2.begin(), quant_mult, A_rows, width);
-  SlowRefInt(A_prep2.begin(), B_quant.begin(), slowint_C.begin(), unquant_mult, A_rows, width, B_cols, bias.begin());
+  references::Multiply(A_prep2.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
+    return sum * unquant_mult + bias[info.col_idx];
+  });
 
   AlignedVector<float> float_C(test_C.size());
-  SlowRefFloat(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, bias.begin());
+  references::MultiplyFF(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](float sum, const callbacks::OutputBufferInfo& info) {
+    return sum + bias[info.col_idx];
+  });
 
   /*ACTUAL MULTIPLICATION
   *
@@ -185,7 +191,10 @@ template <class Routine> void TestMultiplyShiftNonShift(Index A_rows, Index widt
   Routine::Multiply(A_prep_old.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), slowint_C.begin()));
 
   AlignedVector<float> float_C(test_C.size());
-  SlowRefFloat(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, bias.begin());
+  references::MultiplyFF(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](float sum, const callbacks::OutputBufferInfo& info) {
+    return sum + bias[info.col_idx];
+  });
+
   /*
    * Multiply8 shift multiplication
    */
@@ -238,10 +247,14 @@ template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, In
   Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, B.size());
   AlignedVector<float> slowint_C(test_C.size());
   // Taking the original A_preparation which means A would be int8_t
-  //SlowRefInt(A_prep.begin(), B_quant.begin(), slowint_C.begin(), unquant_mult, A_rows, width, B_cols, bias.begin());
+  // references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
+  //   return sum * unquant_mult + bias[info.col_idx];
+  // });
 
   AlignedVector<float> float_C(test_C.size());
-  SlowRefFloat(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, bias.begin());
+  references::MultiplyFF(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](float sum, const callbacks::OutputBufferInfo& info) {
+    return sum + bias[info.col_idx];
+  });
   /*
    * Multiply8 shift multiplication
    */
@@ -252,7 +265,9 @@ template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, In
   }
   AlignedVector<float> ShiftedBias(B_cols);
   float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
-  SlowRefInt(A_prep2.begin(), B_quant.begin(), ShiftedBias.begin(), unquant_mult_forprep, 1, width, B_cols, bias.begin());
+  references::Multiply(A_prep2.begin(), B_quant.begin(), ShiftedBias.begin(), 1, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
+    return sum * unquant_mult_forprep + bias[info.col_idx];
+  });
   
 
   //Now prepare Fast integer Bias
@@ -261,7 +276,9 @@ template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, In
 
   // Reference INT VERSION HERE with ADD127
   // Taking the original A_preparation which means A would be int8_t
-  SlowRefInt(A_prep.begin(), B_quant.begin(), slowint_C.begin(), unquant_mult, A_rows, width, B_cols, ShiftedBias.begin());
+  references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
+    return sum * unquant_mult + ShiftedBias[info.col_idx];
+  });
 
   Compare(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
    int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index c972489..725fbca 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -18,32 +18,6 @@
 
 namespace intgemm {
 
-// Rearrange a tile of simd x unroll entries.
-template <class V> void SlowRearrangeTile(const V *from, V *to, int simd, int unroll, Index cols) {
-  for (int i = 0; i < unroll; ++i) {
-    for (int j = 0; j < simd; ++j) {
-      to[simd * i + j] = from[cols * j + i];
-    }
-  }
-}
-
-template <class V> void SlowRearrange(const V *from, V *to, int simd, int unroll, Index rows, Index cols) {
-  for (Index c = 0; c < cols; c += unroll) {
-    for (Index r = 0; r < rows; r += simd) {
-      SlowRearrangeTile(from + cols * r + c, to, simd, unroll, cols);
-      to += unroll * simd;
-    }
-  }
-}
-
-template <class V> void SlowTranspose(const V *from, V *to, Index rows, Index cols) {
-  for (Index r = 0; r < rows; ++r) {
-    for (Index c = 0; c < cols; ++c) {
-      to[rows * c + r] = from[cols * r + c];
-    }
-  }
-}
-
 INTGEMM_SSE2 TEST_CASE("Transpose 16", "[transpose]") {
   if (kCPU < CPUType::SSE2) return;
   const unsigned N = 8;
@@ -51,7 +25,7 @@ INTGEMM_SSE2 TEST_CASE("Transpose 16", "[transpose]") {
   std::iota(input.begin(), input.end(), 0);
 
   AlignedVector<int16_t> ref(N * N);
-  SlowTranspose(input.begin(), ref.begin(), N, N);
+  references::Transpose(input.begin(), ref.begin(), N, N);
 
   // Overwrite input.
   __m128i *t = input.as<__m128i>();
@@ -69,7 +43,7 @@ INTGEMM_SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
   std::iota(input.begin(), input.end(), 0);
 
   AlignedVector<int8_t> ref(input.size());
-  SlowTranspose(input.begin(), ref.begin(), N, N);
+  references::Transpose(input.begin(), ref.begin(), N, N);
 
   // Overwrite input.
   __m128i *t = input.as<__m128i>();
@@ -111,7 +85,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
   Routine::Quantize(input.begin(), quantized.begin(), 1, input.size());
   AlignedVector<Integer> reference(input.size());
   // Note this won't work for Int8/Int16 generic routines because tile sizes vary.
-  SlowRearrange<Integer>(quantized.begin(), reference.begin(), Routine::kBTileRow, Routine::kBTileCol, rows, cols);
+  references::Rearragement(quantized.begin(), reference.begin(), Routine::kBTileRow, Routine::kBTileCol, rows, cols);
   CHECK_MESSAGE(memcmp(reference.begin(), test.begin(), test.size() * sizeof(Integer)) == 0, Routine::kName << " Mismatch:\n" <<
   	"Quantized Input" << '\n' << PrintMatrix(quantized.begin(), rows, cols) << "Reference" << '\n' <<
   	 PrintMatrix(reference.begin(), rows, cols) << "Routine" << '\n' << PrintMatrix(test.begin(), rows, cols));
@@ -323,10 +297,14 @@ template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_co
   Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, B.size());
   AlignedVector<float> slowint_C(test_C.size());
   // Assuming A is just quantization here.
-  SlowRefInt(A_prep.begin(), B_quant.begin(), slowint_C.begin(), unquant_mult, A_rows, width, B_cols);
+  references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
+    return sum * unquant_mult;
+  });
 
   AlignedVector<float> float_C(test_C.size());
-  SlowRefFloat(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols);
+  references::MultiplyFF(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](float sum, const callbacks::OutputBufferInfo& info) {
+    return sum;
+  });
 
   Compare(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
    int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
@@ -372,10 +350,14 @@ template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index
   Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, B.size());
   AlignedVector<float> slowint_C(test_C.size());
   // Assuming A is just quantization here.
-  SlowRefInt(A_prep.begin(), B_quant.begin(), slowint_C.begin(), unquant_mult, A_rows, width, B_cols, bias.begin());
+  references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
+    return sum * unquant_mult + bias[info.col_idx];
+  });
 
   AlignedVector<float> float_C(test_C.size());
-  SlowRefFloat(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, bias.begin());
+  references::MultiplyFF(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](float sum, const callbacks::OutputBufferInfo& info) {
+    return sum + bias[info.col_idx];
+  });
 
   Compare(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
    int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
diff --git a/test/test.cc b/test/test.cc
index 2986d82..62137a1 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -7,60 +7,8 @@ int main(int argc, char ** argv) {
 
 namespace intgemm {
 
-void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index width, Index B_cols, const float *bias) {
-  for (Index r = 0; r < A_rows; ++r) {
-    for (Index c = 0; c < B_cols; ++c) {
-      float sum = 0.0f;
-      for (Index k = 0; k < width; ++k) {
-        sum += A[r * width + k] * B[k * B_cols + c];
-      }
-      if (bias) {
-        C[r * B_cols + c] = sum + bias[c];
-      } else {
-        C[r * B_cols + c] = sum;
-      }
-    }
-  }
-}
-
-// Compute A*B slowly from integers.
-template <class Integer> void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias) {
-  for (Index r = 0; r < A_rows; ++r) {
-    for (Index c = 0; c < B_cols; ++c) {
-      int32_t sum = 0;
-      for (Index k = 0; k < width; ++k) {
-        sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]);
-      }
-      if (bias) {
-        C[r * B_cols + c] = sum * unquant_mult + bias[c];
-      } else {
-        C[r * B_cols + c] = sum * unquant_mult;
-      }
-    }
-  }
-}
-void SlowRefInt(const uint8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias) {
-  for (Index r = 0; r < A_rows; ++r) {
-    for (Index c = 0; c < B_cols; ++c) {
-      int32_t sum = 0;
-      for (Index k = 0; k < width; ++k) {
-        sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]);
-      }
-      if (bias) {
-        C[r * B_cols + c] = sum * unquant_mult + bias[c];
-      } else {
-        C[r * B_cols + c] = sum * unquant_mult;
-      }
-    }
-  }
-}
-
-template void SlowRefInt<int8_t>(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias);
-template void SlowRefInt<int16_t>(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias);
-template void SlowRefInt<int32_t>(const int32_t *A, const int32_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias);
-
 void Compare(const float *float_ref, const float *int_ref, const float *int_test, std::size_t size, std::string test_info,
- float int_tolerance, float float_tolerance, float MSE_float_tolerance, float MSE_int_tolerance) {
+             float int_tolerance, float float_tolerance, float MSE_float_tolerance, float MSE_int_tolerance) {
   float int_sum = 0.0, float_sum = 0.0;
   for (std::size_t i = 0; i < size; ++i) {
     float int_diff = int_ref[i] - int_test[i];
@@ -74,4 +22,4 @@ void Compare(const float *float_ref, const float *int_ref, const float *int_test
   CHECK_MESSAGE(fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size));
 }
 
-} //namespace intgemm
+} // namespace intgemm
diff --git a/test/test.h b/test/test.h
index 291ff45..7c294f8 100644
--- a/test/test.h
+++ b/test/test.h
@@ -1,11 +1,15 @@
 #pragma once
 
+#include "intgemm_config.h"
+
 #include "../3rd_party/catch.hpp"
-#include <sstream>
 #include "../intgemm.h"
 #include "../aligned.h"
 
-#include "intgemm_config.h"
+#include <math.h>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
 
 #define CHECK_MESSAGE(cond, msg) do { INFO(msg); CHECK(cond); } while(0)
 #define CHECK_FALSE_MESSAGE(cond, msg) do { INFO(msg); CHECK_FALSE(cond); } while(0)
@@ -21,13 +25,80 @@
 #define KERNEL_TEST_CASE(name) TEST_CASE("Kernel: " name, "[kernel_test]")
 
 namespace intgemm {
-void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index width, Index B_cols, const float *bias=nullptr);
 
-// Compute A*B slowly from integers.
-template <class Integer> void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias=nullptr);
-void SlowRefInt(const uint8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias=nullptr);
+void Compare(const float *float_ref, const float *int_ref, const float *int_test,
+             std::size_t size, std::string test_info, float int_tolerance,
+             float float_tolerance, float MSE_float_tolerance, float MSE_int_tolerance);
+
+/*
+ * References
+ */
+namespace references {
+
+// Quantize
+template <typename Type>
+void Quantize(const float* input, Type* output, float quant_mult, Index size) {
+  for (Index i = 0; i < size; ++i) {
+    float value = roundf(input[i] * quant_mult);
+    value = std::max<float>(std::numeric_limits<Type>::min(), value);
+    value = std::min<float>(std::numeric_limits<Type>::max(), value);
+    output[i] = value;
+  }
+}
+
+// Multiply A(float) x B(float)
+template <typename LambdaCallback>
+void MultiplyFF(const float* A, const float* B, float* C, Index A_rows, Index width, Index B_cols, LambdaCallback callback) {
+  for (Index r = 0; r < A_rows; ++r) {
+    for (Index c = 0; c < B_cols; ++c) {
+      float sum = 0.0f;
+      for (Index k = 0; k < width; ++k) {
+        sum += A[r * width + k] * B[k * B_cols + c];
+      }
+      C[r * B_cols + c] = callback(sum, {r, c, A_rows, B_cols});
+    }
+  }
+}
+
+// Multiply A(int) x B(int)
+template <typename TypeA, typename TypeB, typename LambdaCallback,
+          typename std::enable_if<std::is_integral<TypeA>::value>::type* = nullptr,
+          typename std::enable_if<std::is_integral<TypeB>::value>::type* = nullptr>
+void Multiply(const TypeA* A, const TypeB* B, float* C, Index A_rows, Index width, Index B_cols, LambdaCallback callback) {
+  for (Index r = 0; r < A_rows; ++r) {
+    for (Index c = 0; c < B_cols; ++c) {
+      int32_t sum = 0;
+      for (Index k = 0; k < width; ++k) {
+        sum += int32_t(A[r * width + k]) * int32_t(B[k * B_cols + c]);
+      }
+      C[r * B_cols + c] = callback(sum, {r, c, A_rows, B_cols});
+    }
+  }
+}
+
+// Matrix rearragement
+template <typename Type>
+void Rearragement(const Type* input, Type* output, int simd, int unroll, Index rows, Index cols) {
+  for (Index c = 0; c < cols; c += unroll) {
+    for (Index r = 0; r < rows; r += simd) {
+      for (Index i = 0; i < unroll; ++i)
+        for (Index j = 0; j < simd; ++j)
+          output[simd * i + j] = input[cols * r + c + cols * j + i];
+
+      output += unroll * simd;
+    }
+  }
+}
 
-void Compare(const float *float_ref, const float *int_ref, const float *int_test, std::size_t size, std::string test_info,
- float int_tolerance, float float_tolerance, float MSE_float_tolerance, float MSE_int_tolerance);
+// Transpose
+template <typename Type>
+void Transpose(const Type* input, Type* output, Index rows, Index cols) {
+  for (Index r = 0; r < rows; ++r) {
+    for (Index c = 0; c < cols; ++c) {
+      output[rows * c + r] = input[cols * r + c];
+    }
+  }
+}
 
-} //namespace intgemm
+} // namespace references
+} // namespace intgemm
-- 
cgit v1.2.3