per-group and per-channel quantization (#14340)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/14340 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/25 Per-group and per-channel quantization in fbgemm This diff also cleans up explicit template instantiation using macro expansion This diff also changes randFill interface which was easy to make mistakes of generating integer random numbers for floating point vectors. Using this in DNNLOWP operators will be done in a separate diff. Reviewed By: dskhudia Differential Revision: D13176386 fbshipit-source-id: e46c53e31e21520bded71b8ed86e8b19e010e2dd
author: Jongsoo Park <jongsoo@fb.com> 2018-11-27 21:05:28 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-11-27 21:13:17 +0300
commit: d4ee77f5a851879f4a778f122656158663b766b5 (patch)
tree: 1d1db56c63c55753bf237bc6fb4cd59810732001 /bench
parent: db52c82306e7aa10e2dde706b7205c30eec31cd5 (diff)
11 files changed, 200 insertions, 244 deletions
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
index db40ee0..f5ce9ef 100644
--- a/bench/BenchUtils.cc
+++ b/bench/BenchUtils.cc
@@ -5,30 +5,41 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include "BenchUtils.h"
+
+#include <algorithm>
 #include <random>
+#include <type_traits>
+
+#include <omp.h>
 
 namespace fbgemm {
 
 std::default_random_engine eng;
 
 template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high) {
-  std::random_device r;
-  std::uniform_int_distribution<int> dis(low, high);
-  for (auto& v : vec) {
-    v = static_cast<T>(dis(eng));
-  }
+void randFill(aligned_vector<T>& vec, T low, T high, std::true_type) {
+  std::uniform_int_distribution<T> dis(low, high);
+  std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high, std::false_type) {
+  std::uniform_real_distribution<T> dis(low, high);
+  std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high) {
+  randFill(vec, low, high, std::is_integral<T>());
 }
 
 template void
-randFill<float>(aligned_vector<float>& vec, const int low, const int high);
-template void
-randFill<uint8_t>(aligned_vector<uint8_t>& vec, const int low, const int high);
+randFill<float>(aligned_vector<float>& vec, float low, float high);
 template void
-randFill<int8_t>(aligned_vector<int8_t>& vec, const int low, const int high);
-
+randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high);
 template void
-randFill<int>(aligned_vector<int>& vec, const int low, const int high);
+randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high);
+template void randFill<int>(aligned_vector<int>& vec, int low, int high);
 
 void llc_flush(std::vector<char>& llc) {
   volatile char* data = llc.data();
@@ -37,4 +48,20 @@ void llc_flush(std::vector<char>& llc) {
   }
 }
 
+int fbgemm_get_num_threads() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+  return 1;
+#else
+  return omp_get_num_threads();
+#endif
+}
+
+int fbgemm_get_thread_num() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+  return 0;
+#else
+  return omp_get_thread_num();
+#endif
+}
+
 } // namespace fbgemm
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
index 8ca99df..da2ef2d 100644
--- a/bench/BenchUtils.h
+++ b/bench/BenchUtils.h
@@ -11,8 +11,11 @@
 namespace fbgemm {
 
 template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high);
+void randFill(aligned_vector<T>& vec, T low, T high);
 
 void llc_flush(std::vector<char>& llc);
 
+int fbgemm_get_num_threads();
+int fbgemm_get_thread_num();
+
 } // namespace fbgemm
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index f53eeea..c65839b 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -62,10 +62,10 @@ int main() {
     aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
         C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3x3_pad_1_ref(
@@ -129,13 +129,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#if _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3x3_pad_1(
             N,
             T,
@@ -200,13 +195,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#if _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3x3_pad_1(
             N,
             T,
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 8e6d83d..b922f90 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -161,10 +161,10 @@ int main() {
     aligned_vector<int8_t> B(G * R * S);
     aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3_pad_1_ref(
@@ -221,13 +221,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3_pad_1(
             N,
             H,
@@ -279,13 +274,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3_pad_1(
             N,
             H,
diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
index c03f18a..fd9de5b 100644
--- a/bench/FP16Benchmark.cc
+++ b/bench/FP16Benchmark.cc
@@ -73,20 +73,24 @@ void performance_test() {
     int n = s[1];
     int k = s[2];
 
-    aligned_vector<float> A(m * k, 0.f);
-    aligned_vector<float> B(k * n, 0.f);
-    aligned_vector<float> Cg(m * n, 1.f);
-    aligned_vector<float> Cp(m * n, NAN);
+    aligned_vector<float> C_ref(m * n, 1.f);
+    aligned_vector<float> C_fb(m * n, NAN);
 
     // initialize with small numbers
-    randFill(A, 0, 4);
+    aligned_vector<int> Aint(m * k);
+    randFill(Aint, 0, 4);
+    aligned_vector<float> A(Aint.begin(), Aint.end());
 
-    randFill(B, 0, 4);
+    aligned_vector<int> Bint(k * n);
+    randFill(Bint, 0, 4);
+    aligned_vector<float> B(Bint.begin(), Bint.end());
     PackedGemmMatrixFP16 Bp(btran, k, n, alpha, B.data());
 
     if (beta != 0.0f) {
-      randFill(Cg, 0, 4);
-      Cp = Cg;
+      aligned_vector<int> Cint(C_ref.size());
+      randFill(Cint, 0, 4);
+      C_ref.assign(Cint.begin(), Cint.end());
+      C_fb = C_ref;
     }
 
     double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER;
@@ -111,17 +115,17 @@ void performance_test() {
           B.data(),
           (btran == matrix_op_t::NoTranspose) ? n : k,
           beta,
-          Cg.data(),
+          C_ref.data(),
           n);
 #endif
       cblas_gemm_compute(
-          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
 
 #ifdef USE_MKL
       // Compare results
-      for (auto i = 0; i < Cg.size(); i++) {
-        // printf("%f %f\n", Cg[i], Cp[i]);
-        assert(std::abs(Cg[i] - Cp[i]) < 1e-3);
+      for (auto i = 0; i < C_ref.size(); i++) {
+        // printf("%f %f\n", C_ref[i], C_fb[i]);
+        assert(std::abs(C_ref[i] - C_fb[i]) < 1e-3);
       }
 #endif
     }
@@ -151,7 +155,7 @@ void performance_test() {
           B.data(),
           (btran == matrix_op_t::NoTranspose) ? n : k,
           beta,
-          Cg.data(),
+          C_ref.data(),
           n);
       t_end = chrono::system_clock::now();
       if (it >= 0) {
@@ -184,7 +188,7 @@ void performance_test() {
 
       t_begin = chrono::system_clock::now();
       cblas_gemm_compute(
-          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
       t_end = chrono::system_clock::now();
 
       if (it >= 0) {
diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
index 07b73dc..4223d0c 100644
--- a/bench/I8SpmdmBenchmark.cc
+++ b/bench/I8SpmdmBenchmark.cc
@@ -77,7 +77,7 @@ int main() {
         cout << M << ", " << N << ", " << K << ", ";
 
         aligned_vector<uint8_t> A(M * K);
-        randFill(A, 0, 255);
+        randFill<uint8_t>(A, 0, 255);
 
         fbgemm::CompressedSparseColumn B_csc(K, N);
         vector<int32_t> C(M * N);
@@ -156,13 +156,8 @@ int main() {
 #pragma omp parallel
 #endif
           {
-#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
-            int num_threads = 1;
-            int tid = 0;
-#else
-            int num_threads = omp_get_num_threads();
-            int tid = omp_get_thread_num();
-#endif
+            int num_threads = fbgemm_get_num_threads();
+            int tid = fbgemm_get_thread_num();
             int i_per_thread =
                 ((M + 31) / 32 + num_threads - 1) / num_threads * 32;
             int i_begin = std::min(tid * i_per_thread, M);
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index 2115863..cb2edf5 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -125,43 +125,29 @@ void performance_test() {
 
   chrono::time_point<chrono::high_resolution_clock> begin, end;
   for (auto conv_p : shapes) {
-    aligned_vector<float> Afp32(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
     aligned_vector<uint8_t> Aint8(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
-
+        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
     aligned_vector<uint8_t> Aint8_out(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
-            conv_p.K[1] * conv_p.IC,
-        0);
+        conv_p.K[1] * conv_p.IC);
 
-    aligned_vector<float> Bfp32(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
     aligned_vector<int8_t> Bint8(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
 
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb2(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
 
     // A matrix (input activations)
-    randFill(Afp32, 0, 5);
+    randFill<uint8_t>(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
     // B matrix (weights)
-    randFill(Bfp32, -4, 4);
+    randFill<int8_t>(Bint8, -4, 4);
     // int32_t Bint8_zero_point = -3;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // reference implementation
     conv_ref(
@@ -184,8 +170,7 @@ void performance_test() {
     double ttot = 0.0;
     string runType;
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int16_t> packA(
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
index 7144e61..8e112d8 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -125,45 +125,32 @@ void performance_test() {
 
   chrono::time_point<chrono::high_resolution_clock> begin, end;
   for (auto conv_p : shapes) {
-    aligned_vector<float> Afp32(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
     aligned_vector<uint8_t> Aint8(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
 
     aligned_vector<uint8_t> Aint8_out(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
-            conv_p.K[1] * conv_p.IC,
-        0);
+        conv_p.K[1] * conv_p.IC);
 
-    aligned_vector<float> Bfp32(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
     aligned_vector<int8_t> Bint8(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
 
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb2(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
 
     // cout << conv_p.toString() << endl;
 
     // A matrix (input activations)
-    randFill(Afp32, 0, 5);
+    randFill<uint8_t>(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Apf32(Aint8.begin(), Aint8.end());
 
     // B matrix (weights)
-    randFill(Bfp32, -4, 4);
+    randFill<int8_t>(Bint8, -4, 4);
     // int32_t Bint8_zero_point = -3;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // reference implementation
     conv_ref(
@@ -186,8 +173,7 @@ void performance_test() {
     double ttot = 0.0;
     string runType;
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int32_t> packA(
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index badbda0..79a750e 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -86,27 +86,27 @@ void performance_test() {
     int k = shape[2];
 
     float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    aligned_vector<float> Afp32(m * k);
+    aligned_vector<uint8_t> Aint8(Afp32.size());
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
+    aligned_vector<float> Bfp32(k * n);
+    aligned_vector<int8_t> Bint8(Bfp32.size());
 
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
-    aligned_vector<float> Cfp32_fb(m * n, 0.0f);
+    aligned_vector<float> Cfp32_mkl(m * n);
+    aligned_vector<float> Cfp32_fb(Cfp32_mkl.size());
 
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Aint8, 0, 255);
+    randFill<uint8_t>(Aint8, 0, 255);
     float Aint8_scale = 0.11;
     int32_t Aint8_zero_point = 43;
     for (auto i = 0; i < Afp32.size(); ++i) {
       Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
     }
 
-    randFill(Bint8, -128, 127);
+    randFill<int8_t>(Bint8, -128, 127);
     avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
 
     float Bint8_scale = 0.49;
@@ -116,10 +116,9 @@ void performance_test() {
     }
 
     // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n);
+    vector<int32_t> col_offsets(n);
     col_offsets_with_zero_pt_s8acc32_ref(
-        k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+        k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
     double ttot = 0;
     std::string type;
@@ -172,8 +171,7 @@ void performance_test() {
     // printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col
     // offsets before");
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
 
     PackAWithQuantRowOffset<uint8_t> packAN(
@@ -201,12 +199,13 @@ void performance_test() {
     ReQuantizeForFloat<false> outputProcObj(
         doNothingObj,
         Aint8_scale,
-        Bint8_scale,
+        &Bint8_scale,
         Aint8_zero_point,
-        Bint8_zero_point,
+        &Bint8_zero_point,
         packAN.getRowOffsetBuffer(),
         col_offsets.data(),
-        nullptr);
+        nullptr,
+        n);
 
     ttot = 0;
     type = "FBGEMM_i8_acc32";
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index fd48b24..f60332f 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -100,29 +100,26 @@ void performance_test() {
     int n = shape[1];
     int k = shape[2];
 
-    float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    float alpha = 1.0f, beta = 0.0f;
+    aligned_vector<uint8_t> Aint8(m * k);
+    aligned_vector<int8_t> Bint8(k * n);
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
-
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
+    aligned_vector<float> Cfp32_mkl(m * n);
     // just used for result comparisons
-    aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f);
+    aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
     // requantize results
-    aligned_vector<uint8_t> Cint8_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0.0f);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0.0f);
+    aligned_vector<uint8_t> Cint8_mkl(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Afp32, 0, 50);
+    randFill<uint8_t>(Aint8, 0, 50);
     int32_t Aint8_zero_point = 43;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
-    randFill(Bfp32, -8, 8);
+    randFill<int8_t>(Bint8, -8, 8);
+    aligned_vector<int8_t> Bint8_copy(Bint8);
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     double nops = 2.0 * static_cast<double>(NITER) * m * n * k;
     double ttot = 0.0;
@@ -163,9 +160,7 @@ void performance_test() {
     cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1)
          << nops / ttot << endl;
 
-    for (auto i = 0; i < Cfp32_mkl.size(); ++i) {
-      Cint32_mkl[i] = static_cast<int32_t>(Cfp32_mkl[i]);
-    }
+    Cint32_mkl.assign(Cfp32_mkl.begin(), Cfp32_mkl.end());
 #endif
 
     for (BenchmarkType bench_type :
@@ -179,23 +174,19 @@ void performance_test() {
                                   bench_type == BenchmarkType::REQUANTIZATION)
           ? 0
           : -30;
-      for (auto i = 0; i < Bfp32.size(); ++i) {
-        Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-      }
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(n);
+      vector<int32_t> col_offsets(n);
+      Bint8 = Bint8_copy;
       col_offsets_with_zero_pt_s8acc32_ref(
-          k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+          k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
-      vector<int32_t> row_offsets;
-      row_offsets.resize(m);
+      vector<int32_t> row_offsets(m);
 
       row_offsets_u8acc32_ref(m, k, k, Aint8.data(), row_offsets.data());
 
       float C_multiplier =
-          (bench_type == BenchmarkType::BARE_BONE) ? 1 : 0.1234;
+          (bench_type == BenchmarkType::BARE_BONE) ? 1.0f : 0.1234f;
       int32_t C_zero_pt = (bench_type == BenchmarkType::BARE_BONE) ? 0 : 5;
 
       // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -235,16 +226,14 @@ void performance_test() {
           n,
           Cint32_mkl.data(),
           Cint8_mkl.data(),
-          C_multiplier,
+          &C_multiplier,
           C_zero_pt,
           Aint8_zero_point,
-          Bint8_zero_point,
+          &Bint8_zero_point,
           row_offsets.data(),
           col_offsets.data(),
-          nullptr); // bias
-
-      PackBMatrix<int8_t, int16_t> packedB(
-          matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
+          nullptr, // bias
+          n); // ncols per quant group
 
       CompressedSparseColumn B_csc(k, n);
 
@@ -254,30 +243,35 @@ void performance_test() {
       default_random_engine eng;
       binomial_distribution<> per_col_nnz_dist(k, density);
 
-      vector<int> row_indices(k);
-
-      int total_nnz = 0;
-      for (int j = 0; j < n; ++j) {
-        B_csc.ColPtr()[j] = total_nnz;
-
-        int nnz_of_j = per_col_nnz_dist(eng);
-        total_nnz += nnz_of_j;
-
-        iota(row_indices.begin(), row_indices.end(), 0);
-        shuffle(row_indices.begin(), row_indices.end(), eng);
-        sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
-
-        for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
-          B_csc.RowIdx().push_back(row_indices[kidx]);
-          // put the current B value
-          B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
-          // make current B value zero
-          Bint8[row_indices[kidx] * n + j] = 0;
-          // std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
-          // endl;
+      if (bench_type == BenchmarkType::EVERYTHING) {
+        vector<int> row_indices(k);
+
+        int total_nnz = 0;
+        for (int j = 0; j < n; ++j) {
+          B_csc.ColPtr()[j] = total_nnz;
+
+          int nnz_of_j = per_col_nnz_dist(eng);
+          total_nnz += nnz_of_j;
+
+          iota(row_indices.begin(), row_indices.end(), 0);
+          shuffle(row_indices.begin(), row_indices.end(), eng);
+          sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
+
+          for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
+            B_csc.RowIdx().push_back(row_indices[kidx]);
+            // put the current B value
+            B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
+            // make current B value zero
+            Bint8[row_indices[kidx] * n + j] = 0;
+            // std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
+            // endl;
+          }
         }
+        B_csc.ColPtr()[n] = total_nnz;
       }
-      B_csc.ColPtr()[n] = total_nnz;
+
+      PackBMatrix<int8_t, int16_t> packedB(
+          matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
 
       // printMatrix(matrix_op_t::NoTranspose,
       // Cint32_mkl.data(), m, n, n, "C mkl");
@@ -298,8 +292,7 @@ void performance_test() {
 #pragma omp parallel
 #endif
         {
-          vector<int32_t> row_offset_buf;
-          row_offset_buf.resize(
+          vector<int32_t> row_offset_buf(
               PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
           PackAMatrix<uint8_t, int16_t> packA(
@@ -333,15 +326,16 @@ void performance_test() {
           // Requantization back to int8
           ReQuantizeOutput<false> reqObj(
               doNothingObj,
-              C_multiplier,
+              &C_multiplier,
               C_zero_pt,
               Aint8_zero_point,
-              Bint8_zero_point,
+              &Bint8_zero_point,
               bench_type == BenchmarkType::REQUANTIZATION
                   ? nullptr
                   : packAWithRowOffset.getRowOffsetBuffer(),
               col_offsets.data(),
-              nullptr);
+              nullptr,
+              n);
 
           // the top most (first) operation in the output processing
           // pipeline is spmdm
@@ -354,13 +348,8 @@ void performance_test() {
               ReQuantizeOutput<false>>
               spmdmObj(reqObj, Aint8.data(), k, B_csc);
 
-#ifdef _OPENMP
-          int num_threads = omp_get_num_threads();
-          int tid = omp_get_thread_num();
-#else
-          int num_threads = 1;
-          int tid = 0;
-#endif
+          int num_threads = fbgemm_get_num_threads();
+          int tid = fbgemm_get_thread_num();
           // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
           switch (bench_type) {
             case BenchmarkType::BARE_BONE:
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index b3a9b38..b255b8c 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -103,42 +103,35 @@ void performance_test() {
     int k = shape[2];
 
     float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    aligned_vector<uint8_t> Aint8(m * k);
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
+    aligned_vector<int8_t> Bint8(k * n);
 
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_local(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-    aligned_vector<uint8_t> Cint8_local(m * n, 0);
+    aligned_vector<float> Cfp32_mkl(m * n);
+    aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_local(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_local(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Aint8, 0, 255);
+    randFill<uint8_t>(Aint8, 0, 255);
     // float Aint8_scale = 0.11;
     int32_t Aint8_zero_point = 43;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Afp32[i] = (float)Aint8[i];
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
-    randFill(Bint8, -128, 127);
+    randFill<int8_t>(Bint8, -128, 127);
     avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
 
     // float Bint8_scale = 0.49;
     int32_t Bint8_zero_point = -30;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bfp32[i] = (float)Bint8[i];
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n);
+    vector<int32_t> col_offsets(n);
     col_offsets_with_zero_pt_s8acc32_ref(
-        k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+        k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
     double nops = 2.0 * static_cast<double>(NITER) * m * n * k;
     double ttot = 0.0;
@@ -180,8 +173,7 @@ void performance_test() {
     }
 #endif
 
-    vector<int32_t> row_offsets;
-    row_offsets.resize(m);
+    vector<int32_t> row_offsets(m);
 
     float C_multiplier = 0.1234;
     int32_t C_zero_pt = 5;
@@ -197,13 +189,14 @@ void performance_test() {
         n,
         Cint32_local.data(),
         Cint8_local.data(),
-        C_multiplier,
+        &C_multiplier,
         C_zero_pt,
         Aint8_zero_point,
-        Bint8_zero_point,
+        &Bint8_zero_point,
         row_offsets.data(),
         col_offsets.data(),
-        nullptr); // bias
+        nullptr, // bias
+        n); // ncols per quant group
     // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
     // unpacked");
     // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -248,8 +241,7 @@ void performance_test() {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
 
         PackAWithRowOffset<uint8_t> packAN(
@@ -265,21 +257,17 @@ void performance_test() {
         DoNothing<> doNothingObj{};
         ReQuantizeOutput<false> outputProcObj(
             doNothingObj,
-            C_multiplier,
+            &C_multiplier,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            &Bint8_zero_point,
             packAN.getRowOffsetBuffer(),
             col_offsets.data(),
-            nullptr);
+            nullptr,
+            n);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
         fbgemmPacked(
             packAN,
author	Jongsoo Park <jongsoo@fb.com>	2018-11-27 21:05:28 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-11-27 21:13:17 +0300
commit	d4ee77f5a851879f4a778f122656158663b766b5 (patch)
tree	1d1db56c63c55753bf237bc6fb4cd59810732001 /bench
parent	db52c82306e7aa10e2dde706b7205c30eec31cd5 (diff)