per-group and per-channel quantization (#14340)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/14340 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/25 Per-group and per-channel quantization in fbgemm This diff also cleans up explicit template instantiation using macro expansion This diff also changes randFill interface which was easy to make mistakes of generating integer random numbers for floating point vectors. Using this in DNNLOWP operators will be done in a separate diff. Reviewed By: dskhudia Differential Revision: D13176386 fbshipit-source-id: e46c53e31e21520bded71b8ed86e8b19e010e2dd
author: Jongsoo Park <jongsoo@fb.com> 2018-11-27 21:05:28 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-11-27 21:13:17 +0300
commit: d4ee77f5a851879f4a778f122656158663b766b5 (patch)
tree: 1d1db56c63c55753bf237bc6fb4cd59810732001
parent: db52c82306e7aa10e2dde706b7205c30eec31cd5 (diff)
24 files changed, 1466 insertions, 1049 deletions
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
index db40ee0..f5ce9ef 100644
--- a/bench/BenchUtils.cc
+++ b/bench/BenchUtils.cc
@@ -5,30 +5,41 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include "BenchUtils.h"
+
+#include <algorithm>
 #include <random>
+#include <type_traits>
+
+#include <omp.h>
 
 namespace fbgemm {
 
 std::default_random_engine eng;
 
 template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high) {
-  std::random_device r;
-  std::uniform_int_distribution<int> dis(low, high);
-  for (auto& v : vec) {
-    v = static_cast<T>(dis(eng));
-  }
+void randFill(aligned_vector<T>& vec, T low, T high, std::true_type) {
+  std::uniform_int_distribution<T> dis(low, high);
+  std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high, std::false_type) {
+  std::uniform_real_distribution<T> dis(low, high);
+  std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high) {
+  randFill(vec, low, high, std::is_integral<T>());
 }
 
 template void
-randFill<float>(aligned_vector<float>& vec, const int low, const int high);
-template void
-randFill<uint8_t>(aligned_vector<uint8_t>& vec, const int low, const int high);
+randFill<float>(aligned_vector<float>& vec, float low, float high);
 template void
-randFill<int8_t>(aligned_vector<int8_t>& vec, const int low, const int high);
-
+randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high);
 template void
-randFill<int>(aligned_vector<int>& vec, const int low, const int high);
+randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high);
+template void randFill<int>(aligned_vector<int>& vec, int low, int high);
 
 void llc_flush(std::vector<char>& llc) {
   volatile char* data = llc.data();
@@ -37,4 +48,20 @@ void llc_flush(std::vector<char>& llc) {
   }
 }
 
+int fbgemm_get_num_threads() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+  return 1;
+#else
+  return omp_get_num_threads();
+#endif
+}
+
+int fbgemm_get_thread_num() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+  return 0;
+#else
+  return omp_get_thread_num();
+#endif
+}
+
 } // namespace fbgemm
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
index 8ca99df..da2ef2d 100644
--- a/bench/BenchUtils.h
+++ b/bench/BenchUtils.h
@@ -11,8 +11,11 @@
 namespace fbgemm {
 
 template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high);
+void randFill(aligned_vector<T>& vec, T low, T high);
 
 void llc_flush(std::vector<char>& llc);
 
+int fbgemm_get_num_threads();
+int fbgemm_get_thread_num();
+
 } // namespace fbgemm
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index f53eeea..c65839b 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -62,10 +62,10 @@ int main() {
     aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
         C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3x3_pad_1_ref(
@@ -129,13 +129,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#if _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3x3_pad_1(
             N,
             T,
@@ -200,13 +195,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#if _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3x3_pad_1(
             N,
             T,
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 8e6d83d..b922f90 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -161,10 +161,10 @@ int main() {
     aligned_vector<int8_t> B(G * R * S);
     aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3_pad_1_ref(
@@ -221,13 +221,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3_pad_1(
             N,
             H,
@@ -279,13 +274,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3_pad_1(
             N,
             H,
diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
index c03f18a..fd9de5b 100644
--- a/bench/FP16Benchmark.cc
+++ b/bench/FP16Benchmark.cc
@@ -73,20 +73,24 @@ void performance_test() {
     int n = s[1];
     int k = s[2];
 
-    aligned_vector<float> A(m * k, 0.f);
-    aligned_vector<float> B(k * n, 0.f);
-    aligned_vector<float> Cg(m * n, 1.f);
-    aligned_vector<float> Cp(m * n, NAN);
+    aligned_vector<float> C_ref(m * n, 1.f);
+    aligned_vector<float> C_fb(m * n, NAN);
 
     // initialize with small numbers
-    randFill(A, 0, 4);
+    aligned_vector<int> Aint(m * k);
+    randFill(Aint, 0, 4);
+    aligned_vector<float> A(Aint.begin(), Aint.end());
 
-    randFill(B, 0, 4);
+    aligned_vector<int> Bint(k * n);
+    randFill(Bint, 0, 4);
+    aligned_vector<float> B(Bint.begin(), Bint.end());
     PackedGemmMatrixFP16 Bp(btran, k, n, alpha, B.data());
 
     if (beta != 0.0f) {
-      randFill(Cg, 0, 4);
-      Cp = Cg;
+      aligned_vector<int> Cint(C_ref.size());
+      randFill(Cint, 0, 4);
+      C_ref.assign(Cint.begin(), Cint.end());
+      C_fb = C_ref;
     }
 
     double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER;
@@ -111,17 +115,17 @@ void performance_test() {
           B.data(),
           (btran == matrix_op_t::NoTranspose) ? n : k,
           beta,
-          Cg.data(),
+          C_ref.data(),
           n);
 #endif
       cblas_gemm_compute(
-          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
 
 #ifdef USE_MKL
       // Compare results
-      for (auto i = 0; i < Cg.size(); i++) {
-        // printf("%f %f\n", Cg[i], Cp[i]);
-        assert(std::abs(Cg[i] - Cp[i]) < 1e-3);
+      for (auto i = 0; i < C_ref.size(); i++) {
+        // printf("%f %f\n", C_ref[i], C_fb[i]);
+        assert(std::abs(C_ref[i] - C_fb[i]) < 1e-3);
       }
 #endif
     }
@@ -151,7 +155,7 @@ void performance_test() {
           B.data(),
           (btran == matrix_op_t::NoTranspose) ? n : k,
           beta,
-          Cg.data(),
+          C_ref.data(),
           n);
       t_end = chrono::system_clock::now();
       if (it >= 0) {
@@ -184,7 +188,7 @@ void performance_test() {
 
       t_begin = chrono::system_clock::now();
       cblas_gemm_compute(
-          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
       t_end = chrono::system_clock::now();
 
       if (it >= 0) {
diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
index 07b73dc..4223d0c 100644
--- a/bench/I8SpmdmBenchmark.cc
+++ b/bench/I8SpmdmBenchmark.cc
@@ -77,7 +77,7 @@ int main() {
         cout << M << ", " << N << ", " << K << ", ";
 
         aligned_vector<uint8_t> A(M * K);
-        randFill(A, 0, 255);
+        randFill<uint8_t>(A, 0, 255);
 
         fbgemm::CompressedSparseColumn B_csc(K, N);
         vector<int32_t> C(M * N);
@@ -156,13 +156,8 @@ int main() {
 #pragma omp parallel
 #endif
           {
-#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
-            int num_threads = 1;
-            int tid = 0;
-#else
-            int num_threads = omp_get_num_threads();
-            int tid = omp_get_thread_num();
-#endif
+            int num_threads = fbgemm_get_num_threads();
+            int tid = fbgemm_get_thread_num();
             int i_per_thread =
                 ((M + 31) / 32 + num_threads - 1) / num_threads * 32;
             int i_begin = std::min(tid * i_per_thread, M);
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index 2115863..cb2edf5 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -125,43 +125,29 @@ void performance_test() {
 
   chrono::time_point<chrono::high_resolution_clock> begin, end;
   for (auto conv_p : shapes) {
-    aligned_vector<float> Afp32(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
     aligned_vector<uint8_t> Aint8(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
-
+        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
     aligned_vector<uint8_t> Aint8_out(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
-            conv_p.K[1] * conv_p.IC,
-        0);
+        conv_p.K[1] * conv_p.IC);
 
-    aligned_vector<float> Bfp32(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
     aligned_vector<int8_t> Bint8(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
 
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb2(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
 
     // A matrix (input activations)
-    randFill(Afp32, 0, 5);
+    randFill<uint8_t>(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
     // B matrix (weights)
-    randFill(Bfp32, -4, 4);
+    randFill<int8_t>(Bint8, -4, 4);
     // int32_t Bint8_zero_point = -3;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // reference implementation
     conv_ref(
@@ -184,8 +170,7 @@ void performance_test() {
     double ttot = 0.0;
     string runType;
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int16_t> packA(
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
index 7144e61..8e112d8 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -125,45 +125,32 @@ void performance_test() {
 
   chrono::time_point<chrono::high_resolution_clock> begin, end;
   for (auto conv_p : shapes) {
-    aligned_vector<float> Afp32(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
     aligned_vector<uint8_t> Aint8(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
 
     aligned_vector<uint8_t> Aint8_out(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
-            conv_p.K[1] * conv_p.IC,
-        0);
+        conv_p.K[1] * conv_p.IC);
 
-    aligned_vector<float> Bfp32(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
     aligned_vector<int8_t> Bint8(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
 
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb2(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
 
     // cout << conv_p.toString() << endl;
 
     // A matrix (input activations)
-    randFill(Afp32, 0, 5);
+    randFill<uint8_t>(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Apf32(Aint8.begin(), Aint8.end());
 
     // B matrix (weights)
-    randFill(Bfp32, -4, 4);
+    randFill<int8_t>(Bint8, -4, 4);
     // int32_t Bint8_zero_point = -3;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // reference implementation
     conv_ref(
@@ -186,8 +173,7 @@ void performance_test() {
     double ttot = 0.0;
     string runType;
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int32_t> packA(
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index badbda0..79a750e 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -86,27 +86,27 @@ void performance_test() {
     int k = shape[2];
 
     float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    aligned_vector<float> Afp32(m * k);
+    aligned_vector<uint8_t> Aint8(Afp32.size());
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
+    aligned_vector<float> Bfp32(k * n);
+    aligned_vector<int8_t> Bint8(Bfp32.size());
 
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
-    aligned_vector<float> Cfp32_fb(m * n, 0.0f);
+    aligned_vector<float> Cfp32_mkl(m * n);
+    aligned_vector<float> Cfp32_fb(Cfp32_mkl.size());
 
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Aint8, 0, 255);
+    randFill<uint8_t>(Aint8, 0, 255);
     float Aint8_scale = 0.11;
     int32_t Aint8_zero_point = 43;
     for (auto i = 0; i < Afp32.size(); ++i) {
       Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
     }
 
-    randFill(Bint8, -128, 127);
+    randFill<int8_t>(Bint8, -128, 127);
     avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
 
     float Bint8_scale = 0.49;
@@ -116,10 +116,9 @@ void performance_test() {
     }
 
     // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n);
+    vector<int32_t> col_offsets(n);
     col_offsets_with_zero_pt_s8acc32_ref(
-        k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+        k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
     double ttot = 0;
     std::string type;
@@ -172,8 +171,7 @@ void performance_test() {
     // printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col
     // offsets before");
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
 
     PackAWithQuantRowOffset<uint8_t> packAN(
@@ -201,12 +199,13 @@ void performance_test() {
     ReQuantizeForFloat<false> outputProcObj(
         doNothingObj,
         Aint8_scale,
-        Bint8_scale,
+        &Bint8_scale,
         Aint8_zero_point,
-        Bint8_zero_point,
+        &Bint8_zero_point,
         packAN.getRowOffsetBuffer(),
         col_offsets.data(),
-        nullptr);
+        nullptr,
+        n);
 
     ttot = 0;
     type = "FBGEMM_i8_acc32";
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index fd48b24..f60332f 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -100,29 +100,26 @@ void performance_test() {
     int n = shape[1];
     int k = shape[2];
 
-    float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    float alpha = 1.0f, beta = 0.0f;
+    aligned_vector<uint8_t> Aint8(m * k);
+    aligned_vector<int8_t> Bint8(k * n);
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
-
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
+    aligned_vector<float> Cfp32_mkl(m * n);
     // just used for result comparisons
-    aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f);
+    aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
     // requantize results
-    aligned_vector<uint8_t> Cint8_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0.0f);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0.0f);
+    aligned_vector<uint8_t> Cint8_mkl(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Afp32, 0, 50);
+    randFill<uint8_t>(Aint8, 0, 50);
     int32_t Aint8_zero_point = 43;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
-    randFill(Bfp32, -8, 8);
+    randFill<int8_t>(Bint8, -8, 8);
+    aligned_vector<int8_t> Bint8_copy(Bint8);
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     double nops = 2.0 * static_cast<double>(NITER) * m * n * k;
     double ttot = 0.0;
@@ -163,9 +160,7 @@ void performance_test() {
     cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1)
          << nops / ttot << endl;
 
-    for (auto i = 0; i < Cfp32_mkl.size(); ++i) {
-      Cint32_mkl[i] = static_cast<int32_t>(Cfp32_mkl[i]);
-    }
+    Cint32_mkl.assign(Cfp32_mkl.begin(), Cfp32_mkl.end());
 #endif
 
     for (BenchmarkType bench_type :
@@ -179,23 +174,19 @@ void performance_test() {
                                   bench_type == BenchmarkType::REQUANTIZATION)
           ? 0
           : -30;
-      for (auto i = 0; i < Bfp32.size(); ++i) {
-        Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-      }
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(n);
+      vector<int32_t> col_offsets(n);
+      Bint8 = Bint8_copy;
       col_offsets_with_zero_pt_s8acc32_ref(
-          k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+          k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
-      vector<int32_t> row_offsets;
-      row_offsets.resize(m);
+      vector<int32_t> row_offsets(m);
 
       row_offsets_u8acc32_ref(m, k, k, Aint8.data(), row_offsets.data());
 
       float C_multiplier =
-          (bench_type == BenchmarkType::BARE_BONE) ? 1 : 0.1234;
+          (bench_type == BenchmarkType::BARE_BONE) ? 1.0f : 0.1234f;
       int32_t C_zero_pt = (bench_type == BenchmarkType::BARE_BONE) ? 0 : 5;
 
       // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -235,16 +226,14 @@ void performance_test() {
           n,
           Cint32_mkl.data(),
           Cint8_mkl.data(),
-          C_multiplier,
+          &C_multiplier,
           C_zero_pt,
           Aint8_zero_point,
-          Bint8_zero_point,
+          &Bint8_zero_point,
           row_offsets.data(),
           col_offsets.data(),
-          nullptr); // bias
-
-      PackBMatrix<int8_t, int16_t> packedB(
-          matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
+          nullptr, // bias
+          n); // ncols per quant group
 
       CompressedSparseColumn B_csc(k, n);
 
@@ -254,30 +243,35 @@ void performance_test() {
       default_random_engine eng;
       binomial_distribution<> per_col_nnz_dist(k, density);
 
-      vector<int> row_indices(k);
-
-      int total_nnz = 0;
-      for (int j = 0; j < n; ++j) {
-        B_csc.ColPtr()[j] = total_nnz;
-
-        int nnz_of_j = per_col_nnz_dist(eng);
-        total_nnz += nnz_of_j;
-
-        iota(row_indices.begin(), row_indices.end(), 0);
-        shuffle(row_indices.begin(), row_indices.end(), eng);
-        sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
-
-        for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
-          B_csc.RowIdx().push_back(row_indices[kidx]);
-          // put the current B value
-          B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
-          // make current B value zero
-          Bint8[row_indices[kidx] * n + j] = 0;
-          // std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
-          // endl;
+      if (bench_type == BenchmarkType::EVERYTHING) {
+        vector<int> row_indices(k);
+
+        int total_nnz = 0;
+        for (int j = 0; j < n; ++j) {
+          B_csc.ColPtr()[j] = total_nnz;
+
+          int nnz_of_j = per_col_nnz_dist(eng);
+          total_nnz += nnz_of_j;
+
+          iota(row_indices.begin(), row_indices.end(), 0);
+          shuffle(row_indices.begin(), row_indices.end(), eng);
+          sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
+
+          for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
+            B_csc.RowIdx().push_back(row_indices[kidx]);
+            // put the current B value
+            B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
+            // make current B value zero
+            Bint8[row_indices[kidx] * n + j] = 0;
+            // std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
+            // endl;
+          }
         }
+        B_csc.ColPtr()[n] = total_nnz;
       }
-      B_csc.ColPtr()[n] = total_nnz;
+
+      PackBMatrix<int8_t, int16_t> packedB(
+          matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
 
       // printMatrix(matrix_op_t::NoTranspose,
       // Cint32_mkl.data(), m, n, n, "C mkl");
@@ -298,8 +292,7 @@ void performance_test() {
 #pragma omp parallel
 #endif
         {
-          vector<int32_t> row_offset_buf;
-          row_offset_buf.resize(
+          vector<int32_t> row_offset_buf(
               PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
           PackAMatrix<uint8_t, int16_t> packA(
@@ -333,15 +326,16 @@ void performance_test() {
           // Requantization back to int8
           ReQuantizeOutput<false> reqObj(
               doNothingObj,
-              C_multiplier,
+              &C_multiplier,
               C_zero_pt,
               Aint8_zero_point,
-              Bint8_zero_point,
+              &Bint8_zero_point,
               bench_type == BenchmarkType::REQUANTIZATION
                   ? nullptr
                   : packAWithRowOffset.getRowOffsetBuffer(),
               col_offsets.data(),
-              nullptr);
+              nullptr,
+              n);
 
           // the top most (first) operation in the output processing
           // pipeline is spmdm
@@ -354,13 +348,8 @@ void performance_test() {
               ReQuantizeOutput<false>>
               spmdmObj(reqObj, Aint8.data(), k, B_csc);
 
-#ifdef _OPENMP
-          int num_threads = omp_get_num_threads();
-          int tid = omp_get_thread_num();
-#else
-          int num_threads = 1;
-          int tid = 0;
-#endif
+          int num_threads = fbgemm_get_num_threads();
+          int tid = fbgemm_get_thread_num();
           // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
           switch (bench_type) {
             case BenchmarkType::BARE_BONE:
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index b3a9b38..b255b8c 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -103,42 +103,35 @@ void performance_test() {
     int k = shape[2];
 
     float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    aligned_vector<uint8_t> Aint8(m * k);
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
+    aligned_vector<int8_t> Bint8(k * n);
 
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_local(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-    aligned_vector<uint8_t> Cint8_local(m * n, 0);
+    aligned_vector<float> Cfp32_mkl(m * n);
+    aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_local(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_local(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Aint8, 0, 255);
+    randFill<uint8_t>(Aint8, 0, 255);
     // float Aint8_scale = 0.11;
     int32_t Aint8_zero_point = 43;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Afp32[i] = (float)Aint8[i];
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
-    randFill(Bint8, -128, 127);
+    randFill<int8_t>(Bint8, -128, 127);
     avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
 
     // float Bint8_scale = 0.49;
     int32_t Bint8_zero_point = -30;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bfp32[i] = (float)Bint8[i];
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n);
+    vector<int32_t> col_offsets(n);
     col_offsets_with_zero_pt_s8acc32_ref(
-        k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+        k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
     double nops = 2.0 * static_cast<double>(NITER) * m * n * k;
     double ttot = 0.0;
@@ -180,8 +173,7 @@ void performance_test() {
     }
 #endif
 
-    vector<int32_t> row_offsets;
-    row_offsets.resize(m);
+    vector<int32_t> row_offsets(m);
 
     float C_multiplier = 0.1234;
     int32_t C_zero_pt = 5;
@@ -197,13 +189,14 @@ void performance_test() {
         n,
         Cint32_local.data(),
         Cint8_local.data(),
-        C_multiplier,
+        &C_multiplier,
         C_zero_pt,
         Aint8_zero_point,
-        Bint8_zero_point,
+        &Bint8_zero_point,
         row_offsets.data(),
         col_offsets.data(),
-        nullptr); // bias
+        nullptr, // bias
+        n); // ncols per quant group
     // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
     // unpacked");
     // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -248,8 +241,7 @@ void performance_test() {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
 
         PackAWithRowOffset<uint8_t> packAN(
@@ -265,21 +257,17 @@ void performance_test() {
         DoNothing<> doNothingObj{};
         ReQuantizeOutput<false> outputProcObj(
             doNothingObj,
-            C_multiplier,
+            &C_multiplier,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            &Bint8_zero_point,
             packAN.getRowOffsetBuffer(),
             col_offsets.data(),
-            nullptr);
+            nullptr,
+            n);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
         fbgemmPacked(
             packAN,
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index d2c2a1e..17a07e5 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -847,13 +847,19 @@ class DoSpmdmOnInpBuffer {
   const int groups_;
 };
 
+enum class QuantizationGranularity {
+  TENSOR,
+  GROUP,
+  OUT_CHANNEL,
+};
+
 /**
  * @brief Requantize values in inp buffer and write to out buffer.
  *        pass the out buffer to next op for further processing.
- *
  */
 template <
     bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
     typename outT = std::uint8_t,
     typename inT = std::int32_t,
     typename nextOPType = DoNothing<outT, outT>>
@@ -863,13 +869,15 @@ class ReQuantizeOutput {
   using inpType = inT;
   ReQuantizeOutput(
       nextOPType& nextop,
-      float C_multiplier,
+      const float* C_multiplier,
       std::int32_t C_zero_point,
       std::int32_t Aq_zero_point,
-      std::int32_t Bq_zero_point,
+      const std::int32_t* Bq_zero_point,
       const std::int32_t* row_offsets,
       const std::int32_t* col_offsets,
-      const std::int32_t* bias)
+      const std::int32_t* bias,
+      std::uint32_t nCol,
+      int groups = 1)
       : nextop_(nextop),
         C_multiplier_(C_multiplier),
         C_zero_point_(C_zero_point),
@@ -877,7 +885,9 @@ class ReQuantizeOutput {
         Bq_zero_point_(Bq_zero_point),
         q_row_offsets_(row_offsets),
         q_col_offsets_(col_offsets),
-        bias_(bias) {}
+        bias_(bias),
+        ncols_(nCol),
+        groups_(groups) {}
 
   template <inst_set_t instSet>
   inline int f(
@@ -897,13 +907,15 @@ class ReQuantizeOutput {
       int ld_in) const;
 
   nextOPType& nextop_;
-  float C_multiplier_;
+  const float* C_multiplier_;
   std::int32_t C_zero_point_;
   std::int32_t Aq_zero_point_;
-  std::int32_t Bq_zero_point_;
+  const std::int32_t* Bq_zero_point_;
   const std::int32_t* q_row_offsets_;
   const std::int32_t* q_col_offsets_;
   const std::int32_t* bias_;
+  std::uint32_t ncols_;
+  int groups_;
 };
 
 /**
@@ -912,6 +924,7 @@ class ReQuantizeOutput {
  */
 template <
     bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
     typename outT = float,
     typename inT = std::int32_t,
     typename nextOPType = DoNothing<outT, outT>>
@@ -922,12 +935,14 @@ class ReQuantizeForFloat {
   ReQuantizeForFloat(
       nextOPType& nextop,
       float Aq_scale,
-      float Bq_scale,
+      const float* Bq_scale,
       std::int32_t Aq_zero_point,
-      std::int32_t Bq_zero_point,
+      const std::int32_t* Bq_zero_point,
       const std::int32_t* row_offsets,
       const std::int32_t* col_offsets,
-      const float* bias)
+      const float* bias,
+      std::uint32_t nCol,
+      int groups = 1)
       : nextop_(nextop),
         Aq_scale_(Aq_scale),
         Bq_scale_(Bq_scale),
@@ -935,7 +950,9 @@ class ReQuantizeForFloat {
         Bq_zero_point_(Bq_zero_point),
         q_row_offsets_(row_offsets),
         q_col_offsets_(col_offsets),
-        bias_(bias) {}
+        bias_(bias),
+        ncols_(nCol),
+        groups_(groups) {}
 
   template <inst_set_t instSet>
   inline int f(
@@ -947,12 +964,15 @@ class ReQuantizeForFloat {
 
  private:
   nextOPType& nextop_;
-  float Aq_scale_, Bq_scale_;
+  float Aq_scale_;
+  const float* Bq_scale_;
   std::int32_t Aq_zero_point_;
-  std::int32_t Bq_zero_point_;
+  const std::int32_t* Bq_zero_point_;
   const std::int32_t* q_row_offsets_;
   const std::int32_t* q_col_offsets_;
   const float* bias_;
+  std::uint32_t ncols_;
+  int groups_;
 };
 
 // type specialized implementation in an include file
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
index 59a6e0e..88a10bc 100644
--- a/include/fbgemm/OutputProcessing-inl.h
+++ b/include/fbgemm/OutputProcessing-inl.h
@@ -44,9 +44,14 @@ inline int DoSpmdmOnInpBuffer<outT, inT, nextOPType>::f(
   return nextop_.template f<instSet>(out, inp, block, ld_out, ld_in);
 }
 
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN,
+    typename outT,
+    typename inT,
+    typename nextOPType>
 template <bool A_SYMMETRIC, bool B_SYMMETRIC, bool HAS_BIAS>
-void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
+void ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f_(
     outT* out,
     const inT* inp,
     const block_type_t& block,
@@ -54,7 +59,13 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
     int ld_in) const {
   // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c
   // using AVX2 instructions
-  __m256 multiplier_v = _mm256_set1_ps(C_multiplier_);
+  int quant_param_idx = 0;
+  if (Q_GRAN == QuantizationGranularity::GROUP) {
+    int ncol_per_group = ncols_ / groups_;
+    int g = block.col_start / ncol_per_group;
+    quant_param_idx = g;
+  }
+  __m256 multiplier_v = _mm256_set1_ps(C_multiplier_[quant_param_idx]);
 
   __m256i min_v = _mm256_set1_epi8(std::numeric_limits<uint8_t>::min());
   __m256i max_v = _mm256_set1_epi8(std::numeric_limits<uint8_t>::max());
@@ -63,7 +74,9 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
       (A_SYMMETRIC == (Aq_zero_point_ == 0)) &&
       "A_SYMMETRIC == true if and only if Aq_zero_point == 0");
   assert(
-      (B_SYMMETRIC == (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr)) &&
+      (B_SYMMETRIC ==
+       ((Q_GRAN == QuantizationGranularity::TENSOR && Bq_zero_point_[0] == 0) ||
+        q_row_offsets_ == nullptr)) &&
       "B_SYMMETRIC == true if and only if Bq_zero_point == 0 "
       "or q_row_offsets_ == nullptr");
   assert(
@@ -79,10 +92,22 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
 
   constexpr int VLEN = 8;
   for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
-    std::int32_t row_offset = (q_row_offsets_ && !B_SYMMETRIC)
-        ? q_row_offsets_[i - block.row_start] * Bq_zero_point_
-        : 0;
+    // Scale row_offset with Bq_zero_point
+    int32_t row_offset = 0;
+    if (B_SYMMETRIC) {
+      row_offset = 0;
+    } else if (
+        Q_GRAN == QuantizationGranularity::TENSOR ||
+        Q_GRAN == QuantizationGranularity::GROUP) {
+      row_offset =
+          q_row_offsets_[i - block.row_start] * Bq_zero_point_[quant_param_idx];
+    } else {
+      assert(
+          Q_GRAN == QuantizationGranularity::OUT_CHANNEL &&
+          "unknown quantization granularity");
+    }
     __m256i row_offset_v = _mm256_set1_epi32(row_offset);
+
     int j = block.col_start;
     for (; j < block.col_start + (block.col_size / (VLEN * 4) * (VLEN * 4));
          j += (VLEN * 4)) {
@@ -122,9 +147,33 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
       }
 
       if (!B_SYMMETRIC) {
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(
+                  reinterpret_cast<const __m256i*>(Bq_zero_point_ + j)));
+        }
         x_v = _mm256_sub_epi32(x_v, row_offset_v);
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(
+                  reinterpret_cast<const __m256i*>(Bq_zero_point_ + j + VLEN)));
+        }
         y_v = _mm256_sub_epi32(y_v, row_offset_v);
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+                  Bq_zero_point_ + j + 2 * VLEN)));
+        }
         z_v = _mm256_sub_epi32(z_v, row_offset_v);
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+                  Bq_zero_point_ + j + 3 * VLEN)));
+        }
         w_v = _mm256_sub_epi32(w_v, row_offset_v);
       }
       if (HAS_BIAS) {
@@ -157,10 +206,24 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
        * representation as an FP32 value, and will be rounded to nearest
        * FP32 value with ties to even with default MXCSR rounding mode.
        */
-      __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
-      __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v);
-      __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v);
-      __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v);
+      __m256 x_scaled_v, y_scaled_v, z_scaled_v, w_scaled_v;
+      if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        x_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j));
+        y_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(y_v), _mm256_loadu_ps(C_multiplier_ + j + VLEN));
+        z_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(z_v),
+            _mm256_loadu_ps(C_multiplier_ + j + 2 * VLEN));
+        w_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(w_v),
+            _mm256_loadu_ps(C_multiplier_ + j + 3 * VLEN));
+      } else {
+        x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+        y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v);
+        z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v);
+        w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v);
+      }
 
       /*
        * Convert scaled FP32 result to int32_t using CVTPS2DQ instruction.
@@ -238,6 +301,12 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
       }
 
       if (!B_SYMMETRIC) {
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(
+                  reinterpret_cast<const __m256i*>(Bq_zero_point_ + j)));
+        }
         x_v = _mm256_sub_epi32(x_v, row_offset_v);
       }
       if (HAS_BIAS) {
@@ -246,7 +315,13 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias_ + j)));
       }
 
-      __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+      __m256 x_scaled_v;
+      if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        x_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j));
+      } else {
+        x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+      }
       __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
 
       __m256i x_packed_v = _mm256_adds_epi16(
@@ -281,19 +356,26 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
           _mm256_castsi256_si128(x_clamped_v));
     } // j loop vectorized
 
+    // TODO: vectorize remainder using masking
     for (; j < block.col_start + block.col_size; ++j) {
       int32_t raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
       if (!A_SYMMETRIC) {
         raw -= Aq_zero_point_ * q_col_offsets_[j];
       }
       if (!B_SYMMETRIC) {
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset = q_row_offsets_[i - block.row_start] * Bq_zero_point_[j];
+        }
         raw -= row_offset;
       }
       if (HAS_BIAS) {
         raw += bias_[j];
       }
 
-      float ab = raw * C_multiplier_;
+      float ab = raw *
+          ((Q_GRAN == QuantizationGranularity::OUT_CHANNEL)
+               ? C_multiplier_[j]
+               : C_multiplier_[quant_param_idx]);
       long rounded = std::lrintf(ab) + C_zero_point_;
 
       out[i * ld_out + j] = std::max(
@@ -303,9 +385,14 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
   } // i loop
 }
 
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN,
+    typename outT,
+    typename inT,
+    typename nextOPType>
 template <inst_set_t instSet>
-inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
+inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
     outT* out,
     const inT* inp,
     const block_type_t& block,
@@ -314,19 +401,35 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
   static_assert(
       std::is_same<inT, int32_t>::value,
       "input data type must be of int32_t type");
+  int ncol_per_group = ncols_ / groups_;
+  assert(
+      block.col_size <= ncol_per_group &&
+      "ReQuantizeOutput should be called at most 1 group at a time.");
+  int g = block.col_start / ncol_per_group;
   if (instSet == inst_set_t::anyarch) {
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
       for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
         inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
         raw -= Aq_zero_point_ * q_col_offsets_[j];
+        int Bq_zero_point_idx;
+        if (Q_GRAN == QuantizationGranularity::TENSOR) {
+          Bq_zero_point_idx = 0;
+        } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+          Bq_zero_point_idx = g;
+        } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          Bq_zero_point_idx = j;
+        } else {
+          assert(false && "unknown quantization granularity");
+        }
         if (q_row_offsets_) {
-          raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_;
+          raw -= q_row_offsets_[i - block.row_start] *
+              Bq_zero_point_[Bq_zero_point_idx];
         }
         if (bias_) {
           raw += bias_[j];
         }
 
-        float ab = raw * C_multiplier_;
+        float ab = raw * C_multiplier_[Bq_zero_point_idx];
         long rounded = std::lrintf(ab) + C_zero_point_;
 
         out[i * ld_out + j] = std::max(
@@ -336,8 +439,11 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
     }
   } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
     if (std::is_same<outT, uint8_t>::value) {
+      bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
+                             Bq_zero_point_[0] == 0) ||
+          q_row_offsets_ == nullptr;
       if (Aq_zero_point_ == 0) {
-        if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) {
+        if (b_symmetric) {
           if (bias_ == nullptr) {
             f_<true, true, false>(out, inp, block, ld_out, ld_in);
           } else {
@@ -351,7 +457,7 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
           }
         }
       } else {
-        if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) {
+        if (b_symmetric) {
           if (bias_ == nullptr) {
             f_<false, true, false>(out, inp, block, ld_out, ld_in);
           } else {
@@ -374,9 +480,14 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
   return nextop_.template f<instSet>(out, out, block, ld_out, ld_out);
 }
 
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN,
+    typename outT,
+    typename inT,
+    typename nextOPType>
 template <inst_set_t instSet>
-inline int ReQuantizeForFloat<FUSE_RELU, outT, inT, nextOPType>::f(
+inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
     outT* out,
     inT* inp,
     const block_type_t& block,
@@ -388,12 +499,28 @@ inline int ReQuantizeForFloat<FUSE_RELU, outT, inT, nextOPType>::f(
   static_assert(
       std::is_same<float, outT>::value,
       "output data type is of not expected type");
+  int ncol_per_group = ncols_ / groups_;
+  assert(
+      block.col_size <= ncol_per_group &&
+      "ReQuantizeOutput should be called at most 1 group at a time.");
+  int g = block.col_start / ncol_per_group;
   for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
     for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
       inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
       raw -= Aq_zero_point_ * q_col_offsets_[j];
-      raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_;
-      float res = raw * Aq_scale_ * Bq_scale_;
+      int Bq_zero_point_idx;
+      if (Q_GRAN == QuantizationGranularity::TENSOR) {
+        Bq_zero_point_idx = 0;
+      } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+        Bq_zero_point_idx = g;
+      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        Bq_zero_point_idx = j;
+      } else {
+        assert(false && "unknown quantization granularity");
+      }
+      raw -= q_row_offsets_[i - block.row_start] *
+          Bq_zero_point_[Bq_zero_point_idx];
+      float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx];
       if (bias_) {
         res += bias_[j];
       }
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 2e2035c..f1ec882 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -240,47 +240,60 @@ void ExecuteKernel<
 
   } // for each j block
 }
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    uint8_t,
-    ReQuantizeOutput<false /* FUSE_RELU*/>>;
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    uint8_t,
-    ReQuantizeOutput<true>>;
-
-template class ExecuteKernel<
-    PackAWithQuantRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    float,
-    ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
-    PackAWithQuantRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    float,
-    ReQuantizeForFloat<true>>;
-
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    float,
-    ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    float,
-    ReQuantizeForFloat<true>>;
-
-template class ExecuteKernel<
-    PackAMatrix<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    int32_t,
-    memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeOutput
+#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \
+  template class ExecuteKernel<               \
+      PackAWithRowOffset<uint8_t, ACC_T>,     \
+      PackBMatrix<int8_t, ACC_T>,             \
+      uint8_t,                                \
+      ReQuantizeOutput<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(ACC_T) \
+  INSTANTIATE_Q_GRANS(ACC_T, false);   \
+  INSTANTIATE_Q_GRANS(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+  template class ExecuteKernel<                            \
+      PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,        \
+      PackBMatrix<int8_t, ACC_T>,                          \
+      uint8_t,                                             \
+      ReQuantizeOutput<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(                                                            \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T)   \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template class ExecuteKernel<
     PackAMatrix<uint8_t, int16_t>,
@@ -288,110 +301,127 @@ template class ExecuteKernel<
     uint8_t,
     ReQuantizeOutput<false>>;
 
-template class ExecuteKernel<
-    PackAMatrix<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    int32_t,
-    memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeForFloat
+#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \
+  template class ExecuteKernel<                \
+      PACK_A<uint8_t, int32_t>,                \
+      PackBMatrix<int8_t, int32_t>,            \
+      float,                                   \
+      ReQuantizeForFloat<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(PACK_A, RELU)                          \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(PACK_A)      \
+  INSTANTIATE_Q_GRANS(PACK_A, false); \
+  INSTANTIATE_Q_GRANS(PACK_A, true);
+
+INSTANTIATE_RELU(PackAWithRowOffset);
+INSTANTIATE_RELU(PackAWithQuantRowOffset);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+  template class ExecuteKernel<                            \
+      PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,        \
+      PackBMatrix<int8_t, ACC_T>,                          \
+      float,                                               \
+      ReQuantizeForFloat<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(                                                            \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T)          \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template class ExecuteKernel<
     PackAWithRowOffset<uint8_t, int16_t>,
     PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    DoSpmdmOnInpBuffer<
-        ReQuantizeOutput<false>::outType,
-        int32_t,
-        ReQuantizeOutput<false>>>;
+    float,
+    ReQuantizeForFloat<false /* FUSE_RELU*/>>;
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    DoSpmdmOnInpBuffer<
-        ReQuantizeOutput<true>::outType,
-        int32_t,
-        ReQuantizeOutput<true>>>;
+////////////////////////////////////////////////////////////////////////////////
+// DoSpmdmOnInpBuffer
+#define INSTANTIATE_BASE(RELU, Q_GRAN)      \
+  template class ExecuteKernel<             \
+      PackAWithRowOffset<uint8_t, int16_t>, \
+      PackBMatrix<int8_t, int16_t>,         \
+      uint8_t,                              \
+      DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    float,
-    DoSpmdmOnInpBuffer<
-        ReQuantizeForFloat<false>::outType,
-        int32_t,
-        ReQuantizeForFloat<false>>>;
+#define INSTANTIATE_Q_GRANS(RELU)                          \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    ReQuantizeOutput<true>>;
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template class ExecuteKernel<
     PackAWithRowOffset<uint8_t, int16_t>,
     PackBMatrix<int8_t, int16_t>,
-    int32_t,
-    memCopy<>>;
+    float,
+    DoSpmdmOnInpBuffer<float, int32_t, ReQuantizeForFloat<false>>>;
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    int32_t,
-    memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// memCopy
+#define INSTANTIATE_BASE(PACK_A, ACC_T) \
+  template class ExecuteKernel<         \
+      PACK_A<uint8_t, ACC_T>,           \
+      PackBMatrix<int8_t, ACC_T>,       \
+      int32_t,                            \
+      memCopy<>>;
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int16_t, 3>,
-    PackBMatrix<int8_t, int16_t>,
-    int32_t,
-    memCopy<>>;
+#define INSTANTIATE_ACC_T(PACK_A)   \
+  INSTANTIATE_BASE(PACK_A, int32_t) \
+  INSTANTIATE_BASE(PACK_A, int16_t)
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+INSTANTIATE_ACC_T(PackAMatrix);
+INSTANTIATE_ACC_T(PackAWithRowOffset);
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int16_t, 3>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+#undef INSTANTIATE_ACC_T
+#undef INSTANTIATE_BASE
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    int32_t,
-    memCopy<>>;
-
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    int32_t,
-    memCopy<>>;
+#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM)        \
+  template class ExecuteKernel<                     \
+      PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+      PackBMatrix<int8_t, ACC_T>,                   \
+      int32_t,                                        \
+      memCopy<>>;
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int32_t, 3>,
-    PackBMatrix<int8_t, int32_t>,
-    int32_t,
-    memCopy<>>;
+#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
+  INSTANTIATE_BASE(ACC_T, 2);          \
+  INSTANTIATE_BASE(ACC_T, 3);
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+INSTANTIATE_SPATIAL_DIM(int32_t);
+INSTANTIATE_SPATIAL_DIM(int16_t);
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int32_t, 3>,
-    PackBMatrix<int8_t, int32_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BASE
 
 template class ExecuteKernel<
     PackAWithQuantRowOffset<uint8_t, int32_t>,
@@ -400,12 +430,6 @@ template class ExecuteKernel<
     memCopy<>>;
 
 template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    float,
-    ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
     PackAMatrix<uint8_t, int16_t>,
     PackBMatrix<int8_t, int16_t>,
     int32_t,
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index 0039daf..a8bf02f 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -198,149 +198,70 @@ bool fbgemmSupportedCPU() {
   return (cpuinfo_initialize() && cpuinfo_has_x86_avx2());
 }
 
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<true>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
-        packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
-        packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<true>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAMatrix<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<true>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
-        packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-// 16 bit accumulation functions
-template void fbgemmPacked(
-    PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeOutput
+#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN)                                \
+  template void fbgemmPacked(                                                \
+      PackMatrix<PackAWithRowOffset<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB,          \
+      uint8_t* C,                                                            \
+      int32_t* C_buffer,                                                     \
+      uint32_t ldc,                                                          \
+      const ReQuantizeOutput<RELU, Q_GRAN>& outProcess,                      \
+      int thread_id,                                                         \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(ACC_T) \
+  INSTANTIATE_Q_GRANS(ACC_T, false);   \
+  INSTANTIATE_Q_GRANS(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN)          \
+  template void fbgemmPacked(                                       \
+      PackMatrix<                                                   \
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,             \
+          uint8_t,                                                  \
+          ACC_T>& packA,                                            \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+      uint8_t* C,                                                   \
+      int32_t* C_buffer,                                            \
+      uint32_t ldc,                                                 \
+      const ReQuantizeOutput<RELU, Q_GRAN>& outProcess,             \
+      int thread_id,                                                \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(                                                            \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T)   \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template void fbgemmPacked(
     PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
@@ -352,28 +273,109 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<false>>&
-        outProcess,
-    int thread_id,
-    int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeForFloat
+#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN)                          \
+  template void fbgemmPacked(                                           \
+      PackMatrix<PACK_A<uint8_t, int32_t>, uint8_t, int32_t>& packA,    \
+      PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, \
+      float* C,                                                         \
+      int32_t* C_buffer,                                                \
+      uint32_t ldc,                                                     \
+      const ReQuantizeForFloat<RELU, Q_GRAN>& outProcess,               \
+      int thread_id,                                                    \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(PACK_A, RELU)                          \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(PACK_A)      \
+  INSTANTIATE_Q_GRANS(PACK_A, false); \
+  INSTANTIATE_Q_GRANS(PACK_A, true);
+
+INSTANTIATE_RELU(PackAWithRowOffset);
+INSTANTIATE_RELU(PackAWithQuantRowOffset);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN)          \
+  template void fbgemmPacked(                                       \
+      PackMatrix<                                                   \
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,             \
+          uint8_t,                                                  \
+          ACC_T>& packA,                                            \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+      float* C,                                                     \
+      int32_t* C_buffer,                                            \
+      uint32_t ldc,                                                 \
+      const ReQuantizeForFloat<RELU, Q_GRAN>& outProcess,           \
+      int thread_id,                                                \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(                                                            \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T)          \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template void fbgemmPacked(
     PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
     PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
+    float* C,
     int32_t* C_buffer,
     uint32_t ldc,
-    const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<true>>&
-        outProcess,
+    const ReQuantizeForFloat<false>& outProcess,
     int thread_id,
     int num_threads);
 
+////////////////////////////////////////////////////////////////////////////////
+// DoSpmdmOnInpBuffer
+#define INSTANTIATE_BASE(RELU, Q_GRAN)                                    \
+  template void fbgemmPacked(                                             \
+      PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& \
+          packA,                                                          \
+      PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,   \
+      uint8_t* C,                                                         \
+      int32_t* C_buffer,                                                  \
+      uint32_t ldc,                                                       \
+      const DoSpmdmOnInpBuffer<                                           \
+          uint8_t,                                                        \
+          int32_t,                                                        \
+          ReQuantizeOutput<RELU, Q_GRAN>>& outProcess,                    \
+      int thread_id,                                                      \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(RELU)                          \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
+
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
 template void fbgemmPacked(
     PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
     PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
@@ -385,49 +387,57 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<true>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// memCopy
+#define INSTANTIATE_BASE(PACK_A, ACC_T)                             \
+  template void fbgemmPacked(                                       \
+      PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA,    \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+      int32_t* C,                                                   \
+      int32_t* C_buffer,                                            \
+      uint32_t ldc,                                                 \
+      const memCopy<>& outProcess,                                  \
+      int thread_id,                                                \
+      int num_threads);
+
+#define INSTANTIATE_ACC_T(PACK_A)   \
+  INSTANTIATE_BASE(PACK_A, int32_t) \
+  INSTANTIATE_BASE(PACK_A, int16_t)
+
+INSTANTIATE_ACC_T(PackAMatrix);
+INSTANTIATE_ACC_T(PackAWithRowOffset);
+
+#undef INSTANTIATE_ACC_T
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM)                        \
+  template void fbgemmPacked(                                       \
+      PackMatrix<                                                   \
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,             \
+          uint8_t,                                                  \
+          ACC_T>& packA,                                            \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+      int32_t* C,                                                   \
+      int32_t* C_buffer,                                            \
+      uint32_t ldc,                                                 \
+      const memCopy<>& outProcess,                                  \
+      int thread_id,                                                \
+      int num_threads);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
+  INSTANTIATE_BASE(ACC_T, 2);          \
+  INSTANTIATE_BASE(ACC_T, 3);
+
+INSTANTIATE_SPATIAL_DIM(int32_t);
+INSTANTIATE_SPATIAL_DIM(int16_t);
+
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BASE
 
 template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+    PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
+        packA,
+    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
     int32_t* C,
     int32_t* C_buffer,
     uint32_t ldc,
@@ -436,26 +446,6 @@ template void fbgemmPacked(
     int num_threads);
 
 template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
     PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
     PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
     int32_t* C,
@@ -465,14 +455,4 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
 } // namespace fbgemm
diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
index 322287b..50f619a 100644
--- a/src/QuantUtils.cc
+++ b/src/QuantUtils.cc
@@ -436,13 +436,14 @@ void RequantizeAvx2(
   DoNothing<> doNothingObj{};
   ReQuantizeOutput<false /* FUSE_RELU */> requantizeObj(
     doNothingObj,
-    params.real_multiplier,
+    &params.real_multiplier,
     params.target_qparams.zero_point,
     0,
     0,
     nullptr,
     nullptr,
-    nullptr);
+    nullptr,
+    len);
   requantizeObj.f<inst_set_t::avx2>(dst, src, {0, 1, 0, len}, 0, 0);
 }
 #endif
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index 097e3b5..369aea3 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -57,24 +57,25 @@ void requantize_u8acc32_ref(
     int ld,
     const int32_t* inp,
     uint8_t* out,
-    float C_multiplier,
+    const float* C_multiplier,
     int32_t C_zero_point,
     int32_t A_zero_point,
-    int32_t B_zero_point,
+    const int32_t* B_zero_point,
     const int32_t* row_offsets,
     const int32_t* col_offsets,
     const int32_t* bias,
+    int ncols_per_quant_group,
     bool fuse_relu) {
   for (int i = 0; i < M; ++i) {
     for (int j = 0; j < N; ++j) {
       int32_t raw = inp[i * ld + j];
       raw -= A_zero_point * col_offsets[j];
-      raw -= B_zero_point * row_offsets[i];
+      raw -= B_zero_point[j / ncols_per_quant_group] * row_offsets[i];
       if (bias) {
         raw += bias[j];
       }
 
-      float result = raw * C_multiplier;
+      float result = raw * C_multiplier[j / ncols_per_quant_group];
       long rounded = lrintf(result) + C_zero_point;
       out[i * ld + j] = std::max(
           fuse_relu ? static_cast<long>(C_zero_point) : 0l,
@@ -180,14 +181,15 @@ void col_offsets_with_zero_pt_s8acc32_ref(
     int N,
     int ld,
     const int8_t* Bint8,
-    int32_t B_zero_point,
-    int32_t* col_offsets) {
+    const int32_t* B_zero_point,
+    int32_t* col_offsets,
+    int ncols_per_quant_group) {
   for (int j = 0; j < N; ++j) {
     int32_t sum = 0;
     for (int k = 0; k < K; ++k) {
       sum += Bint8[k * ld + j];
     }
-    col_offsets[j] = sum - B_zero_point * K;
+    col_offsets[j] = sum - B_zero_point[j / ncols_per_quant_group] * K;
   }
 }
 
@@ -578,13 +580,14 @@ void depthwise_3x3_pad_1_ref(
           1,
           C_int32.data() + i * K + k,
           C + i * K + k,
-          C_multiplier,
+          &C_multiplier,
           C_zero_point,
           A_zero_point,
-          B_zero_point,
+          &B_zero_point,
           &row_offsets[i * K + k],
           col_offsets + k,
-          bias ? bias + k : nullptr);
+          bias ? bias + k : nullptr,
+          1);
     }
   }
 };
@@ -644,13 +647,14 @@ void depthwise_3x3_per_channel_quantization_pad_1_ref(
           1,
           C_int32.data() + i * K + k,
           C + i * K + k,
-          C_multiplier[k],
+          &C_multiplier[k],
           C_zero_point,
           A_zero_point,
-          B_zero_point[k],
+          &B_zero_point[k],
           &row_offsets[i * K + k],
           col_offsets + k,
-          bias ? bias + k : nullptr);
+          bias ? bias + k : nullptr,
+          1);
     }
   }
 };
@@ -781,13 +785,14 @@ void depthwise_3x3x3_pad_1_ref(
           1,
           C_int32.data() + i * K + k,
           C + i * K + k,
-          C_multiplier,
+          &C_multiplier,
           C_zero_point,
           A_zero_point,
-          B_zero_point,
+          &B_zero_point,
           &row_offsets[i * K + k],
           col_offsets + k,
-          bias ? bias + k : nullptr);
+          bias ? bias + k : nullptr,
+          1);
     }
   }
 };
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index cec4bff..6530eff 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -39,6 +39,11 @@ void requantize_u8acc32_ref(
  * @brief Reference implementation of requantization step.
  * float multiplier
  * @params bias can be nullptr
+ * @params ncols_per_quant_group the number of columns share the same
+ *         quantization parameter.
+ *         ncols_per_quant_group == N : per-tensor quantization
+ *         ncols_per_quant_group == N / groups : per-group quantization
+ *         ncols_per_quant_group == 1 : per-channel quantization
  */
 void requantize_u8acc32_ref(
     int M,
@@ -46,13 +51,14 @@ void requantize_u8acc32_ref(
     int ld,
     const std::int32_t* inp,
     std::uint8_t* out,
-    float C_multiplier,
+    const float* C_multiplier,
     std::int32_t C_zero_point,
     std::int32_t A_zero_point,
-    std::int32_t B_zero_point,
+    const std::int32_t* B_zero_point,
     const std::int32_t* row_offsets,
     const std::int32_t* col_offsets,
     const std::int32_t* bias,
+    int ncols_per_quant_group,
     bool fuse_relu = false);
 
 /**
@@ -114,14 +120,18 @@ void row_offsets_u8acc32_ref(
 /**
  * @brief Reference implementation to compute adjusted col_offsets (sum of
  * columns of B and adjusted with B_zero_point)
+ *
+ * @params ncols_per_quant_group see ncols_per_quant_group in
+ *         requantize_u8acc32_ref
  */
 void col_offsets_with_zero_pt_s8acc32_ref(
     int K,
     int N,
     int ld,
     const std::int8_t* Bint8,
-    std::int32_t B_zero_point,
-    std::int32_t* col_offsets);
+    const std::int32_t* B_zero_point,
+    std::int32_t* col_offsets,
+    int ncols_per_quant_group);
 
 /**
  * @brief Reference implementation of SPMDM (sparse matrix times dense matrix).
diff --git a/test/FP16Test.cc b/test/FP16Test.cc
index b5e4f13..0edcc4b 100644
--- a/test/FP16Test.cc
+++ b/test/FP16Test.cc
@@ -73,19 +73,17 @@ TEST_P(FBGemmFP16Test, Test) {
     }
     cerr << endl;
 
-    aligned_vector<float> A(m * k, 0.f);
-    aligned_vector<float> B(k * n, 0.f);
+    // initialize with small numbers
+    aligned_vector<int> Aint(m * k);
+    aligned_vector<int> Bint(k * n);
+    randFill(Aint, 0, 4);
+    randFill(Bint, 0, 4);
+    aligned_vector<float> A(Aint.begin(), Aint.end());
+    aligned_vector<float> B(Bint.begin(), Bint.end());
+
     aligned_vector<float> C(m * n, NAN);
 
-    // initialize with small numbers
-    randFill(A, 0, 4);
-    randFill(B, 0, 4);
-    randFill(C, 0, 4);
-
-    aligned_vector<float> A_ref, B_ref, C_ref;
-    A_ref = A;
-    B_ref = B;
-    C_ref = C;
+    aligned_vector<float> A_ref(A), B_ref(B), C_ref(C);
 
     if (atrans == matrix_op_t::Transpose) {
       transpose_matrix(A_ref.data(), k, m);
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc
index 9a19f0f..f482783 100644
--- a/test/I8DepthwiseTest.cc
+++ b/test/I8DepthwiseTest.cc
@@ -85,10 +85,10 @@ TEST(FBGemmDepthWiseTest, Test3x3) {
     aligned_vector<int8_t> B(K * R * S);
     aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * K), C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3_pad_1_ref(
@@ -211,10 +211,10 @@ TEST(FBGemmDepthWiseTest, Test3x3x3) {
     aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
         C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3x3_pad_1_ref(
@@ -360,7 +360,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
     int32_t C_num_rows = N * H_OUT * W_OUT;
     aligned_vector<int32_t> C_ref(C_num_rows * K), C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
     // Each row of G has a different range to really test per-channel
@@ -368,7 +368,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
     vector<int32_t> B_zero_point(K);
     for (auto k = 0; k < K; ++k) {
       aligned_vector<int8_t> Bk(R * S);
-      randFill(Bk, -16 + k, 16 + k);
+      randFill<int8_t>(Bk, -16 + k, 16 + k);
       copy(Bk.begin(), Bk.end(), B.begin() + k * R * S);
 
       B_zero_point[k] = 5 + k;
diff --git a/test/I8SpmdmTest.cc b/test/I8SpmdmTest.cc
index 5bb7703..2090b63 100644
--- a/test/I8SpmdmTest.cc
+++ b/test/I8SpmdmTest.cc
@@ -66,7 +66,7 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) {
     }
 
     aligned_vector<uint8_t> A(M * K);
-    randFill(A, 0, 255);
+    randFill<uint8_t>(A, 0, 255);
 
     CompressedSparseColumn B_csc(K_adjusted, N_adjusted);
     vector<int32_t> C(M * N);
@@ -127,13 +127,8 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) {
 #pragma omp parallel
 #endif
     {
-#ifdef _OPENMP
-      int num_threads = omp_get_num_threads();
-      int tid = omp_get_thread_num();
-#else
-      int num_threads = 1;
-      int tid = 0;
-#endif
+      int num_threads = fbgemm_get_num_threads();
+      int tid = fbgemm_get_thread_num();
       int i_per_thread = (M + num_threads - 1) / num_threads;
       int i_begin = std::min(tid * i_per_thread, M);
       int i_end = std::min(i_begin + i_per_thread, M);
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index c7de1a8..d8f3f7a 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -20,8 +20,22 @@
 #include "src/RefImplementations.h"
 
 using namespace std;
+using namespace fbgemm;
 
-namespace fbgemm {
+vector<QuantizationGranularity> qGranularityVals{
+    QuantizationGranularity::TENSOR,
+    QuantizationGranularity::GROUP,
+    QuantizationGranularity::OUT_CHANNEL};
+
+namespace {
+class fbgemmIm2colTest
+    : public testing::TestWithParam<QuantizationGranularity> {};
+}; // namespace
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmIm2colTest,
+    ::testing::ValuesIn(qGranularityVals));
 
 // From Faster-RCNN with ShuffleNet
 static vector<conv_param_t<>> shapes = {
@@ -71,7 +85,7 @@ static vector<conv_param_t<>> shapes = {
     conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
 };
 
-template <typename ACC_T>
+template <typename ACC_T, QuantizationGranularity Q_GRAN>
 static void Im2colTest() {
   for (auto conv_p : shapes) {
     for (int groups : {1, 4}) {
@@ -80,29 +94,38 @@ static void Im2colTest() {
       }
       conv_p.G = groups;
       aligned_vector<uint8_t> Aint8(
-          conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+          conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
       aligned_vector<int8_t> Bint8(
-          conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+          conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
       aligned_vector<int32_t> Cint32_ref(
-          conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-
-      int32_t Aint8_zero_point, Bint8_zero_point;
+          conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+
+      int ncols_per_quant_group = conv_p.OC;
+      if (Q_GRAN == QuantizationGranularity::GROUP) {
+        ncols_per_quant_group = conv_p.OC / conv_p.G;
+      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        ncols_per_quant_group = 1;
+      }
+      int32_t Aint8_zero_point;
+      aligned_vector<int32_t> Bint8_zero_point(
+          conv_p.OC / ncols_per_quant_group);
       if (is_same<ACC_T, int32_t>::value) {
-        randFill(Aint8, 0, 80);
+        randFill<uint8_t>(Aint8, 0, 80);
         Aint8_zero_point = 43;
-        randFill(Bint8, -16, 16);
-        Bint8_zero_point = -30;
+        randFill<int8_t>(Bint8, -16, 16);
+        randFill(Bint8_zero_point, -50, -10);
       } else {
-        randFill(Aint8, 0, 5);
+        randFill<uint8_t>(Aint8, 0, 5);
         Aint8_zero_point = 4;
-        randFill(Bint8, -4, 4);
-        Bint8_zero_point = -2;
+        randFill<int8_t>(Bint8, -4, 4);
+        randFill(Bint8_zero_point, -3, -1);
       }
 
-      float C_multiplier = 0.1234;
+      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
       int32_t C_zero_pt = 5;
 
       int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
@@ -116,16 +139,16 @@ static void Im2colTest() {
       im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * NDim);
+      vector<int32_t> col_offsets(groups * NDim);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             KDimPerGroup,
             NDim,
             NDim,
             Bint8.data() + g * KDimPerGroup * NDim,
-            Bint8_zero_point,
-            col_offsets.data() + g * NDim);
+            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
+            col_offsets.data() + g * NDim,
+            ncols_per_quant_group);
       }
 
       conv_ref(
@@ -149,13 +172,14 @@ static void Im2colTest() {
             conv_p.G * NDim,
             Cint32_ref.data() + g * NDim,
             Cint8_ref.data() + g * NDim,
-            C_multiplier,
+            C_multiplier.data() + g * NDim / ncols_per_quant_group,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
             row_offsets.data(),
             col_offsets.data() + g * NDim,
-            nullptr);
+            nullptr,
+            ncols_per_quant_group);
       }
 
       PackBMatrix<int8_t, ACC_T> packedB(
@@ -171,8 +195,7 @@ static void Im2colTest() {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithIm2Col<uint8_t, ACC_T>::rowOffsetBufferSize());
 
         PackAWithIm2Col<uint8_t, ACC_T> packA(
@@ -183,23 +206,20 @@ static void Im2colTest() {
             row_offset_buf.data());
 
         DoNothing<> doNothingObj{};
-        ReQuantizeOutput<false> outputProcObj(
+        ReQuantizeOutput<false, Q_GRAN> outputProcObj(
             doNothingObj,
-            C_multiplier,
+            C_multiplier.data(),
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data(),
             packA.getRowOffsetBuffer(),
             col_offsets.data(),
-            nullptr);
+            nullptr,
+            conv_p.G * NDim,
+            conv_p.G);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
 
         fbgemmPacked(
             packA,
@@ -236,12 +256,26 @@ static void Im2colTest() {
   } // for each shape
 }
 
-TEST(FBGemmIm2colTest, Acc32Test) {
-  Im2colTest<int32_t>();
+TEST_P(fbgemmIm2colTest, Acc32Test) {
+  QuantizationGranularity q_granularity = GetParam();
+  if (q_granularity == QuantizationGranularity::TENSOR) {
+    Im2colTest<int32_t, QuantizationGranularity::TENSOR>();
+  } else if (q_granularity == QuantizationGranularity::GROUP) {
+    Im2colTest<int32_t, QuantizationGranularity::GROUP>();
+  } else {
+    Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>();
+  }
 }
 
-TEST(FBGemmIm2colTest, Acc16Test) {
-  Im2colTest<int16_t>();
+TEST_P(fbgemmIm2colTest, Acc16Test) {
+  QuantizationGranularity q_granularity = GetParam();
+  if (q_granularity == QuantizationGranularity::TENSOR) {
+    Im2colTest<int16_t, QuantizationGranularity::TENSOR>();
+  } else if (q_granularity == QuantizationGranularity::GROUP) {
+    Im2colTest<int16_t, QuantizationGranularity::GROUP>();
+  } else {
+    Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>();
+  }
 }
 
 static vector<conv_param_t<3>> shapes_3d = {
@@ -319,7 +353,7 @@ static vector<conv_param_t<3>> shapes_3d = {
         3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, 0, 0}),
 };
 
-template <typename ACC_T>
+template <typename ACC_T, QuantizationGranularity Q_GRAN>
 static void Im2col3DTest() {
   for (auto conv_p : shapes_3d) {
     for (int groups : {1, 4}) {
@@ -329,32 +363,39 @@ static void Im2col3DTest() {
       conv_p.G = groups;
       aligned_vector<uint8_t> Aint8(
           conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IN_DIM[2] *
-              conv_p.IC,
-          0);
+          conv_p.IC);
       aligned_vector<int8_t> Bint8(
-          conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC, 0);
+          conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC);
       aligned_vector<int32_t> Cint32_ref(
           conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] *
-              conv_p.OUT_DIM[2] * conv_p.OC,
-          0);
-      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-
-      int32_t Aint8_zero_point, Bint8_zero_point;
+          conv_p.OUT_DIM[2] * conv_p.OC);
+      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+
+      int ncols_per_quant_group = conv_p.OC;
+      if (Q_GRAN == QuantizationGranularity::GROUP) {
+        ncols_per_quant_group = conv_p.OC / conv_p.G;
+      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        ncols_per_quant_group = 1;
+      }
+      int32_t Aint8_zero_point;
+      aligned_vector<int32_t> Bint8_zero_point(
+          conv_p.OC / ncols_per_quant_group);
       if (is_same<ACC_T, int32_t>::value) {
-        randFill(Aint8, 0, 80);
+        randFill<uint8_t>(Aint8, 0, 80);
         Aint8_zero_point = 43;
-        randFill(Bint8, -16, 16);
-        Bint8_zero_point = -30;
+        randFill<int8_t>(Bint8, -16, 16);
+        randFill(Bint8_zero_point, -50, -10);
       } else {
-        randFill(Aint8, 0, 5);
+        randFill<uint8_t>(Aint8, 0, 5);
         Aint8_zero_point = 4;
-        randFill(Bint8, -4, 4);
-        Bint8_zero_point = -2;
+        randFill<int8_t>(Bint8, -4, 4);
+        randFill(Bint8_zero_point, -3, -1);
       }
 
-      float C_multiplier = 0.1234;
+      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
       int32_t C_zero_pt = 5;
 
       int MDim =
@@ -369,16 +410,16 @@ static void Im2col3DTest() {
       im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * NDim);
+      vector<int32_t> col_offsets(groups * NDim);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             KDimPerGroup,
             NDim,
             NDim,
             Bint8.data() + g * KDimPerGroup * NDim,
-            Bint8_zero_point,
-            col_offsets.data() + g * NDim);
+            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
+            col_offsets.data() + g * NDim,
+            ncols_per_quant_group);
       }
 
       conv3d_ref(
@@ -402,13 +443,14 @@ static void Im2col3DTest() {
             conv_p.G * NDim,
             Cint32_ref.data() + g * NDim,
             Cint8_ref.data() + g * NDim,
-            C_multiplier,
+            C_multiplier.data() + g * NDim / ncols_per_quant_group,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
             row_offsets.data(),
             col_offsets.data() + g * NDim,
-            nullptr);
+            nullptr,
+            ncols_per_quant_group);
       }
 
       PackBMatrix<int8_t, ACC_T> packedB(
@@ -424,8 +466,7 @@ static void Im2col3DTest() {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithIm2Col<uint8_t, ACC_T, 3>::rowOffsetBufferSize());
 
         PackAWithIm2Col<uint8_t, ACC_T, 3> packA(
@@ -436,23 +477,20 @@ static void Im2col3DTest() {
             row_offset_buf.data());
 
         DoNothing<> doNothingObj{};
-        ReQuantizeOutput<false> outputProcObj(
+        ReQuantizeOutput<false, Q_GRAN> outputProcObj(
             doNothingObj,
-            C_multiplier,
+            C_multiplier.data(),
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data(),
             packA.getRowOffsetBuffer(),
             col_offsets.data(),
-            nullptr);
+            nullptr,
+            conv_p.G * NDim,
+            conv_p.G);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
 
         fbgemmPacked(
             packA,
@@ -495,12 +533,24 @@ static void Im2col3DTest() {
   } // for each shape
 }
 
-TEST(FBGemmIm2colTest, 3DAcc32Test) {
-  Im2col3DTest<int32_t>();
+TEST_P(fbgemmIm2colTest, 3DAcc32Test) {
+  QuantizationGranularity q_granularity = GetParam();
+  if (q_granularity == QuantizationGranularity::TENSOR) {
+    Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>();
+  } else if (q_granularity == QuantizationGranularity::GROUP) {
+    Im2col3DTest<int32_t, QuantizationGranularity::GROUP>();
+  } else {
+    Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>();
+  }
 }
 
-TEST(FBGemmIm2colTest, 3DAcc16Test) {
-  Im2col3DTest<int16_t>();
+TEST_P(fbgemmIm2colTest, 3DAcc16Test) {
+  QuantizationGranularity q_granularity = GetParam();
+  if (q_granularity == QuantizationGranularity::TENSOR) {
+    Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>();
+  } else if (q_granularity == QuantizationGranularity::GROUP) {
+    Im2col3DTest<int16_t, QuantizationGranularity::GROUP>();
+  } else {
+    Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>();
+  }
 }
-
-} // namespace fbgemm
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index cb614cd..55f6e7f 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -25,17 +25,34 @@
 using namespace std;
 using namespace fbgemm;
 
-std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
+vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
                                        matrix_op_t::Transpose};
 
+vector<QuantizationGranularity> qGranularityVals{
+    QuantizationGranularity::TENSOR,
+    QuantizationGranularity::GROUP,
+    QuantizationGranularity::OUT_CHANNEL};
+
 namespace {
-class fbgemmu8s8acc16test : public testing::TestWithParam<
-                                std::tuple<matrix_op_t, matrix_op_t, bool>> {};
+class fbgemmu8s8acc16WithQuantGranularityTest
+    : public testing::TestWithParam<
+          tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
+class fbgemmu8s8acc16Test
+    : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
-    fbgemmu8s8acc16test,
+    fbgemmu8s8acc16WithQuantGranularityTest,
+    ::testing::Combine(
+        ::testing::Values(matrix_op_t::NoTranspose),
+        ::testing::ValuesIn(transposeVals),
+        ::testing::Bool(),
+        ::testing::ValuesIn(qGranularityVals)));
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmu8s8acc16Test,
     ::testing::Combine(
         ::testing::Values(matrix_op_t::NoTranspose),
         ::testing::ValuesIn(transposeVals),
@@ -77,11 +94,12 @@ static vector<vector<int>> GetShapes_() {
  * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
  * accumulation. Output processing: requantization -> nothing
  */
-TEST_P(fbgemmu8s8acc16test, Test) {
+TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, Test) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+  QuantizationGranularity q_granularity;
+  tie(atrans, btrans, test_ld, q_granularity) = GetParam();
 
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
@@ -93,22 +111,21 @@ TEST_P(fbgemmu8s8acc16test, Test) {
       }
       int k_per_group = k / groups;
 
-      aligned_vector<uint8_t> Aint8(m * k, 0);
+      aligned_vector<uint8_t> Aint8(m * k);
 
-      aligned_vector<int8_t> Bint8(k * n, 0);
-      aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+      aligned_vector<int8_t> Bint8_ref(k * n);
 
-      aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
-      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+      aligned_vector<int32_t> Cint32_ref(m * n * groups);
+      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
 
-      randFill(Aint8, 0, 255);
+      randFill<uint8_t>(Aint8, 0, 255);
       int32_t Aint8_zero_point = 43;
 
-      randFill(Bint8_ref, -128, 127);
-      Bint8 = Bint8_ref;
+      randFill<int8_t>(Bint8_ref, -128, 127);
+      aligned_vector<int8_t> Bint8(Bint8_ref);
 
       if (btrans == matrix_op_t::Transpose) {
         aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -124,7 +141,6 @@ TEST_P(fbgemmu8s8acc16test, Test) {
         Bint8 = Bint8_temp;
       }
 
-      int32_t Bint8_zero_point = -30;
       // To test lda != k , we just reduce k by half and use the original k
       // as lda.
       int n_adjusted = n;
@@ -137,23 +153,33 @@ TEST_P(fbgemmu8s8acc16test, Test) {
         }
       }
 
+      int ncols_per_quant_group = groups * n_adjusted;
+      if (q_granularity == QuantizationGranularity::GROUP) {
+        ncols_per_quant_group = n_adjusted;
+      } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+        ncols_per_quant_group = 1;
+      }
+      aligned_vector<int32_t> Bint8_zero_point(
+          groups * n_adjusted / ncols_per_quant_group);
+      randFill(Bint8_zero_point, -60, 0);
+
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * n_adjusted);
+      vector<int32_t> col_offsets(groups * n_adjusted);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             k_per_group,
             n_adjusted,
             n,
             Bint8_ref.data() + g * k_per_group * n,
-            Bint8_zero_point,
-            col_offsets.data() + g * n_adjusted);
+            Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+            col_offsets.data() + g * n_adjusted,
+            ncols_per_quant_group);
       }
 
-      vector<int32_t> row_offsets;
-      row_offsets.resize(m);
+      vector<int32_t> row_offsets(m);
 
-      float C_multiplier = 0.1234;
+      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
       int32_t C_zero_pt = 5;
 
       int brow = 256;
@@ -183,13 +209,14 @@ TEST_P(fbgemmu8s8acc16test, Test) {
             groups * n,
             Cint32_ref.data() + g * n_adjusted,
             Cint8_ref.data() + g * n_adjusted,
-            C_multiplier,
+            C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
             row_offsets.data(),
             col_offsets.data() + g * n_adjusted,
-            nullptr);
+            nullptr,
+            ncols_per_quant_group);
       }
 
       PackBMatrix<int8_t, int16_t> packedBN(
@@ -205,8 +232,7 @@ TEST_P(fbgemmu8s8acc16test, Test) {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
         PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -219,34 +245,79 @@ TEST_P(fbgemmu8s8acc16test, Test) {
             groups,
             row_offset_buf.data());
 
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
+
         DoNothing<> doNothingObj{};
-        ReQuantizeOutput<false> outputProcObj(
-            doNothingObj,
-            C_multiplier,
-            C_zero_pt,
-            Aint8_zero_point,
-            Bint8_zero_point,
-            packAN.getRowOffsetBuffer(),
-            col_offsets.data(),
-            nullptr);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        if (q_granularity == QuantizationGranularity::TENSOR) {
+          ReQuantizeOutput<false> outputProcObj(
+              doNothingObj,
+              C_multiplier.data(),
+              C_zero_pt,
+              Aint8_zero_point,
+              Bint8_zero_point.data(),
+              packAN.getRowOffsetBuffer(),
+              col_offsets.data(),
+              nullptr,
+              groups * n_adjusted,
+              groups);
 
-        fbgemmPacked(
-            packAN,
-            packedBN,
-            Cint8_fb.data(),
-            Cint32_buffer.data(),
-            groups * n,
-            outputProcObj,
-            tid,
-            num_threads);
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cint8_fb.data(),
+              Cint32_buffer.data(),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        } else if (q_granularity == QuantizationGranularity::GROUP) {
+          ReQuantizeOutput<false, QuantizationGranularity::GROUP> outputProcObj(
+              doNothingObj,
+              C_multiplier.data(),
+              C_zero_pt,
+              Aint8_zero_point,
+              Bint8_zero_point.data(),
+              packAN.getRowOffsetBuffer(),
+              col_offsets.data(),
+              nullptr,
+              groups * n_adjusted,
+              groups);
+
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cint8_fb.data(),
+              Cint32_buffer.data(),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        } else {
+          ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+              outputProcObj(
+                  doNothingObj,
+                  C_multiplier.data(),
+                  C_zero_pt,
+                  Aint8_zero_point,
+                  Bint8_zero_point.data(),
+                  packAN.getRowOffsetBuffer(),
+                  col_offsets.data(),
+                  nullptr,
+                  groups * n_adjusted,
+                  groups);
+
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cint8_fb.data(),
+              Cint32_buffer.data(),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        }
       } // omp parallel
 
       compare_validate_buffers(
@@ -264,11 +335,12 @@ TEST_P(fbgemmu8s8acc16test, Test) {
  * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
  * accumulation. Output processing: spmdm -> requantization -> nothing
  */
-TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
+TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, SpMDMTest) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+  QuantizationGranularity q_granularity;
+  tie(atrans, btrans, test_ld, q_granularity) = GetParam();
 
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
@@ -283,21 +355,19 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
         }
         int k_per_group = k / groups;
 
-        aligned_vector<uint8_t> Aint8(m * k, 0);
+        aligned_vector<uint8_t> Aint8(m * k);
+        aligned_vector<int8_t> Bint8(k * n);
 
-        aligned_vector<int8_t> Bint8(k * n, 0);
-        aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+        aligned_vector<int32_t> Cint32_ref(m * n * groups);
+        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
 
-        aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
-        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
-
-        randFill(Aint8, 0, 255);
+        randFill<uint8_t>(Aint8, 0, 255);
         int32_t Aint8_zero_point = 43;
 
-        randFill(Bint8, -128, 127);
+        randFill<int8_t>(Bint8, -128, 127);
 
         // To test lda != k , we just reduce k by half and use the original k
         // as lda.
@@ -311,18 +381,27 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
           }
         }
 
-        int32_t Bint8_zero_point = -30;
+        int ncols_per_quant_group = groups * n_adjusted;
+        if (q_granularity == QuantizationGranularity::GROUP) {
+          ncols_per_quant_group = n_adjusted;
+        } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+          ncols_per_quant_group = 1;
+        }
+        aligned_vector<int32_t> Bint8_zero_point(
+            groups * n_adjusted / ncols_per_quant_group);
+        randFill(Bint8_zero_point, -50, -10);
+
         // computing column offset
-        vector<int32_t> col_offsets;
-        col_offsets.resize(groups * n_adjusted);
+        vector<int32_t> col_offsets(groups * n_adjusted);
         for (int g = 0; g < groups; ++g) {
           col_offsets_with_zero_pt_s8acc32_ref(
               k_per_group,
               n_adjusted,
               n,
-              Bint8_ref.data() + g * k_per_group * n,
-              Bint8_zero_point,
-              col_offsets.data() + g * n_adjusted);
+              Bint8.data() + g * k_per_group * n,
+              Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+              col_offsets.data() + g * n_adjusted,
+              ncols_per_quant_group);
         }
 
         CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted);
@@ -366,7 +445,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
         }
         B_csc.ColPtr()[groups * n_adjusted] = total_nnz;
 
-        Bint8_ref = Bint8;
+        aligned_vector<int8_t> Bint8_ref(Bint8);
 
         if (btrans == matrix_op_t::Transpose) {
           aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -382,10 +461,10 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
           Bint8 = Bint8_temp;
         }
 
-        vector<int32_t> row_offsets;
-        row_offsets.resize(m);
+        vector<int32_t> row_offsets(m);
 
-        float C_multiplier = 0.1234;
+        aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+        randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
         int32_t C_zero_pt = 5;
 
         int brow = 256;
@@ -428,13 +507,14 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
               groups * n,
               Cint32_ref.data() + g * n_adjusted,
               Cint8_ref.data() + g * n_adjusted,
-              C_multiplier,
+              C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
               C_zero_pt,
               Aint8_zero_point,
-              Bint8_zero_point,
+              Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
               row_offsets.data(),
               col_offsets.data() + g * n_adjusted,
-              nullptr);
+              nullptr,
+              ncols_per_quant_group);
         }
 
         PackBMatrix<int8_t, int16_t> packedB(
@@ -450,8 +530,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
 #pragma omp parallel
 #endif
         {
-          vector<int32_t> row_offset_buf;
-          row_offset_buf.resize(
+          vector<int32_t> row_offset_buf(
               PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
           PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -464,51 +543,106 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
               groups,
               row_offset_buf.data());
 
+          int num_threads = fbgemm_get_num_threads();
+          int tid = fbgemm_get_thread_num();
+
           // spmdm -> requantization -> nothing
           // construct an output processing pipeline in reverse order
           // i.e. last output operation first
           // Last operation should always be DoNothing with
           // correct input and output type.
           DoNothing<> doNothingObj{};
-          // The second last operation is requantization back
-          // to int8
-          ReQuantizeOutput<false> reqObj(
-              doNothingObj,
-              C_multiplier,
-              C_zero_pt,
-              Aint8_zero_point,
-              Bint8_zero_point,
-              packAN.getRowOffsetBuffer(),
-              col_offsets.data(),
-              nullptr);
-          // the top most (first) operation in the output processing
-          // pipeline is spmdm
-          // outType = final output type after fullly processing through
-          // pipeline inType = initial input type at the first call to the whole
-          // pipeline
-          DoSpmdmOnInpBuffer<
-              ReQuantizeOutput<false>::outType,
-              int32_t,
-              ReQuantizeOutput<false>>
-              spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
-
-#ifdef _OPENMP
-          int num_threads = omp_get_num_threads();
-          int tid = omp_get_thread_num();
-#else
-          int num_threads = 1;
-          int tid = 0;
-#endif
 
-          fbgemmPacked(
-              packAN,
-              packedB,
-              Cint8_fb.data(),
-              Cint32_fb.data(),
-              groups * n,
-              spmdmObj,
-              tid,
-              num_threads);
+          if (q_granularity == QuantizationGranularity::TENSOR) {
+            // The second last operation is requantization back
+            // to int8
+            ReQuantizeOutput<false> reqObj(
+                doNothingObj,
+                C_multiplier.data(),
+                C_zero_pt,
+                Aint8_zero_point,
+                Bint8_zero_point.data(),
+                packAN.getRowOffsetBuffer(),
+                col_offsets.data(),
+                nullptr,
+                groups * n_adjusted,
+                groups);
+            // the top most (first) operation in the output processing
+            // pipeline is spmdm
+            // outType = final output type after fullly processing through
+            // pipeline inType = initial input type at the first call to the
+            // whole pipeline
+            DoSpmdmOnInpBuffer<
+                ReQuantizeOutput<false>::outType,
+                int32_t,
+                ReQuantizeOutput<false>>
+                spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+            fbgemmPacked(
+                packAN,
+                packedB,
+                Cint8_fb.data(),
+                Cint32_fb.data(),
+                groups * n,
+                spmdmObj,
+                tid,
+                num_threads);
+          } else if (q_granularity == QuantizationGranularity::GROUP) {
+            ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj(
+                doNothingObj,
+                C_multiplier.data(),
+                C_zero_pt,
+                Aint8_zero_point,
+                Bint8_zero_point.data(),
+                packAN.getRowOffsetBuffer(),
+                col_offsets.data(),
+                nullptr,
+                groups * n_adjusted,
+                groups);
+            DoSpmdmOnInpBuffer<
+                ReQuantizeOutput<false>::outType,
+                int32_t,
+                ReQuantizeOutput<false, QuantizationGranularity::GROUP>>
+                spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+            fbgemmPacked(
+                packAN,
+                packedB,
+                Cint8_fb.data(),
+                Cint32_fb.data(),
+                groups * n,
+                spmdmObj,
+                tid,
+                num_threads);
+          } else {
+            ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+                reqObj(
+                    doNothingObj,
+                    C_multiplier.data(),
+                    C_zero_pt,
+                    Aint8_zero_point,
+                    Bint8_zero_point.data(),
+                    packAN.getRowOffsetBuffer(),
+                    col_offsets.data(),
+                    nullptr,
+                    groups * n_adjusted,
+                    groups);
+            DoSpmdmOnInpBuffer<
+                ReQuantizeOutput<false>::outType,
+                int32_t,
+                ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>>
+                spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+            fbgemmPacked(
+                packAN,
+                packedB,
+                Cint8_fb.data(),
+                Cint32_fb.data(),
+                groups * n,
+                spmdmObj,
+                tid,
+                num_threads);
+          }
         }
 
         compare_validate_buffers(
@@ -527,7 +661,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
  * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
  * accumulation. Output processing: nothing
  */
-TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
+TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
@@ -543,20 +677,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
       }
       int k_per_group = k / groups;
 
-      aligned_vector<uint8_t> Aint8(m * k, 0);
+      aligned_vector<uint8_t> Aint8(m * k);
 
-      aligned_vector<int8_t> Bint8(k * n, 0);
-      aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+      aligned_vector<int8_t> Bint8_ref(k * n);
 
-      aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
-      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+      aligned_vector<int32_t> Cint32_ref(m * n * groups);
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
 
-      randFill(Aint8, 0, 255);
+      randFill<uint8_t>(Aint8, 0, 255);
       int32_t Aint8_zero_point = 43;
 
-      randFill(Bint8_ref, -128, 127);
-      Bint8 = Bint8_ref;
+      randFill<int8_t>(Bint8_ref, -128, 127);
+      aligned_vector<int8_t> Bint8(Bint8_ref);
 
       if (btrans == matrix_op_t::Transpose) {
         aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -586,20 +719,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
       }
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * n_adjusted);
+      vector<int32_t> col_offsets(groups * n_adjusted);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             k_per_group,
             n_adjusted,
             n,
             Bint8_ref.data() + g * k_per_group * n,
-            Bint8_zero_point,
-            col_offsets.data() + g * n_adjusted);
+            &Bint8_zero_point,
+            col_offsets.data() + g * n_adjusted,
+            n_adjusted);
       }
 
-      vector<int32_t> row_offsets;
-      row_offsets.resize(m);
+      vector<int32_t> row_offsets(m);
 
       int brow = 256;
       for (int g = 0; g < groups; ++g) {
@@ -636,8 +768,7 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
         PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -654,13 +785,8 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
         DoNothing<int32_t, int32_t> doNothingObj{};
         memCopy<> outputProcObj(doNothingObj);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
 
         fbgemmPacked(
             packAN,
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index a5744c0..9873e3f 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -25,17 +25,35 @@
 using namespace std;
 using namespace fbgemm;
 
-std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
-                                       matrix_op_t::Transpose};
+vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
+                                  matrix_op_t::Transpose};
+
+vector<QuantizationGranularity> qGranularityVals{
+    QuantizationGranularity::TENSOR,
+    QuantizationGranularity::GROUP,
+    QuantizationGranularity::OUT_CHANNEL};
 
 namespace {
-class fbgemmu8s8acc32test : public testing::TestWithParam<
-                                std::tuple<matrix_op_t, matrix_op_t, bool>> {};
+class fbgemmu8s8acc32WithQuantGranularityTest
+    : public testing::TestWithParam<
+          tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
+class fbgemmu8s8acc32Test
+    : public testing::TestWithParam<
+          tuple<matrix_op_t, matrix_op_t, bool>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
-    fbgemmu8s8acc32test,
+    fbgemmu8s8acc32WithQuantGranularityTest,
+    ::testing::Combine(
+        ::testing::Values(matrix_op_t::NoTranspose),
+        ::testing::ValuesIn(transposeVals),
+        ::testing::Bool(),
+        ::testing::ValuesIn(qGranularityVals)));
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmu8s8acc32Test,
     ::testing::Combine(
         ::testing::Values(matrix_op_t::NoTranspose),
         ::testing::ValuesIn(transposeVals),
@@ -77,11 +95,12 @@ static vector<vector<int>> GetShapes_() {
  * @brief Unit test for uint8 matrix A, int8 matrix B, and 32-bit
  * accumulation. Output processing: requantization -> nothing
  */
-TEST_P(fbgemmu8s8acc32test, Test) {
+TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, Test) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+  QuantizationGranularity q_granularity;
+  tie(atrans, btrans, test_ld, q_granularity) = GetParam();
 
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
@@ -95,22 +114,21 @@ TEST_P(fbgemmu8s8acc32test, Test) {
         int k_per_group = k / groups;
 
         // mxk matrix
-        aligned_vector<uint8_t> Aint8(m * k, 0);
+        aligned_vector<uint8_t> Aint8(m * k);
 
         // kxn matrix
-        aligned_vector<int8_t> Bint8(k * n, 0);
-        aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+        aligned_vector<int8_t> Bint8_ref(k * n);
 
-        aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
-        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+        aligned_vector<int32_t> Cint32_ref(m * n * groups);
+        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
 
-        randFill(Aint8, 0, 255);
+        randFill<uint8_t>(Aint8, 0, 255);
         int32_t Aint8_zero_point = 43;
 
-        randFill(Bint8_ref, -128, 127);
+        randFill<int8_t>(Bint8_ref, -128, 127);
         for (int g = 0; g < groups; ++g) {
           avoidOverflow(
               m,
@@ -122,7 +140,7 @@ TEST_P(fbgemmu8s8acc32test, Test) {
               n);
         }
 
-        Bint8 = Bint8_ref;
+        aligned_vector<int8_t> Bint8(Bint8_ref);
 
         // initialize bias
         aligned_vector<int32_t> bias_int32(groups * n);
@@ -146,7 +164,6 @@ TEST_P(fbgemmu8s8acc32test, Test) {
           Bint8 = Bint8_temp;
         }
 
-        int32_t Bint8_zero_point = -30;
         // To test lda != k , we just reduce k by half and use the original k
         // as lda.
         int n_adjusted = n;
@@ -159,23 +176,33 @@ TEST_P(fbgemmu8s8acc32test, Test) {
           }
         }
 
+        int ncols_per_quant_group = groups * n_adjusted;
+        if (q_granularity == QuantizationGranularity::GROUP) {
+          ncols_per_quant_group = n_adjusted;
+        } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+          ncols_per_quant_group = 1;
+        }
+        aligned_vector<int32_t> Bint8_zero_point(
+            groups * n_adjusted / ncols_per_quant_group);
+        randFill(Bint8_zero_point, -50, -10);
+
         // computing column offset
-        vector<int32_t> col_offsets;
-        col_offsets.resize(groups * n_adjusted);
+        vector<int32_t> col_offsets(groups * n_adjusted);
         for (int g = 0; g < groups; ++g) {
           col_offsets_with_zero_pt_s8acc32_ref(
               k_per_group,
               n_adjusted,
               n,
               Bint8_ref.data() + g * k_per_group * n,
-              Bint8_zero_point,
-              col_offsets.data() + g * n_adjusted);
+              Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+              col_offsets.data() + g * n_adjusted,
+              ncols_per_quant_group);
         }
 
-        vector<int32_t> row_offsets;
-        row_offsets.resize(m);
+        vector<int32_t> row_offsets(m);
 
-        float C_multiplier = 0.001234;
+        aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+        randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
         int32_t C_zero_pt = 5;
 
         for (int g = 0; g < groups; ++g) {
@@ -203,13 +230,14 @@ TEST_P(fbgemmu8s8acc32test, Test) {
               groups * n,
               Cint32_ref.data() + g * n_adjusted,
               Cint8_ref.data() + g * n_adjusted,
-              C_multiplier,
+              C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
               C_zero_pt,
               Aint8_zero_point,
-              Bint8_zero_point,
+              Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
               row_offsets.data(),
               col_offsets.data() + g * n_adjusted,
-              bias ? (bias + g * n_adjusted) : nullptr);
+              bias ? (bias + g * n_adjusted) : nullptr,
+              ncols_per_quant_group);
         }
 
         PackBMatrix<int8_t> packedBN(
@@ -225,8 +253,7 @@ TEST_P(fbgemmu8s8acc32test, Test) {
 #pragma omp parallel
 #endif
         {
-          vector<int32_t> row_offset_buf;
-          row_offset_buf.resize(
+          vector<int32_t> row_offset_buf(
               PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
 
           PackAWithRowOffset<uint8_t> packAN(
@@ -239,34 +266,80 @@ TEST_P(fbgemmu8s8acc32test, Test) {
               groups,
               row_offset_buf.data());
 
-          DoNothing<> doNothingObj{};
-          ReQuantizeOutput<false> outputProcObj(
-              doNothingObj,
-              C_multiplier,
-              C_zero_pt,
-              Aint8_zero_point,
-              Bint8_zero_point,
-              packAN.getRowOffsetBuffer(),
-              col_offsets.data(),
-              bias);
+          int num_threads = fbgemm_get_num_threads();
+          int tid = fbgemm_get_thread_num();
 
-#ifdef _OPENMP
-          int num_threads = omp_get_num_threads();
-          int tid = omp_get_thread_num();
-#else
-          int num_threads = 1;
-          int tid = 0;
-#endif
+          DoNothing<> doNothingObj{};
 
-          fbgemmPacked(
-              packAN,
-              packedBN,
-              Cint8_fb.data(),
-              Cint32_buffer.data(),
-              groups * n,
-              outputProcObj,
-              tid,
-              num_threads);
+          if (q_granularity == QuantizationGranularity::TENSOR) {
+            ReQuantizeOutput<false> outputProcObj(
+                doNothingObj,
+                C_multiplier.data(),
+                C_zero_pt,
+                Aint8_zero_point,
+                Bint8_zero_point.data(),
+                packAN.getRowOffsetBuffer(),
+                col_offsets.data(),
+                bias,
+                groups * n_adjusted,
+                groups);
+
+            fbgemmPacked(
+                packAN,
+                packedBN,
+                Cint8_fb.data(),
+                Cint32_buffer.data(),
+                groups * n,
+                outputProcObj,
+                tid,
+                num_threads);
+          } else if (q_granularity == QuantizationGranularity::GROUP) {
+            ReQuantizeOutput<false, QuantizationGranularity::GROUP>
+                outputProcObj(
+                    doNothingObj,
+                    C_multiplier.data(),
+                    C_zero_pt,
+                    Aint8_zero_point,
+                    Bint8_zero_point.data(),
+                    packAN.getRowOffsetBuffer(),
+                    col_offsets.data(),
+                    bias,
+                    groups * n_adjusted,
+                    groups);
+
+            fbgemmPacked(
+                packAN,
+                packedBN,
+                Cint8_fb.data(),
+                Cint32_buffer.data(),
+                groups * n,
+                outputProcObj,
+                tid,
+                num_threads);
+          } else {
+            ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+                outputProcObj(
+                    doNothingObj,
+                    C_multiplier.data(),
+                    C_zero_pt,
+                    Aint8_zero_point,
+                    Bint8_zero_point.data(),
+                    packAN.getRowOffsetBuffer(),
+                    col_offsets.data(),
+                    bias,
+                    groups * n_adjusted,
+                    groups);
+
+            fbgemmPacked(
+                packAN,
+                packedBN,
+                Cint8_fb.data(),
+                Cint32_buffer.data(),
+                groups * n,
+                outputProcObj,
+                tid,
+                num_threads);
+          }
         }
         // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(),
         // m, n_adjusted, n, "C local");
@@ -287,11 +360,12 @@ TEST_P(fbgemmu8s8acc32test, Test) {
  * accumulation. Directly output fp32 matrix C. Output processing:
  * requantization -> nothing
  */
-TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
+TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, TestFloatInputOutput) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+  QuantizationGranularity q_granularity;
+  tie(atrans, btrans, test_ld, q_granularity) = GetParam();
 
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
@@ -303,26 +377,26 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
       }
       int k_per_group = k / groups;
 
-      aligned_vector<float> Afp32(m * k, 0.0f);
-      aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
+      aligned_vector<float> Afp32(m * k);
+      aligned_vector<uint8_t> Aint8(Afp32.size());
 
-      aligned_vector<float> Bfp32(k * n, 0.0f);
-      aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
+      aligned_vector<float> Bfp32(k * n);
+      aligned_vector<int8_t> Bint8(Bfp32.size());
 
-      aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
-      aligned_vector<float> Cfp32_fb(Cfp32_ref.size(), 0.0f);
+      aligned_vector<float> Cfp32_ref(m * n * groups);
+      aligned_vector<float> Cfp32_fb(Cfp32_ref.size());
 
-      aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size(), 0);
+      aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size());
+      aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size());
 
-      randFill(Aint8, 0, 255);
+      randFill<uint8_t>(Aint8, 0, 255);
       int32_t Aint8_zero_point = 43;
       float Aint8_scale = 0.11;
       for (auto i = 0; i < Afp32.size(); ++i) {
         Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
       }
 
-      randFill(Bint8, -128, 127);
+      randFill<int8_t>(Bint8, -128, 127);
       for (int g = 0; g < groups; ++g) {
         avoidOverflow(
             m,
@@ -333,11 +407,6 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
             Bint8.data() + g * k_per_group * n,
             n);
       }
-      int32_t Bint8_zero_point = -30;
-      float Bint8_scale = 0.49;
-      for (auto i = 0; i < Bfp32.size(); ++i) {
-        Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point);
-      }
 
       // To test lda != k , we just reduce k by half and use the original k
       // as lda.
@@ -351,17 +420,37 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
         }
       }
 
+      int ncols_per_quant_group = groups * n_adjusted;
+      if (q_granularity == QuantizationGranularity::GROUP) {
+        ncols_per_quant_group = n_adjusted;
+      } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+        ncols_per_quant_group = 1;
+      }
+      aligned_vector<int32_t> Bint8_zero_point(
+          groups * n_adjusted / ncols_per_quant_group);
+      randFill(Bint8_zero_point, -50, -10);
+      aligned_vector<float> Bint8_scale(Bint8_zero_point.size());
+      randFill(Bint8_scale, 0.49f / 2, 0.49f * 3 / 2);
+      for (int i = 0; i < k; ++i) {
+        int g = i / k_per_group;
+        for (int j = 0; j < n_adjusted; ++j) {
+          int quant_group = (g * n_adjusted + j) / ncols_per_quant_group;
+          Bfp32[i * n + j] = Bint8_scale[quant_group] *
+              (Bint8[i * n + j] - Bint8_zero_point[quant_group]);
+        }
+      }
+
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * n_adjusted);
+      vector<int32_t> col_offsets(groups * n_adjusted);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             k_per_group,
             n_adjusted,
             n,
             Bint8.data() + g * k_per_group * n,
-            Bint8_zero_point,
-            col_offsets.data() + g * n_adjusted);
+            Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+            col_offsets.data() + g * n_adjusted,
+            ncols_per_quant_group);
       }
 
       if (btrans == matrix_op_t::Transpose) {
@@ -404,8 +493,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
 
         PackAWithQuantRowOffset<uint8_t> packAN(
@@ -420,47 +508,96 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
             groups,
             row_offset_buf.data());
 
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
+
         DoNothing<float, float> doNothingObj{};
-        ReQuantizeForFloat<false> outputProcObj(
-            doNothingObj,
-            Aint8_scale,
-            Bint8_scale,
-            Aint8_zero_point,
-            Bint8_zero_point,
-            packAN.getRowOffsetBuffer(),
-            col_offsets.data(),
-            nullptr);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        if (q_granularity == QuantizationGranularity::TENSOR) {
+          ReQuantizeForFloat<false> outputProcObj(
+              doNothingObj,
+              Aint8_scale,
+              Bint8_scale.data(),
+              Aint8_zero_point,
+              Bint8_zero_point.data(),
+              packAN.getRowOffsetBuffer(),
+              col_offsets.data(),
+              nullptr,
+              groups * n_adjusted,
+              groups);
 
-        fbgemmPacked(
-            packAN,
-            packedBN,
-            Cfp32_fb.data(),
-            reinterpret_cast<int32_t*>(Cfp32_fb.data()),
-            groups * n,
-            outputProcObj,
-            tid,
-            num_threads);
-      }
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cfp32_fb.data(),
+              reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        } else if (q_granularity == QuantizationGranularity::GROUP) {
+          ReQuantizeForFloat<false, QuantizationGranularity::GROUP>
+              outputProcObj(
+                  doNothingObj,
+                  Aint8_scale,
+                  Bint8_scale.data(),
+                  Aint8_zero_point,
+                  Bint8_zero_point.data(),
+                  packAN.getRowOffsetBuffer(),
+                  col_offsets.data(),
+                  nullptr,
+                  groups * n_adjusted,
+                  groups);
 
-      float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end());
-      float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end());
-      float atol = (maximum - minimum) / 255 / 1.9;
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cfp32_fb.data(),
+              reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        } else {
+          ReQuantizeForFloat<false, QuantizationGranularity::OUT_CHANNEL>
+              outputProcObj(
+                  doNothingObj,
+                  Aint8_scale,
+                  Bint8_scale.data(),
+                  Aint8_zero_point,
+                  Bint8_zero_point.data(),
+                  packAN.getRowOffsetBuffer(),
+                  col_offsets.data(),
+                  nullptr,
+                  groups * n_adjusted,
+                  groups);
 
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cfp32_fb.data(),
+              reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        }
+      }
+
+      float maximum = 0;
+      for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < groups * n_adjusted; ++j) {
+          float c = Cfp32_ref[i * groups * n + j];
+          maximum = std::max(maximum, std::abs(c));
+        }
+      }
       compare_validate_buffers(
           Cfp32_ref.data(),
           Cfp32_fb.data(),
           m,
           groups * n_adjusted,
           groups * n,
-          atol);
+          maximum * 1e-5f);
     } // for each groups
   } // for each shape
 }
@@ -470,7 +607,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
  * accumulation. Output processing: requantization -> nothing. Symmetric: the
  * zero point is 0.
  */
-TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
+TEST_P(fbgemmu8s8acc32Test, TestSymmetricQuantizedInputOutput) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
@@ -486,22 +623,17 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
       }
       int k_per_group = k / groups;
 
-      aligned_vector<float> Afp32(m * k, 0.0f);
-      aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
+      aligned_vector<uint8_t> Aint8(m * k);
+      aligned_vector<int8_t> Bint8(k * n);
 
-      aligned_vector<float> Bfp32(k * n, 0.0f);
-      aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
+      aligned_vector<float> Cfp32_ref(m * n * groups);
+      aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size());
 
-      aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
-      aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size(), 0);
-
-      randFill(Afp32, 0, 255);
-      for (auto i = 0; i < Afp32.size(); i++) {
-        Aint8[i] = (uint8_t)Afp32[i];
-      }
+      randFill<uint8_t>(Aint8, 0, 255);
+      aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
       // initialize B matrix
-      randFill(Bfp32, -128, 127);
+      randFill<int8_t>(Bint8, -128, 127);
       for (int g = 0; g < groups; ++g) {
         avoidOverflow(
             m,
@@ -509,13 +641,11 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
             k_per_group,
             Aint8.data() + g * k_per_group,
             k,
-            Bfp32.data() + g * k_per_group * n,
+            Bint8.data() + g * k_per_group * n,
             n);
       }
 
-      for (auto i = 0; i < Bfp32.size(); ++i) {
-        Bint8[i] = (int8_t)Bfp32[i];
-      }
+      aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
       // To test lda != k , we just reduce k by half and use the original k
       // as lda.
@@ -577,13 +707,8 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
         DoNothing<int32_t, int32_t> doNothingObj{};
         memCopy<> outputProcObj(doNothingObj);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
 
         fbgemmPacked(
             packAN,
author	Jongsoo Park <jongsoo@fb.com>	2018-11-27 21:05:28 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-11-27 21:13:17 +0300
commit	d4ee77f5a851879f4a778f122656158663b766b5 (patch)
tree	1d1db56c63c55753bf237bc6fb4cd59810732001
parent	db52c82306e7aa10e2dde706b7205c30eec31cd5 (diff)