24 files changed, 1466 insertions, 1049 deletions
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
index db40ee0..f5ce9ef 100644
--- a/bench/BenchUtils.cc
+++ b/bench/BenchUtils.cc
@@ -5,30 +5,41 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include "BenchUtils.h"
+
+#include <algorithm>
 #include <random>
+#include <type_traits>
+
+#include <omp.h>
 
 namespace fbgemm {
 
 std::default_random_engine eng;
 
 template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high) {
-  std::random_device r;
-  std::uniform_int_distribution<int> dis(low, high);
-  for (auto& v : vec) {
-    v = static_cast<T>(dis(eng));
-  }
+void randFill(aligned_vector<T>& vec, T low, T high, std::true_type) {
+  std::uniform_int_distribution<T> dis(low, high);
+  std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high, std::false_type) {
+  std::uniform_real_distribution<T> dis(low, high);
+  std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high) {
+  randFill(vec, low, high, std::is_integral<T>());
 }
 
 template void
-randFill<float>(aligned_vector<float>& vec, const int low, const int high);
-template void
-randFill<uint8_t>(aligned_vector<uint8_t>& vec, const int low, const int high);
+randFill<float>(aligned_vector<float>& vec, float low, float high);
 template void
-randFill<int8_t>(aligned_vector<int8_t>& vec, const int low, const int high);
-
+randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high);
 template void
-randFill<int>(aligned_vector<int>& vec, const int low, const int high);
+randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high);
+template void randFill<int>(aligned_vector<int>& vec, int low, int high);
 
 void llc_flush(std::vector<char>& llc) {
   volatile char* data = llc.data();
@@ -37,4 +48,20 @@ void llc_flush(std::vector<char>& llc) {
   }
 }
 
+int fbgemm_get_num_threads() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+  return 1;
+#else
+  return omp_get_num_threads();
+#endif
+}
+
+int fbgemm_get_thread_num() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+  return 0;
+#else
+  return omp_get_thread_num();
+#endif
+}
+
 } // namespace fbgemm
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
index 8ca99df..da2ef2d 100644
--- a/bench/BenchUtils.h
+++ b/bench/BenchUtils.h
@@ -11,8 +11,11 @@
 namespace fbgemm {
 
 template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high);
+void randFill(aligned_vector<T>& vec, T low, T high);
 
 void llc_flush(std::vector<char>& llc);
 
+int fbgemm_get_num_threads();
+int fbgemm_get_thread_num();
+
 } // namespace fbgemm
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index f53eeea..c65839b 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -62,10 +62,10 @@ int main() {
     aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
         C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3x3_pad_1_ref(
@@ -129,13 +129,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#if _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3x3_pad_1(
             N,
             T,
@@ -200,13 +195,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#if _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3x3_pad_1(
             N,
             T,
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 8e6d83d..b922f90 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -161,10 +161,10 @@ int main() {
     aligned_vector<int8_t> B(G * R * S);
     aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3_pad_1_ref(
@@ -221,13 +221,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3_pad_1(
             N,
             H,
@@ -279,13 +274,8 @@ int main() {
       t_begin = chrono::system_clock::now();
 #pragma omp parallel
       {
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         depthwise_3x3_pad_1(
             N,
             H,
diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
index c03f18a..fd9de5b 100644
--- a/bench/FP16Benchmark.cc
+++ b/bench/FP16Benchmark.cc
@@ -73,20 +73,24 @@ void performance_test() {
     int n = s[1];
     int k = s[2];
 
-    aligned_vector<float> A(m * k, 0.f);
-    aligned_vector<float> B(k * n, 0.f);
-    aligned_vector<float> Cg(m * n, 1.f);
-    aligned_vector<float> Cp(m * n, NAN);
+    aligned_vector<float> C_ref(m * n, 1.f);
+    aligned_vector<float> C_fb(m * n, NAN);
 
     // initialize with small numbers
-    randFill(A, 0, 4);
+    aligned_vector<int> Aint(m * k);
+    randFill(Aint, 0, 4);
+    aligned_vector<float> A(Aint.begin(), Aint.end());
 
-    randFill(B, 0, 4);
+    aligned_vector<int> Bint(k * n);
+    randFill(Bint, 0, 4);
+    aligned_vector<float> B(Bint.begin(), Bint.end());
     PackedGemmMatrixFP16 Bp(btran, k, n, alpha, B.data());
 
     if (beta != 0.0f) {
-      randFill(Cg, 0, 4);
-      Cp = Cg;
+      aligned_vector<int> Cint(C_ref.size());
+      randFill(Cint, 0, 4);
+      C_ref.assign(Cint.begin(), Cint.end());
+      C_fb = C_ref;
     }
 
     double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER;
@@ -111,17 +115,17 @@ void performance_test() {
           B.data(),
           (btran == matrix_op_t::NoTranspose) ? n : k,
           beta,
-          Cg.data(),
+          C_ref.data(),
           n);
 #endif
       cblas_gemm_compute(
-          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
 
 #ifdef USE_MKL
       // Compare results
-      for (auto i = 0; i < Cg.size(); i++) {
-        // printf("%f %f\n", Cg[i], Cp[i]);
-        assert(std::abs(Cg[i] - Cp[i]) < 1e-3);
+      for (auto i = 0; i < C_ref.size(); i++) {
+        // printf("%f %f\n", C_ref[i], C_fb[i]);
+        assert(std::abs(C_ref[i] - C_fb[i]) < 1e-3);
       }
 #endif
     }
@@ -151,7 +155,7 @@ void performance_test() {
           B.data(),
           (btran == matrix_op_t::NoTranspose) ? n : k,
           beta,
-          Cg.data(),
+          C_ref.data(),
           n);
       t_end = chrono::system_clock::now();
       if (it >= 0) {
@@ -184,7 +188,7 @@ void performance_test() {
 
       t_begin = chrono::system_clock::now();
       cblas_gemm_compute(
-          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+          matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
       t_end = chrono::system_clock::now();
 
       if (it >= 0) {
diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
index 07b73dc..4223d0c 100644
--- a/bench/I8SpmdmBenchmark.cc
+++ b/bench/I8SpmdmBenchmark.cc
@@ -77,7 +77,7 @@ int main() {
         cout << M << ", " << N << ", " << K << ", ";
 
         aligned_vector<uint8_t> A(M * K);
-        randFill(A, 0, 255);
+        randFill<uint8_t>(A, 0, 255);
 
         fbgemm::CompressedSparseColumn B_csc(K, N);
         vector<int32_t> C(M * N);
@@ -156,13 +156,8 @@ int main() {
 #pragma omp parallel
 #endif
           {
-#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
-            int num_threads = 1;
-            int tid = 0;
-#else
-            int num_threads = omp_get_num_threads();
-            int tid = omp_get_thread_num();
-#endif
+            int num_threads = fbgemm_get_num_threads();
+            int tid = fbgemm_get_thread_num();
             int i_per_thread =
                 ((M + 31) / 32 + num_threads - 1) / num_threads * 32;
             int i_begin = std::min(tid * i_per_thread, M);
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index 2115863..cb2edf5 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -125,43 +125,29 @@ void performance_test() {
 
   chrono::time_point<chrono::high_resolution_clock> begin, end;
   for (auto conv_p : shapes) {
-    aligned_vector<float> Afp32(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
     aligned_vector<uint8_t> Aint8(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
-
+        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
     aligned_vector<uint8_t> Aint8_out(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
-            conv_p.K[1] * conv_p.IC,
-        0);
+        conv_p.K[1] * conv_p.IC);
 
-    aligned_vector<float> Bfp32(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
     aligned_vector<int8_t> Bint8(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
 
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb2(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
 
     // A matrix (input activations)
-    randFill(Afp32, 0, 5);
+    randFill<uint8_t>(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
     // B matrix (weights)
-    randFill(Bfp32, -4, 4);
+    randFill<int8_t>(Bint8, -4, 4);
     // int32_t Bint8_zero_point = -3;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // reference implementation
     conv_ref(
@@ -184,8 +170,7 @@ void performance_test() {
     double ttot = 0.0;
     string runType;
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int16_t> packA(
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
index 7144e61..8e112d8 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -125,45 +125,32 @@ void performance_test() {
 
   chrono::time_point<chrono::high_resolution_clock> begin, end;
   for (auto conv_p : shapes) {
-    aligned_vector<float> Afp32(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
     aligned_vector<uint8_t> Aint8(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
 
     aligned_vector<uint8_t> Aint8_out(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
-            conv_p.K[1] * conv_p.IC,
-        0);
+        conv_p.K[1] * conv_p.IC);
 
-    aligned_vector<float> Bfp32(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
     aligned_vector<int8_t> Bint8(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
 
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
-    aligned_vector<int32_t> Cint32_fb2(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
 
     // cout << conv_p.toString() << endl;
 
     // A matrix (input activations)
-    randFill(Afp32, 0, 5);
+    randFill<uint8_t>(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Apf32(Aint8.begin(), Aint8.end());
 
     // B matrix (weights)
-    randFill(Bfp32, -4, 4);
+    randFill<int8_t>(Bint8, -4, 4);
     // int32_t Bint8_zero_point = -3;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // reference implementation
     conv_ref(
@@ -186,8 +173,7 @@ void performance_test() {
     double ttot = 0.0;
     string runType;
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int32_t> packA(
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index badbda0..79a750e 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -86,27 +86,27 @@ void performance_test() {
     int k = shape[2];
 
     float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    aligned_vector<float> Afp32(m * k);
+    aligned_vector<uint8_t> Aint8(Afp32.size());
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
+    aligned_vector<float> Bfp32(k * n);
+    aligned_vector<int8_t> Bint8(Bfp32.size());
 
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
-    aligned_vector<float> Cfp32_fb(m * n, 0.0f);
+    aligned_vector<float> Cfp32_mkl(m * n);
+    aligned_vector<float> Cfp32_fb(Cfp32_mkl.size());
 
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Aint8, 0, 255);
+    randFill<uint8_t>(Aint8, 0, 255);
     float Aint8_scale = 0.11;
     int32_t Aint8_zero_point = 43;
     for (auto i = 0; i < Afp32.size(); ++i) {
       Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
     }
 
-    randFill(Bint8, -128, 127);
+    randFill<int8_t>(Bint8, -128, 127);
     avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
 
     float Bint8_scale = 0.49;
@@ -116,10 +116,9 @@ void performance_test() {
     }
 
     // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n);
+    vector<int32_t> col_offsets(n);
     col_offsets_with_zero_pt_s8acc32_ref(
-        k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+        k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
     double ttot = 0;
     std::string type;
@@ -172,8 +171,7 @@ void performance_test() {
     // printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col
     // offsets before");
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
+    vector<int32_t> row_offset_buf(
         PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
 
     PackAWithQuantRowOffset<uint8_t> packAN(
@@ -201,12 +199,13 @@ void performance_test() {
     ReQuantizeForFloat<false> outputProcObj(
         doNothingObj,
         Aint8_scale,
-        Bint8_scale,
+        &Bint8_scale,
         Aint8_zero_point,
-        Bint8_zero_point,
+        &Bint8_zero_point,
         packAN.getRowOffsetBuffer(),
         col_offsets.data(),
-        nullptr);
+        nullptr,
+        n);
 
     ttot = 0;
     type = "FBGEMM_i8_acc32";
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index fd48b24..f60332f 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -100,29 +100,26 @@ void performance_test() {
     int n = shape[1];
     int k = shape[2];
 
-    float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    float alpha = 1.0f, beta = 0.0f;
+    aligned_vector<uint8_t> Aint8(m * k);
+    aligned_vector<int8_t> Bint8(k * n);
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
-
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
+    aligned_vector<float> Cfp32_mkl(m * n);
     // just used for result comparisons
-    aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f);
+    aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
     // requantize results
-    aligned_vector<uint8_t> Cint8_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0.0f);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0.0f);
+    aligned_vector<uint8_t> Cint8_mkl(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Afp32, 0, 50);
+    randFill<uint8_t>(Aint8, 0, 50);
     int32_t Aint8_zero_point = 43;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Aint8[i] = static_cast<uint8_t>(Afp32[i]);
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
-    randFill(Bfp32, -8, 8);
+    randFill<int8_t>(Bint8, -8, 8);
+    aligned_vector<int8_t> Bint8_copy(Bint8);
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     double nops = 2.0 * static_cast<double>(NITER) * m * n * k;
     double ttot = 0.0;
@@ -163,9 +160,7 @@ void performance_test() {
     cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1)
          << nops / ttot << endl;
 
-    for (auto i = 0; i < Cfp32_mkl.size(); ++i) {
-      Cint32_mkl[i] = static_cast<int32_t>(Cfp32_mkl[i]);
-    }
+    Cint32_mkl.assign(Cfp32_mkl.begin(), Cfp32_mkl.end());
 #endif
 
     for (BenchmarkType bench_type :
@@ -179,23 +174,19 @@ void performance_test() {
                                   bench_type == BenchmarkType::REQUANTIZATION)
           ? 0
           : -30;
-      for (auto i = 0; i < Bfp32.size(); ++i) {
-        Bint8[i] = static_cast<int8_t>(Bfp32[i]);
-      }
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(n);
+      vector<int32_t> col_offsets(n);
+      Bint8 = Bint8_copy;
       col_offsets_with_zero_pt_s8acc32_ref(
-          k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+          k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
-      vector<int32_t> row_offsets;
-      row_offsets.resize(m);
+      vector<int32_t> row_offsets(m);
 
       row_offsets_u8acc32_ref(m, k, k, Aint8.data(), row_offsets.data());
 
       float C_multiplier =
-          (bench_type == BenchmarkType::BARE_BONE) ? 1 : 0.1234;
+          (bench_type == BenchmarkType::BARE_BONE) ? 1.0f : 0.1234f;
       int32_t C_zero_pt = (bench_type == BenchmarkType::BARE_BONE) ? 0 : 5;
 
       // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -235,16 +226,14 @@ void performance_test() {
           n,
           Cint32_mkl.data(),
           Cint8_mkl.data(),
-          C_multiplier,
+          &C_multiplier,
           C_zero_pt,
           Aint8_zero_point,
-          Bint8_zero_point,
+          &Bint8_zero_point,
           row_offsets.data(),
           col_offsets.data(),
-          nullptr); // bias
-
-      PackBMatrix<int8_t, int16_t> packedB(
-          matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
+          nullptr, // bias
+          n); // ncols per quant group
 
       CompressedSparseColumn B_csc(k, n);
 
@@ -254,30 +243,35 @@ void performance_test() {
       default_random_engine eng;
       binomial_distribution<> per_col_nnz_dist(k, density);
 
-      vector<int> row_indices(k);
-
-      int total_nnz = 0;
-      for (int j = 0; j < n; ++j) {
-        B_csc.ColPtr()[j] = total_nnz;
-
-        int nnz_of_j = per_col_nnz_dist(eng);
-        total_nnz += nnz_of_j;
-
-        iota(row_indices.begin(), row_indices.end(), 0);
-        shuffle(row_indices.begin(), row_indices.end(), eng);
-        sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
-
-        for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
-          B_csc.RowIdx().push_back(row_indices[kidx]);
-          // put the current B value
-          B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
-          // make current B value zero
-          Bint8[row_indices[kidx] * n + j] = 0;
-          // std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
-          // endl;
+      if (bench_type == BenchmarkType::EVERYTHING) {
+        vector<int> row_indices(k);
+
+        int total_nnz = 0;
+        for (int j = 0; j < n; ++j) {
+          B_csc.ColPtr()[j] = total_nnz;
+
+          int nnz_of_j = per_col_nnz_dist(eng);
+          total_nnz += nnz_of_j;
+
+          iota(row_indices.begin(), row_indices.end(), 0);
+          shuffle(row_indices.begin(), row_indices.end(), eng);
+          sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
+
+          for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
+            B_csc.RowIdx().push_back(row_indices[kidx]);
+            // put the current B value
+            B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
+            // make current B value zero
+            Bint8[row_indices[kidx] * n + j] = 0;
+            // std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
+            // endl;
+          }
         }
+        B_csc.ColPtr()[n] = total_nnz;
       }
-      B_csc.ColPtr()[n] = total_nnz;
+
+      PackBMatrix<int8_t, int16_t> packedB(
+          matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
 
       // printMatrix(matrix_op_t::NoTranspose,
       // Cint32_mkl.data(), m, n, n, "C mkl");
@@ -298,8 +292,7 @@ void performance_test() {
 #pragma omp parallel
 #endif
         {
-          vector<int32_t> row_offset_buf;
-          row_offset_buf.resize(
+          vector<int32_t> row_offset_buf(
               PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
           PackAMatrix<uint8_t, int16_t> packA(
@@ -333,15 +326,16 @@ void performance_test() {
           // Requantization back to int8
           ReQuantizeOutput<false> reqObj(
               doNothingObj,
-              C_multiplier,
+              &C_multiplier,
               C_zero_pt,
               Aint8_zero_point,
-              Bint8_zero_point,
+              &Bint8_zero_point,
               bench_type == BenchmarkType::REQUANTIZATION
                   ? nullptr
                   : packAWithRowOffset.getRowOffsetBuffer(),
               col_offsets.data(),
-              nullptr);
+              nullptr,
+              n);
 
           // the top most (first) operation in the output processing
           // pipeline is spmdm
@@ -354,13 +348,8 @@ void performance_test() {
               ReQuantizeOutput<false>>
               spmdmObj(reqObj, Aint8.data(), k, B_csc);
 
-#ifdef _OPENMP
-          int num_threads = omp_get_num_threads();
-          int tid = omp_get_thread_num();
-#else
-          int num_threads = 1;
-          int tid = 0;
-#endif
+          int num_threads = fbgemm_get_num_threads();
+          int tid = fbgemm_get_thread_num();
           // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
           switch (bench_type) {
             case BenchmarkType::BARE_BONE:
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index b3a9b38..b255b8c 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -103,42 +103,35 @@ void performance_test() {
     int k = shape[2];
 
     float alpha = 1.f, beta = 0.f;
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
+    aligned_vector<uint8_t> Aint8(m * k);
 
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
+    aligned_vector<int8_t> Bint8(k * n);
 
-    aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_local(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-    aligned_vector<uint8_t> Cint8_local(m * n, 0);
+    aligned_vector<float> Cfp32_mkl(m * n);
+    aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_local(Cfp32_mkl.size());
+    aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size());
+    aligned_vector<uint8_t> Cint8_local(Cfp32_mkl.size());
 
     // A matrix
-    randFill(Aint8, 0, 255);
+    randFill<uint8_t>(Aint8, 0, 255);
     // float Aint8_scale = 0.11;
     int32_t Aint8_zero_point = 43;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Afp32[i] = (float)Aint8[i];
-    }
+    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
-    randFill(Bint8, -128, 127);
+    randFill<int8_t>(Bint8, -128, 127);
     avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
 
     // float Bint8_scale = 0.49;
     int32_t Bint8_zero_point = -30;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bfp32[i] = (float)Bint8[i];
-    }
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
     // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n);
+    vector<int32_t> col_offsets(n);
     col_offsets_with_zero_pt_s8acc32_ref(
-        k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+        k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
 
     double nops = 2.0 * static_cast<double>(NITER) * m * n * k;
     double ttot = 0.0;
@@ -180,8 +173,7 @@ void performance_test() {
     }
 #endif
 
-    vector<int32_t> row_offsets;
-    row_offsets.resize(m);
+    vector<int32_t> row_offsets(m);
 
     float C_multiplier = 0.1234;
     int32_t C_zero_pt = 5;
@@ -197,13 +189,14 @@ void performance_test() {
         n,
         Cint32_local.data(),
         Cint8_local.data(),
-        C_multiplier,
+        &C_multiplier,
         C_zero_pt,
         Aint8_zero_point,
-        Bint8_zero_point,
+        &Bint8_zero_point,
         row_offsets.data(),
         col_offsets.data(),
-        nullptr); // bias
+        nullptr, // bias
+        n); // ncols per quant group
     // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
     // unpacked");
     // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -248,8 +241,7 @@ void performance_test() {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
 
         PackAWithRowOffset<uint8_t> packAN(
@@ -265,21 +257,17 @@ void performance_test() {
         DoNothing<> doNothingObj{};
         ReQuantizeOutput<false> outputProcObj(
             doNothingObj,
-            C_multiplier,
+            &C_multiplier,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            &Bint8_zero_point,
             packAN.getRowOffsetBuffer(),
             col_offsets.data(),
-            nullptr);
+            nullptr,
+            n);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
         // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
         fbgemmPacked(
             packAN,
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index d2c2a1e..17a07e5 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -847,13 +847,19 @@ class DoSpmdmOnInpBuffer {
   const int groups_;
 };
 
+enum class QuantizationGranularity {
+  TENSOR,
+  GROUP,
+  OUT_CHANNEL,
+};
+
 /**
  * @brief Requantize values in inp buffer and write to out buffer.
  *        pass the out buffer to next op for further processing.
- *
  */
 template <
     bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
     typename outT = std::uint8_t,
     typename inT = std::int32_t,
     typename nextOPType = DoNothing<outT, outT>>
@@ -863,13 +869,15 @@ class ReQuantizeOutput {
   using inpType = inT;
   ReQuantizeOutput(
       nextOPType& nextop,
-      float C_multiplier,
+      const float* C_multiplier,
       std::int32_t C_zero_point,
       std::int32_t Aq_zero_point,
-      std::int32_t Bq_zero_point,
+      const std::int32_t* Bq_zero_point,
       const std::int32_t* row_offsets,
       const std::int32_t* col_offsets,
-      const std::int32_t* bias)
+      const std::int32_t* bias,
+      std::uint32_t nCol,
+      int groups = 1)
       : nextop_(nextop),
         C_multiplier_(C_multiplier),
         C_zero_point_(C_zero_point),
@@ -877,7 +885,9 @@ class ReQuantizeOutput {
         Bq_zero_point_(Bq_zero_point),
         q_row_offsets_(row_offsets),
         q_col_offsets_(col_offsets),
-        bias_(bias) {}
+        bias_(bias),
+        ncols_(nCol),
+        groups_(groups) {}
 
   template <inst_set_t instSet>
   inline int f(
@@ -897,13 +907,15 @@ class ReQuantizeOutput {
       int ld_in) const;
 
   nextOPType& nextop_;
-  float C_multiplier_;
+  const float* C_multiplier_;
   std::int32_t C_zero_point_;
   std::int32_t Aq_zero_point_;
-  std::int32_t Bq_zero_point_;
+  const std::int32_t* Bq_zero_point_;
   const std::int32_t* q_row_offsets_;
   const std::int32_t* q_col_offsets_;
   const std::int32_t* bias_;
+  std::uint32_t ncols_;
+  int groups_;
 };
 
 /**
@@ -912,6 +924,7 @@ class ReQuantizeOutput {
  */
 template <
     bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
     typename outT = float,
     typename inT = std::int32_t,
     typename nextOPType = DoNothing<outT, outT>>
@@ -922,12 +935,14 @@ class ReQuantizeForFloat {
   ReQuantizeForFloat(
       nextOPType& nextop,
       float Aq_scale,
-      float Bq_scale,
+      const float* Bq_scale,
       std::int32_t Aq_zero_point,
-      std::int32_t Bq_zero_point,
+      const std::int32_t* Bq_zero_point,
       const std::int32_t* row_offsets,
       const std::int32_t* col_offsets,
-      const float* bias)
+      const float* bias,
+      std::uint32_t nCol,
+      int groups = 1)
       : nextop_(nextop),
         Aq_scale_(Aq_scale),
         Bq_scale_(Bq_scale),
@@ -935,7 +950,9 @@ class ReQuantizeForFloat {
         Bq_zero_point_(Bq_zero_point),
         q_row_offsets_(row_offsets),
         q_col_offsets_(col_offsets),
-        bias_(bias) {}
+        bias_(bias),
+        ncols_(nCol),
+        groups_(groups) {}
 
   template <inst_set_t instSet>
   inline int f(
@@ -947,12 +964,15 @@ class ReQuantizeForFloat {
 
  private:
   nextOPType& nextop_;
-  float Aq_scale_, Bq_scale_;
+  float Aq_scale_;
+  const float* Bq_scale_;
   std::int32_t Aq_zero_point_;
-  std::int32_t Bq_zero_point_;
+  const std::int32_t* Bq_zero_point_;
   const std::int32_t* q_row_offsets_;
   const std::int32_t* q_col_offsets_;
   const float* bias_;
+  std::uint32_t ncols_;
+  int groups_;
 };
 
 // type specialized implementation in an include file
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
index 59a6e0e..88a10bc 100644
--- a/include/fbgemm/OutputProcessing-inl.h
+++ b/include/fbgemm/OutputProcessing-inl.h
@@ -44,9 +44,14 @@ inline int DoSpmdmOnInpBuffer<outT, inT, nextOPType>::f(
   return nextop_.template f<instSet>(out, inp, block, ld_out, ld_in);
 }
 
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN,
+    typename outT,
+    typename inT,
+    typename nextOPType>
 template <bool A_SYMMETRIC, bool B_SYMMETRIC, bool HAS_BIAS>
-void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
+void ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f_(
     outT* out,
     const inT* inp,
     const block_type_t& block,
@@ -54,7 +59,13 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
     int ld_in) const {
   // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c
   // using AVX2 instructions
-  __m256 multiplier_v = _mm256_set1_ps(C_multiplier_);
+  int quant_param_idx = 0;
+  if (Q_GRAN == QuantizationGranularity::GROUP) {
+    int ncol_per_group = ncols_ / groups_;
+    int g = block.col_start / ncol_per_group;
+    quant_param_idx = g;
+  }
+  __m256 multiplier_v = _mm256_set1_ps(C_multiplier_[quant_param_idx]);
 
   __m256i min_v = _mm256_set1_epi8(std::numeric_limits<uint8_t>::min());
   __m256i max_v = _mm256_set1_epi8(std::numeric_limits<uint8_t>::max());
@@ -63,7 +74,9 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
       (A_SYMMETRIC == (Aq_zero_point_ == 0)) &&
       "A_SYMMETRIC == true if and only if Aq_zero_point == 0");
   assert(
-      (B_SYMMETRIC == (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr)) &&
+      (B_SYMMETRIC ==
+       ((Q_GRAN == QuantizationGranularity::TENSOR && Bq_zero_point_[0] == 0) ||
+        q_row_offsets_ == nullptr)) &&
       "B_SYMMETRIC == true if and only if Bq_zero_point == 0 "
       "or q_row_offsets_ == nullptr");
   assert(
@@ -79,10 +92,22 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
 
   constexpr int VLEN = 8;
   for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
-    std::int32_t row_offset = (q_row_offsets_ && !B_SYMMETRIC)
-        ? q_row_offsets_[i - block.row_start] * Bq_zero_point_
-        : 0;
+    // Scale row_offset with Bq_zero_point
+    int32_t row_offset = 0;
+    if (B_SYMMETRIC) {
+      row_offset = 0;
+    } else if (
+        Q_GRAN == QuantizationGranularity::TENSOR ||
+        Q_GRAN == QuantizationGranularity::GROUP) {
+      row_offset =
+          q_row_offsets_[i - block.row_start] * Bq_zero_point_[quant_param_idx];
+    } else {
+      assert(
+          Q_GRAN == QuantizationGranularity::OUT_CHANNEL &&
+          "unknown quantization granularity");
+    }
     __m256i row_offset_v = _mm256_set1_epi32(row_offset);
+
     int j = block.col_start;
     for (; j < block.col_start + (block.col_size / (VLEN * 4) * (VLEN * 4));
          j += (VLEN * 4)) {
@@ -122,9 +147,33 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
       }
 
       if (!B_SYMMETRIC) {
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(
+                  reinterpret_cast<const __m256i*>(Bq_zero_point_ + j)));
+        }
         x_v = _mm256_sub_epi32(x_v, row_offset_v);
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(
+                  reinterpret_cast<const __m256i*>(Bq_zero_point_ + j + VLEN)));
+        }
         y_v = _mm256_sub_epi32(y_v, row_offset_v);
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+                  Bq_zero_point_ + j + 2 * VLEN)));
+        }
         z_v = _mm256_sub_epi32(z_v, row_offset_v);
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+                  Bq_zero_point_ + j + 3 * VLEN)));
+        }
         w_v = _mm256_sub_epi32(w_v, row_offset_v);
       }
       if (HAS_BIAS) {
@@ -157,10 +206,24 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
        * representation as an FP32 value, and will be rounded to nearest
        * FP32 value with ties to even with default MXCSR rounding mode.
        */
-      __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
-      __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v);
-      __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v);
-      __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v);
+      __m256 x_scaled_v, y_scaled_v, z_scaled_v, w_scaled_v;
+      if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        x_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j));
+        y_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(y_v), _mm256_loadu_ps(C_multiplier_ + j + VLEN));
+        z_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(z_v),
+            _mm256_loadu_ps(C_multiplier_ + j + 2 * VLEN));
+        w_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(w_v),
+            _mm256_loadu_ps(C_multiplier_ + j + 3 * VLEN));
+      } else {
+        x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+        y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v);
+        z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v);
+        w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v);
+      }
 
       /*
        * Convert scaled FP32 result to int32_t using CVTPS2DQ instruction.
@@ -238,6 +301,12 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
       }
 
       if (!B_SYMMETRIC) {
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+              _mm256_loadu_si256(
+                  reinterpret_cast<const __m256i*>(Bq_zero_point_ + j)));
+        }
         x_v = _mm256_sub_epi32(x_v, row_offset_v);
       }
       if (HAS_BIAS) {
@@ -246,7 +315,13 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias_ + j)));
       }
 
-      __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+      __m256 x_scaled_v;
+      if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        x_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j));
+      } else {
+        x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+      }
       __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
 
       __m256i x_packed_v = _mm256_adds_epi16(
@@ -281,19 +356,26 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
           _mm256_castsi256_si128(x_clamped_v));
     } // j loop vectorized
 
+    // TODO: vectorize remainder using masking
     for (; j < block.col_start + block.col_size; ++j) {
       int32_t raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
       if (!A_SYMMETRIC) {
         raw -= Aq_zero_point_ * q_col_offsets_[j];
       }
       if (!B_SYMMETRIC) {
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset = q_row_offsets_[i - block.row_start] * Bq_zero_point_[j];
+        }
         raw -= row_offset;
       }
       if (HAS_BIAS) {
         raw += bias_[j];
       }
 
-      float ab = raw * C_multiplier_;
+      float ab = raw *
+          ((Q_GRAN == QuantizationGranularity::OUT_CHANNEL)
+               ? C_multiplier_[j]
+               : C_multiplier_[quant_param_idx]);
       long rounded = std::lrintf(ab) + C_zero_point_;
 
       out[i * ld_out + j] = std::max(
@@ -303,9 +385,14 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
   } // i loop
 }
 
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN,
+    typename outT,
+    typename inT,
+    typename nextOPType>
 template <inst_set_t instSet>
-inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
+inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
     outT* out,
     const inT* inp,
     const block_type_t& block,
@@ -314,19 +401,35 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
   static_assert(
       std::is_same<inT, int32_t>::value,
       "input data type must be of int32_t type");
+  int ncol_per_group = ncols_ / groups_;
+  assert(
+      block.col_size <= ncol_per_group &&
+      "ReQuantizeOutput should be called at most 1 group at a time.");
+  int g = block.col_start / ncol_per_group;
   if (instSet == inst_set_t::anyarch) {
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
       for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
         inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
         raw -= Aq_zero_point_ * q_col_offsets_[j];
+        int Bq_zero_point_idx;
+        if (Q_GRAN == QuantizationGranularity::TENSOR) {
+          Bq_zero_point_idx = 0;
+        } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+          Bq_zero_point_idx = g;
+        } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          Bq_zero_point_idx = j;
+        } else {
+          assert(false && "unknown quantization granularity");
+        }
         if (q_row_offsets_) {
-          raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_;
+          raw -= q_row_offsets_[i - block.row_start] *
+              Bq_zero_point_[Bq_zero_point_idx];
         }
         if (bias_) {
           raw += bias_[j];
         }
 
-        float ab = raw * C_multiplier_;
+        float ab = raw * C_multiplier_[Bq_zero_point_idx];
         long rounded = std::lrintf(ab) + C_zero_point_;
 
         out[i * ld_out + j] = std::max(
@@ -336,8 +439,11 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
     }
   } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
     if (std::is_same<outT, uint8_t>::value) {
+      bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
+                             Bq_zero_point_[0] == 0) ||
+          q_row_offsets_ == nullptr;
       if (Aq_zero_point_ == 0) {
-        if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) {
+        if (b_symmetric) {
           if (bias_ == nullptr) {
             f_<true, true, false>(out, inp, block, ld_out, ld_in);
           } else {
@@ -351,7 +457,7 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
           }
         }
       } else {
-        if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) {
+        if (b_symmetric) {
           if (bias_ == nullptr) {
             f_<false, true, false>(out, inp, block, ld_out, ld_in);
           } else {
@@ -374,9 +480,14 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
   return nextop_.template f<instSet>(out, out, block, ld_out, ld_out);
 }
 
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN,
+    typename outT,
+    typename inT,
+    typename nextOPType>
 template <inst_set_t instSet>
-inline int ReQuantizeForFloat<FUSE_RELU, outT, inT, nextOPType>::f(
+inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
     outT* out,
     inT* inp,
     const block_type_t& block,
@@ -388,12 +499,28 @@ inline int ReQuantizeForFloat<FUSE_RELU, outT, inT, nextOPType>::f(
   static_assert(
       std::is_same<float, outT>::value,
       "output data type is of not expected type");
+  int ncol_per_group = ncols_ / groups_;
+  assert(
+      block.col_size <= ncol_per_group &&
+      "ReQuantizeOutput should be called at most 1 group at a time.");
+  int g = block.col_start / ncol_per_group;
   for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
     for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
       inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
       raw -= Aq_zero_point_ * q_col_offsets_[j];
-      raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_;
-      float res = raw * Aq_scale_ * Bq_scale_;
+      int Bq_zero_point_idx;
+      if (Q_GRAN == QuantizationGranularity::TENSOR) {
+        Bq_zero_point_idx = 0;
+      } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+        Bq_zero_point_idx = g;
+      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        Bq_zero_point_idx = j;
+      } else {
+        assert(false && "unknown quantization granularity");
+      }
+      raw -= q_row_offsets_[i - block.row_start] *
+          Bq_zero_point_[Bq_zero_point_idx];
+      float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx];
       if (bias_) {
         res += bias_[j];
       }
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 2e2035c..f1ec882 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -240,47 +240,60 @@ void ExecuteKernel<
 
   } // for each j block
 }
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    uint8_t,
-    ReQuantizeOutput<false /* FUSE_RELU*/>>;
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    uint8_t,
-    ReQuantizeOutput<true>>;
-
-template class ExecuteKernel<
-    PackAWithQuantRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    float,
-    ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
-    PackAWithQuantRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    float,
-    ReQuantizeForFloat<true>>;
-
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    float,
-    ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    float,
-    ReQuantizeForFloat<true>>;
-
-template class ExecuteKernel<
-    PackAMatrix<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    int32_t,
-    memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeOutput
+#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \
+  template class ExecuteKernel<               \
+      PackAWithRowOffset<uint8_t, ACC_T>,     \
+      PackBMatrix<int8_t, ACC_T>,             \
+      uint8_t,                                \
+      ReQuantizeOutput<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(ACC_T) \
+  INSTANTIATE_Q_GRANS(ACC_T, false);   \
+  INSTANTIATE_Q_GRANS(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+  template class ExecuteKernel<                            \
+      PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,        \
+      PackBMatrix<int8_t, ACC_T>,                          \
+      uint8_t,                                             \
+      ReQuantizeOutput<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(                                                            \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T)   \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template class ExecuteKernel<
     PackAMatrix<uint8_t, int16_t>,
@@ -288,110 +301,127 @@ template class ExecuteKernel<
     uint8_t,
     ReQuantizeOutput<false>>;
 
-template class ExecuteKernel<
-    PackAMatrix<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    int32_t,
-    memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeForFloat
+#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \
+  template class ExecuteKernel<                \
+      PACK_A<uint8_t, int32_t>,                \
+      PackBMatrix<int8_t, int32_t>,            \
+      float,                                   \
+      ReQuantizeForFloat<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(PACK_A, RELU)                          \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(PACK_A)      \
+  INSTANTIATE_Q_GRANS(PACK_A, false); \
+  INSTANTIATE_Q_GRANS(PACK_A, true);
+
+INSTANTIATE_RELU(PackAWithRowOffset);
+INSTANTIATE_RELU(PackAWithQuantRowOffset);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+  template class ExecuteKernel<                            \
+      PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,        \
+      PackBMatrix<int8_t, ACC_T>,                          \
+      float,                                               \
+      ReQuantizeForFloat<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(                                                            \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T)          \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template class ExecuteKernel<
     PackAWithRowOffset<uint8_t, int16_t>,
     PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    DoSpmdmOnInpBuffer<
-        ReQuantizeOutput<false>::outType,
-        int32_t,
-        ReQuantizeOutput<false>>>;
+    float,
+    ReQuantizeForFloat<false /* FUSE_RELU*/>>;
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    DoSpmdmOnInpBuffer<
-        ReQuantizeOutput<true>::outType,
-        int32_t,
-        ReQuantizeOutput<true>>>;
+////////////////////////////////////////////////////////////////////////////////
+// DoSpmdmOnInpBuffer
+#define INSTANTIATE_BASE(RELU, Q_GRAN)      \
+  template class ExecuteKernel<             \
+      PackAWithRowOffset<uint8_t, int16_t>, \
+      PackBMatrix<int8_t, int16_t>,         \
+      uint8_t,                              \
+      DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    float,
-    DoSpmdmOnInpBuffer<
-        ReQuantizeForFloat<false>::outType,
-        int32_t,
-        ReQuantizeForFloat<false>>>;
+#define INSTANTIATE_Q_GRANS(RELU)                          \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    ReQuantizeOutput<true>>;
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template class ExecuteKernel<
     PackAWithRowOffset<uint8_t, int16_t>,
     PackBMatrix<int8_t, int16_t>,
-    int32_t,
-    memCopy<>>;
+    float,
+    DoSpmdmOnInpBuffer<float, int32_t, ReQuantizeForFloat<false>>>;
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    int32_t,
-    memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// memCopy
+#define INSTANTIATE_BASE(PACK_A, ACC_T) \
+  template class ExecuteKernel<         \
+      PACK_A<uint8_t, ACC_T>,           \
+      PackBMatrix<int8_t, ACC_T>,       \
+      int32_t,                            \
+      memCopy<>>;
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int16_t, 3>,
-    PackBMatrix<int8_t, int16_t>,
-    int32_t,
-    memCopy<>>;
+#define INSTANTIATE_ACC_T(PACK_A)   \
+  INSTANTIATE_BASE(PACK_A, int32_t) \
+  INSTANTIATE_BASE(PACK_A, int16_t)
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+INSTANTIATE_ACC_T(PackAMatrix);
+INSTANTIATE_ACC_T(PackAWithRowOffset);
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int16_t, 3>,
-    PackBMatrix<int8_t, int16_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+#undef INSTANTIATE_ACC_T
+#undef INSTANTIATE_BASE
 
-template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    int32_t,
-    memCopy<>>;
-
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    int32_t,
-    memCopy<>>;
+#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM)        \
+  template class ExecuteKernel<                     \
+      PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+      PackBMatrix<int8_t, ACC_T>,                   \
+      int32_t,                                        \
+      memCopy<>>;
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int32_t, 3>,
-    PackBMatrix<int8_t, int32_t>,
-    int32_t,
-    memCopy<>>;
+#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
+  INSTANTIATE_BASE(ACC_T, 2);          \
+  INSTANTIATE_BASE(ACC_T, 3);
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int32_t>,
-    PackBMatrix<int8_t, int32_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+INSTANTIATE_SPATIAL_DIM(int32_t);
+INSTANTIATE_SPATIAL_DIM(int16_t);
 
-template class ExecuteKernel<
-    PackAWithIm2Col<uint8_t, int32_t, 3>,
-    PackBMatrix<int8_t, int32_t>,
-    uint8_t,
-    ReQuantizeOutput<false>>;
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BASE
 
 template class ExecuteKernel<
     PackAWithQuantRowOffset<uint8_t, int32_t>,
@@ -400,12 +430,6 @@ template class ExecuteKernel<
     memCopy<>>;
 
 template class ExecuteKernel<
-    PackAWithRowOffset<uint8_t, int16_t>,
-    PackBMatrix<int8_t, int16_t>,
-    float,
-    ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
     PackAMatrix<uint8_t, int16_t>,
     PackBMatrix<int8_t, int16_t>,
     int32_t,
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index 0039daf..a8bf02f 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -198,149 +198,70 @@ bool fbgemmSupportedCPU() {
   return (cpuinfo_initialize() && cpuinfo_has_x86_avx2());
 }
 
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<true>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
-        packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
-        packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<true>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAMatrix<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<true>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
-        packA,
-    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-// 16 bit accumulation functions
-template void fbgemmPacked(
-    PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeOutput
+#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN)                                \
+  template void fbgemmPacked(                                                \
+      PackMatrix<PackAWithRowOffset<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB,          \
+      uint8_t* C,                                                            \
+      int32_t* C_buffer,                                                     \
+      uint32_t ldc,                                                          \
+      const ReQuantizeOutput<RELU, Q_GRAN>& outProcess,                      \
+      int thread_id,                                                         \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(ACC_T) \
+  INSTANTIATE_Q_GRANS(ACC_T, false);   \
+  INSTANTIATE_Q_GRANS(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN)          \
+  template void fbgemmPacked(                                       \
+      PackMatrix<                                                   \
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,             \
+          uint8_t,                                                  \
+          ACC_T>& packA,                                            \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+      uint8_t* C,                                                   \
+      int32_t* C_buffer,                                            \
+      uint32_t ldc,                                                 \
+      const ReQuantizeOutput<RELU, Q_GRAN>& outProcess,             \
+      int thread_id,                                                \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(                                                            \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T)   \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template void fbgemmPacked(
     PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
@@ -352,28 +273,109 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<false>>&
-        outProcess,
-    int thread_id,
-    int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeForFloat
+#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN)                          \
+  template void fbgemmPacked(                                           \
+      PackMatrix<PACK_A<uint8_t, int32_t>, uint8_t, int32_t>& packA,    \
+      PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, \
+      float* C,                                                         \
+      int32_t* C_buffer,                                                \
+      uint32_t ldc,                                                     \
+      const ReQuantizeForFloat<RELU, Q_GRAN>& outProcess,               \
+      int thread_id,                                                    \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(PACK_A, RELU)                          \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(PACK_A)      \
+  INSTANTIATE_Q_GRANS(PACK_A, false); \
+  INSTANTIATE_Q_GRANS(PACK_A, true);
+
+INSTANTIATE_RELU(PackAWithRowOffset);
+INSTANTIATE_RELU(PackAWithQuantRowOffset);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN)          \
+  template void fbgemmPacked(                                       \
+      PackMatrix<                                                   \
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,             \
+          uint8_t,                                                  \
+          ACC_T>& packA,                                            \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+      float* C,                                                     \
+      int32_t* C_buffer,                                            \
+      uint32_t ldc,                                                 \
+      const ReQuantizeForFloat<RELU, Q_GRAN>& outProcess,           \
+      int thread_id,                                                \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(                                                            \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 2);       \
+  INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T)          \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+  INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
 
 template void fbgemmPacked(
     PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
     PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
+    float* C,
     int32_t* C_buffer,
     uint32_t ldc,
-    const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<true>>&
-        outProcess,
+    const ReQuantizeForFloat<false>& outProcess,
     int thread_id,
     int num_threads);
 
+////////////////////////////////////////////////////////////////////////////////
+// DoSpmdmOnInpBuffer
+#define INSTANTIATE_BASE(RELU, Q_GRAN)                                    \
+  template void fbgemmPacked(                                             \
+      PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& \
+          packA,                                                          \
+      PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,   \
+      uint8_t* C,                                                         \
+      int32_t* C_buffer,                                                  \
+      uint32_t ldc,                                                       \
+      const DoSpmdmOnInpBuffer<                                           \
+          uint8_t,                                                        \
+          int32_t,                                                        \
+          ReQuantizeOutput<RELU, Q_GRAN>>& outProcess,                    \
+      int thread_id,                                                      \
+      int num_threads);
+
+#define INSTANTIATE_Q_GRANS(RELU)                          \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
+
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
 template void fbgemmPacked(
     PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
     PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
@@ -385,49 +387,57 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<true>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// memCopy
+#define INSTANTIATE_BASE(PACK_A, ACC_T)                             \
+  template void fbgemmPacked(                                       \
+      PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA,    \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+      int32_t* C,                                                   \
+      int32_t* C_buffer,                                            \
+      uint32_t ldc,                                                 \
+      const memCopy<>& outProcess,                                  \
+      int thread_id,                                                \
+      int num_threads);
+
+#define INSTANTIATE_ACC_T(PACK_A)   \
+  INSTANTIATE_BASE(PACK_A, int32_t) \
+  INSTANTIATE_BASE(PACK_A, int16_t)
+
+INSTANTIATE_ACC_T(PackAMatrix);
+INSTANTIATE_ACC_T(PackAWithRowOffset);
+
+#undef INSTANTIATE_ACC_T
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM)                        \
+  template void fbgemmPacked(                                       \
+      PackMatrix<                                                   \
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,             \
+          uint8_t,                                                  \
+          ACC_T>& packA,                                            \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+      int32_t* C,                                                   \
+      int32_t* C_buffer,                                            \
+      uint32_t ldc,                                                 \
+      const memCopy<>& outProcess,                                  \
+      int thread_id,                                                \
+      int num_threads);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
+  INSTANTIATE_BASE(ACC_T, 2);          \
+  INSTANTIATE_BASE(ACC_T, 3);
+
+INSTANTIATE_SPATIAL_DIM(int32_t);
+INSTANTIATE_SPATIAL_DIM(int16_t);
+
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BASE
 
 template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    int32_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const memCopy<>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+    PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
+        packA,
+    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
     int32_t* C,
     int32_t* C_buffer,
     uint32_t ldc,
@@ -436,26 +446,6 @@ template void fbgemmPacked(
     int num_threads);
 
 template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
-    PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    uint8_t* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeOutput<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
-template void fbgemmPacked(
     PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
     PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
     int32_t* C,
@@ -465,14 +455,4 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
-template void fbgemmPacked(
-    PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
-    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
-    float* C,
-    int32_t* C_buffer,
-    uint32_t ldc,
-    const ReQuantizeForFloat<false>& outProcess,
-    int thread_id,
-    int num_threads);
-
 } // namespace fbgemm
diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
index 322287b..50f619a 100644
--- a/src/QuantUtils.cc
+++ b/src/QuantUtils.cc
@@ -436,13 +436,14 @@ void RequantizeAvx2(
   DoNothing<> doNothingObj{};
   ReQuantizeOutput<false /* FUSE_RELU */> requantizeObj(
     doNothingObj,
-    params.real_multiplier,
+    &params.real_multiplier,
     params.target_qparams.zero_point,
     0,
     0,
     nullptr,
     nullptr,
-    nullptr);
+    nullptr,
+    len);
   requantizeObj.f<inst_set_t::avx2>(dst, src, {0, 1, 0, len}, 0, 0);
 }
 #endif
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index 097e3b5..369aea3 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -57,24 +57,25 @@ void requantize_u8acc32_ref(
     int ld,
     const int32_t* inp,
     uint8_t* out,
-    float C_multiplier,
+    const float* C_multiplier,
     int32_t C_zero_point,
     int32_t A_zero_point,
-    int32_t B_zero_point,
+    const int32_t* B_zero_point,
     const int32_t* row_offsets,
     const int32_t* col_offsets,
     const int32_t* bias,
+    int ncols_per_quant_group,
     bool fuse_relu) {
   for (int i = 0; i < M; ++i) {
     for (int j = 0; j < N; ++j) {
       int32_t raw = inp[i * ld + j];
       raw -= A_zero_point * col_offsets[j];
-      raw -= B_zero_point * row_offsets[i];
+      raw -= B_zero_point[j / ncols_per_quant_group] * row_offsets[i];
       if (bias) {
         raw += bias[j];
       }
 
-      float result = raw * C_multiplier;
+      float result = raw * C_multiplier[j / ncols_per_quant_group];
       long rounded = lrintf(result) + C_zero_point;
       out[i * ld + j] = std::max(
           fuse_relu ? static_cast<long>(C_zero_point) : 0l,
@@ -180,14 +181,15 @@ void col_offsets_with_zero_pt_s8acc32_ref(
     int N,
     int ld,
     const int8_t* Bint8,
-    int32_t B_zero_point,
-    int32_t* col_offsets) {
+    const int32_t* B_zero_point,
+    int32_t* col_offsets,
+    int ncols_per_quant_group) {
   for (int j = 0; j < N; ++j) {
     int32_t sum = 0;
     for (int k = 0; k < K; ++k) {
       sum += Bint8[k * ld + j];
     }
-    col_offsets[j] = sum - B_zero_point * K;
+    col_offsets[j] = sum - B_zero_point[j / ncols_per_quant_group] * K;
   }
 }
 
@@ -578,13 +580,14 @@ void depthwise_3x3_pad_1_ref(
           1,
           C_int32.data() + i * K + k,
           C + i * K + k,
-          C_multiplier,
+          &C_multiplier,
           C_zero_point,
           A_zero_point,
-          B_zero_point,
+          &B_zero_point,
           &row_offsets[i * K + k],
           col_offsets + k,
-          bias ? bias + k : nullptr);
+          bias ? bias + k : nullptr,
+          1);
     }
   }
 };
@@ -644,13 +647,14 @@ void depthwise_3x3_per_channel_quantization_pad_1_ref(
           1,
           C_int32.data() + i * K + k,
           C + i * K + k,
-          C_multiplier[k],
+          &C_multiplier[k],
           C_zero_point,
           A_zero_point,
-          B_zero_point[k],
+          &B_zero_point[k],
           &row_offsets[i * K + k],
           col_offsets + k,
-          bias ? bias + k : nullptr);
+          bias ? bias + k : nullptr,
+          1);
     }
   }
 };
@@ -781,13 +785,14 @@ void depthwise_3x3x3_pad_1_ref(
           1,
           C_int32.data() + i * K + k,
           C + i * K + k,
-          C_multiplier,
+          &C_multiplier,
           C_zero_point,
           A_zero_point,
-          B_zero_point,
+          &B_zero_point,
           &row_offsets[i * K + k],
           col_offsets + k,
-          bias ? bias + k : nullptr);
+          bias ? bias + k : nullptr,
+          1);
     }
   }
 };
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index cec4bff..6530eff 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -39,6 +39,11 @@ void requantize_u8acc32_ref(
  * @brief Reference implementation of requantization step.
  * float multiplier
  * @params bias can be nullptr
+ * @params ncols_per_quant_group the number of columns share the same
+ *         quantization parameter.
+ *         ncols_per_quant_group == N : per-tensor quantization
+ *         ncols_per_quant_group == N / groups : per-group quantization
+ *         ncols_per_quant_group == 1 : per-channel quantization
  */
 void requantize_u8acc32_ref(
     int M,
@@ -46,13 +51,14 @@ void requantize_u8acc32_ref(
     int ld,
     const std::int32_t* inp,
     std::uint8_t* out,
-    float C_multiplier,
+    const float* C_multiplier,
     std::int32_t C_zero_point,
     std::int32_t A_zero_point,
-    std::int32_t B_zero_point,
+    const std::int32_t* B_zero_point,
     const std::int32_t* row_offsets,
     const std::int32_t* col_offsets,
     const std::int32_t* bias,
+    int ncols_per_quant_group,
     bool fuse_relu = false);
 
 /**
@@ -114,14 +120,18 @@ void row_offsets_u8acc32_ref(
 /**
  * @brief Reference implementation to compute adjusted col_offsets (sum of
  * columns of B and adjusted with B_zero_point)
+ *
+ * @params ncols_per_quant_group see ncols_per_quant_group in
+ *         requantize_u8acc32_ref
  */
 void col_offsets_with_zero_pt_s8acc32_ref(
     int K,
     int N,
     int ld,
     const std::int8_t* Bint8,
-    std::int32_t B_zero_point,
-    std::int32_t* col_offsets);
+    const std::int32_t* B_zero_point,
+    std::int32_t* col_offsets,
+    int ncols_per_quant_group);
 
 /**
  * @brief Reference implementation of SPMDM (sparse matrix times dense matrix).
diff --git a/test/FP16Test.cc b/test/FP16Test.cc
index b5e4f13..0edcc4b 100644
--- a/test/FP16Test.cc
+++ b/test/FP16Test.cc
@@ -73,19 +73,17 @@ TEST_P(FBGemmFP16Test, Test) {
     }
     cerr << endl;
 
-    aligned_vector<float> A(m * k, 0.f);
-    aligned_vector<float> B(k * n, 0.f);
+    // initialize with small numbers
+    aligned_vector<int> Aint(m * k);
+    aligned_vector<int> Bint(k * n);
+    randFill(Aint, 0, 4);
+    randFill(Bint, 0, 4);
+    aligned_vector<float> A(Aint.begin(), Aint.end());
+    aligned_vector<float> B(Bint.begin(), Bint.end());
+
     aligned_vector<float> C(m * n, NAN);
 
-    // initialize with small numbers
-    randFill(A, 0, 4);
-    randFill(B, 0, 4);
-    randFill(C, 0, 4);
-
-    aligned_vector<float> A_ref, B_ref, C_ref;
-    A_ref = A;
-    B_ref = B;
-    C_ref = C;
+    aligned_vector<float> A_ref(A), B_ref(B), C_ref(C);
 
     if (atrans == matrix_op_t::Transpose) {
       transpose_matrix(A_ref.data(), k, m);
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc
index 9a19f0f..f482783 100644
--- a/test/I8DepthwiseTest.cc
+++ b/test/I8DepthwiseTest.cc
@@ -85,10 +85,10 @@ TEST(FBGemmDepthWiseTest, Test3x3) {
     aligned_vector<int8_t> B(K * R * S);
     aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * K), C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3_pad_1_ref(
@@ -211,10 +211,10 @@ TEST(FBGemmDepthWiseTest, Test3x3x3) {
     aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
         C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
-    randFill(B, -16, 16);
+    randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
     depthwise_3x3x3_pad_1_ref(
@@ -360,7 +360,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
     int32_t C_num_rows = N * H_OUT * W_OUT;
     aligned_vector<int32_t> C_ref(C_num_rows * K), C(C_ref.size());
 
-    randFill(A, 0, 86);
+    randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
 
     // Each row of G has a different range to really test per-channel
@@ -368,7 +368,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
     vector<int32_t> B_zero_point(K);
     for (auto k = 0; k < K; ++k) {
       aligned_vector<int8_t> Bk(R * S);
-      randFill(Bk, -16 + k, 16 + k);
+      randFill<int8_t>(Bk, -16 + k, 16 + k);
       copy(Bk.begin(), Bk.end(), B.begin() + k * R * S);
 
       B_zero_point[k] = 5 + k;
diff --git a/test/I8SpmdmTest.cc b/test/I8SpmdmTest.cc
index 5bb7703..2090b63 100644
--- a/test/I8SpmdmTest.cc
+++ b/test/I8SpmdmTest.cc
@@ -66,7 +66,7 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) {
     }
 
     aligned_vector<uint8_t> A(M * K);
-    randFill(A, 0, 255);
+    randFill<uint8_t>(A, 0, 255);
 
     CompressedSparseColumn B_csc(K_adjusted, N_adjusted);
     vector<int32_t> C(M * N);
@@ -127,13 +127,8 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) {
 #pragma omp parallel
 #endif
     {
-#ifdef _OPENMP
-      int num_threads = omp_get_num_threads();
-      int tid = omp_get_thread_num();
-#else
-      int num_threads = 1;
-      int tid = 0;
-#endif
+      int num_threads = fbgemm_get_num_threads();
+      int tid = fbgemm_get_thread_num();
       int i_per_thread = (M + num_threads - 1) / num_threads;
       int i_begin = std::min(tid * i_per_thread, M);
       int i_end = std::min(i_begin + i_per_thread, M);
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index c7de1a8..d8f3f7a 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -20,8 +20,22 @@
 #include "src/RefImplementations.h"
 
 using namespace std;
+using namespace fbgemm;
 
-namespace fbgemm {
+vector<QuantizationGranularity> qGranularityVals{
+    QuantizationGranularity::TENSOR,
+    QuantizationGranularity::GROUP,
+    QuantizationGranularity::OUT_CHANNEL};
+
+namespace {
+class fbgemmIm2colTest
+    : public testing::TestWithParam<QuantizationGranularity> {};
+}; // namespace
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmIm2colTest,
+    ::testing::ValuesIn(qGranularityVals));
 
 // From Faster-RCNN with ShuffleNet
 static vector<conv_param_t<>> shapes = {
@@ -71,7 +85,7 @@ static vector<conv_param_t<>> shapes = {
     conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
 };
 
-template <typename ACC_T>
+template <typename ACC_T, QuantizationGranularity Q_GRAN>
 static void Im2colTest() {
   for (auto conv_p : shapes) {
     for (int groups : {1, 4}) {
@@ -80,29 +94,38 @@ static void Im2colTest() {
       }
       conv_p.G = groups;
       aligned_vector<uint8_t> Aint8(
-          conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+          conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
       aligned_vector<int8_t> Bint8(
-          conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+          conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
       aligned_vector<int32_t> Cint32_ref(
-          conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-
-      int32_t Aint8_zero_point, Bint8_zero_point;
+          conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+
+      int ncols_per_quant_group = conv_p.OC;
+      if (Q_GRAN == QuantizationGranularity::GROUP) {
+        ncols_per_quant_group = conv_p.OC / conv_p.G;
+      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        ncols_per_quant_group = 1;
+      }
+      int32_t Aint8_zero_point;
+      aligned_vector<int32_t> Bint8_zero_point(
+          conv_p.OC / ncols_per_quant_group);
       if (is_same<ACC_T, int32_t>::value) {
-        randFill(Aint8, 0, 80);
+        randFill<uint8_t>(Aint8, 0, 80);
         Aint8_zero_point = 43;
-        randFill(Bint8, -16, 16);
-        Bint8_zero_point = -30;
+        randFill<int8_t>(Bint8, -16, 16);
+        randFill(Bint8_zero_point, -50, -10);
       } else {
-        randFill(Aint8, 0, 5);
+        randFill<uint8_t>(Aint8, 0, 5);
         Aint8_zero_point = 4;
-        randFill(Bint8, -4, 4);
-        Bint8_zero_point = -2;
+        randFill<int8_t>(Bint8, -4, 4);
+        randFill(Bint8_zero_point, -3, -1);
       }
 
-      float C_multiplier = 0.1234;
+      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
       int32_t C_zero_pt = 5;
 
       int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
@@ -116,16 +139,16 @@ static void Im2colTest() {
       im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * NDim);
+      vector<int32_t> col_offsets(groups * NDim);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             KDimPerGroup,
             NDim,
             NDim,
             Bint8.data() + g * KDimPerGroup * NDim,
-            Bint8_zero_point,
-            col_offsets.data() + g * NDim);
+            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
+            col_offsets.data() + g * NDim,
+            ncols_per_quant_group);
       }
 
       conv_ref(
@@ -149,13 +172,14 @@ static void Im2colTest() {
             conv_p.G * NDim,
             Cint32_ref.data() + g * NDim,
             Cint8_ref.data() + g * NDim,
-            C_multiplier,
+            C_multiplier.data() + g * NDim / ncols_per_quant_group,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
             row_offsets.data(),
             col_offsets.data() + g * NDim,
-            nullptr);
+            nullptr,
+            ncols_per_quant_group);
       }
 
       PackBMatrix<int8_t, ACC_T> packedB(
@@ -171,8 +195,7 @@ static void Im2colTest() {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithIm2Col<uint8_t, ACC_T>::rowOffsetBufferSize());
 
         PackAWithIm2Col<uint8_t, ACC_T> packA(
@@ -183,23 +206,20 @@ static void Im2colTest() {
             row_offset_buf.data());
 
         DoNothing<> doNothingObj{};
-        ReQuantizeOutput<false> outputProcObj(
+        ReQuantizeOutput<false, Q_GRAN> outputProcObj(
             doNothingObj,
-            C_multiplier,
+            C_multiplier.data(),
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data(),
             packA.getRowOffsetBuffer(),
             col_offsets.data(),
-            nullptr);
+            nullptr,
+            conv_p.G * NDim,
+            conv_p.G);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
 
         fbgemmPacked(
             packA,
@@ -236,12 +256,26 @@ static void Im2colTest() {
   } // for each shape
 }
 
-TEST(FBGemmIm2colTest, Acc32Test) {
-  Im2colTest<int32_t>();
+TEST_P(fbgemmIm2colTest, Acc32Test) {
+  QuantizationGranularity q_granularity = GetParam();
+  if (q_granularity == QuantizationGranularity::TENSOR) {
+    Im2colTest<int32_t, QuantizationGranularity::TENSOR>();
+  } else if (q_granularity == QuantizationGranularity::GROUP) {
+    Im2colTest<int32_t, QuantizationGranularity::GROUP>();
+  } else {
+    Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>();
+  }
 }
 
-TEST(FBGemmIm2colTest, Acc16Test) {
-  Im2colTest<int16_t>();
+TEST_P(fbgemmIm2colTest, Acc16Test) {
+  QuantizationGranularity q_granularity = GetParam();
+  if (q_granularity == QuantizationGranularity::TENSOR) {
+    Im2colTest<int16_t, QuantizationGranularity::TENSOR>();
+  } else if (q_granularity == QuantizationGranularity::GROUP) {
+    Im2colTest<int16_t, QuantizationGranularity::GROUP>();
+  } else {
+    Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>();
+  }
 }
 
 static vector<conv_param_t<3>> shapes_3d = {
@@ -319,7 +353,7 @@ static vector<conv_param_t<3>> shapes_3d = {
         3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, 0, 0}),
 };
 
-template <typename ACC_T>
+template <typename ACC_T, QuantizationGranularity Q_GRAN>
 static void Im2col3DTest() {
   for (auto conv_p : shapes_3d) {
     for (int groups : {1, 4}) {
@@ -329,32 +363,39 @@ static void Im2col3DTest() {
       conv_p.G = groups;
       aligned_vector<uint8_t> Aint8(
           conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IN_DIM[2] *
-              conv_p.IC,
-          0);
+          conv_p.IC);
       aligned_vector<int8_t> Bint8(
-          conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC, 0);
+          conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC);
       aligned_vector<int32_t> Cint32_ref(
           conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] *
-              conv_p.OUT_DIM[2] * conv_p.OC,
-          0);
-      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-
-      int32_t Aint8_zero_point, Bint8_zero_point;
+          conv_p.OUT_DIM[2] * conv_p.OC);
+      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+
+      int ncols_per_quant_group = conv_p.OC;
+      if (Q_GRAN == QuantizationGranularity::GROUP) {
+        ncols_per_quant_group = conv_p.OC / conv_p.G;
+      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        ncols_per_quant_group = 1;
+      }
+      int32_t Aint8_zero_point;
+      aligned_vector<int32_t> Bint8_zero_point(
+          conv_p.OC / ncols_per_quant_group);
       if (is_same<ACC_T, int32_t>::value) {
-        randFill(Aint8, 0, 80);
+        randFill<uint8_t>(Aint8, 0, 80);
         Aint8_zero_point = 43;
-        randFill(Bint8, -16, 16);
-        Bint8_zero_point = -30;
+        randFill<int8_t>(Bint8, -16, 16);
+        randFill(Bint8_zero_point, -50, -10);
       } else {
-        randFill(Aint8, 0, 5);
+        randFill<uint8_t>(Aint8, 0, 5);
         Aint8_zero_point = 4;
-        randFill(Bint8, -4, 4);
-        Bint8_zero_point = -2;
+        randFill<int8_t>(Bint8, -4, 4);
+        randFill(Bint8_zero_point, -3, -1);
       }
 
-      float C_multiplier = 0.1234;
+      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
       int32_t C_zero_pt = 5;
 
       int MDim =
@@ -369,16 +410,16 @@ static void Im2col3DTest() {
       im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * NDim);
+      vector<int32_t> col_offsets(groups * NDim);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             KDimPerGroup,
             NDim,
             NDim,
             Bint8.data() + g * KDimPerGroup * NDim,
-            Bint8_zero_point,
-            col_offsets.data() + g * NDim);
+            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
+            col_offsets.data() + g * NDim,
+            ncols_per_quant_group);
       }
 
       conv3d_ref(
@@ -402,13 +443,14 @@ static void Im2col3DTest() {
             conv_p.G * NDim,
             Cint32_ref.data() + g * NDim,
             Cint8_ref.data() + g * NDim,
-            C_multiplier,
+            C_multiplier.data() + g * NDim / ncols_per_quant_group,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
             row_offsets.data(),
             col_offsets.data() + g * NDim,
-            nullptr);
+            nullptr,
+            ncols_per_quant_group);
       }
 
       PackBMatrix<int8_t, ACC_T> packedB(
@@ -424,8 +466,7 @@ static void Im2col3DTest() {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithIm2Col<uint8_t, ACC_T, 3>::rowOffsetBufferSize());
 
         PackAWithIm2Col<uint8_t, ACC_T, 3> packA(
@@ -436,23 +477,20 @@ static void Im2col3DTest() {
             row_offset_buf.data());
 
         DoNothing<> doNothingObj{};
-        ReQuantizeOutput<false> outputProcObj(
+        ReQuantizeOutput<false, Q_GRAN> outputProcObj(
             doNothingObj,
-            C_multiplier,
+            C_multiplier.data(),
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data(),
             packA.getRowOffsetBuffer(),
             col_offsets.data(),
-            nullptr);
+            nullptr,
+            conv_p.G * NDim,
+            conv_p.G);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
 
         fbgemmPacked(
             packA,
@@ -495,12 +533,24 @@ static void Im2col3DTest() {
   } // for each shape
 }
 
-TEST(FBGemmIm2colTest, 3DAcc32Test) {
-  Im2col3DTest<int32_t>();
+TEST_P(fbgemmIm2colTest, 3DAcc32Test) {
+  QuantizationGranularity q_granularity = GetParam();
+  if (q_granularity == QuantizationGranularity::TENSOR) {
+    Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>();
+  } else if (q_granularity == QuantizationGranularity::GROUP) {
+    Im2col3DTest<int32_t, QuantizationGranularity::GROUP>();
+  } else {
+    Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>();
+  }
 }
 
-TEST(FBGemmIm2colTest, 3DAcc16Test) {
-  Im2col3DTest<int16_t>();
+TEST_P(fbgemmIm2colTest, 3DAcc16Test) {
+  QuantizationGranularity q_granularity = GetParam();
+  if (q_granularity == QuantizationGranularity::TENSOR) {
+    Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>();
+  } else if (q_granularity == QuantizationGranularity::GROUP) {
+    Im2col3DTest<int16_t, QuantizationGranularity::GROUP>();
+  } else {
+    Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>();
+  }
 }
-
-} // namespace fbgemm
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index cb614cd..55f6e7f 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -25,17 +25,34 @@
 using namespace std;
 using namespace fbgemm;
 
-std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
+vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
                                        matrix_op_t::Transpose};
 
+vector<QuantizationGranularity> qGranularityVals{
+    QuantizationGranularity::TENSOR,
+    QuantizationGranularity::GROUP,
+    QuantizationGranularity::OUT_CHANNEL};
+
 namespace {
-class fbgemmu8s8acc16test : public testing::TestWithParam<
-                                std::tuple<matrix_op_t, matrix_op_t, bool>> {};
+class fbgemmu8s8acc16WithQuantGranularityTest
+    : public testing::TestWithParam<
+          tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
+class fbgemmu8s8acc16Test
+    : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
-    fbgemmu8s8acc16test,
+    fbgemmu8s8acc16WithQuantGranularityTest,
+    ::testing::Combine(
+        ::testing::Values(matrix_op_t::NoTranspose),
+        ::testing::ValuesIn(transposeVals),
+        ::testing::Bool(),
+        ::testing::ValuesIn(qGranularityVals)));
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmu8s8acc16Test,
     ::testing::Combine(
         ::testing::Values(matrix_op_t::NoTranspose),
         ::testing::ValuesIn(transposeVals),
@@ -77,11 +94,12 @@ static vector<vector<int>> GetShapes_() {
  * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
  * accumulation. Output processing: requantization -> nothing
  */
-TEST_P(fbgemmu8s8acc16test, Test) {
+TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, Test) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+  QuantizationGranularity q_granularity;
+  tie(atrans, btrans, test_ld, q_granularity) = GetParam();
 
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
@@ -93,22 +111,21 @@ TEST_P(fbgemmu8s8acc16test, Test) {
       }
       int k_per_group = k / groups;
 
-      aligned_vector<uint8_t> Aint8(m * k, 0);
+      aligned_vector<uint8_t> Aint8(m * k);
 
-      aligned_vector<int8_t> Bint8(k * n, 0);
-      aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+      aligned_vector<int8_t> Bint8_ref(k * n);
 
-      aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
-      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+      aligned_vector<int32_t> Cint32_ref(m * n * groups);
+      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
 
-      randFill(Aint8, 0, 255);
+      randFill<uint8_t>(Aint8, 0, 255);
       int32_t Aint8_zero_point = 43;
 
-      randFill(Bint8_ref, -128, 127);
-      Bint8 = Bint8_ref;
+      randFill<int8_t>(Bint8_ref, -128, 127);
+      aligned_vector<int8_t> Bint8(Bint8_ref);
 
       if (btrans == matrix_op_t::Transpose) {
         aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -124,7 +141,6 @@ TEST_P(fbgemmu8s8acc16test, Test) {
         Bint8 = Bint8_temp;
       }
 
-      int32_t Bint8_zero_point = -30;
       // To test lda != k , we just reduce k by half and use the original k
       // as lda.
       int n_adjusted = n;
@@ -137,23 +153,33 @@ TEST_P(fbgemmu8s8acc16test, Test) {
         }
       }
 
+      int ncols_per_quant_group = groups * n_adjusted;
+      if (q_granularity == QuantizationGranularity::GROUP) {
+        ncols_per_quant_group = n_adjusted;
+      } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+        ncols_per_quant_group = 1;
+      }
+      aligned_vector<int32_t> Bint8_zero_point(
+          groups * n_adjusted / ncols_per_quant_group);
+      randFill(Bint8_zero_point, -60, 0);
+
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * n_adjusted);
+      vector<int32_t> col_offsets(groups * n_adjusted);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             k_per_group,
             n_adjusted,
             n,
             Bint8_ref.data() + g * k_per_group * n,
-            Bint8_zero_point,
-            col_offsets.data() + g * n_adjusted);
+            Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+            col_offsets.data() + g * n_adjusted,
+            ncols_per_quant_group);
       }
 
-      vector<int32_t> row_offsets;
-      row_offsets.resize(m);
+      vector<int32_t> row_offsets(m);
 
-      float C_multiplier = 0.1234;
+      aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+      randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
       int32_t C_zero_pt = 5;
 
       int brow = 256;
@@ -183,13 +209,14 @@ TEST_P(fbgemmu8s8acc16test, Test) {
             groups * n,
             Cint32_ref.data() + g * n_adjusted,
             Cint8_ref.data() + g * n_adjusted,
-            C_multiplier,
+            C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
             C_zero_pt,
             Aint8_zero_point,
-            Bint8_zero_point,
+            Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
             row_offsets.data(),
             col_offsets.data() + g * n_adjusted,
-            nullptr);
+            nullptr,
+            ncols_per_quant_group);
       }
 
       PackBMatrix<int8_t, int16_t> packedBN(
@@ -205,8 +232,7 @@ TEST_P(fbgemmu8s8acc16test, Test) {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
         PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -219,34 +245,79 @@ TEST_P(fbgemmu8s8acc16test, Test) {
             groups,
             row_offset_buf.data());
 
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
+
         DoNothing<> doNothingObj{};
-        ReQuantizeOutput<false> outputProcObj(
-            doNothingObj,
-            C_multiplier,
-            C_zero_pt,
-            Aint8_zero_point,
-            Bint8_zero_point,
-            packAN.getRowOffsetBuffer(),
-            col_offsets.data(),
-            nullptr);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        if (q_granularity == QuantizationGranularity::TENSOR) {
+          ReQuantizeOutput<false> outputProcObj(
+              doNothingObj,
+              C_multiplier.data(),
+              C_zero_pt,
+              Aint8_zero_point,
+              Bint8_zero_point.data(),
+              packAN.getRowOffsetBuffer(),
+              col_offsets.data(),
+              nullptr,
+              groups * n_adjusted,
+              groups);
 
-        fbgemmPacked(
-            packAN,
-            packedBN,
-            Cint8_fb.data(),
-            Cint32_buffer.data(),
-            groups * n,
-            outputProcObj,
-            tid,
-            num_threads);
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cint8_fb.data(),
+              Cint32_buffer.data(),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        } else if (q_granularity == QuantizationGranularity::GROUP) {
+          ReQuantizeOutput<false, QuantizationGranularity::GROUP> outputProcObj(
+              doNothingObj,
+              C_multiplier.data(),
+              C_zero_pt,
+              Aint8_zero_point,
+              Bint8_zero_point.data(),
+              packAN.getRowOffsetBuffer(),
+              col_offsets.data(),
+              nullptr,
+              groups * n_adjusted,
+              groups);
+
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cint8_fb.data(),
+              Cint32_buffer.data(),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        } else {
+          ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+              outputProcObj(
+                  doNothingObj,
+                  C_multiplier.data(),
+                  C_zero_pt,
+                  Aint8_zero_point,
+                  Bint8_zero_point.data(),
+                  packAN.getRowOffsetBuffer(),
+                  col_offsets.data(),
+                  nullptr,
+                  groups * n_adjusted,
+                  groups);
+
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cint8_fb.data(),
+              Cint32_buffer.data(),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        }
       } // omp parallel
 
       compare_validate_buffers(
@@ -264,11 +335,12 @@ TEST_P(fbgemmu8s8acc16test, Test) {
  * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
  * accumulation. Output processing: spmdm -> requantization -> nothing
  */
-TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
+TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, SpMDMTest) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+  QuantizationGranularity q_granularity;
+  tie(atrans, btrans, test_ld, q_granularity) = GetParam();
 
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
@@ -283,21 +355,19 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
         }
         int k_per_group = k / groups;
 
-        aligned_vector<uint8_t> Aint8(m * k, 0);
+        aligned_vector<uint8_t> Aint8(m * k);
+        aligned_vector<int8_t> Bint8(k * n);
 
-        aligned_vector<int8_t> Bint8(k * n, 0);
-        aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+        aligned_vector<int32_t> Cint32_ref(m * n * groups);
+        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
 
-        aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
-        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
-
-        randFill(Aint8, 0, 255);
+        randFill<uint8_t>(Aint8, 0, 255);
         int32_t Aint8_zero_point = 43;
 
-        randFill(Bint8, -128, 127);
+        randFill<int8_t>(Bint8, -128, 127);
 
         // To test lda != k , we just reduce k by half and use the original k
         // as lda.
@@ -311,18 +381,27 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
           }
         }
 
-        int32_t Bint8_zero_point = -30;
+        int ncols_per_quant_group = groups * n_adjusted;
+        if (q_granularity == QuantizationGranularity::GROUP) {
+          ncols_per_quant_group = n_adjusted;
+        } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+          ncols_per_quant_group = 1;
+        }
+        aligned_vector<int32_t> Bint8_zero_point(
+            groups * n_adjusted / ncols_per_quant_group);
+        randFill(Bint8_zero_point, -50, -10);
+
         // computing column offset
-        vector<int32_t> col_offsets;
-        col_offsets.resize(groups * n_adjusted);
+        vector<int32_t> col_offsets(groups * n_adjusted);
         for (int g = 0; g < groups; ++g) {
           col_offsets_with_zero_pt_s8acc32_ref(
               k_per_group,
               n_adjusted,
               n,
-              Bint8_ref.data() + g * k_per_group * n,
-              Bint8_zero_point,
-              col_offsets.data() + g * n_adjusted);
+              Bint8.data() + g * k_per_group * n,
+              Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+              col_offsets.data() + g * n_adjusted,
+              ncols_per_quant_group);
         }
 
         CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted);
@@ -366,7 +445,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
         }
         B_csc.ColPtr()[groups * n_adjusted] = total_nnz;
 
-        Bint8_ref = Bint8;
+        aligned_vector<int8_t> Bint8_ref(Bint8);
 
         if (btrans == matrix_op_t::Transpose) {
           aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -382,10 +461,10 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
           Bint8 = Bint8_temp;
         }
 
-        vector<int32_t> row_offsets;
-        row_offsets.resize(m);
+        vector<int32_t> row_offsets(m);
 
-        float C_multiplier = 0.1234;
+        aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+        randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
         int32_t C_zero_pt = 5;
 
         int brow = 256;
@@ -428,13 +507,14 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
               groups * n,
               Cint32_ref.data() + g * n_adjusted,
               Cint8_ref.data() + g * n_adjusted,
-              C_multiplier,
+              C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
               C_zero_pt,
               Aint8_zero_point,
-              Bint8_zero_point,
+              Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
               row_offsets.data(),
               col_offsets.data() + g * n_adjusted,
-              nullptr);
+              nullptr,
+              ncols_per_quant_group);
         }
 
         PackBMatrix<int8_t, int16_t> packedB(
@@ -450,8 +530,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
 #pragma omp parallel
 #endif
         {
-          vector<int32_t> row_offset_buf;
-          row_offset_buf.resize(
+          vector<int32_t> row_offset_buf(
               PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
           PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -464,51 +543,106 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
               groups,
               row_offset_buf.data());
 
+          int num_threads = fbgemm_get_num_threads();
+          int tid = fbgemm_get_thread_num();
+
           // spmdm -> requantization -> nothing
           // construct an output processing pipeline in reverse order
           // i.e. last output operation first
           // Last operation should always be DoNothing with
           // correct input and output type.
           DoNothing<> doNothingObj{};
-          // The second last operation is requantization back
-          // to int8
-          ReQuantizeOutput<false> reqObj(
-              doNothingObj,
-              C_multiplier,
-              C_zero_pt,
-              Aint8_zero_point,
-              Bint8_zero_point,
-              packAN.getRowOffsetBuffer(),
-              col_offsets.data(),
-              nullptr);
-          // the top most (first) operation in the output processing
-          // pipeline is spmdm
-          // outType = final output type after fullly processing through
-          // pipeline inType = initial input type at the first call to the whole
-          // pipeline
-          DoSpmdmOnInpBuffer<
-              ReQuantizeOutput<false>::outType,
-              int32_t,
-              ReQuantizeOutput<false>>
-              spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
-
-#ifdef _OPENMP
-          int num_threads = omp_get_num_threads();
-          int tid = omp_get_thread_num();
-#else
-          int num_threads = 1;
-          int tid = 0;
-#endif
 
-          fbgemmPacked(
-              packAN,
-              packedB,
-              Cint8_fb.data(),
-              Cint32_fb.data(),
-              groups * n,
-              spmdmObj,
-              tid,
-              num_threads);
+          if (q_granularity == QuantizationGranularity::TENSOR) {
+            // The second last operation is requantization back
+            // to int8
+            ReQuantizeOutput<false> reqObj(
+                doNothingObj,
+                C_multiplier.data(),
+                C_zero_pt,
+                Aint8_zero_point,
+                Bint8_zero_point.data(),
+                packAN.getRowOffsetBuffer(),
+                col_offsets.data(),
+                nullptr,
+                groups * n_adjusted,
+                groups);
+            // the top most (first) operation in the output processing
+            // pipeline is spmdm
+            // outType = final output type after fullly processing through
+            // pipeline inType = initial input type at the first call to the
+            // whole pipeline
+            DoSpmdmOnInpBuffer<
+                ReQuantizeOutput<false>::outType,
+                int32_t,
+                ReQuantizeOutput<false>>
+                spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+            fbgemmPacked(
+                packAN,
+                packedB,
+                Cint8_fb.data(),
+                Cint32_fb.data(),
+                groups * n,
+                spmdmObj,
+                tid,
+                num_threads);
+          } else if (q_granularity == QuantizationGranularity::GROUP) {
+            ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj(
+                doNothingObj,
+                C_multiplier.data(),
+                C_zero_pt,
+                Aint8_zero_point,
+                Bint8_zero_point.data(),
+                packAN.getRowOffsetBuffer(),
+                col_offsets.data(),
+                nullptr,
+                groups * n_adjusted,
+                groups);
+            DoSpmdmOnInpBuffer<
+                ReQuantizeOutput<false>::outType,
+                int32_t,
+                ReQuantizeOutput<false, QuantizationGranularity::GROUP>>
+                spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+            fbgemmPacked(
+                packAN,
+                packedB,
+                Cint8_fb.data(),
+                Cint32_fb.data(),
+                groups * n,
+                spmdmObj,
+                tid,
+                num_threads);
+          } else {
+            ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+                reqObj(
+                    doNothingObj,
+                    C_multiplier.data(),
+                    C_zero_pt,
+                    Aint8_zero_point,
+                    Bint8_zero_point.data(),
+                    packAN.getRowOffsetBuffer(),
+                    col_offsets.data(),
+                    nullptr,
+                    groups * n_adjusted,
+                    groups);
+            DoSpmdmOnInpBuffer<
+                ReQuantizeOutput<false>::outType,
+                int32_t,
+                ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>>
+                spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+            fbgemmPacked(
+                packAN,
+                packedB,
+                Cint8_fb.data(),
+                Cint32_fb.data(),
+                groups * n,
+                spmdmObj,
+                tid,
+                num_threads);
+          }
         }
 
         compare_validate_buffers(
@@ -527,7 +661,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
  * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
  * accumulation. Output processing: nothing
  */
-TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
+TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
@@ -543,20 +677,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
       }
       int k_per_group = k / groups;
 
-      aligned_vector<uint8_t> Aint8(m * k, 0);
+      aligned_vector<uint8_t> Aint8(m * k);
 
-      aligned_vector<int8_t> Bint8(k * n, 0);
-      aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+      aligned_vector<int8_t> Bint8_ref(k * n);
 
-      aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
-      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+      aligned_vector<int32_t> Cint32_ref(m * n * groups);
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
 
-      randFill(Aint8, 0, 255);
+      randFill<uint8_t>(Aint8, 0, 255);
       int32_t Aint8_zero_point = 43;
 
-      randFill(Bint8_ref, -128, 127);
-      Bint8 = Bint8_ref;
+      randFill<int8_t>(Bint8_ref, -128, 127);
+      aligned_vector<int8_t> Bint8(Bint8_ref);
 
       if (btrans == matrix_op_t::Transpose) {
         aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -586,20 +719,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
       }
 
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * n_adjusted);
+      vector<int32_t> col_offsets(groups * n_adjusted);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             k_per_group,
             n_adjusted,
             n,
             Bint8_ref.data() + g * k_per_group * n,
-            Bint8_zero_point,
-            col_offsets.data() + g * n_adjusted);
+            &Bint8_zero_point,
+            col_offsets.data() + g * n_adjusted,
+            n_adjusted);
       }
 
-      vector<int32_t> row_offsets;
-      row_offsets.resize(m);
+      vector<int32_t> row_offsets(m);
 
       int brow = 256;
       for (int g = 0; g < groups; ++g) {
@@ -636,8 +768,7 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
 
         PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -654,13 +785,8 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
         DoNothing<int32_t, int32_t> doNothingObj{};
         memCopy<> outputProcObj(doNothingObj);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
 
         fbgemmPacked(
             packAN,
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index a5744c0..9873e3f 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -25,17 +25,35 @@
 using namespace std;
 using namespace fbgemm;
 
-std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
-                                       matrix_op_t::Transpose};
+vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
+                                  matrix_op_t::Transpose};
+
+vector<QuantizationGranularity> qGranularityVals{
+    QuantizationGranularity::TENSOR,
+    QuantizationGranularity::GROUP,
+    QuantizationGranularity::OUT_CHANNEL};
 
 namespace {
-class fbgemmu8s8acc32test : public testing::TestWithParam<
-                                std::tuple<matrix_op_t, matrix_op_t, bool>> {};
+class fbgemmu8s8acc32WithQuantGranularityTest
+    : public testing::TestWithParam<
+          tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
+class fbgemmu8s8acc32Test
+    : public testing::TestWithParam<
+          tuple<matrix_op_t, matrix_op_t, bool>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
-    fbgemmu8s8acc32test,
+    fbgemmu8s8acc32WithQuantGranularityTest,
+    ::testing::Combine(
+        ::testing::Values(matrix_op_t::NoTranspose),
+        ::testing::ValuesIn(transposeVals),
+        ::testing::Bool(),
+        ::testing::ValuesIn(qGranularityVals)));
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmu8s8acc32Test,
     ::testing::Combine(
         ::testing::Values(matrix_op_t::NoTranspose),
         ::testing::ValuesIn(transposeVals),
@@ -77,11 +95,12 @@ static vector<vector<int>> GetShapes_() {
  * @brief Unit test for uint8 matrix A, int8 matrix B, and 32-bit
  * accumulation. Output processing: requantization -> nothing
  */
-TEST_P(fbgemmu8s8acc32test, Test) {
+TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, Test) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+  QuantizationGranularity q_granularity;
+  tie(atrans, btrans, test_ld, q_granularity) = GetParam();
 
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
@@ -95,22 +114,21 @@ TEST_P(fbgemmu8s8acc32test, Test) {
         int k_per_group = k / groups;
 
         // mxk matrix
-        aligned_vector<uint8_t> Aint8(m * k, 0);
+        aligned_vector<uint8_t> Aint8(m * k);
 
         // kxn matrix
-        aligned_vector<int8_t> Bint8(k * n, 0);
-        aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+        aligned_vector<int8_t> Bint8_ref(k * n);
 
-        aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
-        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
-        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
-        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+        aligned_vector<int32_t> Cint32_ref(m * n * groups);
+        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
 
-        randFill(Aint8, 0, 255);
+        randFill<uint8_t>(Aint8, 0, 255);
         int32_t Aint8_zero_point = 43;
 
-        randFill(Bint8_ref, -128, 127);
+        randFill<int8_t>(Bint8_ref, -128, 127);
         for (int g = 0; g < groups; ++g) {
           avoidOverflow(
               m,
@@ -122,7 +140,7 @@ TEST_P(fbgemmu8s8acc32test, Test) {
               n);
         }
 
-        Bint8 = Bint8_ref;
+        aligned_vector<int8_t> Bint8(Bint8_ref);
 
         // initialize bias
         aligned_vector<int32_t> bias_int32(groups * n);
@@ -146,7 +164,6 @@ TEST_P(fbgemmu8s8acc32test, Test) {
           Bint8 = Bint8_temp;
         }
 
-        int32_t Bint8_zero_point = -30;
         // To test lda != k , we just reduce k by half and use the original k
         // as lda.
         int n_adjusted = n;
@@ -159,23 +176,33 @@ TEST_P(fbgemmu8s8acc32test, Test) {
           }
         }
 
+        int ncols_per_quant_group = groups * n_adjusted;
+        if (q_granularity == QuantizationGranularity::GROUP) {
+          ncols_per_quant_group = n_adjusted;
+        } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+          ncols_per_quant_group = 1;
+        }
+        aligned_vector<int32_t> Bint8_zero_point(
+            groups * n_adjusted / ncols_per_quant_group);
+        randFill(Bint8_zero_point, -50, -10);
+
         // computing column offset
-        vector<int32_t> col_offsets;
-        col_offsets.resize(groups * n_adjusted);
+        vector<int32_t> col_offsets(groups * n_adjusted);
         for (int g = 0; g < groups; ++g) {
           col_offsets_with_zero_pt_s8acc32_ref(
               k_per_group,
               n_adjusted,
               n,
               Bint8_ref.data() + g * k_per_group * n,
-              Bint8_zero_point,
-              col_offsets.data() + g * n_adjusted);
+              Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+              col_offsets.data() + g * n_adjusted,
+              ncols_per_quant_group);
         }
 
-        vector<int32_t> row_offsets;
-        row_offsets.resize(m);
+        vector<int32_t> row_offsets(m);
 
-        float C_multiplier = 0.001234;
+        aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+        randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
         int32_t C_zero_pt = 5;
 
         for (int g = 0; g < groups; ++g) {
@@ -203,13 +230,14 @@ TEST_P(fbgemmu8s8acc32test, Test) {
               groups * n,
               Cint32_ref.data() + g * n_adjusted,
               Cint8_ref.data() + g * n_adjusted,
-              C_multiplier,
+              C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
               C_zero_pt,
               Aint8_zero_point,
-              Bint8_zero_point,
+              Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
               row_offsets.data(),
               col_offsets.data() + g * n_adjusted,
-              bias ? (bias + g * n_adjusted) : nullptr);
+              bias ? (bias + g * n_adjusted) : nullptr,
+              ncols_per_quant_group);
         }
 
         PackBMatrix<int8_t> packedBN(
@@ -225,8 +253,7 @@ TEST_P(fbgemmu8s8acc32test, Test) {
 #pragma omp parallel
 #endif
         {
-          vector<int32_t> row_offset_buf;
-          row_offset_buf.resize(
+          vector<int32_t> row_offset_buf(
               PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
 
           PackAWithRowOffset<uint8_t> packAN(
@@ -239,34 +266,80 @@ TEST_P(fbgemmu8s8acc32test, Test) {
               groups,
               row_offset_buf.data());
 
-          DoNothing<> doNothingObj{};
-          ReQuantizeOutput<false> outputProcObj(
-              doNothingObj,
-              C_multiplier,
-              C_zero_pt,
-              Aint8_zero_point,
-              Bint8_zero_point,
-              packAN.getRowOffsetBuffer(),
-              col_offsets.data(),
-              bias);
+          int num_threads = fbgemm_get_num_threads();
+          int tid = fbgemm_get_thread_num();
 
-#ifdef _OPENMP
-          int num_threads = omp_get_num_threads();
-          int tid = omp_get_thread_num();
-#else
-          int num_threads = 1;
-          int tid = 0;
-#endif
+          DoNothing<> doNothingObj{};
 
-          fbgemmPacked(
-              packAN,
-              packedBN,
-              Cint8_fb.data(),
-              Cint32_buffer.data(),
-              groups * n,
-              outputProcObj,
-              tid,
-              num_threads);
+          if (q_granularity == QuantizationGranularity::TENSOR) {
+            ReQuantizeOutput<false> outputProcObj(
+                doNothingObj,
+                C_multiplier.data(),
+                C_zero_pt,
+                Aint8_zero_point,
+                Bint8_zero_point.data(),
+                packAN.getRowOffsetBuffer(),
+                col_offsets.data(),
+                bias,
+                groups * n_adjusted,
+                groups);
+
+            fbgemmPacked(
+                packAN,
+                packedBN,
+                Cint8_fb.data(),
+                Cint32_buffer.data(),
+                groups * n,
+                outputProcObj,
+                tid,
+                num_threads);
+          } else if (q_granularity == QuantizationGranularity::GROUP) {
+            ReQuantizeOutput<false, QuantizationGranularity::GROUP>
+                outputProcObj(
+                    doNothingObj,
+                    C_multiplier.data(),
+                    C_zero_pt,
+                    Aint8_zero_point,
+                    Bint8_zero_point.data(),
+                    packAN.getRowOffsetBuffer(),
+                    col_offsets.data(),
+                    bias,
+                    groups * n_adjusted,
+                    groups);
+
+            fbgemmPacked(
+                packAN,
+                packedBN,
+                Cint8_fb.data(),
+                Cint32_buffer.data(),
+                groups * n,
+                outputProcObj,
+                tid,
+                num_threads);
+          } else {
+            ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+                outputProcObj(
+                    doNothingObj,
+                    C_multiplier.data(),
+                    C_zero_pt,
+                    Aint8_zero_point,
+                    Bint8_zero_point.data(),
+                    packAN.getRowOffsetBuffer(),
+                    col_offsets.data(),
+                    bias,
+                    groups * n_adjusted,
+                    groups);
+
+            fbgemmPacked(
+                packAN,
+                packedBN,
+                Cint8_fb.data(),
+                Cint32_buffer.data(),
+                groups * n,
+                outputProcObj,
+                tid,
+                num_threads);
+          }
         }
         // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(),
         // m, n_adjusted, n, "C local");
@@ -287,11 +360,12 @@ TEST_P(fbgemmu8s8acc32test, Test) {
  * accumulation. Directly output fp32 matrix C. Output processing:
  * requantization -> nothing
  */
-TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
+TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, TestFloatInputOutput) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+  QuantizationGranularity q_granularity;
+  tie(atrans, btrans, test_ld, q_granularity) = GetParam();
 
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
@@ -303,26 +377,26 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
       }
       int k_per_group = k / groups;
 
-      aligned_vector<float> Afp32(m * k, 0.0f);
-      aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
+      aligned_vector<float> Afp32(m * k);
+      aligned_vector<uint8_t> Aint8(Afp32.size());
 
-      aligned_vector<float> Bfp32(k * n, 0.0f);
-      aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
+      aligned_vector<float> Bfp32(k * n);
+      aligned_vector<int8_t> Bint8(Bfp32.size());
 
-      aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
-      aligned_vector<float> Cfp32_fb(Cfp32_ref.size(), 0.0f);
+      aligned_vector<float> Cfp32_ref(m * n * groups);
+      aligned_vector<float> Cfp32_fb(Cfp32_ref.size());
 
-      aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size(), 0);
-      aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size(), 0);
+      aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size());
+      aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size());
 
-      randFill(Aint8, 0, 255);
+      randFill<uint8_t>(Aint8, 0, 255);
       int32_t Aint8_zero_point = 43;
       float Aint8_scale = 0.11;
       for (auto i = 0; i < Afp32.size(); ++i) {
         Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
       }
 
-      randFill(Bint8, -128, 127);
+      randFill<int8_t>(Bint8, -128, 127);
       for (int g = 0; g < groups; ++g) {
         avoidOverflow(
             m,
@@ -333,11 +407,6 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
             Bint8.data() + g * k_per_group * n,
             n);
       }
-      int32_t Bint8_zero_point = -30;
-      float Bint8_scale = 0.49;
-      for (auto i = 0; i < Bfp32.size(); ++i) {
-        Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point);
-      }
 
       // To test lda != k , we just reduce k by half and use the original k
       // as lda.
@@ -351,17 +420,37 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
         }
       }
 
+      int ncols_per_quant_group = groups * n_adjusted;
+      if (q_granularity == QuantizationGranularity::GROUP) {
+        ncols_per_quant_group = n_adjusted;
+      } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+        ncols_per_quant_group = 1;
+      }
+      aligned_vector<int32_t> Bint8_zero_point(
+          groups * n_adjusted / ncols_per_quant_group);
+      randFill(Bint8_zero_point, -50, -10);
+      aligned_vector<float> Bint8_scale(Bint8_zero_point.size());
+      randFill(Bint8_scale, 0.49f / 2, 0.49f * 3 / 2);
+      for (int i = 0; i < k; ++i) {
+        int g = i / k_per_group;
+        for (int j = 0; j < n_adjusted; ++j) {
+          int quant_group = (g * n_adjusted + j) / ncols_per_quant_group;
+          Bfp32[i * n + j] = Bint8_scale[quant_group] *
+              (Bint8[i * n + j] - Bint8_zero_point[quant_group]);
+        }
+      }
+
       // computing column offset
-      vector<int32_t> col_offsets;
-      col_offsets.resize(groups * n_adjusted);
+      vector<int32_t> col_offsets(groups * n_adjusted);
       for (int g = 0; g < groups; ++g) {
         col_offsets_with_zero_pt_s8acc32_ref(
             k_per_group,
             n_adjusted,
             n,
             Bint8.data() + g * k_per_group * n,
-            Bint8_zero_point,
-            col_offsets.data() + g * n_adjusted);
+            Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+            col_offsets.data() + g * n_adjusted,
+            ncols_per_quant_group);
       }
 
       if (btrans == matrix_op_t::Transpose) {
@@ -404,8 +493,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
 #pragma omp parallel
 #endif
       {
-        vector<int32_t> row_offset_buf;
-        row_offset_buf.resize(
+        vector<int32_t> row_offset_buf(
             PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
 
         PackAWithQuantRowOffset<uint8_t> packAN(
@@ -420,47 +508,96 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
             groups,
             row_offset_buf.data());
 
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
+
         DoNothing<float, float> doNothingObj{};
-        ReQuantizeForFloat<false> outputProcObj(
-            doNothingObj,
-            Aint8_scale,
-            Bint8_scale,
-            Aint8_zero_point,
-            Bint8_zero_point,
-            packAN.getRowOffsetBuffer(),
-            col_offsets.data(),
-            nullptr);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        if (q_granularity == QuantizationGranularity::TENSOR) {
+          ReQuantizeForFloat<false> outputProcObj(
+              doNothingObj,
+              Aint8_scale,
+              Bint8_scale.data(),
+              Aint8_zero_point,
+              Bint8_zero_point.data(),
+              packAN.getRowOffsetBuffer(),
+              col_offsets.data(),
+              nullptr,
+              groups * n_adjusted,
+              groups);
 
-        fbgemmPacked(
-            packAN,
-            packedBN,
-            Cfp32_fb.data(),
-            reinterpret_cast<int32_t*>(Cfp32_fb.data()),
-            groups * n,
-            outputProcObj,
-            tid,
-            num_threads);
-      }
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cfp32_fb.data(),
+              reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        } else if (q_granularity == QuantizationGranularity::GROUP) {
+          ReQuantizeForFloat<false, QuantizationGranularity::GROUP>
+              outputProcObj(
+                  doNothingObj,
+                  Aint8_scale,
+                  Bint8_scale.data(),
+                  Aint8_zero_point,
+                  Bint8_zero_point.data(),
+                  packAN.getRowOffsetBuffer(),
+                  col_offsets.data(),
+                  nullptr,
+                  groups * n_adjusted,
+                  groups);
 
-      float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end());
-      float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end());
-      float atol = (maximum - minimum) / 255 / 1.9;
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cfp32_fb.data(),
+              reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        } else {
+          ReQuantizeForFloat<false, QuantizationGranularity::OUT_CHANNEL>
+              outputProcObj(
+                  doNothingObj,
+                  Aint8_scale,
+                  Bint8_scale.data(),
+                  Aint8_zero_point,
+                  Bint8_zero_point.data(),
+                  packAN.getRowOffsetBuffer(),
+                  col_offsets.data(),
+                  nullptr,
+                  groups * n_adjusted,
+                  groups);
 
+          fbgemmPacked(
+              packAN,
+              packedBN,
+              Cfp32_fb.data(),
+              reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+              groups * n,
+              outputProcObj,
+              tid,
+              num_threads);
+        }
+      }
+
+      float maximum = 0;
+      for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < groups * n_adjusted; ++j) {
+          float c = Cfp32_ref[i * groups * n + j];
+          maximum = std::max(maximum, std::abs(c));
+        }
+      }
       compare_validate_buffers(
           Cfp32_ref.data(),
           Cfp32_fb.data(),
           m,
           groups * n_adjusted,
           groups * n,
-          atol);
+          maximum * 1e-5f);
     } // for each groups
   } // for each shape
 }
@@ -470,7 +607,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
  * accumulation. Output processing: requantization -> nothing. Symmetric: the
  * zero point is 0.
  */
-TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
+TEST_P(fbgemmu8s8acc32Test, TestSymmetricQuantizedInputOutput) {
   vector<vector<int>> shapes(GetShapes_());
   matrix_op_t atrans, btrans;
   bool test_ld;
@@ -486,22 +623,17 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
       }
       int k_per_group = k / groups;
 
-      aligned_vector<float> Afp32(m * k, 0.0f);
-      aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
+      aligned_vector<uint8_t> Aint8(m * k);
+      aligned_vector<int8_t> Bint8(k * n);
 
-      aligned_vector<float> Bfp32(k * n, 0.0f);
-      aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
+      aligned_vector<float> Cfp32_ref(m * n * groups);
+      aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size());
 
-      aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
-      aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size(), 0);
-
-      randFill(Afp32, 0, 255);
-      for (auto i = 0; i < Afp32.size(); i++) {
-        Aint8[i] = (uint8_t)Afp32[i];
-      }
+      randFill<uint8_t>(Aint8, 0, 255);
+      aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
 
       // initialize B matrix
-      randFill(Bfp32, -128, 127);
+      randFill<int8_t>(Bint8, -128, 127);
       for (int g = 0; g < groups; ++g) {
         avoidOverflow(
             m,
@@ -509,13 +641,11 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
             k_per_group,
             Aint8.data() + g * k_per_group,
             k,
-            Bfp32.data() + g * k_per_group * n,
+            Bint8.data() + g * k_per_group * n,
             n);
       }
 
-      for (auto i = 0; i < Bfp32.size(); ++i) {
-        Bint8[i] = (int8_t)Bfp32[i];
-      }
+      aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
 
       // To test lda != k , we just reduce k by half and use the original k
       // as lda.
@@ -577,13 +707,8 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
         DoNothing<int32_t, int32_t> doNothingObj{};
         memCopy<> outputProcObj(doNothingObj);
 
-#ifdef _OPENMP
-        int num_threads = omp_get_num_threads();
-        int tid = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int tid = 0;
-#endif
+        int num_threads = fbgemm_get_num_threads();
+        int tid = fbgemm_get_thread_num();
 
         fbgemmPacked(
             packAN,