21 files changed, 1298 insertions, 1083 deletions
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index 48b744a..8827b4c 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -189,7 +189,11 @@ void performance_test() {
         PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int16_t> packA(
-        conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+        conv_p,
+        Aint8.data(),
+        nullptr,
+        Aint8_zero_point,
+        row_offset_buf.data());
 
     PackBMatrix<int8_t, int16_t> packedB(
         matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
index 9201e52..b87f7d7 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -191,7 +191,11 @@ void performance_test() {
         PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int32_t> packA(
-        conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+        conv_p,
+        Aint8.data(),
+        nullptr,
+        Aint8_zero_point,
+        row_offset_buf.data());
 
     PackBMatrix<int8_t, int32_t> packedB(
         matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index bd1c86e..d05e099 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -20,6 +20,10 @@
 #include "Types.h"
 #include "Utils.h"
 
+// Turning on this option will print out time breakdown of each stage (e.g.,
+// input packing, the main GEMM kernel, each output processing pipeline).
+// Please note that currently this option won't report accurate timing if
+// multiple threads are used.
 // #define FBGEMM_MEASURE_TIME_BREAKDOWN
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
@@ -80,13 +84,22 @@ class PackMatrix {
    *             overhead of internal memory allocation everytime a PackMatrix
    *             is constructed. The client code can query how big patm should
    *             be with packedBufferSize function.
+   * @param groups when groups > 1, we compute groups number of GEMMs each
+   *               multiplies A.rows by A.cols/A.groups matrix with
+   *               B.rows/B.groups by B.cols matrix (in conventional BLAS
+   *               terminology, this is a batched GEMM but we use the name group
+   *               to follow deep learning terminology). The result matrix has
+   *               dimension A.rows by B.cols*B.groups .
+   *               A.groups must be same as B.groups, A.groups must divide
+   *               A.cols, and B.groups must divide B.rows and C.cols.
    * @param zero_pt the quantized value that maps to 0.0f floating-point number.
    */
   PackMatrix(
       std::int32_t rows,
       std::int32_t cols,
       inpType* pmat,
-      std::int32_t zero_pt);
+      int groups = 1,
+      std::int32_t zero_pt = 0);
 
   /**
    * @return true usually when the matrix is constant matrix (e.g., weight
@@ -228,6 +241,10 @@ class PackMatrix {
     return last_bcol_;
   }
 
+  int numGroups() const {
+    return G_;
+  }
+
   /**
    * @return True if the last column block has fewer columns than the block
    *         size.
@@ -275,6 +292,7 @@ class PackMatrix {
 
  private:
   std::int32_t nrows_, ncols_;
+  int G_;
   std::int32_t zero_pt_;
   block_type_t packedBlock_; ///< The block in the source matrix just packed
   std::int32_t last_brow_, last_bcol_;
@@ -295,9 +313,6 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
 
   PackAMatrix() = delete; // no default constructor
 
-  /**
-   * TODO: currently only groups == 1 supported
-   */
   PackAMatrix(
       matrix_op_t trans,
       std::int32_t nRow,
@@ -305,7 +320,7 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
       const inpType* smat,
       std::int32_t ld,
       inpType* pmat = nullptr,
-      std::int32_t groups = 1,
+      int groups = 1,
       std::int32_t zero_pt = 0);
 
   /**
@@ -351,7 +366,6 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
   matrix_op_t trans_;
   const T* smat_;
   std::int32_t ld_;
-  std::int32_t G_;
   std::int32_t row_interleave_B_;
 };
 
@@ -370,9 +384,6 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
 
   PackBMatrix() = delete; // no default constructor
 
-  /**
-   * TODO: Currently only groups == 1 supported.
-   */
   PackBMatrix(
       matrix_op_t trans,
       std::int32_t nRow,
@@ -380,7 +391,7 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
       const inpType* smat,
       std::int32_t ld,
       inpType* pmat = nullptr,
-      std::int32_t groups = 1,
+      int groups = 1,
       std::int32_t zero_pt = 0);
 
   /**
@@ -437,7 +448,6 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
   matrix_op_t trans_;
   const T* smat_;
   std::int32_t ld_;
-  std::int32_t G_;
   std::int32_t row_interleave_;
 };
 
@@ -533,9 +543,6 @@ class PackAWithRowOffset final
   using accType = accT;
 
   PackAWithRowOffset() = delete; // no default constructor
-  /**
-   * TODO: Currently only groups == 1 supported
-   */
   PackAWithRowOffset(
       matrix_op_t trans,
       std::uint32_t nRow,
@@ -543,7 +550,7 @@ class PackAWithRowOffset final
       const T* smat,
       std::uint32_t ld,
       inpType* pmat = nullptr,
-      std::uint32_t groups = 1,
+      int groups = 1,
       std::int32_t zero_pt = 0,
       std::int32_t* row_offset = nullptr);
 
@@ -600,7 +607,6 @@ class PackAWithRowOffset final
   matrix_op_t trans_;
   const T* smat_;
   std::uint32_t ld_;
-  std::uint32_t G_;
   std::int32_t* row_offset_;
   bool rowOffsetAllocatedHere;
   std::int32_t row_interleave_B_;
@@ -621,9 +627,6 @@ class PackAWithQuantRowOffset final
   using accType = accT;
 
   PackAWithQuantRowOffset() = delete; // no default constructor
-  /**
-   * TODO: Currently only groups == 1 supported
-   */
   PackAWithQuantRowOffset(
       matrix_op_t trans,
       std::int32_t nRow,
@@ -633,7 +636,7 @@ class PackAWithQuantRowOffset final
       inpType* pmat = nullptr,
       float scale = 1.0f,
       std::int32_t zero_pt = 0,
-      std::int32_t groups = 1,
+      int groups = 1,
       std::int32_t* row_offset = nullptr);
 
   /**
@@ -690,7 +693,6 @@ class PackAWithQuantRowOffset final
   const float* smat_;
   std::int32_t ld_;
   float scale_;
-  std::int32_t G_;
   std::int32_t* row_offset_;
   bool rowOffsetAllocatedHere;
   std::int32_t row_interleave_B_;
@@ -811,7 +813,8 @@ class ReluOutput {
  * processing pipeline.
  *
  * SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer
- * (inp). After modifying the input buffer, pass it to the next op
+ * (inp). After modifying the input buffer, pass it to the next op.
+ * When groups > 1, each group is numRows() x (numCols()/groups) matrix.
  */
 template <
     typename outT = std::int32_t,
@@ -825,8 +828,9 @@ class DoSpmdmOnInpBuffer {
       nextOPType& nextop,
       const std::uint8_t* A,
       int lda,
-      const CompressedSparseColumn& B_csc)
-      : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc) {}
+      const CompressedSparseColumn& B_csc,
+      int groups = 1)
+      : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc), groups_(groups) {}
 
   template <inst_set_t instSet>
   inline int f(
@@ -841,6 +845,7 @@ class DoSpmdmOnInpBuffer {
   const std::uint8_t* A_;
   const int lda_;
   const CompressedSparseColumn& B_csc_;
+  const int groups_;
 };
 
 /**
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
index 13f614a..71d8046 100644
--- a/include/fbgemm/OutputProcessing-inl.h
+++ b/include/fbgemm/OutputProcessing-inl.h
@@ -28,7 +28,10 @@ template <typename outT, typename inT, typename nextOPType>
 template<inst_set_t instSet>
 inline int DoSpmdmOnInpBuffer<outT, inT, nextOPType>::f(outT* out, inT* inp,
     const block_type_t& block, int ld_out, int ld_in) const {
-  B_csc_.SpMDM(block, A_, lda_, true, inp, ld_in);
+  assert(B_csc_.NumOfCols() % groups_ == 0);
+  int n_per_group = B_csc_.NumOfCols() / groups_;
+  int g = block.col_start / n_per_group;
+  B_csc_.SpMDM(block, A_ + g * B_csc_.NumOfRows(), lda_, true, inp, ld_in);
   return nextop_.template f<instSet>(out, inp, block, ld_out, ld_in);
 }
 
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index b3f8c15..c2079b1 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -8,7 +8,6 @@
 #include <cpuinfo.h>
 #include <chrono>
 
-
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
 double kernel_time = 0.0;
 double postprocessing_time = 0.0;
@@ -84,8 +83,10 @@ void ExecuteKernel<
   int32_t packed_rows_A = packedA_.numPackedRows();
   int32_t row_start_A = packedA_.packedRowStart();
 
-  bool lastKBlock = packedB_.isThisLastKBlock(kBlock);
-  bool accum = kBlock > 0;
+  int group = kBlock / packedB_.blockRows();
+  int NDim = packedB_.numCols();
+  bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows());
+  bool accum = (kBlock % packedB_.blockRows()) > 0;
 
   typename BaseType::jit_micro_kernel_fp fn;
 
@@ -120,7 +121,6 @@ void ExecuteKernel<
 #endif
 
   for (int jb = 0; jb < bColBlocks; ++jb) {
-
     bBuf = packedB_.getBuf(jb, kBlock);
     // prefetch addr of the next packed block of B matrix
     bBuf_pf = packedB_.getBuf(jb == bColBlocks - 1 ? jb : jb + 1, kBlock);
@@ -128,12 +128,14 @@ void ExecuteKernel<
     // Reuse the first rowblock of C_buffer_ unless when C_buffer_ is same as
     // matC_ (inplace output processing)
     int32_t* C_buffer_row_start = C_buffer_ +
-        ((C_buffer_ == reinterpret_cast<int32_t*>(matC_)) ? row_start_A * ldc_
-                                                          : 0);
+        ((C_buffer_ == reinterpret_cast<int32_t*>(matC_))
+             ? row_start_A * ldc_ + NDim * group
+             : 0);
     int32_t* C_buffer_start = C_buffer_row_start + jb * nbSize_;
     int32_t leadingDim = ldc_;
     if (packedB_.isThereColRemainder() && (jb == bColBlocks - 1)) {
-      // In case we will access memory past C_buffer_, we use C_tile_ instead.
+      // In case we will access memory past C_buffer_, we use C_tile_ scratchpad
+      // instead.
       C_buffer_start = C_tile_;
       leadingDim = nbSize_;
     }
@@ -146,14 +148,15 @@ void ExecuteKernel<
        leadingDim);
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-      t_end = std::chrono::high_resolution_clock::now();
-      dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
-               .count();
-      kernel_time += (dt);
-      t_start = std::chrono::high_resolution_clock::now();
+    t_end = std::chrono::high_resolution_clock::now();
+    dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
+             .count();
+    kernel_time += (dt);
+    t_start = std::chrono::high_resolution_clock::now();
 #endif
 
-    // Output processing is done only once per rowblock
+    // Output processing is done only once per rowblock to amortize overhead
+    // and for better spatial locality.
     if (lastKBlock && jb == bColBlocks - 1) {
       // When C_tile_ is used for the last column block, we need a separate
       // handling for the last column block.
@@ -166,14 +169,14 @@ void ExecuteKernel<
           outputProcess_.template f<inst_set_t::avx2>(
               matC_,
               C_buffer_row_start,
-              {row_start_A, packed_rows_A, 0, nSize},
+              {row_start_A, packed_rows_A, NDim * group, nSize},
               ldc_,
               ldc_);
         } else if (cpuinfo_has_x86_avx2()) {
           outputProcess_.template f<inst_set_t::avx2>(
               matC_,
               C_buffer_row_start,
-              {row_start_A, packed_rows_A, 0, nSize},
+              {row_start_A, packed_rows_A, NDim * group, nSize},
               ldc_,
               ldc_);
         } else {
@@ -183,20 +186,28 @@ void ExecuteKernel<
       }
 
       if (C_buffer_start == C_tile_) {
+        // When C_tile_ scratchpad was used to avoid accessing memory past
+        // C_buffer_ .
         if (cpuinfo_has_x86_avx512f()) {
           // TODO: avx512 path
           // Currently use avx2 code
           outputProcess_.template f<inst_set_t::avx2>(
               matC_,
               C_tile_,
-              {row_start_A, packed_rows_A, jb * nbSize_, packedB_.lastBcol()},
+              {row_start_A,
+               packed_rows_A,
+               NDim * group + jb * nbSize_,
+               packedB_.lastBcol()},
               ldc_,
               leadingDim);
         } else if (cpuinfo_has_x86_avx2()) {
           outputProcess_.template f<inst_set_t::avx2>(
               matC_,
               C_tile_,
-              {row_start_A, packed_rows_A, jb * nbSize_, packedB_.lastBcol()},
+              {row_start_A,
+               packed_rows_A,
+               NDim * group + jb * nbSize_,
+               packedB_.lastBcol()},
               ldc_,
               leadingDim);
         } else {
@@ -207,11 +218,11 @@ void ExecuteKernel<
     } // output processing
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-      t_end = std::chrono::high_resolution_clock::now();
-      dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
-               .count();
-      postprocessing_time += (dt);
-      t_start = std::chrono::high_resolution_clock::now();
+    t_end = std::chrono::high_resolution_clock::now();
+    dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
+             .count();
+    postprocessing_time += (dt);
+    t_start = std::chrono::high_resolution_clock::now();
 #endif
 
   } // for each j block
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index f8f0d34..8376077 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -76,23 +76,31 @@ void fbgemmPacked(
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
 
+  if (!packB.isPrePacked()) {
+    throw std::runtime_error("B matrix must be prepacked");
+  }
+  if (packA.numGroups() != packB.numGroups()) {
+    throw std::runtime_error(
+        "A.groups = " + std::to_string(packA.numGroups()) + " and B.groups = " +
+        std::to_string(packB.numGroups()) + " are not the same");
+  }
+
   int MDim = packA.numRows();
   int KDim = packB.numRows();
+  int KDimPerGroup = packB.numRows() / packB.numGroups();
+  int NDim = packB.numCols();
 
   int mBlocks = (MDim + MCB - 1) / MCB;
-  int kBlocks = (KDim + KCB - 1) / KCB;
+  int kBlocks = (KDimPerGroup + KCB - 1) / KCB;
 
   // remainders
   int _mc = MDim % MCB;
-  int _kc = KDim % KCB;
+  int _kc = KDimPerGroup % KCB;
 
   int kc, mc;
 
   block_type_t blockA{0, 0, 0, 0};
 
-  // B must be prepacked
-  assert(packB.isPrePacked() && "B matrix must be prepacked");
-
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
   std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start,
       t_start, t_end;
@@ -101,37 +109,48 @@ void fbgemmPacked(
   t_very_start = std::chrono::high_resolution_clock::now();
 #endif
 
-  ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType>
-      exeKernelObj(packA, packB, 0, C, C_buffer, ldc, outProcess);
   // ToDo: thread based work division
-  for (int i = 0; i < mBlocks; ++i) {
-    mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc;
-    for (int k = 0; k < kBlocks; ++k) {
-      kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc;
-      // pack A matrix
-      blockA = {i * MCB, mc, k * KCB, kc};
-
-      packA.pack(blockA);
+  for (int g = 0; g < packA.numGroups(); ++g) {
+    ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType>
+        exeKernelObj(
+            packA,
+            packB,
+            0,
+            C,
+            C_buffer,
+            ldc,
+            outProcess);
+    for (int i = 0; i < mBlocks; ++i) {
+      mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc;
+      for (int k = 0; k < kBlocks; ++k) {
+        kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc;
+        // pack A matrix
+        blockA = {i * MCB, mc, g * KDimPerGroup + k * KCB, kc};
+
+        packA.pack(blockA);
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-      t_end = std::chrono::high_resolution_clock::now();
-      dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
-               .count();
-      packing_time += (dt);
-      t_start = std::chrono::high_resolution_clock::now();
+        t_end = std::chrono::high_resolution_clock::now();
+        dt = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                 t_end - t_start)
+                 .count();
+        packing_time += (dt);
+        t_start = std::chrono::high_resolution_clock::now();
 #endif
 
-      exeKernelObj.execute(k);
+        exeKernelObj.execute(g * kBlocks + k);
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-      t_end = std::chrono::high_resolution_clock::now();
-      dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
-               .count();
-      computing_time += (dt);
-      t_start = std::chrono::high_resolution_clock::now();
+        t_end = std::chrono::high_resolution_clock::now();
+        dt = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                 t_end - t_start)
+                 .count();
+        computing_time += (dt);
+        t_start = std::chrono::high_resolution_clock::now();
 #endif
+      }
     }
-  }
+  } // for each group
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
   t_end = std::chrono::high_resolution_clock::now();
diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
index cd991ca..988a27b 100644
--- a/src/PackAMatrix.cc
+++ b/src/PackAMatrix.cc
@@ -20,15 +20,17 @@ PackAMatrix<T, accT>::PackAMatrix(
     const T* smat,
     int32_t ld,
     inpType* pmat,
-    int32_t groups,
+    int groups,
     std::int32_t zero_pt)
-    : PackMatrix<PackAMatrix<T, accT>, T, accT>(nRow, nCol, pmat, zero_pt),
+    : PackMatrix<PackAMatrix<T, accT>, T, accT>(
+          nRow,
+          nCol,
+          pmat,
+          groups,
+          zero_pt),
       trans_(trans),
       smat_(smat),
-      ld_(ld),
-      G_(groups) {
-  assert(G_ == 1 && "Groups != 1 not supported yet");
-
+      ld_(ld) {
   if (cpuinfo_has_x86_avx512f()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
@@ -43,6 +45,11 @@ PackAMatrix<T, accT>::PackAMatrix(
     // TODO: Have default slower path
     assert(0 && "unsupported architecure");
   }
+  if (BaseType::numCols() % groups != 0) {
+    throw std::runtime_error(
+        "groups = " + std::to_string(groups) +
+        " does not divide numCols = " + std::to_string(BaseType::numCols()));
+  }
   if (!pmat) {
     BaseType::buf_ = (T*)fbgemmAlignedAlloc(
         64, BaseType::brow_ * BaseType::bcol_ * sizeof(T));
@@ -63,7 +70,7 @@ void PackAMatrix<T, accT>::pack(const block_type_t& block) {
   if (tr) {
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
       for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
-        T val = smat_[i + ld_ * j];
+        T val = smat_[i + j * ld_];
         out[addr(i, j) - addr(block.row_start, block.col_start)] = val;
       }
       // zero fill
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index 71efced..7ca7059 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -36,6 +36,7 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
               std::multiplies<int>()) *
               conv_p.IC,
           pmat,
+          conv_p.G,
           zero_pt),
       conv_p_(conv_p),
       sdata_(sdata) {
@@ -55,6 +56,11 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
     // TODO: Have default slower path
     assert(0 && "unsupported architecure");
   }
+  if (BaseType::numCols() % conv_p.G != 0) {
+    throw std::runtime_error(
+        "groups = " + std::to_string(conv_p.G) +
+        " does not divide numCols = " + std::to_string(BaseType::numCols()));
+  }
   if (pmat) {
     BaseType::buf_ = pmat;
   } else {
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index 485afb1..1bb7d4b 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -20,15 +20,17 @@ PackBMatrix<T, accT>::PackBMatrix(
     const T* smat,
     int32_t ld,
     inpType* pmat,
-    int32_t groups,
+    int groups,
     std::int32_t zero_pt)
-    : PackMatrix<PackBMatrix<T, accT>, T, accT>(nRow, nCol, pmat, zero_pt),
+    : PackMatrix<PackBMatrix<T, accT>, T, accT>(
+          nRow,
+          nCol,
+          pmat,
+          groups,
+          zero_pt),
       trans_(trans),
       smat_(smat),
-      ld_(ld),
-      G_(groups) {
-  assert(G_ == 1 && "Groups != 1 not supported yet");
-
+      ld_(ld) {
   if (cpuinfo_has_x86_avx512f()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB;
@@ -42,14 +44,22 @@ PackBMatrix<T, accT>::PackBMatrix(
     // Error
     assert(0 && "unknown architecure");
   }
-  block_type_t block{0, BaseType::numRows(), 0, BaseType::numCols()};
+  if (BaseType::numRows() % groups != 0) {
+    throw std::runtime_error(
+        "groups = " + std::to_string(groups) +
+        " does not divide numRows = " + std::to_string(BaseType::numRows()));
+  }
+
+  // blocking for one group
+  block_type_t block{
+      0, BaseType::numRows() / BaseType::numGroups(), 0, BaseType::numCols()};
   BaseType::packedBlock(block);
   if (!pmat) {
     BaseType::bufAllocatedHere_ = true;
     BaseType::buf_ = (T*)fbgemmAlignedAlloc(
         64,
-        BaseType::blockRows() * BaseType::brow_ * BaseType::blockCols() *
-            BaseType::bcol_ * sizeof(T));
+        BaseType::numGroups() * BaseType::blockRows() * BaseType::brow_ *
+            BaseType::blockCols() * BaseType::bcol_ * sizeof(T));
   }
   pack(block);
 }
@@ -59,26 +69,28 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
   assert((BaseType::blockRowSize() % row_interleave_) == 0);
 
   BaseType::packedBlock(block);
-  T* out = BaseType::getBuf();
   bool tr = (trans_ == matrix_op_t::Transpose);
-  for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
-    for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
-      T val = tr ? smat_[i + ld_ * j] : smat_[i * ld_ + j];
-      out[addr(i, j) - addr(block.row_start, block.col_start)] =
-          tconv(val, out[addr(i, j)]);
+  for (int g = 0; g < this->numGroups(); ++g) {
+    T* out = BaseType::getBuf() +
+        g * this->packedBufferSize(block.row_size, block.col_size);
+    for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+      for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
+        T val = tr ? smat_[g * block.row_size + i + ld_ * j]
+                   : smat_[(g * block.row_size + i) * ld_ + j];
+        out[addr(i, j)] = tconv(val, out[addr(i, j)]);
+      }
     }
-  }
-  // fill the remaining with zero.
-  // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
-  for (int i = block.row_start + block.row_size;
-       i < (block.row_start + block.row_size + row_interleave_ - 1) /
-           row_interleave_ * row_interleave_;
-       ++i) {
-    for (int j = block.col_start; j < block.col_start + block.col_size; j++) {
-      out[addr(i, j) - addr(block.row_start, block.col_start)] =
-          tconv(0, out[addr(i, j)]);
+    // fill the remaining with zero.
+    // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
+    for (int i = block.row_start + block.row_size;
+         i < (block.row_start + block.row_size + row_interleave_ - 1) /
+             row_interleave_ * row_interleave_;
+         ++i) {
+      for (int j = block.col_start; j < block.col_start + block.col_size; j++) {
+        out[addr(i, j)] = tconv(0, out[addr(i, j)]);
+      }
     }
-  }
+  } // for each group
 }
 
 template <typename T, typename accT>
@@ -109,7 +121,8 @@ void PackBMatrix<T, accT>::printPackedMatrix(std::string name) {
             << "[" << BaseType::blockRowSize() << ", "
             << BaseType::blockColSize() << "]" << std::endl;
 
-  T* out = BaseType::getBuf();
+  T* out = BaseType::getBuf() +
+      this->packedBufferSize(this->numPackedRows(), this->numPackedCols());
   for (auto nr = 0; nr < BaseType::blockRows(); ++nr) {
     auto rows = (nr == BaseType::blockRows() - 1) ? BaseType::lastBrow()
                                                   : BaseType::blockRowSize();
@@ -150,7 +163,8 @@ bool PackBMatrix<T, accT>::metaEquals(const PackBMatrix<T, accT>& that) const {
       BaseType::numPackedRows() != that.numPackedRows() ||
       BaseType::numPackedCols() != that.numPackedCols() ||
       BaseType::zeroPoint() != that.zeroPoint() || trans_ != that.trans_ ||
-      G_ != that.G_ || row_interleave_ != that.row_interleave_) {
+      BaseType::numGroups() != that.numGroups() ||
+      row_interleave_ != that.row_interleave_) {
     return false;
   }
 
diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
index b8b8388..a577057 100644
--- a/src/PackMatrix.cc
+++ b/src/PackMatrix.cc
@@ -18,8 +18,9 @@ PackMatrix<PT, inpType, accType>::PackMatrix(
     int32_t rows,
     int32_t cols,
     inpType* buf,
+    int groups,
     int32_t zero_pt)
-    : buf_(buf), nrows_(rows), ncols_(cols), zero_pt_(zero_pt) {
+    : buf_(buf), nrows_(rows), ncols_(cols), G_(groups), zero_pt_(zero_pt) {
   bufAllocatedHere_ = false;
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
diff --git a/src/PackWithQuantRowOffset.cc b/src/PackWithQuantRowOffset.cc
index a3d7e0f..2bd4b15 100644
--- a/src/PackWithQuantRowOffset.cc
+++ b/src/PackWithQuantRowOffset.cc
@@ -25,21 +25,19 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
     inpType* pmat,
     float scale,
     int32_t zero_pt,
-    int32_t groups,
+    int groups,
     int32_t* row_offset)
     : PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT>(
           nRow,
           nCol,
           pmat,
+          groups,
           zero_pt),
       trans_(trans),
       smat_(smat),
       ld_(ld),
       scale_(scale),
-      G_(groups),
       row_offset_(row_offset) {
-  assert(G_ == 1 && "Groups != 1 not supported yet");
-
   rowOffsetAllocatedHere = false;
 
   if (cpuinfo_has_x86_avx512f()) {
@@ -56,6 +54,11 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
     // TODO: Have default slower path
     assert(0 && "unknown architecure");
   }
+  if (BaseType::numCols() % groups != 0) {
+    throw std::runtime_error(
+        "groups = " + std::to_string(groups) +
+        " does not divide numCols = " + std::to_string(BaseType::numCols()));
+  }
   if (pmat) {
     BaseType::buf_ = pmat;
   } else {
@@ -73,7 +76,6 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
 template <typename T, typename accT>
 void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
   assert(block.row_start % BaseType::blockRowSize() == 0);
-  assert(block.col_start % BaseType::blockColSize() == 0);
   assert(block.row_size <= BaseType::blockRowSize());
   assert(block.col_size <= BaseType::blockColSize());
 
@@ -88,7 +90,8 @@ void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
   T* out = BaseType::getBuf();
   bool tr = (trans_ == matrix_op_t::Transpose);
   // accumulate into row offset?
-  bool row_offset_acc = (block.col_start != 0);
+  bool row_offset_acc =
+      (block.col_start % (this->numCols() / this->numGroups())) != 0;
   int32_t* row_offset_buf = getRowOffsetBuffer();
 
   float smat_transposed[block.row_size * block.col_size];
diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc
index dec3f70..1935161 100644
--- a/src/PackWithRowOffset.cc
+++ b/src/PackWithRowOffset.cc
@@ -22,21 +22,19 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
     const T* smat,
     uint32_t ld,
     inpType* pmat,
-    uint32_t groups,
+    int groups,
     int32_t zero_pt,
     int32_t* row_offset)
     : PackMatrix<PackAWithRowOffset<T, accT>, T, accT>(
           nRow,
           nCol,
           pmat,
+          groups,
           zero_pt),
       trans_(trans),
       smat_(smat),
       ld_(ld),
-      G_(groups),
       row_offset_(row_offset) {
-  assert(G_ == 1 && "Groups != 1 not supported yet");
-
   rowOffsetAllocatedHere = false;
 
   if (cpuinfo_has_x86_avx512f()) {
@@ -53,6 +51,11 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
     // TODO: Have default slower path
     assert(0 && "unknown architecure");
   }
+  if (BaseType::numCols() % groups != 0) {
+    throw std::runtime_error(
+        "groups = " + std::to_string(groups) +
+        " does not divide numCols = " + std::to_string(BaseType::numCols()));
+  }
   if (pmat) {
     BaseType::buf_ = pmat;
   } else {
@@ -70,7 +73,6 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
 template <typename T, typename accT>
 void PackAWithRowOffset<T, accT>::pack(const block_type_t& block) {
   assert(block.row_start % BaseType::blockRowSize() == 0);
-  assert(block.col_start % BaseType::blockColSize() == 0);
   assert(block.row_size <= BaseType::blockRowSize());
   assert(block.col_size <= BaseType::blockColSize());
 
@@ -85,7 +87,8 @@ void PackAWithRowOffset<T, accT>::pack(const block_type_t& block) {
   T* out = BaseType::getBuf();
   bool tr = (trans_ == matrix_op_t::Transpose);
   // accumulate into row offset?
-  bool row_offset_acc = (block.col_start != 0);
+  bool row_offset_acc =
+      (block.col_start % (this->numCols() / this->numGroups())) != 0;
   int32_t* row_offset_buf = getRowOffsetBuffer();
   if (tr) {
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index dc41c27..45eb9f9 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -93,8 +93,8 @@ void matmul_u8i8acc32_ref(
     const uint8_t* Aint8,
     const int8_t* Bint8,
     int32_t* Cint32) {
-  for (int j = 0; j < N; ++j) {
-    for (int i = 0; i < M; ++i) {
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
       int32_t sum = 0;
       for (int k = 0; k < K; ++k) {
         sum += static_cast<int32_t>(Aint8[i * lda + k]) *
@@ -116,8 +116,8 @@ void matmul_u8i8acc16_ref(
     const uint8_t* Aint8,
     const int8_t* Bint8,
     int32_t* Cint32) {
-  for (int j = 0; j < N; ++j) {
-    for (int i = 0; i < M; ++i) {
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
       int32_t sum = 0, sum_32bit = 0;
       for (int k = 0; k < K; k += 2) {
         int a0 = Aint8[i * lda + k];
@@ -148,8 +148,8 @@ void matmul_fp_ref(
     const float* Afp32,
     const float* Bfp32,
     float* Cfp32) {
-  for (int j = 0; j < N; ++j) {
-    for (int i = 0; i < M; ++i) {
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
       float sum = 0;
       for (int k = 0; k < K; ++k) {
         sum += Afp32[i * lda + k] * Bfp32[k * ldb + j];
@@ -198,8 +198,10 @@ void spmdm_ref(
     fbgemm::CompressedSparseColumn& B,
     bool accumulation,
     int32_t* C,
-    int ldc) {
+    int ldc,
+    int groups /*=1*/) {
   int N = B.NumOfCols();
+  assert(N % groups == 0);
   if (!accumulation) {
     for (int i = 0; i < M; ++i) {
       for (int j = 0; j < N; ++j) {
@@ -207,15 +209,17 @@ void spmdm_ref(
       }
     }
   }
-  for (int j = 0; j < N; ++j) {
-    for (int k = B.ColPtr()[j]; k < B.ColPtr()[j + 1]; ++k) {
-      int row = B.RowIdx()[k];
-      int w = B.Values()[k];
-      for (int i = 0; i < M; ++i) {
-        C[i * ldc + j] += A[i * lda + row] * w;
+  for (int g = 0; g < groups; ++g) {
+    for (int j = g * (N / groups); j < (g + 1) * (N / groups); ++j) {
+      for (int k = B.ColPtr()[j]; k < B.ColPtr()[j + 1]; ++k) {
+        int row = g * B.NumOfRows() + B.RowIdx()[k];
+        int w = B.Values()[k];
+        for (int i = 0; i < M; ++i) {
+          C[i * ldc + j] += A[i * lda + row] * w;
+        }
       }
-    }
-  } // for each column of B
+    } // for each column of B
+  } // for each group
 }
 
 int32_t clip_16bit(int32_t x) {
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index 9e81ce1..cec4bff 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -125,6 +125,10 @@ void col_offsets_with_zero_pt_s8acc32_ref(
 
 /**
  * @brief Reference implementation of SPMDM (sparse matrix times dense matrix).
+ *
+ * @param groups when > 1, for gth group, we multiply
+ *               A[:,g*(A.ncols/groups):(g+1)*(A.ncols/groups)] sub-matrix with
+ *               B[:,g*(B.ncols/groups):(g+1)*(B.ncols/groups)] sub-matrix .
  */
 void spmdm_ref(
     int M,
@@ -133,7 +137,8 @@ void spmdm_ref(
     CompressedSparseColumn& B,
     bool accumulation,
     std::int32_t* C,
-    int ldc);
+    int ldc,
+    int groups = 1);
 
 /*
  * @brief Trim a 32-bit integer to a 16-bit integer.
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index 391b993..bab3e72 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -135,7 +135,11 @@ TEST(FBGemmIm2colTest, Acc32Test) {
         PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int32_t> packA(
-        conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+        conv_p,
+        Aint8.data(),
+        nullptr,
+        Aint8_zero_point,
+        row_offset_buf.data());
 
     PackBMatrix<int8_t, int32_t> packedB(
         matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
@@ -254,7 +258,11 @@ TEST(FBGemmIm2colTest, Acc16Test) {
         PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int16_t> packA(
-        conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+        conv_p,
+        Aint8.data(),
+        nullptr,
+        Aint8_zero_point,
+        row_offset_buf.data());
 
     PackBMatrix<int8_t, int16_t> packedB(
         matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
@@ -452,7 +460,11 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) {
         PackAWithIm2Col<uint8_t, int32_t, 3>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int32_t, 3> packA(
-        conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+        conv_p,
+        Aint8.data(),
+        nullptr,
+        Aint8_zero_point,
+        row_offset_buf.data());
 
     PackBMatrix<int8_t, int32_t> packedB(
         matrix_op_t::NoTranspose,
@@ -588,7 +600,11 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) {
         PackAWithIm2Col<uint8_t, int16_t, 3>::rowOffsetBufferSize());
 
     PackAWithIm2Col<uint8_t, int16_t, 3> packA(
-        conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+        conv_p,
+        Aint8.data(),
+        nullptr,
+        Aint8_zero_point,
+        row_offset_buf.data());
 
     PackBMatrix<int8_t, int16_t> packedB(
         matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index 82ae96f..7f0a7c7 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -82,141 +82,169 @@ TEST_P(fbgemmu8s8acc16test, Test) {
   tie(atrans, btrans, test_ld) = GetParam();
 
   for (auto shape : shapes) {
-    int m = shape[0];
-    int n = shape[1];
-    int k = shape[2];
-
-    aligned_vector<uint8_t> Aint8(m * k, 0);
-    aligned_vector<int8_t> Bint8(k * n, 0);
-    aligned_vector<int8_t> Bint8_ref(k * n, 0);
-    aligned_vector<int32_t> Cint32_local(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_local(m * n, 0);
-
-    randFill(Aint8, 0, 255);
-    int32_t Aint8_zero_point = 43;
-
-    randFill(Bint8_ref, -128, 127);
-
-    for (auto i = 0; i < Bint8.size(); ++i) {
-      Bint8[i] = Bint8_ref[i];
-    }
-
-    if (btrans == matrix_op_t::Transpose) {
-      transpose_matrix(Bint8.data(), k, n);
-    }
-
-    int32_t Bint8_zero_point = -30;
-    // To test lda != k , we just reduce k by half and use the original k
-    // as lda.
-    int k_adjusted = k;
-    int n_adjusted = n;
-    if (test_ld) {
-      assert(
-          atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
-      k_adjusted = std::max(k / 2, 1);
-      if (btrans == matrix_op_t::NoTranspose) {
-        n_adjusted = std::max(n / 2, 1);
+    for (int groups : {1, 3, 4}) {
+      int m = shape[0];
+      int n = shape[1];
+      int k = shape[2];
+      if (k % groups != 0) {
+        continue;
       }
-    }
-
-    // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n_adjusted);
-    col_offsets_with_zero_pt_s8acc32_ref(
-        k_adjusted,
-        n_adjusted,
-        n,
-        Bint8_ref.data(),
-        Bint8_zero_point,
-        col_offsets.data());
-
-    vector<int32_t> row_offsets;
-    row_offsets.resize(m);
-
-    float C_multiplier = 0.1234;
-    int32_t C_zero_pt = 5;
-
-    int brow = 256;
-    matmul_u8i8acc16_ref(
-        m,
-        n_adjusted,
-        k_adjusted,
-        k,
-        n,
-        n,
-        brow,
-        Aint8.data(),
-        Bint8_ref.data(),
-        Cint32_local.data());
-
-    row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data());
-
-    requantize_u8acc32_ref(
-        m,
-        n_adjusted,
-        n,
-        Cint32_local.data(),
-        Cint8_local.data(),
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        row_offsets.data(),
-        col_offsets.data(),
-        nullptr);
-
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
-        PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-
-    PackAWithRowOffset<uint8_t, int16_t> packAN(
-        matrix_op_t::NoTranspose,
-        m,
-        k_adjusted,
-        Aint8.data(),
-        k,
-        nullptr,
-        1,
-        Aint8_zero_point,
-        row_offset_buf.data());
-
-    PackBMatrix<int8_t, int16_t> packedBN(
-        btrans,
-        k_adjusted,
-        n_adjusted,
-        Bint8.data(),
-        (btrans == matrix_op_t::Transpose) ? k : n,
-        nullptr,
-        1,
-        Bint8_zero_point);
-
-    DoNothing<> doNothingObj{};
-    ReQuantizeOutput<false> outputProcObj(
-        doNothingObj,
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        packAN.getRowOffsetBuffer(),
-        col_offsets.data(),
-        nullptr);
-
-    fbgemmPacked(
-        packAN,
-        packedBN,
-        Cint8_fb.data(),
-        Cint32_buffer.data(),
-        n,
-        outputProcObj,
-        0,
-        1);
-
-    compare_validate_buffers(
-        Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0));
-  }
+      int k_per_group = k / groups;
+
+      aligned_vector<uint8_t> Aint8(m * k, 0);
+
+      aligned_vector<int8_t> Bint8(k * n, 0);
+      aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+
+      aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
+      aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+      aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+
+      randFill(Aint8, 0, 255);
+      int32_t Aint8_zero_point = 43;
+
+      randFill(Bint8_ref, -128, 127);
+      Bint8 = Bint8_ref;
+
+      if (btrans == matrix_op_t::Transpose) {
+        aligned_vector<int8_t> Bint8_temp(Bint8.size());
+        for (int g = 0; g < groups; ++g) {
+          transpose_matrix(
+              k_per_group,
+              n,
+              Bint8.data() + g * k_per_group * n,
+              n,
+              Bint8_temp.data() + g * k_per_group,
+              groups * k_per_group);
+        }
+        Bint8 = Bint8_temp;
+      }
+
+      int32_t Bint8_zero_point = -30;
+      // To test lda != k , we just reduce k by half and use the original k
+      // as lda.
+      int n_adjusted = n;
+      if (test_ld) {
+        assert(
+            atrans == matrix_op_t::NoTranspose &&
+            "This case is not handled yet");
+        if (btrans == matrix_op_t::NoTranspose) {
+          n_adjusted = std::max(n / 2, 1);
+        }
+      }
+
+      // computing column offset
+      vector<int32_t> col_offsets;
+      col_offsets.resize(groups * n_adjusted);
+      for (int g = 0; g < groups; ++g) {
+        col_offsets_with_zero_pt_s8acc32_ref(
+            k_per_group,
+            n_adjusted,
+            n,
+            Bint8_ref.data() + g * k_per_group * n,
+            Bint8_zero_point,
+            col_offsets.data() + g * n_adjusted);
+      }
+
+      vector<int32_t> row_offsets;
+      row_offsets.resize(m);
+
+      float C_multiplier = 0.1234;
+      int32_t C_zero_pt = 5;
+
+      int brow = 256;
+      for (int g = 0; g < groups; ++g) {
+        matmul_u8i8acc16_ref(
+            m,
+            n_adjusted,
+            k_per_group,
+            k,
+            n,
+            groups * n,
+            brow,
+            Aint8.data() + g * k_per_group,
+            Bint8_ref.data() + g * k_per_group * n,
+            Cint32_ref.data() + g * n_adjusted);
+
+        row_offsets_u8acc32_ref(
+            m,
+            k_per_group,
+            k,
+            Aint8.data() + g * k_per_group,
+            row_offsets.data());
+
+        requantize_u8acc32_ref(
+            m,
+            n_adjusted,
+            groups * n,
+            Cint32_ref.data() + g * n_adjusted,
+            Cint8_ref.data() + g * n_adjusted,
+            C_multiplier,
+            C_zero_pt,
+            Aint8_zero_point,
+            Bint8_zero_point,
+            row_offsets.data(),
+            col_offsets.data() + g * n_adjusted,
+            nullptr);
+      }
+
+      vector<int32_t> row_offset_buf;
+      row_offset_buf.resize(
+          PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
+
+      PackAWithRowOffset<uint8_t, int16_t> packAN(
+          matrix_op_t::NoTranspose,
+          m,
+          k,
+          Aint8.data(),
+          k,
+          nullptr,
+          groups,
+          Aint8_zero_point,
+          row_offset_buf.data());
+
+      PackBMatrix<int8_t, int16_t> packedBN(
+          btrans,
+          k,
+          n_adjusted,
+          Bint8.data(),
+          (btrans == matrix_op_t::Transpose) ? k : n,
+          nullptr,
+          groups,
+          Bint8_zero_point);
+
+      DoNothing<> doNothingObj{};
+      ReQuantizeOutput<false> outputProcObj(
+          doNothingObj,
+          C_multiplier,
+          C_zero_pt,
+          Aint8_zero_point,
+          Bint8_zero_point,
+          packAN.getRowOffsetBuffer(),
+          col_offsets.data(),
+          nullptr);
+
+      fbgemmPacked(
+          packAN,
+          packedBN,
+          Cint8_fb.data(),
+          Cint32_buffer.data(),
+          groups * n,
+          outputProcObj,
+          0,
+          1);
+
+      compare_validate_buffers(
+          Cint8_ref.data(),
+          Cint8_fb.data(),
+          m,
+          groups * n_adjusted,
+          groups * n,
+          static_cast<uint8_t>(0));
+    } // for each groups
+  } // for each shape
 }
 
 /**
@@ -230,180 +258,244 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
   tie(atrans, btrans, test_ld) = GetParam();
 
   for (auto shape : shapes) {
-    int m = shape[0];
-    int n = shape[1];
-    int k = shape[2];
-
-    aligned_vector<uint8_t> Aint8(m * k, 0);
-    aligned_vector<int8_t> Bint8(k * n, 0);
-    aligned_vector<int8_t> Bint8_ref(k * n, 0);
-    aligned_vector<int32_t> Cint32_local(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_local(m * n, 0);
-
-    randFill(Aint8, 0, 255);
-    int32_t Aint8_zero_point = 43;
-
-    randFill(Bint8, -128, 127);
-
-    // To test lda != k , we just reduce k by half and use the original k
-    // as lda.
-    int k_adjusted = k;
-    int n_adjusted = n;
-    if (test_ld) {
-      assert(
-          atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
-      k_adjusted = std::max(k / 2, 1);
-      if (btrans == matrix_op_t::NoTranspose) {
-        n_adjusted = std::max(n / 2, 1);
-      }
-    }
-
-    int32_t Bint8_zero_point = -30;
-    // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n_adjusted);
-    col_offsets_with_zero_pt_s8acc32_ref(
-        k_adjusted,
-        n_adjusted,
-        n,
-        Bint8.data(),
-        Bint8_zero_point,
-        col_offsets.data());
-
-    CompressedSparseColumn B_csc(k_adjusted, n_adjusted);
-    float density = 0.001f;
-    // deterministic random number
-    default_random_engine eng;
-    binomial_distribution<> per_col_nnz_dist(k_adjusted, density);
-    uniform_int_distribution<> value_dist(
-        numeric_limits<int8_t>::min() / 2, numeric_limits<int8_t>::max() / 2);
-
-    vector<int> row_indices(k_adjusted);
-    int total_nnz = 0;
-    for (int j = 0; j < n_adjusted; ++j) {
-      B_csc.ColPtr()[j] = total_nnz;
-
-      int nnz_of_j = per_col_nnz_dist(eng);
-      total_nnz += nnz_of_j;
-
-      iota(row_indices.begin(), row_indices.end(), 0);
-      shuffle(row_indices.begin(), row_indices.end(), eng);
-      sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
-
-      for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
-        B_csc.RowIdx().push_back(row_indices[kidx]);
-        // put the current B value
-        B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
-        // make current B value zero
-        Bint8[row_indices[kidx] * n + j] = 0;
-      }
-    }
-    B_csc.ColPtr()[n_adjusted] = total_nnz;
-
-    for (auto i = 0; i < Bint8.size(); ++i) {
-      Bint8_ref[i] = Bint8[i];
-    }
-
-    if (btrans == matrix_op_t::Transpose) {
-      transpose_matrix(Bint8.data(), k, n);
-    }
-
-    vector<int32_t> row_offsets;
-    row_offsets.resize(m);
-
-    float C_multiplier = 0.1234;
-    int32_t C_zero_pt = 5;
-
-    int brow = 256;
-    matmul_u8i8acc16_ref(
-        m,
-        n_adjusted,
-        k_adjusted,
-        k,
-        n,
-        n,
-        brow,
-        Aint8.data(),
-        Bint8_ref.data(),
-        Cint32_local.data());
-
-    bool accumulation = true;
-    spmdm_ref(m, Aint8.data(), k, B_csc, accumulation, Cint32_local.data(), n);
-
-    row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data());
-
-    requantize_u8acc32_ref(
-        m,
-        n_adjusted,
-        n,
-        Cint32_local.data(),
-        Cint8_local.data(),
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        row_offsets.data(),
-        col_offsets.data(),
-        nullptr);
-
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
-        PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-
-    PackAWithRowOffset<uint8_t, int16_t> packAN(
-        matrix_op_t::NoTranspose,
-        m,
-        k_adjusted,
-        Aint8.data(),
-        k,
-        nullptr,
-        1,
-        Aint8_zero_point,
-        row_offset_buf.data());
-
-    // spmdm -> requantization -> nothing
-    // construct an output processing pipeline in reverse order
-    // i.e. last output operation first
-    // Last operation should always be DoNothing with
-    // correct input and output type.
-    DoNothing<> doNothingObj{};
-    // The second last operation is requantization back
-    // to int8
-    ReQuantizeOutput<false> reqObj(
-        doNothingObj,
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        packAN.getRowOffsetBuffer(),
-        col_offsets.data(),
-        nullptr);
-    // the top most (first) operation in the output processing
-    // pipeline is spmdm
-    // outType = final output type after fullly processing through pipeline
-    // inType = initial input type at the first call to the whole pipeline
-    DoSpmdmOnInpBuffer<
-        ReQuantizeOutput<false>::outType,
-        int32_t,
-        ReQuantizeOutput<false>>
-        spmdmObj(reqObj, Aint8.data(), k, B_csc);
-
-    PackBMatrix<int8_t, int16_t> packedB(
-        btrans,
-        k_adjusted,
-        n_adjusted,
-        Bint8.data(),
-        (btrans == matrix_op_t::Transpose) ? k : n);
-
-    fbgemmPacked(
-        packAN, packedB, Cint8_fb.data(), Cint32_fb.data(), n, spmdmObj, 0, 1);
-
-    compare_validate_buffers(
-        Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0));
-  }
+    for (int groups : {1, 3, 4}) {
+      // very small density to test hyper sparsity case
+      // moderate density to test the implementation using transpose
+      for (float density : {0.0001f, 0.1f}) {
+        int m = shape[0];
+        int n = shape[1];
+        int k = shape[2];
+        if (k % groups != 0) {
+          continue;
+        }
+        int k_per_group = k / groups;
+
+        aligned_vector<uint8_t> Aint8(m * k, 0);
+
+        aligned_vector<int8_t> Bint8(k * n, 0);
+        aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+
+        aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
+        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+
+        randFill(Aint8, 0, 255);
+        int32_t Aint8_zero_point = 43;
+
+        randFill(Bint8, -128, 127);
+
+        // To test lda != k , we just reduce k by half and use the original k
+        // as lda.
+        int n_adjusted = n;
+        if (test_ld) {
+          assert(
+              atrans == matrix_op_t::NoTranspose &&
+              "This case is not handled yet");
+          if (btrans == matrix_op_t::NoTranspose) {
+            n_adjusted = std::max(n / 2, 1);
+          }
+        }
+
+        int32_t Bint8_zero_point = -30;
+        // computing column offset
+        vector<int32_t> col_offsets;
+        col_offsets.resize(groups * n_adjusted);
+        for (int g = 0; g < groups; ++g) {
+          col_offsets_with_zero_pt_s8acc32_ref(
+              k_per_group,
+              n_adjusted,
+              n,
+              Bint8_ref.data() + g * k_per_group * n,
+              Bint8_zero_point,
+              col_offsets.data() + g * n_adjusted);
+        }
+
+        CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted);
+        // Make sure density is big enough. Otherwise, we're not really testing
+        // spmdm.
+        // deterministic random number
+        default_random_engine eng;
+        binomial_distribution<> per_col_nnz_dist(k_per_group, density);
+
+        vector<int> row_indices(k_per_group);
+        int total_nnz = 0;
+        for (int g = 0; g < groups; ++g) {
+          for (int j = 0; j < n_adjusted; ++j) {
+            B_csc.ColPtr()[g * n_adjusted + j] = total_nnz;
+
+            int nnz_of_j = per_col_nnz_dist(eng);
+            total_nnz += nnz_of_j;
+
+            iota(row_indices.begin(), row_indices.end(), 0);
+            shuffle(row_indices.begin(), row_indices.end(), eng);
+            sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
+
+            for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
+              int rowidx = row_indices[kidx];
+              B_csc.RowIdx().push_back(rowidx);
+              int8_t* bptr = &Bint8[(g * k_per_group + rowidx) * n + j];
+              int b_remainder = 0;
+              if (kidx % 2 == 1) {
+                // Make sure abs(b_prev + *bptr - b_remainder) <= 128
+                int b_prev = B_csc.Values().back();
+                b_remainder = std::max(b_prev + *bptr - 128, b_remainder);
+                b_remainder = std::min(b_prev + *bptr + 128, b_remainder);
+              }
+              // put a portion of current B value that won't saturate
+              // _mm256_maddubs_epi16 .
+              B_csc.Values().push_back(*bptr - b_remainder);
+              // put the remainder
+              *bptr = b_remainder;
+            }
+          }
+        }
+        B_csc.ColPtr()[groups * n_adjusted] = total_nnz;
+
+        Bint8_ref = Bint8;
+
+        if (btrans == matrix_op_t::Transpose) {
+          aligned_vector<int8_t> Bint8_temp(Bint8.size());
+          for (int g = 0; g < groups; ++g) {
+            transpose_matrix(
+                k_per_group,
+                n,
+                Bint8.data() + g * k_per_group * n,
+                n,
+                Bint8_temp.data() + g * k_per_group,
+                groups * k_per_group);
+          }
+          Bint8 = Bint8_temp;
+        }
+
+        vector<int32_t> row_offsets;
+        row_offsets.resize(m);
+
+        float C_multiplier = 0.1234;
+        int32_t C_zero_pt = 5;
+
+        int brow = 256;
+        for (int g = 0; g < groups; ++g) {
+          matmul_u8i8acc16_ref(
+              m,
+              n_adjusted,
+              k_per_group,
+              k,
+              n,
+              groups * n,
+              brow,
+              Aint8.data() + g * k_per_group,
+              Bint8_ref.data() + g * k_per_group * n,
+              Cint32_ref.data() + g * n_adjusted);
+        }
+
+        bool accumulation = true;
+        spmdm_ref(
+            m,
+            Aint8.data(),
+            k,
+            B_csc,
+            accumulation,
+            Cint32_ref.data(),
+            groups * n,
+            groups);
+
+        for (int g = 0; g < groups; ++g) {
+          row_offsets_u8acc32_ref(
+              m,
+              k_per_group,
+              k,
+              Aint8.data() + g * k_per_group,
+              row_offsets.data());
+
+          requantize_u8acc32_ref(
+              m,
+              n_adjusted,
+              groups * n,
+              Cint32_ref.data() + g * n_adjusted,
+              Cint8_ref.data() + g * n_adjusted,
+              C_multiplier,
+              C_zero_pt,
+              Aint8_zero_point,
+              Bint8_zero_point,
+              row_offsets.data(),
+              col_offsets.data() + g * n_adjusted,
+              nullptr);
+        }
+
+        vector<int32_t> row_offset_buf;
+        row_offset_buf.resize(
+            PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
+
+        PackAWithRowOffset<uint8_t, int16_t> packAN(
+            matrix_op_t::NoTranspose,
+            m,
+            k,
+            Aint8.data(),
+            k,
+            nullptr,
+            groups,
+            Aint8_zero_point,
+            row_offset_buf.data());
+
+        // spmdm -> requantization -> nothing
+        // construct an output processing pipeline in reverse order
+        // i.e. last output operation first
+        // Last operation should always be DoNothing with
+        // correct input and output type.
+        DoNothing<> doNothingObj{};
+        // The second last operation is requantization back
+        // to int8
+        ReQuantizeOutput<false> reqObj(
+            doNothingObj,
+            C_multiplier,
+            C_zero_pt,
+            Aint8_zero_point,
+            Bint8_zero_point,
+            packAN.getRowOffsetBuffer(),
+            col_offsets.data(),
+            nullptr);
+        // the top most (first) operation in the output processing
+        // pipeline is spmdm
+        // outType = final output type after fullly processing through pipeline
+        // inType = initial input type at the first call to the whole pipeline
+        DoSpmdmOnInpBuffer<
+            ReQuantizeOutput<false>::outType,
+            int32_t,
+            ReQuantizeOutput<false>>
+            spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+        PackBMatrix<int8_t, int16_t> packedB(
+            btrans,
+            k,
+            n_adjusted,
+            Bint8.data(),
+            (btrans == matrix_op_t::Transpose) ? k : n,
+            nullptr,
+            groups,
+            Bint8_zero_point);
+
+        fbgemmPacked(
+            packAN,
+            packedB,
+            Cint8_fb.data(),
+            Cint32_fb.data(),
+            groups * n,
+            spmdmObj,
+            0,
+            1);
+
+        compare_validate_buffers(
+            Cint8_ref.data(),
+            Cint8_fb.data(),
+            m,
+            groups * n_adjusted,
+            groups * n,
+            static_cast<uint8_t>(0));
+      } // for each density
+    } // for each groups
+  } // for each shape
 }
 
 /**
@@ -417,119 +509,140 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
   tie(atrans, btrans, test_ld) = GetParam();
 
   for (auto shape : shapes) {
-    int m = shape[0];
-    int n = shape[1];
-    int k = shape[2];
-
-    aligned_vector<uint8_t> Aint8(m * k, 0);
-    aligned_vector<int8_t> Bint8(k * n, 0);
-    aligned_vector<int8_t> Bint8_ref(k * n, 0);
-    aligned_vector<int32_t> Cint32_local(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_local(m * n, 0);
-
-    randFill(Aint8, 0, 255);
-    int32_t Aint8_zero_point = 43;
-
-    randFill(Bint8_ref, -128, 127);
-
-    for (auto i = 0; i < Bint8.size(); ++i) {
-      Bint8[i] = Bint8_ref[i];
-    }
-
-    if (btrans == matrix_op_t::Transpose) {
-      transpose_matrix(Bint8.data(), k, n);
-    }
-
-    int32_t Bint8_zero_point = -30;
-    // To test lda != k , we just reduce k by half and use the original k
-    // as lda.
-    int k_adjusted = k;
-    int n_adjusted = n;
-    if (test_ld) {
-      assert(
-          atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
-      k_adjusted = std::max(k / 2, 1);
-      if (btrans == matrix_op_t::NoTranspose) {
-        n_adjusted = std::max(n / 2, 1);
+    for (int groups : {1, 3, 4}) {
+      int m = shape[0];
+      int n = shape[1];
+      int k = shape[2];
+      if (k % groups != 0) {
+        continue;
+      }
+      int k_per_group = k / groups;
+
+      aligned_vector<uint8_t> Aint8(m * k, 0);
+
+      aligned_vector<int8_t> Bint8(k * n, 0);
+      aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+
+      aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
+      aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+      aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+
+      randFill(Aint8, 0, 255);
+      int32_t Aint8_zero_point = 43;
+
+      randFill(Bint8_ref, -128, 127);
+      Bint8 = Bint8_ref;
+
+      if (btrans == matrix_op_t::Transpose) {
+        aligned_vector<int8_t> Bint8_temp(Bint8.size());
+        for (int g = 0; g < groups; ++g) {
+          transpose_matrix(
+              k_per_group,
+              n,
+              Bint8.data() + g * k_per_group * n,
+              n,
+              Bint8_temp.data() + g * k_per_group,
+              groups * k_per_group);
+        }
+        Bint8 = Bint8_temp;
       }
-    }
-
-    // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n_adjusted);
-    col_offsets_with_zero_pt_s8acc32_ref(
-        k_adjusted,
-        n_adjusted,
-        n,
-        Bint8_ref.data(),
-        Bint8_zero_point,
-        col_offsets.data());
-
-    vector<int32_t> row_offsets;
-    row_offsets.resize(m);
-
-    int brow = 256;
-    matmul_u8i8acc16_ref(
-        m,
-        n_adjusted,
-        k_adjusted,
-        k,
-        n,
-        n,
-        brow,
-        Aint8.data(),
-        Bint8_ref.data(),
-        Cint32_local.data());
-
-    row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data());
-
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
-        PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-
-    PackAWithRowOffset<uint8_t, int16_t> packAN(
-        matrix_op_t::NoTranspose,
-        m,
-        k_adjusted,
-        Aint8.data(),
-        k,
-        nullptr,
-        1,
-        Aint8_zero_point,
-        row_offset_buf.data());
-
-    PackBMatrix<int8_t, int16_t> packedBN(
-        btrans,
-        k_adjusted,
-        n_adjusted,
-        Bint8.data(),
-        (btrans == matrix_op_t::Transpose) ? k : n,
-        nullptr,
-        1,
-        Bint8_zero_point);
-
-    // DoNothing<> doNothingObj{};
-    DoNothing<int32_t, int32_t> doNothingObj{};
-    memCopy<> outputProcObj(doNothingObj);
-    fbgemmPacked(
-        packAN,
-        packedBN,
-        Cint32_fb.data(),
-        Cint32_buffer.data(),
-        n,
-        outputProcObj,
-        0,
-        1);
-
-    compare_validate_buffers(
-        Cint32_local.data(),
-        Cint32_fb.data(),
-        m,
-        n,
-        n,
-        static_cast<int32_t>(0));
-  }
+
+      int32_t Bint8_zero_point = -30;
+      // To test lda != k , we just reduce k by half and use the original k
+      // as lda.
+      int n_adjusted = n;
+      if (test_ld) {
+        assert(
+            atrans == matrix_op_t::NoTranspose &&
+            "This case is not handled yet");
+        if (btrans == matrix_op_t::NoTranspose) {
+          n_adjusted = std::max(n / 2, 1);
+        }
+      }
+
+      // computing column offset
+      vector<int32_t> col_offsets;
+      col_offsets.resize(groups * n_adjusted);
+      for (int g = 0; g < groups; ++g) {
+        col_offsets_with_zero_pt_s8acc32_ref(
+            k_per_group,
+            n_adjusted,
+            n,
+            Bint8_ref.data() + g * k_per_group * n,
+            Bint8_zero_point,
+            col_offsets.data() + g * n_adjusted);
+      }
+
+      vector<int32_t> row_offsets;
+      row_offsets.resize(m);
+
+      int brow = 256;
+      for (int g = 0; g < groups; ++g) {
+        matmul_u8i8acc16_ref(
+            m,
+            n_adjusted,
+            k_per_group,
+            k,
+            n,
+            groups * n,
+            brow,
+            Aint8.data() + g * k_per_group,
+            Bint8_ref.data() + g * k_per_group * n,
+            Cint32_ref.data() + g * n_adjusted);
+
+        row_offsets_u8acc32_ref(
+            m,
+            k_per_group,
+            k,
+            Aint8.data() + g * k_per_group,
+            row_offsets.data());
+      }
+
+      vector<int32_t> row_offset_buf;
+      row_offset_buf.resize(
+          PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
+
+      PackAWithRowOffset<uint8_t, int16_t> packAN(
+          matrix_op_t::NoTranspose,
+          m,
+          k,
+          Aint8.data(),
+          k,
+          nullptr,
+          groups,
+          Aint8_zero_point,
+          row_offset_buf.data());
+
+      PackBMatrix<int8_t, int16_t> packedBN(
+          btrans,
+          k,
+          n_adjusted,
+          Bint8.data(),
+          (btrans == matrix_op_t::Transpose) ? k : n,
+          nullptr,
+          groups,
+          Bint8_zero_point);
+
+      // DoNothing<> doNothingObj{};
+      DoNothing<int32_t, int32_t> doNothingObj{};
+      memCopy<> outputProcObj(doNothingObj);
+      fbgemmPacked(
+          packAN,
+          packedBN,
+          Cint32_fb.data(),
+          Cint32_buffer.data(),
+          groups * n,
+          outputProcObj,
+          0,
+          1);
+
+      compare_validate_buffers(
+          Cint32_ref.data(),
+          Cint32_fb.data(),
+          m,
+          groups * n_adjusted,
+          groups * n,
+          static_cast<int32_t>(0));
+    } // for each groups
+  } // for each shape
 }
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index e0c9850..7653989 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -83,145 +83,191 @@ TEST_P(fbgemmu8s8acc32test, Test) {
   tie(atrans, btrans, test_ld) = GetParam();
 
   for (auto shape : shapes) {
-    int m = shape[0];
-    int n = shape[1];
-    int k = shape[2];
-
-    aligned_vector<uint8_t> Aint8(m * k, 0);
-
-    // nxk matrix
-    aligned_vector<int8_t> Bint8(k * n, 0);
-    // kxn matrix
-    aligned_vector<int8_t> Bint8_ref(k * n, 0);
-
-    aligned_vector<int32_t> Cint32_ref(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_local(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-    aligned_vector<uint8_t> Cint8_local(m * n, 0);
-
-    randFill(Aint8, 0, 255);
-    int32_t Aint8_zero_point = 43;
-
-    randFill(Bint8_ref, -128, 127);
-    avoidOverflow(m, n, k, Aint8.data(), Bint8_ref.data());
-
-    for (auto i = 0; i < Bint8.size(); ++i) {
-      Bint8[i] = Bint8_ref[i];
-    }
-
-    if (btrans == matrix_op_t::Transpose) {
-      transpose_matrix(Bint8.data(), k, n);
-    }
-
-    int32_t Bint8_zero_point = -30;
-    // To test lda != k , we just reduce k by half and use the original k
-    // as lda.
-    int k_adjusted = k;
-    int n_adjusted = n;
-    if (test_ld) {
-      assert(
-          atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
-      k_adjusted = std::max(k / 2, 1);
-      if (btrans == matrix_op_t::NoTranspose) {
-        n_adjusted = std::max(n / 2, 1);
-      }
-    }
-
-    // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n_adjusted);
-    col_offsets_with_zero_pt_s8acc32_ref(
-        k_adjusted,
-        n_adjusted,
-        n,
-        Bint8_ref.data(),
-        Bint8_zero_point,
-        col_offsets.data());
-
-    vector<int32_t> row_offsets;
-    row_offsets.resize(m);
-
-    float C_multiplier = 0.1234;
-    int32_t C_zero_pt = 5;
-
-    matmul_u8i8acc32_ref(
-        m,
-        n_adjusted,
-        k_adjusted,
-        k,
-        n,
-        n,
-        Aint8.data(),
-        Bint8_ref.data(),
-        Cint32_local.data());
-
-    row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data());
-
-    requantize_u8acc32_ref(
-        m,
-        n_adjusted,
-        n,
-        Cint32_local.data(),
-        Cint8_local.data(),
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        row_offsets.data(),
-        col_offsets.data(),
-        nullptr);
-
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
-
-    PackAWithRowOffset<uint8_t> packAN(
-        matrix_op_t::NoTranspose,
-        m,
-        k_adjusted,
-        Aint8.data(),
-        k,
-        nullptr,
-        1,
-        Aint8_zero_point,
-        row_offset_buf.data());
-
-    PackBMatrix<int8_t> packedBN(
-        btrans,
-        k_adjusted,
-        n_adjusted,
-        Bint8.data(),
-        (btrans == matrix_op_t::Transpose) ? k : n,
-        nullptr,
-        1,
-        Bint8_zero_point);
-
-    DoNothing<> doNothingObj{};
-    ReQuantizeOutput<false> outputProcObj(
-        doNothingObj,
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        packAN.getRowOffsetBuffer(),
-        col_offsets.data(),
-        nullptr);
-
-    fbgemmPacked(
-        packAN,
-        packedBN,
-        Cint8_fb.data(),
-        Cint32_buffer.data(),
-        n,
-        outputProcObj,
-        0,
-        1);
-    // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(),
-    // m, n_adjusted, n, "C local");
-    compare_validate_buffers(
-        Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0));
-  }
+    for (int groups : {1, 3, 4}) {
+      for (bool test_bias: {false, true}) {
+        int m = shape[0];
+        int n = shape[1];
+        int k = shape[2];
+        if (k % groups != 0) {
+          continue;
+        }
+        int k_per_group = k / groups;
+
+        // mxk matrix
+        aligned_vector<uint8_t> Aint8(m * k, 0);
+
+        // kxn matrix
+        aligned_vector<int8_t> Bint8(k * n, 0);
+        aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+
+        aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
+        aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+        aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+        aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+        aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+
+        randFill(Aint8, 0, 255);
+        int32_t Aint8_zero_point = 43;
+
+        randFill(Bint8_ref, -128, 127);
+        for (int g = 0; g < groups; ++g) {
+          avoidOverflow(
+              m,
+              n,
+              k_per_group,
+              Aint8.data() + g * k_per_group,
+              k,
+              Bint8_ref.data() + g * k_per_group * n,
+              n);
+        }
+
+        Bint8 = Bint8_ref;
+
+        // initialize bias
+        aligned_vector<int32_t> bias_int32(groups * n);
+        int32_t* bias = nullptr;
+        if (test_bias) {
+          randFill(bias_int32, -128, 127);
+          bias = bias_int32.data();
+        }
+
+        if (btrans == matrix_op_t::Transpose) {
+          aligned_vector<int8_t> Bint8_temp(Bint8.size());
+          for (int g = 0; g < groups; ++g) {
+            transpose_matrix(
+                k_per_group,
+                n,
+                Bint8.data() + g * k_per_group * n,
+                n,
+                Bint8_temp.data() + g * k_per_group,
+                groups * k_per_group);
+          }
+          Bint8 = Bint8_temp;
+        }
+
+        int32_t Bint8_zero_point = -30;
+        // To test lda != k , we just reduce k by half and use the original k
+        // as lda.
+        int n_adjusted = n;
+        if (test_ld) {
+          assert(
+              atrans == matrix_op_t::NoTranspose &&
+              "This case is not handled yet");
+          if (btrans == matrix_op_t::NoTranspose) {
+            n_adjusted = std::max(n / 2, 1);
+          }
+        }
+
+        // computing column offset
+        vector<int32_t> col_offsets;
+        col_offsets.resize(groups * n_adjusted);
+        for (int g = 0; g < groups; ++g) {
+          col_offsets_with_zero_pt_s8acc32_ref(
+              k_per_group,
+              n_adjusted,
+              n,
+              Bint8_ref.data() + g * k_per_group * n,
+              Bint8_zero_point,
+              col_offsets.data() + g * n_adjusted);
+        }
+
+        vector<int32_t> row_offsets;
+        row_offsets.resize(m);
+
+        float C_multiplier = 0.001234;
+        int32_t C_zero_pt = 5;
+
+        for (int g = 0; g < groups; ++g) {
+          matmul_u8i8acc32_ref(
+              m,
+              n_adjusted,
+              k_per_group,
+              k,
+              n,
+              groups * n,
+              Aint8.data() + g * k_per_group,
+              Bint8_ref.data() + g * k_per_group * n,
+              Cint32_ref.data() + g * n_adjusted);
+
+          row_offsets_u8acc32_ref(
+              m,
+              k_per_group,
+              k,
+              Aint8.data() + g * k_per_group,
+              row_offsets.data());
+
+          requantize_u8acc32_ref(
+              m,
+              n_adjusted,
+              groups * n,
+              Cint32_ref.data() + g * n_adjusted,
+              Cint8_ref.data() + g * n_adjusted,
+              C_multiplier,
+              C_zero_pt,
+              Aint8_zero_point,
+              Bint8_zero_point,
+              row_offsets.data(),
+              col_offsets.data() + g * n_adjusted,
+              bias ? (bias + g * n_adjusted) : nullptr);
+        }
+
+        vector<int32_t> row_offset_buf;
+        row_offset_buf.resize(
+            PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
+
+        PackAWithRowOffset<uint8_t> packAN(
+            matrix_op_t::NoTranspose,
+            m,
+            k,
+            Aint8.data(),
+            k,
+            nullptr,
+            groups,
+            Aint8_zero_point,
+            row_offset_buf.data());
+
+        PackBMatrix<int8_t> packedBN(
+            btrans,
+            k,
+            n_adjusted,
+            Bint8.data(),
+            (btrans == matrix_op_t::Transpose) ? k : n,
+            nullptr,
+            groups,
+            Bint8_zero_point);
+
+        DoNothing<> doNothingObj{};
+        ReQuantizeOutput<false> outputProcObj(
+            doNothingObj,
+            C_multiplier,
+            C_zero_pt,
+            Aint8_zero_point,
+            Bint8_zero_point,
+            packAN.getRowOffsetBuffer(),
+            col_offsets.data(),
+            bias);
+
+        fbgemmPacked(
+            packAN,
+            packedBN,
+            Cint8_fb.data(),
+            Cint32_buffer.data(),
+            groups * n,
+            outputProcObj,
+            0,
+            1);
+        // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(),
+        // m, n_adjusted, n, "C local");
+        compare_validate_buffers(
+            Cint8_ref.data(),
+            Cint8_fb.data(),
+            m,
+            groups * n_adjusted,
+            groups * n,
+            static_cast<uint8_t>(0));
+      } // test_bias
+    } // for each groups
+  } // for each shape
 }
 
 /**
@@ -236,129 +282,163 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
   tie(atrans, btrans, test_ld) = GetParam();
 
   for (auto shape : shapes) {
-    int m = shape[0];
-    int n = shape[1];
-    int k = shape[2];
-
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
-
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
-
-    aligned_vector<float> Cfp32_ref(m * n, 0.0f);
-    aligned_vector<float> Cfp32_fb(m * n, 0.0f);
-
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-
-    randFill(Aint8, 0, 255);
-    int32_t Aint8_zero_point = 43;
-    float Aint8_scale = 0.11;
-    for (auto i = 0; i < Afp32.size(); ++i) {
-      Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
-    }
-
-    randFill(Bint8, -128, 127);
-    avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
-    int32_t Bint8_zero_point = -30;
-    float Bint8_scale = 0.49;
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point);
-    }
-
-    // To test lda != k , we just reduce k by half and use the original k
-    // as lda.
-    int k_adjusted = k;
-    int n_adjusted = n;
-    if (test_ld) {
-      assert(
-          atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
-      k_adjusted = std::max(k / 2, 1);
-      if (btrans == matrix_op_t::NoTranspose) {
-        n_adjusted = std::max(n / 2, 1);
+    for (int groups : {1, 3, 4}) {
+      int m = shape[0];
+      int n = shape[1];
+      int k = shape[2];
+      if (k % groups != 0) {
+        continue;
+      }
+      int k_per_group = k / groups;
+
+      aligned_vector<float> Afp32(m * k, 0.0f);
+      aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
+
+      aligned_vector<float> Bfp32(k * n, 0.0f);
+      aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
+
+      aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
+      aligned_vector<float> Cfp32_fb(Cfp32_ref.size(), 0.0f);
+
+      aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size(), 0);
+      aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size(), 0);
+
+      randFill(Aint8, 0, 255);
+      int32_t Aint8_zero_point = 43;
+      float Aint8_scale = 0.11;
+      for (auto i = 0; i < Afp32.size(); ++i) {
+        Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
+      }
+
+      randFill(Bint8, -128, 127);
+      for (int g = 0; g < groups; ++g) {
+        avoidOverflow(
+            m,
+            n,
+            k_per_group,
+            Aint8.data() + g * k_per_group,
+            k,
+            Bint8.data() + g * k_per_group * n,
+            n);
+      }
+      int32_t Bint8_zero_point = -30;
+      float Bint8_scale = 0.49;
+      for (auto i = 0; i < Bfp32.size(); ++i) {
+        Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point);
+      }
+
+      // To test lda != k , we just reduce k by half and use the original k
+      // as lda.
+      int n_adjusted = n;
+      if (test_ld) {
+        assert(
+            atrans == matrix_op_t::NoTranspose &&
+            "This case is not handled yet");
+        if (btrans == matrix_op_t::NoTranspose) {
+          n_adjusted = std::max(n / 2, 1);
+        }
+      }
+
+      // computing column offset
+      vector<int32_t> col_offsets;
+      col_offsets.resize(groups * n_adjusted);
+      for (int g = 0; g < groups; ++g) {
+        col_offsets_with_zero_pt_s8acc32_ref(
+            k_per_group,
+            n_adjusted,
+            n,
+            Bint8.data() + g * k_per_group * n,
+            Bint8_zero_point,
+            col_offsets.data() + g * n_adjusted);
+      }
+
+      if (btrans == matrix_op_t::Transpose) {
+        aligned_vector<int8_t> Bint8_temp(Bint8.size());
+        for (int g = 0; g < groups; ++g) {
+          transpose_matrix(
+              k_per_group,
+              n,
+              Bint8.data() + g * k_per_group * n,
+              n,
+              Bint8_temp.data() + g * k_per_group,
+              groups * k_per_group);
+        }
+        Bint8 = Bint8_temp;
+      }
+
+      for (int g = 0; g < groups; ++g) {
+        matmul_fp_ref(
+            m,
+            n_adjusted,
+            k_per_group,
+            k,
+            n,
+            groups * n,
+            Afp32.data() + g * k_per_group,
+            Bfp32.data() + g * k_per_group * n,
+            Cfp32_ref.data() + g * n_adjusted);
       }
-    }
-
-    // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n_adjusted);
-    col_offsets_with_zero_pt_s8acc32_ref(
-        k_adjusted,
-        n_adjusted,
-        n,
-        Bint8.data(),
-        Bint8_zero_point,
-        col_offsets.data());
-
-    if (btrans == matrix_op_t::Transpose) {
-      transpose_matrix(Bint8.data(), k, n);
-    }
-
-    matmul_fp_ref(
-        m,
-        n_adjusted,
-        k_adjusted,
-        k,
-        n,
-        n,
-        Afp32.data(),
-        Bfp32.data(),
-        Cfp32_ref.data());
-
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(
-        PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
-
-    PackAWithQuantRowOffset<uint8_t> packAN(
-        matrix_op_t::NoTranspose,
-        m,
-        k_adjusted,
-        Afp32.data(),
-        k,
-        nullptr, /*buffer for packed matrix*/
-        Aint8_scale,
-        Aint8_zero_point,
-        1, /*groups*/
-        row_offset_buf.data());
-
-    PackBMatrix<int8_t> packedBN(
-        btrans,
-        k_adjusted,
-        n_adjusted,
-        Bint8.data(),
-        (btrans == matrix_op_t::Transpose) ? k : n,
-        nullptr,
-        1,
-        Bint8_zero_point);
-
-    DoNothing<float, float> doNothingObj{};
-    ReQuantizeForFloat<false> outputProcObj(
-        doNothingObj,
-        Aint8_scale,
-        Bint8_scale,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        packAN.getRowOffsetBuffer(),
-        col_offsets.data(),
-        nullptr);
-
-    fbgemmPacked(
-        packAN,
-        packedBN,
-        Cfp32_fb.data(),
-        (int32_t*)Cfp32_fb.data(),
-        n,
-        outputProcObj,
-        0,
-        1);
-
-    float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end());
-    float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end());
-    float atol = (maximum - minimum) / 255 / 1.9;
-
-    compare_validate_buffers(Cfp32_ref.data(), Cfp32_fb.data(), m, n, n, atol);
-  }
+
+      vector<int32_t> row_offset_buf;
+      row_offset_buf.resize(
+          PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
+
+      PackAWithQuantRowOffset<uint8_t> packAN(
+          matrix_op_t::NoTranspose,
+          m,
+          k,
+          Afp32.data(),
+          k,
+          nullptr, /*buffer for packed matrix*/
+          Aint8_scale,
+          Aint8_zero_point,
+          groups,
+          row_offset_buf.data());
+
+      PackBMatrix<int8_t> packedBN(
+          btrans,
+          k,
+          n_adjusted,
+          Bint8.data(),
+          (btrans == matrix_op_t::Transpose) ? k : n,
+          nullptr,
+          groups,
+          Bint8_zero_point);
+
+      DoNothing<float, float> doNothingObj{};
+      ReQuantizeForFloat<false> outputProcObj(
+          doNothingObj,
+          Aint8_scale,
+          Bint8_scale,
+          Aint8_zero_point,
+          Bint8_zero_point,
+          packAN.getRowOffsetBuffer(),
+          col_offsets.data(),
+          nullptr);
+
+      fbgemmPacked(
+          packAN,
+          packedBN,
+          Cfp32_fb.data(),
+          reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+          groups * n,
+          outputProcObj,
+          0,
+          1);
+
+      float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end());
+      float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end());
+      float atol = (maximum - minimum) / 255 / 1.9;
+
+      compare_validate_buffers(
+          Cfp32_ref.data(),
+          Cfp32_fb.data(),
+          m,
+          groups * n_adjusted,
+          groups * n,
+          atol);
+    } // for each groups
+  } // for each shape
 }
 
 /**
@@ -373,253 +453,121 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
   tie(atrans, btrans, test_ld) = GetParam();
 
   for (auto shape : shapes) {
-    int m = shape[0];
-    int n = shape[1];
-    int k = shape[2];
-
-    aligned_vector<float> Afp32(m * k, 0.0f);
-    aligned_vector<uint8_t> Aint8(m * k, 0);
-
-    aligned_vector<float> Bfp32(k * n, 0.0f);
-    aligned_vector<int8_t> Bint8(k * n, 0);
-
-    aligned_vector<float> Cfp32_ref(m * n, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-
-    randFill(Afp32, 0, 255);
-    for (auto i = 0; i < Afp32.size(); i++) {
-      Aint8[i] = (uint8_t)Afp32[i];
-    }
-
-    // initialize B matrix
-    randFill(Bfp32, -128, 127);
-    avoidOverflow(m, n, k, Aint8.data(), Bfp32.data());
-
-    for (auto i = 0; i < Bfp32.size(); ++i) {
-      Bint8[i] = (int8_t)Bfp32[i];
-    }
-
-    // To test lda != k , we just reduce k by half and use the original k
-    // as lda.
-    int m_adjusted = m;
-    int n_adjusted = n;
-    int k_adjusted = k;
-    if (test_ld) {
-      assert(
-          atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
-      k_adjusted = std::max(k / 2, 1);
-      if (btrans == matrix_op_t::NoTranspose) {
-        n_adjusted = std::max(n / 2, 1);
-      }
-    }
-
-    if (btrans == matrix_op_t::Transpose) {
-      transpose_matrix(Bint8.data(), k, n);
-    }
-
-    matmul_fp_ref(
-        m,
-        n_adjusted,
-        k_adjusted,
-        k,
-        n,
-        n,
-        Afp32.data(),
-        Bfp32.data(),
-        Cfp32_ref.data());
-
-    DoNothing<int32_t, int32_t> doNothingObj{};
-    memCopy<> outputProcObj(doNothingObj);
-    // A zero point and row offset not required
-    PackAMatrix<uint8_t> packAN(
-        matrix_op_t::NoTranspose, m, k_adjusted, Aint8.data(), k);
-
-    // B zero point defaults to 0
-    PackBMatrix<int8_t> packedBN(
-        btrans,
-        k_adjusted,
-        n_adjusted,
-        Bint8.data(),
-        (btrans == matrix_op_t::Transpose) ? k : n);
-
-    fbgemmPacked(
-        packAN,
-        packedBN,
-        Cint32_fb.data(),
-        Cint32_fb.data(),
-        n,
-        outputProcObj,
-        0,
-        1);
-
-    // correctness check
-    for (int i = 0; i < m_adjusted; ++i) {
-      for (int j = 0; j < n_adjusted; ++j) {
-        float expected = Cfp32_ref[i * n + j];
-        int32_t actual = Cint32_fb[i * n + j];
-        EXPECT_EQ(expected, actual)
-            << "GEMM results differ at (" << i << ", " << j << "). ref "
-            << expected << " FBGemm " << actual;
+    for (int groups : {1, 3, 4}) {
+      int m = shape[0];
+      int n = shape[1];
+      int k = shape[2];
+      if (k % groups != 0) {
+        continue;
       }
-    }
-  }
-}
+      int k_per_group = k / groups;
 
-/**
- * @brief Unit test for unt8 matrix A, int8 matrix B, and 32-bit
- * accumulation. Output processing: requantization with bias -> nothing.
- * Asymmetric: the zero point is not 0.
- */
-TEST_P(fbgemmu8s8acc32test, TestAsymmetricQuantizedWithBias) {
-  vector<vector<int>> shapes(GetShapes_());
-  matrix_op_t atrans, btrans;
-  bool test_ld;
-  tie(atrans, btrans, test_ld) = GetParam();
+      aligned_vector<float> Afp32(m * k, 0.0f);
+      aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
 
-  for (auto shape : shapes) {
-    int m = shape[0];
-    int n = shape[1];
-    int k = shape[2];
+      aligned_vector<float> Bfp32(k * n, 0.0f);
+      aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
 
-    aligned_vector<uint8_t> Aint8(m * k, 0);
-    aligned_vector<uint8_t> Aint8_ref(m * k, 0);
+      aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
+      aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size(), 0);
 
-    aligned_vector<int8_t> Bint8(k * n, 0);
-    aligned_vector<int8_t> Bint8_ref(k * n, 0);
+      randFill(Afp32, 0, 255);
+      for (auto i = 0; i < Afp32.size(); i++) {
+        Aint8[i] = (uint8_t)Afp32[i];
+      }
 
-    aligned_vector<int32_t> Cint32_fb(m * n, 0);
-    aligned_vector<int32_t> Cint32_ref(m * n, 0);
+      // initialize B matrix
+      randFill(Bfp32, -128, 127);
+      for (int g = 0; g < groups; ++g) {
+        avoidOverflow(
+            m,
+            n,
+            k_per_group,
+            Aint8.data() + g * k_per_group,
+            k,
+            Bfp32.data() + g * k_per_group * n,
+            n);
+      }
 
-    aligned_vector<uint8_t> Cint8_fb(m * n, 0);
-    aligned_vector<uint8_t> Cint8_ref(m * n, 0);
+      for (auto i = 0; i < Bfp32.size(); ++i) {
+        Bint8[i] = (int8_t)Bfp32[i];
+      }
 
-    int n_adjusted = n;
-    int k_adjusted = k;
+      // To test lda != k , we just reduce k by half and use the original k
+      // as lda.
+      int n_adjusted = n;
+      if (test_ld) {
+        assert(
+            atrans == matrix_op_t::NoTranspose &&
+            "This case is not handled yet");
+        if (btrans == matrix_op_t::NoTranspose) {
+          n_adjusted = std::max(n / 2, 1);
+        }
+      }
+
+      if (btrans == matrix_op_t::Transpose) {
+        aligned_vector<int8_t> Bint8_temp(Bint8.size());
+        for (int g = 0; g < groups; ++g) {
+          transpose_matrix(
+              k_per_group,
+              n,
+              Bint8.data() + g * k_per_group * n,
+              n,
+              Bint8_temp.data() + g * k_per_group,
+              groups * k_per_group);
+        }
+        Bint8 = Bint8_temp;
+      }
+
+      for (int g = 0; g < groups; ++g) {
+        matmul_fp_ref(
+            m,
+            n_adjusted,
+            k_per_group,
+            k,
+            n,
+            groups * n,
+            Afp32.data() + g * k_per_group,
+            Bfp32.data() + g * k_per_group * n,
+            Cfp32_ref.data() + g * n_adjusted);
+      }
 
-    if (test_ld) {
-      assert(
-          atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
-      k_adjusted = std::max(k / 2, 1);
-      if (btrans == matrix_op_t::NoTranspose) {
-        n_adjusted = std::max(n / 2, 1);
+      DoNothing<int32_t, int32_t> doNothingObj{};
+      memCopy<> outputProcObj(doNothingObj);
+      // A zero point and row offset not required
+      PackAMatrix<uint8_t> packAN(
+          matrix_op_t::NoTranspose, m, k, Aint8.data(), k, nullptr, groups);
+
+      // B zero point defaults to 0
+      PackBMatrix<int8_t> packedBN(
+          btrans,
+          k,
+          n_adjusted,
+          Bint8.data(),
+          (btrans == matrix_op_t::Transpose) ? k : n,
+          nullptr,
+          groups);
+
+      fbgemmPacked(
+          packAN,
+          packedBN,
+          Cint32_fb.data(),
+          Cint32_fb.data(),
+          groups * n,
+          outputProcObj,
+          0,
+          1);
+
+      // correctness check
+      for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < groups * n_adjusted; ++j) {
+          float expected = Cfp32_ref[i * groups * n + j];
+          int32_t actual = Cint32_fb[i * groups * n + j];
+          EXPECT_EQ(expected, actual)
+              << "GEMM results differ at (" << i << ", " << j << "). ref "
+              << expected << " FBGemm " << actual;
+        }
       }
-    }
-
-    // A and B have scale 1, so exactly represented after quantization
-    randFill(Aint8, 0, 255);
-    randFill(Bint8, -128, 127);
-    avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
-
-    for (auto i = 0; i < Bint8.size(); ++i) {
-      Bint8_ref[i] = Bint8[i];
-    }
-
-    for (auto i = 0; i < Aint8.size(); ++i) {
-      Aint8_ref[i] = Aint8[i];
-    }
-
-    int32_t Aint8_zero_point = 55;
-    int32_t Bint8_zero_point = -17;
-
-    // initialize bias
-    aligned_vector<int32_t> bias_int32(n);
-    randFill(bias_int32, -128, 127);
-
-    if (btrans == matrix_op_t::Transpose) {
-      transpose_matrix(Bint8.data(), k, n);
-    }
-
-    // computing column offset
-    vector<int32_t> col_offsets;
-    col_offsets.resize(n_adjusted);
-    col_offsets_with_zero_pt_s8acc32_ref(
-        k_adjusted,
-        n_adjusted,
-        n,
-        Bint8_ref.data(),
-        Bint8_zero_point,
-        col_offsets.data());
-
-    matmul_u8i8acc32_ref(
-        m,
-        n_adjusted,
-        k_adjusted,
-        k,
-        n,
-        n,
-        Aint8.data(),
-        Bint8_ref.data(),
-        Cint32_ref.data());
-
-    vector<int32_t> row_offsets;
-    row_offsets.resize(m);
-
-    row_offsets_u8acc32_ref(
-        m, k_adjusted, k, Aint8_ref.data(), row_offsets.data());
-
-    float C_multiplier = 0.1234;
-    int32_t C_zero_pt = 5;
-
-    requantize_u8acc32_ref(
-        m,
-        n_adjusted,
-        n,
-        Cint32_ref.data(),
-        Cint8_ref.data(),
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        row_offsets.data(),
-        col_offsets.data(),
-        bias_int32.data());
-
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
-
-    PackAWithRowOffset<uint8_t> packAN(
-        matrix_op_t::NoTranspose,
-        m,
-        k_adjusted,
-        Aint8.data(),
-        k,
-        nullptr,
-        1,
-        Aint8_zero_point,
-        row_offset_buf.data());
-
-    PackBMatrix<int8_t> packedBN(
-        btrans,
-        k_adjusted,
-        n_adjusted,
-        Bint8.data(),
-        (btrans == matrix_op_t::Transpose) ? k : n,
-        nullptr,
-        1,
-        Bint8_zero_point);
-
-    DoNothing<> doNothingObj{};
-    ReQuantizeOutput<false> outputProcObj(
-        doNothingObj,
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        packAN.getRowOffsetBuffer(),
-        col_offsets.data(),
-        bias_int32.data());
-
-    fbgemmPacked(
-        packAN,
-        packedBN,
-        Cint8_fb.data(),
-        Cint32_fb.data(),
-        n,
-        outputProcObj,
-        0,
-        1);
-
-    compare_validate_buffers(
-        Cint8_fb.data(), Cint8_ref.data(), m, n, n, static_cast<uint8_t>(0));
-  }
+    } // for each groups
+  } // for each shape
 }
diff --git a/test/QuantizationHelpers.cc b/test/QuantizationHelpers.cc
index eab08de..cb3edd3 100644
--- a/test/QuantizationHelpers.cc
+++ b/test/QuantizationHelpers.cc
@@ -17,12 +17,19 @@ namespace fbgemm {
  * @brief Make sure we won't have overflows from vpmaddubsw instruction.
  */
 template <typename T>
-void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B) {
+void avoidOverflow(
+    int m,
+    int n,
+    int k,
+    const uint8_t* Aint8,
+    int lda,
+    T* B,
+    int ldb) {
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
       for (int kk = 0; kk < k / 2 * 2; kk += 2) {
-        int a0 = Aint8[i * k + kk], a1 = Aint8[i * k + kk + 1];
-        int b0 = B[kk * n + j], b1 = B[(kk + 1) * n + j];
+        int a0 = Aint8[i * lda + kk], a1 = Aint8[i * lda + kk + 1];
+        int b0 = B[kk * ldb + j], b1 = B[(kk + 1) * ldb + j];
         int sum_pair = a0 * b0 + a1 * b1;
         if (sum_pair < numeric_limits<int16_t>::lowest()) {
           int b1_adjusted =
@@ -43,13 +50,23 @@ void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B) {
           assert(
               new_sum_pair >= numeric_limits<int16_t>::lowest() &&
               new_sum_pair <= numeric_limits<int16_t>::max());
-          B[(kk + 1) * n + j] = b1_adjusted;
+          B[(kk + 1) * ldb + j] = b1_adjusted;
         }
       }
     } // for each j
   } // for each i
 }
 
+template <typename T>
+void avoidOverflow(
+    int m,
+    int n,
+    int k,
+    const uint8_t* Aint8,
+    T* B) {
+  return avoidOverflow(m, n, k, Aint8, k, B, n);
+}
+
 template void
 avoidOverflow(int m, int n, int k, const uint8_t* Aint8, int8_t* B);
 template void
diff --git a/test/QuantizationHelpers.h b/test/QuantizationHelpers.h
index 42c3e08..d645a4e 100644
--- a/test/QuantizationHelpers.h
+++ b/test/QuantizationHelpers.h
@@ -13,6 +13,16 @@ namespace fbgemm {
  * @brief Make sure we won't have overflows from vpmaddubsw instruction.
  */
 template <typename T>
+void avoidOverflow(
+    int m,
+    int n,
+    int k,
+    const uint8_t* Aint8,
+    int lda,
+    T* B,
+    int ldb);
+
+template <typename T>
 void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B);
 
 } // namespace fbgemm
diff --git a/test/TestUtils.cc b/test/TestUtils.cc
index 5cc14ef..3200246 100644
--- a/test/TestUtils.cc
+++ b/test/TestUtils.cc
@@ -79,18 +79,25 @@ template bool
 check_all_zero_entries<uint8_t>(const uint8_t* test, int m, int n);
 
 template <typename T>
-void transpose_matrix(T* ref, int n, int k) {
-  aligned_vector<T> local(n * k, 0);
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < k; ++j) {
-      local[j * n + i] = ref[i * k + j];
+void transpose_matrix(
+    int M,
+    int N,
+    const T* src,
+    int ld_src,
+    T* dst,
+    int ld_dst) {
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < M; ++j) {
+      dst[i * ld_dst + j] = src[i + j * ld_src];
     }
-  }
-  for (int i = 0; i < k; ++i) {
-    for (int j = 0; j < n; ++j) {
-      ref[i * n + j] = local[i * n + j];
-    }
-  }
+  } // for each output row
+}
+
+template <typename T>
+void transpose_matrix(T* ref, int n, int k) {
+  std::vector<T> local(n * k);
+  transpose_matrix(n, k, ref, k, local.data(), n);
+  memcpy(ref, local.data(), n * k * sizeof(T));
 }
 
 template void transpose_matrix<float>(float* ref, int n, int k);
diff --git a/test/TestUtils.h b/test/TestUtils.h
index 6cc365f..2cb7b88 100644
--- a/test/TestUtils.h
+++ b/test/TestUtils.h
@@ -32,9 +32,24 @@ bool check_all_zero_entries(const T* test, int m, int n);
 
 /*
  * @brief In-place transposition for nxk matrix ref.
- * @params n number of rows in output (number of columns in input)
- * @params k number of columns in output (number of rows in input)
+ * @params n number of rows in input (number of columns in output)
+ * @params k number of columns in input (number of rows in output)
  */
 template <typename T>
 void transpose_matrix(T* ref, int n, int k);
+
+/*
+ * @brief Out-of-place transposition for M*N matrix ref.
+ * @params M number of rows in input
+ * @params K number of columns in input
+ */
+template <typename T>
+void transpose_matrix(
+    int M,
+    int N,
+    const T* src,
+    int ld_src,
+    T* dst,
+    int ld_dst);
+
 } // namespace fbgemm