Merge upstream master

author: Young Jin Kim <youki@microsoft.com> 2019-08-01 22:38:23 +0300
committer: Young Jin Kim <youki@microsoft.com> 2019-08-01 22:38:23 +0300
commit: eb8fede25bd048da6fd396654936703a474f0504 (patch)
tree: 943fd29e7e173fb1075b9886b0309765f5f4b114
parent: e4ed5196cbec0d0a485578996b09912e92927e02 (diff)
parent: f712cb2328a2b29424bdaeecb9c0731da2cd997b (diff)
23 files changed, 1322 insertions, 376 deletions
diff --git a/README.md b/README.md
index 2335b81..d287c44 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,9 @@ General build instructions are as follows:
 ```
 git clone --recursive https://github.com/pytorch/FBGEMM.git
 cd FBGEMM
+# if you are updating an existing checkout
+git submodule sync
+git submodule update --init --recursive
 mkdir build && cd build
 cmake ..
 make
diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc
index 6bc2cf4..b450beb 100644
--- a/bench/ConvUnifiedBenchmark.cc
+++ b/bench/ConvUnifiedBenchmark.cc
@@ -26,16 +26,18 @@ using namespace fbgemm;
 
 // 2D conv shapes
 vector<conv_param_t<2>> shapes_2d = {
-    // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
-    // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
-    // 2D convolutions
-    // regular
-    conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-    // groupwise
-    conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-
-    // DW
-    conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+  // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
+  // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
+  // 2D convolutions
+  // regular
+  conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+  // groupwise
+  conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+  // DW
+  conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+  // Pointwise
+  conv_param_t<>(1, 128, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})
+
 };
 
 // 3D conv shapes
@@ -43,9 +45,11 @@ vector<conv_param_t<3>> shapes_3d = {
   // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, stride_w},
   // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right}
   // Regular
-  conv_param_t<3>(1, 64, 64, {32, 56, 56}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
+  conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
   // Depthwise
-  conv_param_t<3>(1, 64, 64, {32, 56, 56}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1})
+  conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
+  // Pointwise
+  conv_param_t<3>(1, 128, 128, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0})
 };
 
 template <int SPATIAL_DIM, typename Acc_t>
@@ -110,6 +114,9 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
     aligned_vector<int8_t> Bint8(
         kernel_dim * conv_p.IC * (conv_p.OC / conv_p.G));
 
+    aligned_vector<int8_t> Bint8_tr(
+        kernel_dim * conv_p.IC * (conv_p.OC / conv_p.G));
+
     int im_out_dim = accumulate(
         conv_p.OUT_DIM.begin(), conv_p.OUT_DIM.end(), 1, multiplies<int>());
     aligned_vector<int32_t> Cint32_ref(conv_p.MB * im_out_dim * conv_p.OC);
@@ -132,14 +139,14 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
     randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2);
     int32_t C_zero_point = 5;
 
-    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
-
     // reference implementation
+    // conv_ref expects weights to be in G (R S C/G) K/G
+    transposeConvWeights<SPATIAL_DIM>(conv_p, Bint8.data(), Bint8_tr.data());
     conv_ref(
         conv_p,
         Aint8.data(),
         Aint8_zero_point,
-        Bint8.data(),
+        Bint8_tr.data(),
         Cint32_ref.data());
 
     // matrix dimensions after im2col
@@ -162,7 +169,7 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
           KDimPerGroup,
           OC_per_G,
           OC_per_G,
-          Bint8.data() + g * KDimPerGroup * OC_per_G,
+          Bint8_tr.data() + g * KDimPerGroup * OC_per_G,
           Bint8_zero_point.data(),
           col_offsets.data() + g * OC_per_G,
           conv_p.OC);
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index bdcf308..7f428ed 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -489,6 +489,15 @@ class FBGEMM_API PackBMatrix final
   const T* smat_;
   std::int32_t ld_;
   std::int32_t row_interleave_;
+
+  /**
+   * @brief Internal function performing both pack & unpack
+   */
+  void pack_unpack_(
+      const block_type_t& block,
+      T* unpack_buf,
+      T* pack_buf,
+      bool ispack);
 };
 
 /**
@@ -521,6 +530,11 @@ class FBGEMM_API PackWeightMatrixForGConv {
   void pack();
 
   /**
+   * @brief Unpacks a pmat buffer into source matrix.
+   */
+  void unpack(T* origin_buf);
+
+  /**
    * @brief Return packed data
    */
   inpType* getBuf() {
@@ -543,6 +557,22 @@ class FBGEMM_API PackWeightMatrixForGConv {
   const T* sdata_;
   T* pdata_;
   bool bufAllocatedHere_;
+
+  /**
+   * @brief Internal function performing both pack & unpack
+   */
+  void pack_unpack_(const T* src, T* dst, bool ispack);
+
+  /**
+   * @brief Get the index of the unpacked data
+   */
+  int unpacked_index_(int r, int s, int k, int g, int c, bool tr);
+
+  /**
+   * @brief Get the index of the packed data
+   */
+  int packed_index_(int r, int s, int k, int g, int c);
+
 };
 
 /**
@@ -588,7 +618,40 @@ class FBGEMM_API PackWeightsForConv {
     return W_gconv_packed_;
   }
 
+  std::shared_ptr<PackBMatrix<T, accT>> getPackedWForPointwise() {
+    return W_pointwise_packed_;
+  }
+
+  int inputChannels() {
+    return conv_param_.IC;
+  }
+
+  int outputChannels() {
+    return conv_param_.OC;
+  }
+
+  std::array<int, SPATIAL_DIM> kernelDims() {
+    return conv_param_.K;
+  }
+
+  int groups() {
+    return conv_param_.G;
+  }
+
+  /**
+   * @brief Returns true if the packed weights would work for the given
+   * convolution parameters, and false otherwise
+   */
+  bool isPackingCompliant(const conv_param_t<SPATIAL_DIM>& conv_p);
+
+  /**
+   * @brief Unpack packed matric into origin_buf (Used for the serialization to
+   * recover weight matrix).
+   */
+  void unpack(T* origin_buf);
+
  private:
+  const conv_param_t<SPATIAL_DIM> conv_param_;
   // Packed weights if we use im2col based convolution implementation
   std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
   // Packed weights if we use 2D depthwise convolution implementation
@@ -599,6 +662,8 @@ class FBGEMM_API PackWeightsForConv {
   // implementation
   std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
       W_gconv_packed_;
+  // Packed weights if we use direct gemm for pointwise convolution
+  std::shared_ptr<PackBMatrix<T, accT>> W_pointwise_packed_;
 };
 
 /**
@@ -1374,6 +1439,13 @@ template <int SPATIAL_DIM>
 FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p);
 
 /**
+ * @brief Is this convolution a direct matrix-matrix multiplication, i.e., 1x1
+ * (aka pointwise) with right paddings etc.?
+ */
+template <int SPATIAL_DIM>
+FBGEMM_API bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
+
+/**
  * @brief Allocate __size bytes of uninitialized storage whose alignment is
  * specified by __align.
  */
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
index 069ff77..e7b0ec4 100644
--- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h
+++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -16,7 +16,7 @@ namespace fbgemm {
 template <int KERNEL_PROD>
 class FBGEMM_API PackedDepthWiseConvMatrix {
  public:
-  // smat in RSG layout
+  // smat in GRS layout
   PackedDepthWiseConvMatrix(int K, const std::int8_t* smat);
   virtual ~PackedDepthWiseConvMatrix();
 
@@ -24,6 +24,17 @@ class FBGEMM_API PackedDepthWiseConvMatrix {
     return pmat_;
   }
 
+  /**
+   * @brief Unpacks pmat_ into unpack_data.
+   * Used for recovering the weight matrix into the original format
+   */
+  void unpack(std::int8_t* unpacked_data);
+
+  /**
+   * @brief returns the index into pmat_ given the row and column for smat
+   */
+  int addr(int r, int c);
+
  private:
   int K_;
   std::int8_t* pmat_;
@@ -31,6 +42,13 @@ class FBGEMM_API PackedDepthWiseConvMatrix {
 
 using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>;
 using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>;
+using Packed1ConvMatrix = PackedDepthWiseConvMatrix<1>;
+using Packed2ConvMatrix = PackedDepthWiseConvMatrix<2>;
+using Packed3ConvMatrix = PackedDepthWiseConvMatrix<3>;
+using Packed4ConvMatrix = PackedDepthWiseConvMatrix<4>;
+using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>;
+using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
+using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
 
 /**
  * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 68fe177..eac0bcd 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -44,7 +44,7 @@ enum class inst_set_t { anyarch, avx2, avx512 };
 /**
  * @brief Typed enum for optimized paths for convolutions
  */
-enum class optimized_conv_t { depthwise, groupwise, im2col };
+enum class optimized_conv_t { depthwise, groupwise, pointwise, im2col };
 
 /**
  * @brief Typed enum for implementation type.
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 5db63f6..027e6c5 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -6,8 +6,9 @@
  */
 
 #include <algorithm>
-#include <iostream>
+#include <numeric>
 #include <vector>
+#include <functional>
 #include "fbgemm/Fbgemm.h"
 
 namespace fbgemm {
@@ -33,12 +34,24 @@ bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
          });
 }
 
+template <int SPATIAL_DIM>
+bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
+  return std::accumulate(conv_p.K.begin(), conv_p.K.end(), 0) == SPATIAL_DIM &&
+      std::accumulate(conv_p.stride.begin(), conv_p.stride.end(), 0) ==
+      SPATIAL_DIM &&
+      std::accumulate(conv_p.dilation.begin(), conv_p.dilation.end(), 0) ==
+      SPATIAL_DIM &&
+      std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0;
+}
+
 template <int SPATIAL_DIM, typename ACC_T>
 optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
   if (takeDepthWiseFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
     return optimized_conv_t::depthwise;
   } else if (fbgemmOptimizedGConv<SPATIAL_DIM>(conv_p)) {
     return optimized_conv_t::groupwise;
+  } else if (takePointWiseFastPath<SPATIAL_DIM>(conv_p)) {
+    return optimized_conv_t::pointwise;
   } else {
     return optimized_conv_t::im2col;
   }
@@ -58,6 +71,13 @@ int fbgemmConv(
   static_assert(
       SPATIAL_DIM == 2 || SPATIAL_DIM == 3,
       "Only 2D and 3D convolutions are supported");
+
+  if (!packed_weights.isPackingCompliant(conv_p)) {
+    throw std::logic_error(
+        "[FBGEMM_CONV_ERROR] Prepacked weights can't be used"
+        " with these convolution parameters!");
+  }
+
   switch (ConvFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
     case optimized_conv_t::depthwise: {
       // 2D and 3D depthwise fast path
@@ -134,11 +154,44 @@ int fbgemmConv(
           num_threads);
       break;
     }
+    case optimized_conv_t::pointwise: {
+      std::vector<int32_t> row_offset_buf(
+          PackAWithRowOffset<uint8_t>::rowOffsetBufferSize(blocking_params));
+      int image_dim = std::accumulate(
+          conv_p.IN_DIM.begin(),
+          conv_p.IN_DIM.end(),
+          1,
+          std::multiplies<int>());
+      PackAWithRowOffset<uint8_t, ACC_T> packA(
+          matrix_op_t::NoTranspose,
+          conv_p.MB * image_dim,
+          conv_p.IC,
+          activations,
+          conv_p.IC,
+          nullptr,
+          conv_p.G,
+          row_offset_buf.data(),
+          blocking_params);
+
+      outProcess.setRowOffsets(row_offset_buf.data());
+      fbgemmPacked(
+          packA,
+          *(packed_weights.getPackedWForPointwise()),
+          out,
+          outBuffer,
+          conv_p.OC,
+          outProcess,
+          thread_id,
+          num_threads,
+          blocking_params);
+      break;
+    }
     case optimized_conv_t::im2col: {
       // All other convolutions go through im2col-based implementation
       // std::cout << "Im2col path" << std::endl;
       std::vector<int32_t> row_offset_buf(
-          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize());
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>
+          ::rowOffsetBufferSize(blocking_params));
 
       const std::int32_t* b_zero_point = outProcess.getBZeroPoint();
       bool b_symmetric = b_zero_point[0] == 0;
diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc
index f96d1d2..183a8a9 100644
--- a/src/FbgemmI8DepthwiseAvx2.cc
+++ b/src/FbgemmI8DepthwiseAvx2.cc
@@ -170,6 +170,45 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix(
 }
 
 template <int KERNEL_PROD>
+int PackedDepthWiseConvMatrix<KERNEL_PROD>::addr(int r, int c) {
+  constexpr int KERNEL_PROD_ALIGNED = (KERNEL_PROD + 1) / 2 * 2;
+  if (c >= KERNEL_PROD / 4 * 4 &&
+      (KERNEL_PROD % 4 == 1 || KERNEL_PROD % 4 == 2)) {
+    int kBlock = r / 32;
+    int reg_idx = (r % 16) / 8 + c / 4 * 4;
+
+    int blk_idx = kBlock * KERNEL_PROD_ALIGNED + reg_idx;
+
+    int r_ = r % 8;
+    int c_ = c % 4;
+
+    int in_blk_idx = (r % 32) / 16 * 16 + 2 * r_ + c_;
+    return blk_idx * 32 + in_blk_idx;
+
+  } else {
+    int kBlock = r / 32;
+    int reg_idx = (r % 16) / 4 + c / 4 * 4;
+
+    int blk_idx = kBlock * KERNEL_PROD_ALIGNED + reg_idx;
+
+    int r_ = r % 4;
+    int c_ = c % 4;
+
+    int in_blk_idx = (r % 32) / 16 * 16 + 4 * r_ + c_;
+    return blk_idx * 32 + in_blk_idx;
+  }
+}
+
+template <int KERNEL_PROD>
+void PackedDepthWiseConvMatrix<KERNEL_PROD>::unpack(int8_t* unpacked_data) {
+  for (int r = 0; r < K_; ++r) {
+    for (int c = 0; c < KERNEL_PROD; ++c) {
+      unpacked_data[r * KERNEL_PROD + c] = pmat_[addr(r, c)];
+    }
+  }
+}
+
+template <int KERNEL_PROD>
 PackedDepthWiseConvMatrix<KERNEL_PROD>::~PackedDepthWiseConvMatrix() {
 #ifdef _MSC_VER
   _aligned_free(pmat_);
@@ -180,6 +219,13 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::~PackedDepthWiseConvMatrix() {
 
 template class PackedDepthWiseConvMatrix<3 * 3>;
 template class PackedDepthWiseConvMatrix<3 * 3 * 3>;
+template class PackedDepthWiseConvMatrix<1>;
+template class PackedDepthWiseConvMatrix<2>;
+template class PackedDepthWiseConvMatrix<3>;
+template class PackedDepthWiseConvMatrix<4>;
+template class PackedDepthWiseConvMatrix<5>;
+template class PackedDepthWiseConvMatrix<5 * 2>;
+template class PackedDepthWiseConvMatrix<11 * 1>;
 
 // c = a0 * b0 + a1 * b1 + a2 * b2 + a3 * b3
 // A is in uint8_t
diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
index 87adaba..143e11d 100644
--- a/src/PackAMatrix.cc
+++ b/src/PackAMatrix.cc
@@ -34,31 +34,29 @@ PackAMatrix<T, accT>::PackAMatrix(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
+  if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
+    assert(0 && "unknown architecure");
+  }
+
   if (params) {
-    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
-      BaseType::brow_ = params->MCB;
-      BaseType::bcol_ = params->KCB;
-      row_interleave_B_ = params->ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
-    }
+    BaseType::brow_ = params->MCB;
+    BaseType::bcol_ = params->KCB;
+    row_interleave_B_ = params->ROW_INTERLEAVE;
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
       row_interleave_B_ =
           PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-    } else if (fbgemmHasAvx2Support()) {
+    } else {
+      // AVX2
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
       row_interleave_B_ =
           PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
     }
   }
+
   if (BaseType::numCols() % groups != 0) {
     throw std::runtime_error(
         "groups = " + std::to_string(groups) +
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index e55dd4e..d731654 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -49,32 +49,29 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
+  if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
+    assert(0 && "unknown architecure");
+  }
 
   if (params) {
-    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
-      BaseType::brow_ = params->MCB;
-      BaseType::bcol_ = params->KCB;
-      row_interleave_B_ = params->ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
-    }
+    BaseType::brow_ = params->MCB;
+    BaseType::bcol_ = params->KCB;
+    row_interleave_B_ = params->ROW_INTERLEAVE;
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
       row_interleave_B_ =
           PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-    } else if (fbgemmHasAvx2Support()) {
+    } else {
+      // AVX2
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
       row_interleave_B_ =
           PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
     }
   }
+
   if (BaseType::numCols() % conv_p.G != 0) {
     throw std::runtime_error(
         "groups = " + std::to_string(conv_p.G) +
diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc
index 305a298..52caed4 100644
--- a/src/PackAWithQuantRowOffset.cc
+++ b/src/PackAWithQuantRowOffset.cc
@@ -45,32 +45,31 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
-  rowOffsetAllocatedHere = false;
+  if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
+    assert(0 && "unknown architecure");
+  }
+
   if (params) {
-    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
-      BaseType::brow_ = params->MCB;
-      BaseType::bcol_ = params->KCB;
-      row_interleave_B_ = params->ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
-    }
+    BaseType::brow_ = params->MCB;
+    BaseType::bcol_ = params->KCB;
+    row_interleave_B_ = params->ROW_INTERLEAVE;
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
       row_interleave_B_ =
           PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-    } else if (fbgemmHasAvx2Support()) {
+    } else {
+      // AVX2
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
       row_interleave_B_ =
           PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unknown architecure");
     }
   }
+
+  rowOffsetAllocatedHere = false;
+
   if (BaseType::numCols() % groups != 0) {
     throw std::runtime_error(
         "groups = " + std::to_string(groups) +
diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc
index b791817..733bf5c 100644
--- a/src/PackAWithRowOffset.cc
+++ b/src/PackAWithRowOffset.cc
@@ -39,32 +39,31 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
-  rowOffsetAllocatedHere = false;
+  if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
+    assert(0 && "unknown architecure");
+  }
+
   if (params) {
-    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
-      BaseType::brow_ = params->MCB;
-      BaseType::bcol_ = params->KCB;
-      row_interleave_B_ = params->ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
-    }
+    BaseType::brow_ = params->MCB;
+    BaseType::bcol_ = params->KCB;
+    row_interleave_B_ = params->ROW_INTERLEAVE;
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
       row_interleave_B_ =
           PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-    } else if (fbgemmHasAvx2Support()) {
+    } else {
+      // AVX2
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
       row_interleave_B_ =
           PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unknown architecure");
     }
   }
+
+  rowOffsetAllocatedHere = false;
+
   if (BaseType::numCols() % groups != 0) {
     throw std::runtime_error(
         "groups = " + std::to_string(groups) +
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index 4d86d45..b19b5d4 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -188,31 +188,29 @@ PackBMatrix<T, accT>::PackBMatrix(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
+  if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
+    assert(0 && "unknown architecure");
+  }
+
   if (params) {
-    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
-      BaseType::brow_ = params->KCB;
-      BaseType::bcol_ = params->NCB;
-      row_interleave_ = params->ROW_INTERLEAVE;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
-    }
+    BaseType::brow_ = params->KCB;
+    BaseType::bcol_ = params->NCB;
+    row_interleave_ = params->ROW_INTERLEAVE;
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB;
       row_interleave_ =
           PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-    } else if (fbgemmHasAvx2Support()) {
+    } else {
+      // AVX2
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
       BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::NCB;
       row_interleave_ =
           PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
-    } else {
-      // Error
-      assert(0 && "unknown architecure");
     }
   }
+
   if (BaseType::numRows() % groups != 0) {
     throw std::runtime_error(
         "groups = " + std::to_string(groups) +
@@ -292,7 +290,11 @@ PackBMatrix<T, accT>::PackBMatrix(
 }
 
 template <typename T, typename accT>
-void PackBMatrix<T, accT>::pack(const block_type_t& block) {
+void PackBMatrix<T, accT>::pack_unpack_(
+    const block_type_t& block,
+    T* unpack_buf,
+    T* pack_buf,
+    bool ispack) {
   assert((BaseType::blockRowSize() % row_interleave_) == 0);
   assert((block.row_start % BaseType::blockRowSize()) == 0);
   assert((block.col_start % BaseType::blockColSize()) == 0);
@@ -300,7 +302,7 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
   BaseType::packedBlock(block);
   bool tr = (trans_ == matrix_op_t::Transpose);
   for (int g = 0; g < BaseType::numGroups(); ++g) {
-    T* out = BaseType::getBuf() +
+    T* pack_buf_cur = pack_buf +
         g * BaseType::packedBufferSize(block.row_size, block.col_size);
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
       int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
@@ -326,10 +328,16 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
             c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() +
             c_idx_offset * row_interleave_;
 
-        int out_idx = r_offset + c_offset;
-        T val = tr ? smat_[i + (g * block.col_size + j) * ld_]
-                   : smat_[(g * block.row_size + i) * ld_ + j];
-        out[out_idx] = val;
+        if (ispack) {
+          pack_buf_cur[r_offset + c_offset] = tr
+              ? unpack_buf[i + (g * block.col_size + j) * ld_]
+              : unpack_buf[(g * block.row_size + i) * ld_ + j];
+        } else {
+          T* unpack_buf_cur = tr
+              ? &(unpack_buf[i + (g * block.col_size + j) * ld_])
+              : &(unpack_buf[(g * block.row_size + i) * ld_ + j]);
+          *unpack_buf_cur = pack_buf_cur[r_offset + c_offset];
+        }
 
         c_idx_offset++;
         if (c_idx_offset == BaseType::blockColSize()) {
@@ -338,78 +346,45 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
         }
       }
     }
-    // fill the remaining with zero.
-    // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
-    for (int i = block.row_start + block.row_size;
-         i < (block.row_start + block.row_size + row_interleave_ - 1) /
-             row_interleave_ * row_interleave_;
-         ++i) {
-      int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
-              (BaseType::blockRowSize() * BaseType::blockColSize()) +
-          (i % BaseType::blockRowSize() / row_interleave_) *
-              BaseType::blockColSize() * row_interleave_ +
-          i % row_interleave_;
-      for (int j = block.col_start; j < block.col_start + block.col_size; j++) {
-        int c_offset = (j / BaseType::blockColSize()) *
-                BaseType::blockRowSize() * BaseType::blockColSize() +
-            (j % BaseType::blockColSize()) * row_interleave_;
+    if (ispack) {
+      // fill the remaining with zero.
+      // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
+      for (int i = block.row_start + block.row_size;
+           i < (block.row_start + block.row_size + row_interleave_ - 1) /
+               row_interleave_ * row_interleave_;
+           ++i) {
+        int r_offset =
+            ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
+                (BaseType::blockRowSize() * BaseType::blockColSize()) +
+            (i % BaseType::blockRowSize() / row_interleave_) *
+                BaseType::blockColSize() * row_interleave_ +
+            i % row_interleave_;
+        for (int j = block.col_start; j < block.col_start + block.col_size;
+             j++) {
+          int c_offset = (j / BaseType::blockColSize()) *
+                  BaseType::blockRowSize() * BaseType::blockColSize() +
+              (j % BaseType::blockColSize()) * row_interleave_;
 
-        int out_idx = r_offset + c_offset;
-        out[out_idx] = 0;
+          int out_idx = r_offset + c_offset;
+          pack_buf_cur[out_idx] = 0;
+        }
       }
     }
   } // for each group
 }
 
 template <typename T, typename accT>
-void PackBMatrix<T, accT>::unpack(T* origin_buf) {
-  bool tr = (trans_ == matrix_op_t::Transpose);
-  for (int g = 0; g < this->numGroups(); ++g) {
-    T* out = BaseType::getBuf() +
-        g *
-            BaseType::packedBufferSize(
-                BaseType::numPackedRows(), BaseType::numPackedCols());
-    for (int i = BaseType::packedRowStart();
-         i < BaseType::packedRowStart() + BaseType::numPackedRows();
-         ++i) {
-      int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
-              (BaseType::blockRowSize() * BaseType::blockColSize()) +
-          (i % BaseType::blockRowSize() / row_interleave_) *
-              BaseType::blockColSize() * row_interleave_ +
-          i % row_interleave_;
-
-      int c_start_offset =
-          (BaseType::packedColStart() / BaseType::blockColSize()) *
-              BaseType::blockRowSize() * BaseType::blockColSize() +
-          (BaseType::packedColStart() % BaseType::blockColSize()) *
-              row_interleave_;
-
-      int c_idx_offset = 0;
-      int c_blk_offset = 0;
-      for (int j = BaseType::packedColStart();
-           j < BaseType::packedColStart() + BaseType::numPackedCols();
-           ++j) {
-        int c_offset = c_start_offset +
-            c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() +
-            c_idx_offset * row_interleave_;
-
-        int out_idx = r_offset + c_offset;
-
-        T val = out[out_idx];
-        if (tr) {
-          origin_buf[i + (g * BaseType::numPackedCols() + j) * ld_] = val;
-        } else {
-          origin_buf[(g * BaseType::numPackedRows() + i) * ld_ + j] = val;
-        }
+void PackBMatrix<T, accT>::pack(const block_type_t& block) {
+  pack_unpack_(block, const_cast<T*>(smat_), BaseType::getBuf(), true);
+}
 
-        c_idx_offset++;
-        if (c_idx_offset == BaseType::blockColSize()) {
-          c_idx_offset = 0;
-          c_blk_offset++;
-        }
-      }
-    }
-  } // for each group
+template <typename T, typename accT>
+void PackBMatrix<T, accT>::unpack(T* origin_buf) {
+  block_type_t blockB{BaseType::packedRowStart(),
+                      BaseType::numPackedRows(),
+                      BaseType::packedColStart(),
+                      BaseType::numPackedCols()};
+  pack_unpack_(blockB, origin_buf, BaseType::getBuf(), false);
 }
 
 template <typename T, typename accT>
diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
index 33227fb..c7503dd 100644
--- a/src/PackMatrix.cc
+++ b/src/PackMatrix.cc
@@ -36,45 +36,37 @@ int PackMatrix<PT, inpType, accType>::packedBufferSize(
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
+  if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
+    assert(0 && "unknown architecure");
+  }
+
   int MCB, KCB, NCB;
   if (params) {
-    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
-      MCB = params->MCB;
-      NCB = params->NCB;
-      KCB = params->KCB;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
-    }
+    MCB = params->MCB;
+    NCB = params->NCB;
+    KCB = params->KCB;
   } else {
     if (fbgemmHasAvx512Support()) {
       MCB = PackingTraits<inpType, accType, inst_set_t::avx512>::MCB;
       NCB = PackingTraits<inpType, accType, inst_set_t::avx512>::NCB;
       KCB = PackingTraits<inpType, accType, inst_set_t::avx512>::KCB;
-    } else if (fbgemmHasAvx2Support()) {
+    } else {
+      // AVX2
       MCB = PackingTraits<inpType, accType, inst_set_t::avx2>::MCB;
       NCB = PackingTraits<inpType, accType, inst_set_t::avx2>::NCB;
       KCB = PackingTraits<inpType, accType, inst_set_t::avx2>::KCB;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecure");
-      return -1;
     }
   }
 
-  if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
-    if (isA()) {
-      return MCB * KCB;
-    } else {
-      int rowBlock = KCB;
-      int colBlock = NCB;
-      return (((rows + rowBlock - 1) / rowBlock) * rowBlock) *
-          (((cols + colBlock - 1) / colBlock) * colBlock);
-    }
+  if (isA()) {
+    return MCB * KCB;
   } else {
-    // TODO: Have default slower path
-    assert(0 && "unsupported architecure");
+    int rowBlock = KCB;
+    int colBlock = NCB;
+    return (((rows + rowBlock - 1) / rowBlock) * rowBlock) *
+        (((cols + colBlock - 1) / colBlock) * colBlock);
   }
+
   return -1;
 }
 
diff --git a/src/PackWeightMatrixForGConv.cc b/src/PackWeightMatrixForGConv.cc
index 0fb0e2c..ba6adf3 100644
--- a/src/PackWeightMatrixForGConv.cc
+++ b/src/PackWeightMatrixForGConv.cc
@@ -36,8 +36,61 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
 }
 
 /**
- * @brief Pack weight tensor in a suitable format required for the optimized
- * kernel.
+ * @brief Get the index of the unpacked data for a given <r, s, k, g, c, tr>
+ *
+ * Non-transposed: G (R S C/G) K/G
+ * Transposed: G K/G (R S C/G)
+ * Using inline as this will be called frequently
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpacked_index_(
+    int r, int s, int k, int g, int c, bool tr) {
+  // Get the full dimensions
+  int R = conv_param_.K[0];
+  int S = conv_param_.K[1];
+  int G = conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
+
+  int idx;
+  if (tr) {
+    idx = (((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c;
+  } else {
+    idx = (((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k;
+  }
+  return idx;
+}
+
+/**
+ * @brief Get the index of the packed data for a given <r, s, k, g, c>
+ *
+ * The index may differ depending on IC_per_G.
+ * Using inline as this will be called frequently
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::packed_index_(
+    int r, int s, int k, int g, int c) {
+  // Get the full dimensions
+  int R = conv_param_.K[0];
+  int S = conv_param_.K[1];
+  int G = conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
+
+  int idx;
+  // For IC_per_G == 4, we need to work on 2 groups at a time
+  if (IC_per_G == 4) {
+    idx = (((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 + (g % 2))
+      * IC_per_G + c;
+  } else {
+    idx = ((((g * (IC_per_G / 4) + (c / 4)) * R + r) * S + s) * OC_per_G + k)
+      * 4 + (c % 4);
+  }
+  return idx;
+}
+
+/**
+ * @ brief Pack or unpack matrix
  *
  * Let IC_per_G be number of input channels per group and OC_per_G be number of
  * output channels per group.
@@ -53,15 +106,17 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
  * on 2 groups at a time and full SIMD width can be efficiently utilized even
  * while working on 1 group at a time.
  * In this case, the layout is G (C/4) R S K 4
- */
+*/
+
 template <typename T, typename accT, int SPATIAL_DIM>
-void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack_unpack_(
+    const T* src, T* dst, bool ispack) {
   // filters are assumed to be in G RS C/G K/G format
   int R = conv_param_.K[0];
   int S = conv_param_.K[1];
   int G = conv_param_.G;
-  int IC_per_G = conv_param_.IC / conv_param_.G;
-  int OC_per_G = conv_param_.OC / conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
 
   // If transpose option is set, the weight matrix is in layout G K/G (R S C/G)
   // instead of G (R S C/G) K/G
@@ -73,25 +128,13 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
         for (int k = 0; k < OC_per_G; ++k) {
           for (int g = 0; g < G; ++g) {
             for (int c = 0; c < IC_per_G; ++c) {
-              inpType b = tr
-                  ? sdata_
-                        [(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c]
-                  : sdata_
-                        [(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k];
-              if (IC_per_G == 4) {
-                // For IC_per_G == 4, we need to work on 2 groups at a time
-                pdata_
-                    [(((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 +
-                      (g % 2)) *
-                         IC_per_G +
-                     c] = b;
+              int p_idx = packed_index_(r, s, k, g, c);
+              int up_idx = unpacked_index_(r, s, k, g, c, tr);
+              // Pack: src (unpacked) -> dst (packed)
+              if (ispack) {
+                dst[p_idx] = src[up_idx];
               } else {
-                pdata_
-                    [((((g * (IC_per_G / 4) + (c / 4)) * R + r) * S + s) *
-                          OC_per_G +
-                      k) *
-                         4 +
-                     (c % 4)] = b;
+                dst[up_idx] = src[p_idx];
               }
             }
           }
@@ -99,14 +142,54 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
       }
     }
   } else {
+    // For pack & transposed, call transposeConvWeights()
+    // G K/G (R S C/G) => G (R S C/G) K/G
     if (tr) {
-      // conv_ref expects weights to be in G (R S C/G) K/G format
-      transposeConvWeights(conv_param_, sdata_, pdata_);
+      if (ispack) {
+        transposeConvWeights(conv_param_, src, dst);
+      } else {
+      // TODO: Wrap this as a inverseTransposeConvWeights()?
+      // For unpack & transposed, call transposeConvWeights()
+      // G (R S C/G) K/G => G K/G (R S C/G)
+        for (int r = 0; r < R; ++r) {
+          for (int s = 0; s < S; ++s) {
+            for (int k = 0; k < OC_per_G; ++k) {
+              for (int g = 0; g < G; ++g) {
+                for (int c = 0; c < IC_per_G; ++c) {
+                  dst[(((g * OC_per_G + k) * R + r) * S + s)
+                    * IC_per_G + c] =
+                    src[(((g * R + r) * S + s) * IC_per_G + c)
+                    * OC_per_G + k];
+                }
+              }
+            }
+          }
+        }
+      }  // end if(ispack)
     } else {
       // just copy the data for not supported cases
-      memcpy(pdata_, sdata_, G * R * S * OC_per_G * IC_per_G * sizeof(inpType));
-    }
-  }
+      memcpy(dst, src,
+          G * R * S * OC_per_G * IC_per_G * sizeof(inpType));
+    } //end if(tr)
+  } // end if(fbgemmOptimizedGConv(conv_param_)
+}
+
+/**
+ * @brief Pack weight tensor in a suitable format required for the optimized
+ * kernel.
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
+  pack_unpack_(sdata_, pdata_, true);
+}
+
+/**
+ * @brief Unpack the packed weight tensor (for the optimized kernel)
+ * to the original form.
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpack(T* origin_buf) {
+  pack_unpack_(const_cast<const T*>(pdata_), origin_buf, false);
 }
 
 template class PackWeightMatrixForGConv<int8_t, int32_t, 2>;
diff --git a/src/PackWeightsForConv.cc b/src/PackWeightsForConv.cc
index c811144..25b04af 100644
--- a/src/PackWeightsForConv.cc
+++ b/src/PackWeightsForConv.cc
@@ -4,6 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <algorithm>
 #include <memory>
 #include "fbgemm/Fbgemm.h"
 
@@ -13,7 +14,8 @@ template <int SPATIAL_DIM, typename T, typename accT>
 PackWeightsForConv<SPATIAL_DIM, T, accT>::PackWeightsForConv(
     const conv_param_t<SPATIAL_DIM>& conv_p,
     const T* sdata,
-    const BlockingFactors* blocking_params) {
+    const BlockingFactors* blocking_params)
+    : conv_param_(conv_p) {
   static_assert(
       SPATIAL_DIM == 2 || SPATIAL_DIM == 3,
       "Only 2D and 3D convolutions are supported");
@@ -42,18 +44,36 @@ PackWeightsForConv<SPATIAL_DIM, T, accT>::PackWeightsForConv(
       W_dw_3D_packed_ = nullptr;
       W_gconv_packed_ =
           std::make_shared<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>(
-              matrix_op_t::NoTranspose, conv_p, sdata, nullptr);
+              matrix_op_t::Transpose, conv_p, sdata, nullptr);
+      break;
+    }
+    case optimized_conv_t::pointwise: {
+      W_im2col_packed_ = nullptr;
+      W_dw_2D_packed_ = nullptr;
+      W_dw_3D_packed_ = nullptr;
+      W_gconv_packed_ = nullptr;
+      int NDim = conv_p.OC / conv_p.G;
+      int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
+      W_pointwise_packed_ = std::make_shared<PackBMatrix<T, accT>>(
+          matrix_op_t::Transpose,
+          KDim,
+          NDim,
+          sdata,
+          KDim / conv_p.G,
+          nullptr,
+          conv_p.G,
+          blocking_params);
       break;
     }
     case optimized_conv_t::im2col: {
       int NDim = conv_p.OC / conv_p.G;
       int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
       W_im2col_packed_ = std::make_shared<PackBMatrix<T, accT>>(
-          matrix_op_t::NoTranspose,
+          matrix_op_t::Transpose,
           KDim,
           NDim,
           sdata,
-          NDim,
+          KDim / conv_p.G,
           nullptr,
           conv_p.G,
           blocking_params);
@@ -65,6 +85,46 @@ PackWeightsForConv<SPATIAL_DIM, T, accT>::PackWeightsForConv(
   } // switch
 }
 
+template <int SPATIAL_DIM, typename T, typename accT>
+void PackWeightsForConv<SPATIAL_DIM, T, accT>::unpack(T* origin_buf) {
+  if (W_dw_2D_packed_) {
+    W_dw_2D_packed_->unpack(origin_buf);
+  } else if (W_dw_3D_packed_) {
+    W_dw_3D_packed_->unpack(origin_buf);
+  } else if (W_gconv_packed_) {
+    W_gconv_packed_->unpack(origin_buf);
+  } else if (W_im2col_packed_) {
+    W_im2col_packed_->unpack(origin_buf);
+  } else if (W_pointwise_packed_) {
+    W_pointwise_packed_->unpack(origin_buf);
+  } else {
+    assert(false && "At least one packed weights object should exist");
+  }
+}
+
+template <int SPATIAL_DIM, typename T, typename accT>
+bool PackWeightsForConv<SPATIAL_DIM, T, accT>::isPackingCompliant(
+    const conv_param_t<SPATIAL_DIM>& test_conv_p) {
+  return conv_param_.IC == test_conv_p.IC && conv_param_.OC == test_conv_p.OC &&
+      conv_param_.G == test_conv_p.G &&
+      std::equal(
+             conv_param_.K.begin(),
+             conv_param_.K.end(),
+             test_conv_p.K.begin()) &&
+      std::equal(
+             conv_param_.stride.begin(),
+             conv_param_.stride.end(),
+             test_conv_p.stride.begin()) &&
+      std::equal(
+             conv_param_.pad.begin(),
+             conv_param_.pad.end(),
+             test_conv_p.pad.begin()) &&
+      std::equal(
+             conv_param_.dilation.begin(),
+             conv_param_.dilation.end(),
+             test_conv_p.dilation.begin());
+}
+
 template class PackWeightsForConv<2, int8_t, int32_t>;
 template class PackWeightsForConv<3, int8_t, int32_t>;
 
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index b4b0c2b..e3c0eac 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -181,8 +181,7 @@ void cblas_sgemm_ref(
     int ldb,
     float beta,
     float* Cfp32,
-    int ldc
-    ) {
+    int ldc) {
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
       float sum = 0;
@@ -204,7 +203,6 @@ void cblas_sgemm_ref(
   }
 }
 
-
 void row_offsets_u8acc32_ref(
     int M,
     int K,
@@ -542,21 +540,49 @@ void transposeConvWeights(
     const conv_param_t<SPATIAL_DIM>& conv_p,
     const std::int8_t* src,
     std::int8_t* dest) {
-  assert(SPATIAL_DIM == 2 && "Only 2D supported currently");
-  int R = conv_p.K[0];
-  int S = conv_p.K[1];
   int G = conv_p.G;
   int IC_per_G = conv_p.IC / conv_p.G;
   int OC_per_G = conv_p.OC / conv_p.G;
 
-  // Transforms weights from  G K/G (R S C/G) to G (R S C/G) K/G format.
-  for (int r = 0; r < R; ++r) {
-    for (int s = 0; s < S; ++s) {
-      for (int k = 0; k < OC_per_G; ++k) {
-        for (int g = 0; g < G; ++g) {
-          for (int c = 0; c < IC_per_G; ++c) {
-            dest[(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k] =
-                src[(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c];
+  assert(
+      (SPATIAL_DIM == 3 || SPATIAL_DIM == 2) &&
+      "Only 2D and 3D convolutions are supported");
+  if (SPATIAL_DIM == 2) {
+    int R = conv_p.K[0];
+    int S = conv_p.K[1];
+    // Transforms weights from  G K/G (R S C/G) to G (R S C/G) K/G format.
+    for (int r = 0; r < R; ++r) {
+      for (int s = 0; s < S; ++s) {
+        for (int k = 0; k < OC_per_G; ++k) {
+          for (int g = 0; g < G; ++g) {
+            for (int c = 0; c < IC_per_G; ++c) {
+              dest[(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k] =
+                  src[(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    // Transforms weights from  G K/G (T R S C/G) to G (T R S C/G) K/G format.
+    int T = conv_p.K[0];
+    int R = conv_p.K[1];
+    int S = conv_p.K[2];
+    for (int t = 0; t < T; ++t) {
+      for (int r = 0; r < R; ++r) {
+        for (int s = 0; s < S; ++s) {
+          for (int k = 0; k < OC_per_G; ++k) {
+            for (int g = 0; g < G; ++g) {
+              for (int c = 0; c < IC_per_G; ++c) {
+                dest
+                    [((((g * T + t) * R + r) * S + s) * IC_per_G + c) *
+                         OC_per_G +
+                     k] =
+                        src[((((g * OC_per_G + k) * T + t) * R + r) * S + s) *
+                                IC_per_G +
+                            c];
+              }
+            }
           }
         }
       }
diff --git a/test/GConvTest.cc b/test/GConvTest.cc
index 84f0d52..0074535 100644
--- a/test/GConvTest.cc
+++ b/test/GConvTest.cc
@@ -43,6 +43,8 @@ class fbgemmGConvAcc32WithQuantGranularityTest
           QuantizationGranularity,
           bool,
           bool>> {};
+class fbgemmGConvPackTest
+    : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
@@ -61,6 +63,13 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::ValuesIn(qGranularityVals),
         ::testing::Bool(), // A symmetric
         ::testing::Bool())); // B symmetric
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmGConvPackTest,
+    ::testing::Combine(
+        ::testing::Values(matrix_op_t::NoTranspose),
+        ::testing::ValuesIn(transposeVals)));
 /**
  * @brief Shapes for unit test.
  */
@@ -413,3 +422,51 @@ TEST_P(fbgemmGConvAcc32Test, NoRequantizeTest) {
         static_cast<int32_t>(0));
   } // for each shape
 }
+
+/**
+ * @brief Unit test for packing and unpacking the weight tensor
+ */
+TEST_P(fbgemmGConvPackTest, PackUnpackTest) {
+  vector<conv_param_t<>> shapes(GetShapes_());
+  matrix_op_t atrans, btrans;
+  tie(atrans, btrans) = GetParam();
+
+  for (auto conv_p : shapes) {
+    int R = conv_p.K[0];
+    int S = conv_p.K[1];
+    int IC_per_G = conv_p.IC / conv_p.G;
+    int OC_per_G = conv_p.OC / conv_p.G;
+
+    // Weights -- test the packing/unpacking of only the weights
+    // when btrans == Transpose, the weight matrix is in layout G K/G (R S C/G)
+    // instead of G (R S C/G) K/G
+    int weight_len = R * S * conv_p.G * IC_per_G * OC_per_G;
+    aligned_vector<int8_t> Bint8(weight_len, 0);
+
+    // Random fill the weights
+    randFill<int8_t>(Bint8, -4, 4);
+
+    // Instantiate the object
+    PackWeightMatrixForGConv<int8_t> packedWeights(
+        btrans, conv_p, Bint8.data(), nullptr);
+
+    // Setup a buffer to get pack -> unpacked results
+    aligned_vector<int8_t> unpack_buf(weight_len, 0);
+
+    // START Actual pack-unpack operations
+    // Perform packing first. This should populate pdata_ of packedWeights
+    packedWeights.pack();
+
+    // Next perform unpacking
+    packedWeights.unpack(unpack_buf.data());
+    // END actual pack-unpack operations
+
+    // Sanity check
+    for (int i = 0; i < weight_len; ++i) {
+      EXPECT_EQ(Bint8.data()[i], unpack_buf.data()[i])
+        << "Pack/Unpack results differ at index " << i
+        << ", Reference: " << static_cast<int> (Bint8.data()[i])
+        << ", Pack-Unpacked: " << static_cast<int> (unpack_buf.data()[i]);
+    }
+  } // for each shape
+}
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc
index 11bd625..0604879 100644
--- a/test/I8DepthwiseTest.cc
+++ b/test/I8DepthwiseTest.cc
@@ -69,8 +69,16 @@ static vector<vector<int>> shapes = {
 };
 
 namespace {
-class FBGemmDepthWiseTest
-    : public testing::TestWithParam<tuple<bool, bool>> {};
+
+class FBGemmDepthWiseTest : public testing::TestWithParam<tuple<bool, bool>> {};
+
+// Two parameters are K (or Groups) and kernel_prod, i.e.,
+// (output_channels)(kernel_prod)
+// output_channels == Groups.
+// For example, kernel_prod for 3x3 convolution is 9
+class FBGemmDepthWisePackUnpackTest
+    : public testing::TestWithParam<tuple<int, int>> {};
+
 } // namespace
 
 INSTANTIATE_TEST_CASE_P(
@@ -78,6 +86,13 @@ INSTANTIATE_TEST_CASE_P(
     FBGemmDepthWiseTest,
     ::testing::Combine(::testing::Bool(), ::testing::Bool()));
 
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    FBGemmDepthWisePackUnpackTest,
+    ::testing::Combine(
+        ::testing::ValuesIn({8, 16, 24, 32, 40, 64, 72}),
+        ::testing::ValuesIn({1, 2, 3, 4, 5, 9, 10, 11, 27})));
+
 TEST_P(FBGemmDepthWiseTest, Test3x3) {
   bool a_symmetric, b_symmetric;
   tie(a_symmetric, b_symmetric) = GetParam();
@@ -297,8 +312,8 @@ TEST_P(FBGemmDepthWiseTest, Test3x3x3) {
             for (int k = 0; k < K; ++k) {
               int32_t expected = C_uint8_ref
                   [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k];
-              int32_t actual = C_uint8
-                  [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k];
+              int32_t actual =
+                  C_uint8[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k];
               EXPECT_EQ(expected, actual)
                   << "Depthwise 3x3 results differ at (" << n << ", " << t
                   << ", " << h << ", " << w << ", " << k << ").";
@@ -561,4 +576,50 @@ TEST(FBGemmDepthWiseTest, Test3x3x3PerChannelQuantization) {
   } // for each shape
 } // Test3x3PerChannelQuantization
 
+TEST_P(FBGemmDepthWisePackUnpackTest, TestPackUnpack) {
+  int K, kernel_prod;
+  tie(K, kernel_prod) = GetParam();
+
+  ASSERT_EQ(K % 8, 0)
+      << "output channels (== groups) should be a multiple of 8";
+  aligned_vector<int8_t> B(K * kernel_prod);
+  randFill<int8_t>(B, -16, 16);
+
+  aligned_vector<int8_t> BUnpacked(K * kernel_prod);
+
+  if (kernel_prod == 1) {
+    Packed1ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else if (kernel_prod == 2) {
+    Packed2ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else if (kernel_prod == 3) {
+    Packed3ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else if (kernel_prod == 4) {
+    Packed4ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else if (kernel_prod == 5) {
+    Packed5ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else if (kernel_prod == 9) {
+    Packed3x3ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else if (kernel_prod == 10) {
+    Packed10ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else if (kernel_prod == 11) {
+    Packed11ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else if (kernel_prod == 27) {
+    Packed3x3x3ConvMatrix BPacked(K, B.data());
+    BPacked.unpack(BUnpacked.data());
+  } else {
+    ASSERT_TRUE(false);
+  }
+
+  ASSERT_EQ(B, BUnpacked)
+      << "Original and unpacked data elements are not the same";
+} // TestPackUnpack
+
 } // namespace fbgemm
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index 20f860e..23af3eb 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -27,7 +27,7 @@ using namespace std;
 using namespace fbgemm;
 
 vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
-                                       matrix_op_t::Transpose};
+                                  matrix_op_t::Transpose};
 
 vector<QuantizationGranularity> qGranularityVals{
     QuantizationGranularity::TENSOR,
@@ -40,6 +40,8 @@ class fbgemmu8s8acc16WithQuantGranularityTest
           tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
 class fbgemmu8s8acc16Test
     : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
+class fbgemmPackUnpackAcc16Test
+    : public testing::TestWithParam<tuple<matrix_op_t, bool>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
@@ -59,6 +61,11 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::ValuesIn(transposeVals),
         ::testing::Bool()));
 
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmPackUnpackAcc16Test,
+    ::testing::Combine(::testing::ValuesIn(transposeVals), ::testing::Bool()));
+
 /**
  * @brief Shapes for unit test.
  */
@@ -810,3 +817,66 @@ TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) {
     } // for each groups
   } // for each shape
 }
+
+/**
+ * @brief Unit test for packing and unpacking the weight tensor.
+ */
+TEST_P(fbgemmPackUnpackAcc16Test, TestPackUnpack) {
+  vector<vector<int>> shapes(GetShapes_());
+  matrix_op_t btrans;
+  bool test_ld;
+  tie(btrans, test_ld) = GetParam();
+
+  for (auto shape : shapes) {
+    for (int groups : {1, 3, 4}) {
+      int n = shape[1];
+      int k = shape[2];
+
+      if (k % groups != 0) {
+        continue;
+      }
+      int k_per_group = k / groups;
+
+      // kxn matrix
+      aligned_vector<int8_t> Bint8(k * n);
+      randFill<int8_t>(Bint8, -128, 127);
+
+      // To test lda != k , we just reduce k by half and use the original k
+      // as lda.
+      int n_adjusted = n;
+      if (test_ld) {
+        if (btrans == matrix_op_t::NoTranspose) {
+          n_adjusted = std::max(n / 2, 1);
+        }
+      }
+
+      // Note that packing for weight is performed during the constructor
+      // stage.
+      PackBMatrix<int8_t, int16_t> packedWeights(
+          btrans,
+          k,
+          n_adjusted,
+          Bint8.data(),
+          (btrans == matrix_op_t::Transpose) ? k_per_group : n,
+          nullptr,
+          groups);
+
+      // Setup a buffer to get pack -> unpacked results
+      aligned_vector<int8_t> unpack_buf(k * n, 0);
+
+      // Perform unpacking
+      packedWeights.unpack(unpack_buf.data());
+
+      // Sanity check
+      for (int i = 0; i < k; i++) {
+        for (int j = 0; j < n_adjusted; j++) {
+          EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j])
+              << "Pack/Unpack results differ at index (" << i << ", " << j
+              << ", Reference: " << static_cast<int>(Bint8.data()[i * n + j])
+              << ", Pack-Unpacked: "
+              << static_cast<int>(unpack_buf.data()[i * n + j]);
+        }
+      }
+    }
+  }
+}
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index fd827b0..11ef6ff 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -39,6 +39,8 @@ class fbgemmu8s8acc32WithQuantGranularityTest
           tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
 class fbgemmu8s8acc32Test
     : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
+class fbgemmPackUnpackAcc32Test
+    : public testing::TestWithParam<tuple<matrix_op_t, bool>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
@@ -58,6 +60,11 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::ValuesIn(transposeVals),
         ::testing::Bool()));
 
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmPackUnpackAcc32Test,
+    ::testing::Combine(::testing::ValuesIn(transposeVals), ::testing::Bool()));
+
 /**
  * @brief Shapes for unit test.
  */
@@ -749,3 +756,66 @@ TEST_P(fbgemmu8s8acc32Test, TestSymmetricQuantizedInputOutput) {
     } // for each groups
   } // for each shape
 }
+
+/**
+ * @brief Unit test for packing and unpacking the weight tensor.
+ */
+TEST_P(fbgemmPackUnpackAcc32Test, TestPackUnpack) {
+  vector<vector<int>> shapes(GetShapes_());
+  matrix_op_t btrans;
+  bool test_ld;
+  tie(btrans, test_ld) = GetParam();
+
+  for (auto shape : shapes) {
+    for (int groups : {1, 3, 4}) {
+      int n = shape[1];
+      int k = shape[2];
+
+      if (k % groups != 0) {
+        continue;
+      }
+      int k_per_group = k / groups;
+
+      // kxn matrix
+      aligned_vector<int8_t> Bint8(k * n);
+      randFill<int8_t>(Bint8, -128, 127);
+
+      // To test lda != k , we just reduce k by half and use the original k
+      // as lda.
+      int n_adjusted = n;
+      if (test_ld) {
+        if (btrans == matrix_op_t::NoTranspose) {
+          n_adjusted = std::max(n / 2, 1);
+        }
+      }
+
+      // Note that packing for weight is performed during the constructor
+      // stage.
+      PackBMatrix<int8_t> packedWeights(
+          btrans,
+          k,
+          n_adjusted,
+          Bint8.data(),
+          (btrans == matrix_op_t::Transpose) ? k_per_group : n,
+          nullptr,
+          groups);
+
+      // Setup a buffer to get pack -> unpacked results
+      aligned_vector<int8_t> unpack_buf(k * n, 0);
+
+      // Perform unpacking
+      packedWeights.unpack(unpack_buf.data());
+
+      // Sanity check
+      for (int i = 0; i < k; i++) {
+        for (int j = 0; j < n_adjusted; j++) {
+          EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j])
+              << "Pack/Unpack results differ at index (" << i << ", " << j
+              << ", Reference: " << static_cast<int>(Bint8.data()[i * n + j])
+              << ", Pack-Unpacked: "
+              << static_cast<int>(unpack_buf.data()[i * n + j]);
+        }
+      }
+    }
+  }
+}
diff --git a/test/QuantUtilsTest.cc b/test/QuantUtilsTest.cc
new file mode 100644
index 0000000..ddb1f91
--- /dev/null
+++ b/test/QuantUtilsTest.cc
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+#include <limits>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include "fbgemm/QuantUtils.h"
+#include "fbgemm/Utils.h"
+
+using namespace std;
+using namespace fbgemm;
+
+// tuple represents K, C, X, G, layout_t
+// layout_t can be KCX or KXC
+class QuantizeGroupwiseTest
+    : public testing::TestWithParam<tuple<int, int, int, int, layout_t>> {};
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    QuantizeGroupwiseTest,
+    ::testing::Combine(
+        ::testing::ValuesIn({4, 12, 64}), // K
+        ::testing::ValuesIn({12, 16, 32}), // C
+        ::testing::ValuesIn({1, 10, 15, 30}), // X
+        ::testing::ValuesIn({1, 4}), // G
+        ::testing::ValuesIn({layout_t::KCX, layout_t::KXC})));
+
+template <typename T, layout_t LT>
+void ref_impl(
+    const vector<float>& src,
+    int K,
+    int C,
+    int X,
+    int G,
+    const vector<float>& scales,
+    const vector<int>& zero_points,
+    vector<T>& dst) {
+  int C_per_G = C / G;
+  for (int i = 0; i < K; ++i) {
+    for (int g = 0; g < G; ++g) {
+      for (int c = 0; c < C / G; ++c) {
+        for (int x = 0; x < X; ++x) {
+          float num;
+          if (LT == layout_t::KCX) {
+            num = src[(i * C + g * C_per_G + c) * X + x];
+          } else {
+            num = src[(i * X + x) * C + g * C_per_G + c];
+          }
+          int res = nearbyint(zero_points[g] + num / scales[g]);
+          T final_res = min<T>(
+              max<T>(res, numeric_limits<T>::min()), numeric_limits<T>::max());
+          if (LT == layout_t::KCX) {
+            dst[(i * C + g * C_per_G + c) * X + x] = final_res;
+          } else {
+            dst[(i * X + x) * C + g * C_per_G + c] = final_res;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, layout_t LT>
+void runTests(
+    const vector<float>& src,
+    int K,
+    int C,
+    int X,
+    int G,
+    const vector<float>& scales,
+    const vector<int>& zero_points,
+    vector<T>& dst,
+    vector<T>& dst_ref) {
+  QuantizeGroupwise<T, LT>(
+      src.data(), K, C, X, G, scales.data(), zero_points.data(), dst.data());
+
+  ref_impl<T, LT>(src, K, C, X, G, scales, zero_points, dst_ref);
+}
+
+/**
+ * There can be off-by-one error in quantized values due to how the mid-point
+ * cases are rounded-off in vectorized vs scalar codes and due to adding of
+ * zero_point before rounding vs after rounding. We ignore such differences
+ * while comparing results.
+ */
+template <typename T>
+::testing::AssertionResult isNear(
+    const vector<T>& res,
+    const vector<T>& res_ref) {
+  bool match = true;
+  if (res.size() == res_ref.size()) {
+    for (int i = 0; i < res.size(); ++i) {
+      if (!(res[i] == res_ref[i] || res[i] == res_ref[i] + 1 ||
+            res[i] == res_ref[i] - 1)) {
+        match = false;
+        break;
+      }
+    }
+  }
+  if (match)
+    return ::testing::AssertionSuccess();
+  else
+    return ::testing::AssertionFailure() << " Quantized results do not match";
+}
+
+/**
+ * Test for QuantizeGroupwise
+ */
+TEST_P(QuantizeGroupwiseTest, quantizeTest) {
+  int K, C, X, G;
+  layout_t layout;
+  tie(K, C, X, G, layout) = GetParam();
+
+  random_device rd;
+  mt19937 gen(rd());
+
+  uniform_real_distribution<float> disFP(0.1, 1.1);
+
+  vector<float> inp(K * C * X);
+  generate(inp.begin(), inp.end(), [&, disFP]() mutable { return disFP(gen); });
+
+  vector<float> scales(G);
+  generate(scales.begin(), scales.end(), [&, disFP]() mutable {
+    return disFP(gen);
+  });
+
+  uniform_int_distribution<> disUInt8(0, 8);
+  vector<int> zero_points_uint8(G);
+  generate(
+      zero_points_uint8.begin(),
+      zero_points_uint8.end(),
+      [&, disUInt8]() mutable { return disUInt8(gen); });
+
+  uniform_int_distribution<> disInt8(-64, 63);
+  vector<int> zero_points_int8(G);
+  generate(
+      zero_points_int8.begin(), zero_points_int8.end(), [&, disInt8]() mutable {
+        return disInt8(gen);
+      });
+
+  uniform_int_distribution<> disInt32(-512, 512);
+  vector<int> zero_points_int32(G);
+  generate(
+      zero_points_int32.begin(),
+      zero_points_int32.end(),
+      [&, disInt32]() mutable { return disInt32(gen); });
+
+  vector<uint8_t> dstuint8(K * C * X);
+  vector<uint8_t> dstuint8_ref(K * C * X);
+
+  vector<int8_t> dstint8(K * C * X);
+  vector<int8_t> dstint8_ref(K * C * X);
+
+  vector<int32_t> dstint32(K * C * X);
+  vector<int32_t> dstint32_ref(K * C * X);
+
+  if (layout == layout_t::KCX) {
+    runTests<uint8_t, layout_t::KCX>(
+        inp, K, C, X, G, scales, zero_points_uint8, dstuint8, dstuint8_ref);
+    runTests<int8_t, layout_t::KCX>(
+        inp, K, C, X, G, scales, zero_points_int8, dstint8, dstint8_ref);
+    runTests<int32_t, layout_t::KCX>(
+        inp, K, C, X, G, scales, zero_points_int32, dstint32, dstint32_ref);
+  } else {
+    runTests<uint8_t, layout_t::KXC>(
+        inp, K, C, X, G, scales, zero_points_uint8, dstuint8, dstuint8_ref);
+    runTests<int8_t, layout_t::KXC>(
+        inp, K, C, X, G, scales, zero_points_int8, dstint8, dstint8_ref);
+    runTests<int32_t, layout_t::KXC>(
+        inp, K, C, X, G, scales, zero_points_int32, dstint32, dstint32_ref);
+  }
+
+  EXPECT_TRUE(isNear(dstuint8, dstuint8_ref));
+  EXPECT_TRUE(isNear(dstint8, dstint8_ref));
+  EXPECT_TRUE(isNear(dstint32, dstint32_ref));
+}
diff --git a/test/UniConvPackingTest.cc b/test/UniConvPackingTest.cc
deleted file mode 100644
index 77552af..0000000
--- a/test/UniConvPackingTest.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <algorithm>
-#include <random>
-#include <iostream>
-
-
-#include <gtest/gtest.h>
-
-#include "QuantizationHelpers.h"
-#include "TestUtils.h"
-#include "bench/BenchUtils.h"
-#include "fbgemm/Fbgemm.h"
-#include "src/RefImplementations.h"
-
-using namespace std;
-using namespace fbgemm;
-
-namespace {
-
-// tuple represents MB, IC, OC, IT, IH, IW, KH/KW, stride, pad
-class convPackingTest
-    : public testing::TestWithParam<
-          tuple<int, int, int, int, int, int, int, int, int, int>> {};
-
-}; // namespace
-
-INSTANTIATE_TEST_CASE_P(
-    InstantiationName,
-    convPackingTest,
-    ::testing::Combine(
-        ::testing::ValuesIn({1, 2}), // MB
-        ::testing::ValuesIn({16, 32}), // IC
-        ::testing::ValuesIn({16, 32}), // OC
-        ::testing::ValuesIn({17}), // IT
-        ::testing::ValuesIn({10, 30, 55}), // IH
-        ::testing::ValuesIn({10, 30, 55}), // IW
-        ::testing::ValuesIn({1, 4, 16}), // G
-        ::testing::ValuesIn({3, 7}), // kernel
-        ::testing::ValuesIn({1, 2}), // stride
-        ::testing::ValuesIn({1, 2}))); // pad
-
-/**
- * Test for conv packing
- */
-TEST_P(convPackingTest, packingTest) {
-  int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad;
-  tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam();
-
-  conv_param_t<2> conv_p_2d(
-      MB,
-      IC,
-      OC,
-      {IH, IW},
-      G,
-      {kernel, kernel},
-      {stride, stride},
-      {pad, pad, pad, pad});
-
-  int kernel_dim_2d = kernel * kernel;
-  aligned_vector<int8_t> Bint8_2d(
-      kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G));
-  PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data());
-
-  switch (ConvFastPath<2, int32_t>(conv_p_2d)) {
-    case optimized_conv_t::depthwise: {
-      ASSERT_NE(packedB_2D.getPackedWFor2DDW(), nullptr)
-          << "2D depthwise packed matrix is null";
-      ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr)
-          << "im2col packed matrix should be null";
-      ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr)
-          << "3D depthwise packed matrix should be null";
-      ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr)
-          << "groupwise packed matrix should be null";
-      break;
-    }
-    case optimized_conv_t::groupwise: {
-      ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr)
-          << "im2col packed matrix should be null";
-      ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr)
-          << "2D depthwise packed matrix is null";
-      ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr)
-          << "3D depthwise packed matrix should be null";
-      ASSERT_NE(packedB_2D.getPackedWForGroupwise(), nullptr)
-          << "Groupwise packed matrix is null";
-      break;
-    }
-    case optimized_conv_t::im2col: {
-      ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr)
-          << "2D depthwise packed matrix is null";
-      ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr)
-          << "3D depthwise packed matrix should be null";
-      ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr)
-          << "groupwise packed matrix should be null";
-      ASSERT_NE(packedB_2D.getPackedWForIm2col(), nullptr)
-          << "im2col packed matrix is null";
-      break;
-    }
-  }
-
-  conv_param_t<3> conv_p_3d(
-      MB,
-      IC,
-      OC,
-      {IT, IH, IW},
-      G,
-      {kernel, kernel, kernel},
-      {stride, stride, stride},
-      {pad, pad, pad, pad, pad, pad});
-
-  int kernel_dim_3d = kernel * kernel * kernel;
-  aligned_vector<int8_t> Bint8_3d(
-      kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G));
-  PackWeightsForConv<3> packedB_3D(conv_p_3d, Bint8_3d.data());
-
-  switch (ConvFastPath<3, int32_t>(conv_p_3d)) {
-    case optimized_conv_t::depthwise: {
-      ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr)
-          << "2D depthwise packed matrix is null";
-      ASSERT_EQ(packedB_3D.getPackedWForIm2col(), nullptr)
-          << "im2col packed matrix should be null";
-      ASSERT_NE(packedB_3D.getPackedWFor3DDW(), nullptr)
-          << "3D depthwise packed matrix should be null";
-      ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr)
-          << "groupwise packed matrix should be null";
-      break;
-    }
-    case optimized_conv_t::groupwise: {
-      ASSERT_TRUE(false) << "groupwise are not supported for 3D";
-      break;
-    }
-    case optimized_conv_t::im2col: {
-      ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr)
-          << "2D depthwise packed matrix is null";
-      ASSERT_EQ(packedB_3D.getPackedWFor3DDW(), nullptr)
-          << "3D depthwise packed matrix should be null";
-      ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr)
-          << "groupwise packed matrix should be null";
-      ASSERT_NE(packedB_3D.getPackedWForIm2col(), nullptr)
-          << "im2col packed matrix is null";
-      break;
-    }
-  }
-}
diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc
new file mode 100644
index 0000000..91bf578
--- /dev/null
+++ b/test/UniConvTest.cc
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <algorithm>
+#include <random>
+#include <iostream>
+#include <stdexcept>
+
+
+#include <gtest/gtest.h>
+
+#include "QuantizationHelpers.h"
+#include "TestUtils.h"
+#include "bench/BenchUtils.h"
+#include "fbgemm/Fbgemm.h"
+#include "src/RefImplementations.h"
+
+using namespace std;
+using namespace fbgemm;
+
+namespace {
+
+// tuple represents MB, IC, OC, IT, IH, IW, KH/KW, stride, pad
+class uniConvTest
+    : public testing::TestWithParam<
+          tuple<int, int, int, int, int, int, int, int, int, int>> {};
+
+}; // namespace
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    uniConvTest,
+    ::testing::Combine(
+        ::testing::ValuesIn({1, 2}), // MB
+        ::testing::ValuesIn({16, 32}), // IC
+        ::testing::ValuesIn({16, 32}), // OC
+        ::testing::ValuesIn({17}), // IT
+        ::testing::ValuesIn({10, 30, 55}), // IH
+        ::testing::ValuesIn({10, 30, 55}), // IW
+        ::testing::ValuesIn({1, 4, 16}), // G
+        ::testing::ValuesIn({1, 3, 7}), // kernel
+        ::testing::ValuesIn({1, 2}), // stride
+        ::testing::ValuesIn({0, 1, 2}))); // pad
+
+/**
+ * Test for conv packing
+ */
+TEST_P(uniConvTest, packingTest) {
+  int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad;
+  tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam();
+
+  conv_param_t<2> conv_p_2d(
+      MB,
+      IC,
+      OC,
+      {IH, IW},
+      G,
+      {kernel, kernel},
+      {stride, stride},
+      {pad, pad, pad, pad});
+
+  int kernel_dim_2d = kernel * kernel;
+  aligned_vector<int8_t> Bint8_2d(
+      kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G));
+  PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data());
+
+  switch (ConvFastPath<2, int32_t>(conv_p_2d)) {
+    case optimized_conv_t::depthwise: {
+      ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr)
+          << "im2col packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr)
+          << "3D depthwise packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr)
+          << "groupwise packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWForPointwise(), nullptr)
+          << "pointwise packed matrix should be null";
+      ASSERT_NE(packedB_2D.getPackedWFor2DDW(), nullptr)
+          << "2D depthwise packed matrix is null";
+      break;
+    }
+    case optimized_conv_t::groupwise: {
+      ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr)
+          << "im2col packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr)
+          << "2D depthwise packed matrix is null";
+      ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr)
+          << "3D depthwise packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWForPointwise(), nullptr)
+          << "pointwise packed matrix should be null";
+      ASSERT_NE(packedB_2D.getPackedWForGroupwise(), nullptr)
+          << "Groupwise packed matrix is null";
+      break;
+    }
+    case optimized_conv_t::pointwise: {
+      ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr)
+          << "im2col packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr)
+          << "2D depthwise packed matrix is null";
+      ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr)
+          << "3D depthwise packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr)
+          << "Groupwise packed matrix should be null";
+      ASSERT_NE(packedB_2D.getPackedWForPointwise(), nullptr)
+          << "pointwise packed matrix is null";
+      break;
+    }
+    case optimized_conv_t::im2col: {
+      ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr)
+          << "2D depthwise packed matrix is null";
+      ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr)
+          << "3D depthwise packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr)
+          << "groupwise packed matrix should be null";
+      ASSERT_EQ(packedB_2D.getPackedWForPointwise(), nullptr)
+          << "pointwise packed matrix should be null";
+      ASSERT_NE(packedB_2D.getPackedWForIm2col(), nullptr)
+          << "im2col packed matrix is null";
+      break;
+    }
+  }
+
+  conv_param_t<3> conv_p_3d(
+      MB,
+      IC,
+      OC,
+      {IT, IH, IW},
+      G,
+      {kernel, kernel, kernel},
+      {stride, stride, stride},
+      {pad, pad, pad, pad, pad, pad});
+
+  int kernel_dim_3d = kernel * kernel * kernel;
+  aligned_vector<int8_t> Bint8_3d(
+      kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G));
+  PackWeightsForConv<3> packedB_3D(conv_p_3d, Bint8_3d.data());
+
+  switch (ConvFastPath<3, int32_t>(conv_p_3d)) {
+    case optimized_conv_t::depthwise: {
+      ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr)
+          << "2D depthwise packed matrix is null";
+      ASSERT_EQ(packedB_3D.getPackedWForIm2col(), nullptr)
+          << "im2col packed matrix should be null";
+      ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr)
+          << "groupwise packed matrix should be null";
+      ASSERT_EQ(packedB_3D.getPackedWForPointwise(), nullptr)
+          << "pointwise packed matrix should be null";
+      ASSERT_NE(packedB_3D.getPackedWFor3DDW(), nullptr)
+          << "3D depthwise packed matrix should be null";
+      break;
+    }
+    case optimized_conv_t::groupwise: {
+      ASSERT_TRUE(false) << "groupwise are not supported for 3D";
+      break;
+    }
+    case optimized_conv_t::pointwise: {
+      ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr)
+          << "2D depthwise packed matrix is null";
+      ASSERT_EQ(packedB_3D.getPackedWFor3DDW(), nullptr)
+          << "3D depthwise packed matrix should be null";
+      ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr)
+          << "groupwise packed matrix should be null";
+      ASSERT_EQ(packedB_3D.getPackedWForIm2col(), nullptr)
+          << "im2col packed matrix should be null";
+      ASSERT_NE(packedB_3D.getPackedWForPointwise(), nullptr)
+          << "pointwise packed matrix is null";
+      break;
+    }
+    case optimized_conv_t::im2col: {
+      ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr)
+          << "2D depthwise packed matrix is null";
+      ASSERT_EQ(packedB_3D.getPackedWFor3DDW(), nullptr)
+          << "3D depthwise packed matrix should be null";
+      ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr)
+          << "groupwise packed matrix should be null";
+      ASSERT_EQ(packedB_3D.getPackedWForPointwise(), nullptr)
+          << "pointwise packed matrix should be null";
+      ASSERT_NE(packedB_3D.getPackedWForIm2col(), nullptr)
+          << "im2col packed matrix is null";
+      break;
+    }
+  }
+}
+
+/**
+ * Test for packing/unpacking
+ */
+TEST_P(uniConvTest, packUnpackTest) {
+  int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad;
+  tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam();
+
+  conv_param_t<2> conv_p_2d(
+      MB,
+      IC,
+      OC,
+      {IH, IW},
+      G,
+      {kernel, kernel},
+      {stride, stride},
+      {pad, pad, pad, pad});
+
+  int kernel_dim_2d = kernel * kernel;
+
+  aligned_vector<int8_t> Bint8_2d(
+      kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G));
+  aligned_vector<int8_t> Bint8_2d_unpacked(
+      kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G));
+
+  PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data());
+
+  packedB_2D.unpack(Bint8_2d_unpacked.data());
+
+  ASSERT_EQ(Bint8_2d, Bint8_2d_unpacked)
+      << "Original and unpacked data elements are not the same [2D]";
+
+  conv_param_t<3> conv_p_3d(
+      MB,
+      IC,
+      OC,
+      {IT, IH, IW},
+      G,
+      {kernel, kernel, kernel},
+      {stride, stride, stride},
+      {pad, pad, pad, pad, pad, pad});
+
+  int kernel_dim_3d = kernel * kernel * kernel;
+
+  aligned_vector<int8_t> Bint8_3d(
+      kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G));
+
+  aligned_vector<int8_t> Bint8_3d_unpacked(
+      kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G));
+
+  PackWeightsForConv<3> packedB_3D(conv_p_3d, Bint8_3d.data());
+
+  packedB_3D.unpack(Bint8_3d_unpacked.data());
+
+  ASSERT_EQ(Bint8_3d, Bint8_3d_unpacked)
+      << "Original and unpacked data elements are not the same [3D]";
+}
+
+TEST(uniConvTest, cornerCases) {
+  int stride = 1;
+  conv_param_t<2> conv_p_2d(
+      1, // mini-batch
+      16, // input channels
+      32, // output channels
+      {28, 28}, // input height/width
+      4, // groups
+      {3, 3}, // kernel height/width
+      {stride, stride}, // strides
+      {1, 1, 1, 1}); // padding
+
+  int kernel_dim_2d = conv_p_2d.K[0] * conv_p_2d.K[1];
+
+  aligned_vector<uint8_t> Aint8(
+      conv_p_2d.MB * conv_p_2d.IN_DIM[0] * conv_p_2d.IN_DIM[1] * conv_p_2d.IC);
+  aligned_vector<int8_t> Bint8_2d(
+      kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G));
+  aligned_vector<int32_t> Cint32_fb(
+      conv_p_2d.MB * conv_p_2d.OUT_DIM[0] * conv_p_2d.OUT_DIM[1] *
+      conv_p_2d.OC);
+  aligned_vector<uint8_t> Cint8_fb(Cint32_fb.size(), 0);
+
+  // A matrix (input activations)
+  randFill<uint8_t>(Aint8, 0, 5);
+  int32_t Aint8_zero_point = 4;
+
+  // B matrix (weights)
+  randFill<int8_t>(Bint8_2d, -4, 4);
+  aligned_vector<int32_t> Bint8_zero_point(1);
+  randFill(Bint8_zero_point, -3, -1);
+
+  aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+  randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2);
+  int32_t C_zero_point = 5;
+
+  PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data());
+
+  vector<int32_t> col_offsets(conv_p_2d.OC);
+
+  DoNothing<> doNothingObj{};
+  ReQuantizeOutput<false, QuantizationGranularity::TENSOR> outputProcObj(
+      doNothingObj,
+      C_multiplier.data(),
+      C_zero_point,
+      Aint8_zero_point,
+      Bint8_zero_point.data(),
+      nullptr, // row offsets
+      col_offsets.data(),
+      nullptr, // bias
+      conv_p_2d.OC,
+      conv_p_2d.G);
+
+  try {
+    conv_p_2d.stride[0] = 2;
+    fbgemmConv(
+        conv_p_2d,
+        Aint8.data(),
+        packedB_2D,
+        Cint8_fb.data(),
+        Cint32_fb.data(),
+        outputProcObj,
+        0,
+        1);
+  } catch (std::logic_error const& err) {
+    std::string s(err.what());
+    EXPECT_TRUE(s.rfind("[FBGEMM_CONV_ERROR]", 0) == 0);
+  }
+
+  // reset
+  conv_p_2d.stride[0] = stride;
+  // this should run fine
+  fbgemmConv(
+      conv_p_2d,
+      Aint8.data(),
+      packedB_2D,
+      Cint8_fb.data(),
+      Cint32_fb.data(),
+      outputProcObj,
+      0,
+      1);
+}
author	Young Jin Kim <youki@microsoft.com>	2019-08-01 22:38:23 +0300
committer	Young Jin Kim <youki@microsoft.com>	2019-08-01 22:38:23 +0300
commit	eb8fede25bd048da6fd396654936703a474f0504 (patch)
tree	943fd29e7e173fb1075b9886b0309765f5f4b114
parent	e4ed5196cbec0d0a485578996b09912e92927e02 (diff)
parent	f712cb2328a2b29424bdaeecb9c0731da2cd997b (diff)