From 428a0b6cede232eb5c4e9c3bbd8e9d74d8e34500 Mon Sep 17 00:00:00 2001
From: Jianyu Huang <jianyuhuang@fb.com>
Date: Thu, 8 Nov 2018 11:09:04 -0800
Subject: Sync with internal copy: Asymmetric padding; fbgemm2 -> fbgemm

---
 bench/BenchUtils.cc                          |   4 +-
 bench/BenchUtils.h                           |   4 +-
 bench/Depthwise3DBenchmark.cc                |   2 +-
 bench/DepthwiseBenchmark.cc                  |   2 +-
 bench/FP16Benchmark.cc                       |   2 +-
 bench/I8SpmdmBenchmark.cc                    |   4 +-
 bench/Im2ColFusedRequantizeAcc16Benchmark.cc |  86 ++---
 bench/Im2ColFusedRequantizeAcc32Benchmark.cc |  86 ++---
 bench/PackedFloatInOutBenchmark.cc           |   2 +-
 bench/PackedRequantizeAcc16Benchmark.cc      |   2 +-
 bench/PackedRequantizeAcc32Benchmark.cc      |   2 +-
 include/fbgemm/ConvUtils.h                   |  20 +-
 include/fbgemm/Fbgemm.h                      |   4 +-
 include/fbgemm/FbgemmFP16.h                  |   2 +-
 include/fbgemm/FbgemmI8Spmdm.h               |   4 +-
 include/fbgemm/Types.h                       |   2 +-
 include/fbgemm/Utils.h                       |   4 +-
 src/ExecuteKernel.cc                         |   2 +-
 src/ExecuteKernelGeneric.h                   |   4 +-
 src/ExecuteKernelU8S8.cc                     |  28 +-
 src/ExecuteKernelU8S8.h                      |   4 +-
 src/Fbgemm.cc                                |  46 ++-
 src/FbgemmFP16.cc                            |   2 +-
 src/FbgemmFP16UKernels.cc                    |   4 +-
 src/FbgemmFP16UKernels.h                     |   4 +-
 src/FbgemmI8Depthwise.cc                     |   5 +-
 src/FbgemmI8Depthwise.h                      |   4 +-
 src/FbgemmI8Spmdm.cc                         |   4 +-
 src/GenerateKernel.h                         |   4 +-
 src/GenerateKernelU8S8S32ACC16.cc            |   4 +-
 src/GenerateKernelU8S8S32ACC16_avx512.cc     |   4 +-
 src/GenerateKernelU8S8S32ACC32.cc            |   4 +-
 src/GenerateKernelU8S8S32ACC32_avx512.cc     |   4 +-
 src/PackAMatrix.cc                           |   4 +-
 src/PackAWithIm2Col.cc                       | 211 +++++++++----
 src/PackBMatrix.cc                           |  18 +-
 src/PackMatrix.cc                            |   4 +-
 src/PackWithQuantRowOffset.cc                |   4 +-
 src/PackWithRowOffset.cc                     |   4 +-
 src/RefImplementations.cc                    |   6 +-
 src/RefImplementations.h                     |   4 +-
 src/Utils.cc                                 |   4 +-
 src/Utils_avx512.cc                          |   4 +-
 src/codegen_fp16fp32.cc                      |   2 +-
 test/FP16Test.cc                             |   4 +-
 test/I8DepthwiseTest.cc                      |   4 +-
 test/I8DepthwiseTest.h                       |   4 +-
 test/I8SpmdmTest.cc                          |   2 +-
 test/Im2ColFusedRequantizeTest.cc            | 448 ++++++++++++++++++++-------
 test/PackedRequantizeAcc16Test.cc            |   2 +-
 test/PackedRequantizeTest.cc                 |   2 +-
 test/QuantizationHelpers.cc                  |   4 +-
 test/QuantizationHelpers.h                   |   4 +-
 test/TestUtils.cc                            |   4 +-
 test/TestUtils.h                             |   4 +-
 55 files changed, 734 insertions(+), 372 deletions(-)
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
index 5dade2a..7b4cde4 100644
--- a/bench/BenchUtils.cc
+++ b/bench/BenchUtils.cc
@@ -7,7 +7,7 @@
 #include "BenchUtils.h"
 #include <random>
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 std::default_random_engine eng;
 
@@ -41,4 +41,4 @@ void llc_flush(std::vector<char>& llc) {
   }
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
index 5dd452e..5c16a06 100644
--- a/bench/BenchUtils.h
+++ b/bench/BenchUtils.h
@@ -8,11 +8,11 @@
 #include <vector>
 #include "bench/AlignedVec.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename T>
 void randFill(aligned_vector<T> &vec, const int low, const int high);
 
 void llc_flush(std::vector<char>& llc);
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index b7c7d44..417ddd1 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -24,7 +24,7 @@
 #include "BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 int main() {
   // Depthwise is memory BW bound so we want to flush LLC.
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index de08ff7..0bf2d73 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -22,7 +22,7 @@
 #include "BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 int main() {
   // From Xray OCR
diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
index f5ec10f..8fbe878 100644
--- a/bench/FP16Benchmark.cc
+++ b/bench/FP16Benchmark.cc
@@ -21,7 +21,7 @@
 #include "AlignedVec.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 void performance_test() {
   // cache flush
diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
index f97d152..d361bb5 100644
--- a/bench/I8SpmdmBenchmark.cc
+++ b/bench/I8SpmdmBenchmark.cc
@@ -22,7 +22,7 @@
 #include "BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 int main() {
   const vector<array<int, 3>> shapes = {
@@ -79,7 +79,7 @@ int main() {
         aligned_vector<uint8_t> A(M * K);
         randFill(A, 0, 255);
 
-        fbgemm2::CompressedSparseColumn B_csc(K, N);
+        fbgemm::CompressedSparseColumn B_csc(K, N);
         vector<int32_t> C(M * N);
         vector<int32_t> C_ref(C.size());
 
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index ca27278..48b744a 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -21,53 +21,53 @@
 #include "BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 void performance_test() {
   vector<conv_param_t<>> shapes = {
       // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w
-      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0}),
-      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0}),
-      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1}),
+      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
+      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
+      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
   };
 
   bool flush = true;
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
index 8cce235..9201e52 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -21,53 +21,53 @@
 #include "BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 void performance_test() {
   vector<conv_param_t<>> shapes = {
       // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w
-      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0}),
-      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0}),
-      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-      conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1}),
+      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
+      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
+      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
   };
 
   bool flush = true;
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index 4a2eda4..29a7547 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -25,7 +25,7 @@
 #include "BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 void performance_test() {
   vector<vector<int>> shapes = {
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index a758f55..4974ba2 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -25,7 +25,7 @@
 #include "BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 enum class BenchmarkType {
   BARE_BONE, // no row-offset in input packing, and no output processing
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index 27f1433..94bb899 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -25,7 +25,7 @@
 #include "BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 void performance_test() {
   vector<vector<int>> shapes = {
diff --git a/include/fbgemm/ConvUtils.h b/include/fbgemm/ConvUtils.h
index 438807f..667998c 100644
--- a/include/fbgemm/ConvUtils.h
+++ b/include/fbgemm/ConvUtils.h
@@ -9,7 +9,7 @@
 #include <array>
 #include <string>
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /**
  * @brief A struct to conveniently store all convolution parameters.
@@ -23,7 +23,9 @@ struct conv_param_t {
   int G; ///< Number of Groups
   std::array<int, SPATIAL_DIM> K; ///< Filter (Kernel) dimensions
   std::array<int, SPATIAL_DIM> stride; //< Strides
-  std::array<int, SPATIAL_DIM> pad; //< Padding (assume symmetric padding)
+  std::array<int, SPATIAL_DIM * 2>
+      pad; //< Padding (first SPATIAL_DIM is for prev/top/left padding, second
+           //SPATIAL_DIM is for next/bottom/right padding)
   std::array<int, SPATIAL_DIM> dilation; //< Kernel dilation
 
   // The following are derived parameters
@@ -42,7 +44,7 @@ struct conv_param_t {
       int g,
       std::array<int, SPATIAL_DIM> k,
       std::array<int, SPATIAL_DIM> strd,
-      std::array<int, SPATIAL_DIM> pd)
+      std::array<int, SPATIAL_DIM * 2> pd)
       : MB(mb),
         IC(ic),
         OC(oc),
@@ -53,7 +55,7 @@ struct conv_param_t {
         pad(pd) {
     for (int d = 0; d < SPATIAL_DIM; ++d) {
       dilation[d] = 1;
-      IN_DIMP[d] = IN_DIM[d] + 2 * pad[d];
+      IN_DIMP[d] = IN_DIM[d] + pad[d] + pad[SPATIAL_DIM + d];
       OUT_DIM[d] = (IN_DIMP[d] - K[d]) / stride[d] + 1;
     }
   }
@@ -89,10 +91,10 @@ struct conv_param_t {
         out += "stride_" + dim_string[3 - SPATIAL_DIM + d] + ":" +
             std::to_string(stride[d]) + ", ";
       }
-      for (int d = 0; d < SPATIAL_DIM; ++d) {
-        out += "pad_" + dim_string[3 - SPATIAL_DIM + d] + ":" +
+      for (int d = 0; d < SPATIAL_DIM * 2; ++d) {
+        out += "pad_" + dim_string[3 - (SPATIAL_DIM % 3) + d] + ":" +
             std::to_string(pad[d]);
-        if (d < SPATIAL_DIM - 1) {
+        if (d < SPATIAL_DIM * 2 - 1) {
           out += ", ";
         }
       }
@@ -106,7 +108,7 @@ struct conv_param_t {
       }
       for (int d = 0; d < SPATIAL_DIM; ++d) {
         out += "pad_" + std::to_string(d) + ":" + std::to_string(pad[d]);
-        if (d < SPATIAL_DIM - 1) {
+        if (d < SPATIAL_DIM * 2 - 1) {
           out += ", ";
         }
       }
@@ -115,4 +117,4 @@ struct conv_param_t {
   }
 };
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index 081c03b..824a966 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -32,7 +32,7 @@ extern double postprocessing_time;
 extern double run_time;
 #endif
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /**
  * @brief Templatized struct for packing parameters for A and B matrices.
@@ -969,4 +969,4 @@ static void* fbgemmAlignedAlloc(size_t __align, size_t __size) {
   return aligned_mem;
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h
index 0428d93..1083fa3 100644
--- a/include/fbgemm/FbgemmFP16.h
+++ b/include/fbgemm/FbgemmFP16.h
@@ -17,7 +17,7 @@
 #include "Types.h"
 #include "Utils.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /// class that performs packing of matrix in
 /// row-major format into
diff --git a/include/fbgemm/FbgemmI8Spmdm.h b/include/fbgemm/FbgemmI8Spmdm.h
index 264b70e..3e040ad 100644
--- a/include/fbgemm/FbgemmI8Spmdm.h
+++ b/include/fbgemm/FbgemmI8Spmdm.h
@@ -23,7 +23,7 @@ extern double spmdm_transpose_Nx32_time;
 extern double spmdm_run_time;
 #endif
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /**
  * @brief A class to represent a matrix in Compressed Sparse Column (CSC)
@@ -98,4 +98,4 @@ class CompressedSparseColumn {
   mutable std::int32_t old_nnz_;
 };
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h
index c5c62dd..0b25c22 100644
--- a/include/fbgemm/Types.h
+++ b/include/fbgemm/Types.h
@@ -10,7 +10,7 @@
 #include <cstdlib>
 #include <cstring>
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 typedef struct __attribute__((aligned(2))) __f16 {
   uint16_t x;
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 22e5a16..7283932 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -8,7 +8,7 @@
 #include <string>
 #include <type_traits>
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /**
  * @brief Helper struct to type specialize for uint8 and int8 together.
@@ -120,4 +120,4 @@ void transpose_16x16(
     float* dst,
     int ld_dst);
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/ExecuteKernel.cc b/src/ExecuteKernel.cc
index 0e3d122..3bc7e36 100644
--- a/src/ExecuteKernel.cc
+++ b/src/ExecuteKernel.cc
@@ -9,4 +9,4 @@
 #include "fbgemm/Fbgemm.h"
 #include "fbgemm/Utils.h"
 
-namespace fbgemm2 {} // namespace fbgemm2
+namespace fbgemm {} // namespace fbgemm
diff --git a/src/ExecuteKernelGeneric.h b/src/ExecuteKernelGeneric.h
index e83e943..4649912 100644
--- a/src/ExecuteKernelGeneric.h
+++ b/src/ExecuteKernelGeneric.h
@@ -9,7 +9,7 @@
 #include "fbgemm/Fbgemm.h"
 #include "GenerateKernel.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /**
  * @brief Execute Engine for the macro-kernel and output processing.
@@ -61,4 +61,4 @@ class ExecuteKernel : public CodeGenBase<
                                            ///< the C tile in the macro-kernel.
 };
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index e091a87..b3f8c15 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -14,7 +14,7 @@ double kernel_time = 0.0;
 double postprocessing_time = 0.0;
 #endif
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename packingAMatrix, typename cT, typename processOutputType>
 ExecuteKernel<
@@ -327,6 +327,18 @@ template class ExecuteKernel<
     int32_t,
     memCopy<>>;
 
+template class ExecuteKernel<
+    PackAWithIm2Col<uint8_t, int16_t>,
+    PackBMatrix<int8_t, int16_t>,
+    uint8_t,
+    ReQuantizeOutput<false>>;
+
+template class ExecuteKernel<
+    PackAWithIm2Col<uint8_t, int16_t, 3>,
+    PackBMatrix<int8_t, int16_t>,
+    uint8_t,
+    ReQuantizeOutput<false>>;
+
 template class ExecuteKernel<
     PackAWithRowOffset<uint8_t, int32_t>,
     PackBMatrix<int8_t, int32_t>,
@@ -345,6 +357,18 @@ template class ExecuteKernel<
     int32_t,
     memCopy<>>;
 
+template class ExecuteKernel<
+    PackAWithIm2Col<uint8_t, int32_t>,
+    PackBMatrix<int8_t, int32_t>,
+    uint8_t,
+    ReQuantizeOutput<false>>;
+
+template class ExecuteKernel<
+    PackAWithIm2Col<uint8_t, int32_t, 3>,
+    PackBMatrix<int8_t, int32_t>,
+    uint8_t,
+    ReQuantizeOutput<false>>;
+
 template class ExecuteKernel<
     PackAWithQuantRowOffset<uint8_t, int32_t>,
     PackBMatrix<int8_t, int32_t>,
@@ -363,4 +387,4 @@ template class ExecuteKernel<
     int32_t,
     DoNothing<int32_t, int32_t>>;
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/ExecuteKernelU8S8.h b/src/ExecuteKernelU8S8.h
index 0bd7fc5..dfa6577 100644
--- a/src/ExecuteKernelU8S8.h
+++ b/src/ExecuteKernelU8S8.h
@@ -7,7 +7,7 @@
 #pragma once
 #include "ExecuteKernel.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /**
  * @brief Execute Engine of uint 8 and int8 matrix
@@ -70,4 +70,4 @@ class ExecuteKernel<
   int nbSize_; ///< block size in the n dimension.
 };
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index 9195a05..f8f0d34 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -15,9 +15,9 @@ double computing_time = 0.0;
 double run_time = 0.0;
 #endif
 
-using namespace fbgemm2;
+using namespace fbgemm;
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <
     typename packingAMatrix,
@@ -245,6 +245,26 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
+template void fbgemmPacked(
+    PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+    uint8_t* C,
+    int32_t* C_buffer,
+    uint32_t ldc,
+    const ReQuantizeOutput<false>& outProcess,
+    int thread_id,
+    int num_threads);
+
+template void fbgemmPacked(
+    PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA,
+    PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+    uint8_t* C,
+    int32_t* C_buffer,
+    uint32_t ldc,
+    const ReQuantizeOutput<false>& outProcess,
+    int thread_id,
+    int num_threads);
+
 template void fbgemmPacked(
     PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
         packA,
@@ -360,6 +380,26 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
+template void fbgemmPacked(
+    PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+    uint8_t* C,
+    int32_t* C_buffer,
+    uint32_t ldc,
+    const ReQuantizeOutput<false>& outProcess,
+    int thread_id,
+    int num_threads);
+
+template void fbgemmPacked(
+    PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA,
+    PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+    uint8_t* C,
+    int32_t* C_buffer,
+    uint32_t ldc,
+    const ReQuantizeOutput<false>& outProcess,
+    int thread_id,
+    int num_threads);
+
 template void fbgemmPacked(
     PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
     PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
@@ -380,4 +420,4 @@ template void fbgemmPacked(
     int thread_id,
     int num_threads);
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/FbgemmFP16.cc b/src/FbgemmFP16.cc
index eff173f..f54feb8 100644
--- a/src/FbgemmFP16.cc
+++ b/src/FbgemmFP16.cc
@@ -14,7 +14,7 @@
 
 using namespace std;
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /// class that performs packing of matrix in
 /// row-major or col-major format into
diff --git a/src/FbgemmFP16UKernels.cc b/src/FbgemmFP16UKernels.cc
index ec1b297..cc1273e 100644
--- a/src/FbgemmFP16UKernels.cc
+++ b/src/FbgemmFP16UKernels.cc
@@ -6,7 +6,7 @@
  */
 #include "FbgemmFP16UKernels.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 void __attribute__ ((noinline)) gemmkernel_1x1_AVX2_fA0fB0fC0(GemmParams *gp)
 {
@@ -2200,4 +2200,4 @@ asm volatile
 );
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/FbgemmFP16UKernels.h b/src/FbgemmFP16UKernels.h
index bf7f247..88a136b 100644
--- a/src/FbgemmFP16UKernels.h
+++ b/src/FbgemmFP16UKernels.h
@@ -11,7 +11,7 @@
 #include <vector>
 #include "fbgemm/Types.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 using fp16 = float16;
 using fp32 = float;
@@ -35,6 +35,6 @@ void __attribute__ ((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams *gp);
 typedef void (* funcptr_fp16) (GemmParams *gp);
 ;
 
-} // namespace fbgemm2
+} // namespace fbgemm
 
 #endif
diff --git a/src/FbgemmI8Depthwise.cc b/src/FbgemmI8Depthwise.cc
index 551e98e..ed64859 100644
--- a/src/FbgemmI8Depthwise.cc
+++ b/src/FbgemmI8Depthwise.cc
@@ -18,7 +18,8 @@
 
 using namespace std;
 
-namespace fbgemm2 {
+namespace fbgemm
+{
 
 static array<array<int, 8>, 8> masks = {{
   {  0,  0,  0,  0,  0,  0,  0,  0,  },
@@ -2767,4 +2768,4 @@ void depthwise_3x3_per_channel_quantization_pad_1(
   }
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/FbgemmI8Depthwise.h b/src/FbgemmI8Depthwise.h
index bc62c84..a60cb58 100644
--- a/src/FbgemmI8Depthwise.h
+++ b/src/FbgemmI8Depthwise.h
@@ -8,7 +8,7 @@
 
 #include <cstdint>
 
-namespace fbgemm2
+namespace fbgemm
 {
 
 // KERNEL_PROD is the product of all kernels.
@@ -102,4 +102,4 @@ void depthwise_3x3x3_pad_1(
     const std::int32_t* col_offsets, const std::int32_t* bias,
     bool fuse_relu = false, int thread_id = 0, int num_threads = 1);
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc
index 723a467..12e1cb2 100644
--- a/src/FbgemmI8Spmdm.cc
+++ b/src/FbgemmI8Spmdm.cc
@@ -25,7 +25,7 @@ double spmdm_run_time = 0.0;
 
 using namespace std;
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 CompressedSparseColumn::CompressedSparseColumn(int num_of_rows, int num_of_cols)
     : num_rows_(num_of_rows),
@@ -505,4 +505,4 @@ void CompressedSparseColumn::SpMDM(
 #endif
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernel.h b/src/GenerateKernel.h
index 30160d1..5a75c33 100644
--- a/src/GenerateKernel.h
+++ b/src/GenerateKernel.h
@@ -11,7 +11,7 @@
 #include <tuple>
 #include "fbgemm/Fbgemm.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 namespace x86 = asmjit::x86;
 
@@ -151,4 +151,4 @@ thread_local std::map<
     typename CodeGenBase<TA, TB, TC, accT>::jit_micro_kernel_fp>
     CodeGenBase<TA, TB, TC, accT>::codeCache_;
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc
index 451592a..b9ab727 100644
--- a/src/GenerateKernelU8S8S32ACC16.cc
+++ b/src/GenerateKernelU8S8S32ACC16.cc
@@ -7,7 +7,7 @@
 #include <iostream>
 #include "GenerateKernel.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 namespace x86 = asmjit::x86;
 
@@ -295,4 +295,4 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx2>(
   return fn;
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernelU8S8S32ACC16_avx512.cc b/src/GenerateKernelU8S8S32ACC16_avx512.cc
index cab43ed..eeeaea0 100644
--- a/src/GenerateKernelU8S8S32ACC16_avx512.cc
+++ b/src/GenerateKernelU8S8S32ACC16_avx512.cc
@@ -7,7 +7,7 @@
 #include <iostream>
 #include "GenerateKernel.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 namespace x86 = asmjit::x86;
 
@@ -298,4 +298,4 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx512>(
   return fn;
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc
index 9529f5d..31c9996 100644
--- a/src/GenerateKernelU8S8S32ACC32.cc
+++ b/src/GenerateKernelU8S8S32ACC32.cc
@@ -7,7 +7,7 @@
 #include <iostream>
 #include "GenerateKernel.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 namespace x86 = asmjit::x86;
 
@@ -312,4 +312,4 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx2>(
   return fn;
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernelU8S8S32ACC32_avx512.cc b/src/GenerateKernelU8S8S32ACC32_avx512.cc
index 251a8b8..0621bb0 100644
--- a/src/GenerateKernelU8S8S32ACC32_avx512.cc
+++ b/src/GenerateKernelU8S8S32ACC32_avx512.cc
@@ -7,7 +7,7 @@
 #include <iostream>
 #include "GenerateKernel.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 namespace x86 = asmjit::x86;
 
@@ -314,4 +314,4 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx512>(
   return fn;
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
index 8f260ba..cd991ca 100644
--- a/src/PackAMatrix.cc
+++ b/src/PackAMatrix.cc
@@ -10,7 +10,7 @@
 #include <iostream>
 #include "fbgemm/Fbgemm.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename T, typename accT>
 PackAMatrix<T, accT>::PackAMatrix(
@@ -162,4 +162,4 @@ void PackAMatrix<T, accT>::printPackedMatrix(std::string name) {
 
 template class PackAMatrix<uint8_t, int32_t>;
 template class PackAMatrix<uint8_t, int16_t>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index 8dde696..71efced 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -13,7 +13,7 @@
 
 #include "fbgemm/Fbgemm.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename T, typename accT, int SPATIAL_DIM>
 PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
@@ -82,9 +82,122 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
                               row_interleave_B_ * row_interleave_B_};
   BaseType::packedBlock(block_p);
   T* out = BaseType::getBuf();
+  // accumulate into row offset?
+  bool row_offset_acc = (block.col_start != 0);
+  int32_t* row_offset_buf = getRowOffsetBuffer();
 
-  if (SPATIAL_DIM == 3) { // static if
+  bool point_wise = true;
+  for (int d = 0; d < SPATIAL_DIM; ++d) {
+    if (conv_p_.K[d] != 1 || conv_p_.pad[d] != 0 || conv_p_.stride[d] != 1 ||
+        conv_p_.dilation[d] != 1) {
+      point_wise = false;
+      break;
+    }
+  }
+  for (int d = SPATIAL_DIM; d < SPATIAL_DIM * 2; ++d) {
+    if (conv_p_.pad[d] != 0) {
+      point_wise = false;
+      break;
+    }
+  }
+
+  if (point_wise) {
+    int32_t ld = this->numCols();
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+      int buf_idx = i - block.row_start;
+      memcpy(
+          out + buf_idx * BaseType::blockColSize(),
+          sdata_ + i * ld + block.col_start,
+          block.col_size * sizeof(T));
+      // zero fill
+      for (int j = block.col_size; j < block_p.col_size; ++j) {
+        out[buf_idx * BaseType::blockColSize() + j] = 0;
+      }
+      int32_t row_sum =
+          row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
+      __m256i sum_v = _mm256_setzero_si256();
+      __m256i one_epi16_v = _mm256_set1_epi16(1);
+      __m256i one_epi8_v = _mm256_set1_epi8(1);
+      for (int j = block.col_start;
+           j < block.col_start + block.col_size / 32 * 32;
+           j += 32) {
+        __m256i src_v = _mm256_loadu_si256(
+            reinterpret_cast<__m256i const*>(sdata_ + i * ld + j));
+        sum_v = _mm256_add_epi32(
+            sum_v,
+            _mm256_madd_epi16(
+                _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v));
+      }
+      for (int j = block.col_start + block.col_size / 32 * 32;
+           j < block.col_start + block.col_size;
+           ++j) {
+        row_sum += sdata_[i * ld + j];
+      }
+      // alignas(64) std::array<int32_t, 8> temp;
+      alignas(64) std::int32_t temp[8];
+      //_mm256_store_si256(reinterpret_cast<__m256i*>(temp.data()), sum_v);
+      _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v);
+      for (int k = 0; k < 8; ++k) {
+        row_sum += temp[k];
+      }
+      row_offset_buf[i - block.row_start] = row_sum;
+    }
+
+    return;
+  }
+
+  if (SPATIAL_DIM != 2 && SPATIAL_DIM != 3) {
+    assert(false && "unsupported conv dimension");
+  }
+
+  for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+    if (SPATIAL_DIM == 2) { // static if
+      int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
+      int hw = i % (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
+      int w = hw % conv_p_.OUT_DIM[1];
+      int h = hw / conv_p_.OUT_DIM[1];
+      for (int j = block.col_start;
+           j < block.col_start + block.col_size + conv_p_.IC - 1;
+           j += conv_p_.IC) {
+        int j_blk_id = j / conv_p_.IC;
+        // max( j_blk_id * IC, START)  -> min( END, (j_blk_id + 1) * IC )
+        int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start);
+        int j_blk_end = std::min(
+            (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size);
+        if (j_blk_start >= j_blk_end) {
+          break;
+        }
+
+        int rs = j / conv_p_.IC;
+        int s = rs % conv_p_.K[1];
+        int r = rs / conv_p_.K[1];
+
+        int h_in = -conv_p_.pad[0] + h * conv_p_.stride[0] + r;
+        int w_in = -conv_p_.pad[1] + w * conv_p_.stride[1] + s;
+
+        if (h_in < 0 || h_in >= conv_p_.IN_DIM[0] || w_in < 0 ||
+            w_in >= conv_p_.IN_DIM[1]) {
+          // Please note that padding for convolution should be filled with
+          // zero_pt
+          std::memset(
+              &out
+                  [(i - block.row_start) * BaseType::blockColSize() +
+                   (j_blk_start - block.col_start)],
+              BaseType::zeroPoint(),
+              sizeof(T) * (j_blk_end - j_blk_start));
+        } else {
+          std::memcpy(
+              &out
+                  [(i - block.row_start) * BaseType::blockColSize() +
+                   j_blk_start - block.col_start],
+              &sdata_
+                  [((n * conv_p_.IN_DIM[0] + h_in) * conv_p_.IN_DIM[1] + w_in) *
+                       conv_p_.IC +
+                   (j_blk_start % conv_p_.IC)],
+              sizeof(T) * (j_blk_end - j_blk_start));
+        }
+      }
+    } else if (SPATIAL_DIM == 3) { // static if
       int n =
           i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1] * conv_p_.OUT_DIM[2]);
       int thw =
@@ -139,72 +252,8 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
               sizeof(T) * (j_blk_end - j_blk_start));
         }
       }
-      // zero fill
-      // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill.
-      if ((block_p.col_start + block_p.col_size) -
-              (block.col_start + block.col_size) >
-          0) {
-        std::memset(
-            &out
-                [(i - block.row_start) * BaseType::blockColSize() +
-                 (block.col_size)],
-            0,
-            sizeof(T) *
-                ((block_p.col_start + block_p.col_size) -
-                 (block.col_start + block.col_size)));
-      }
     }
-    return;
-  }
-
-  assert(SPATIAL_DIM == 2 && "unsupported conv dimension");
 
-  for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
-    int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
-    int hw = i % (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
-    int w = hw % conv_p_.OUT_DIM[1];
-    int h = hw / conv_p_.OUT_DIM[1];
-    for (int j = block.col_start;
-         j < block.col_start + block.col_size + conv_p_.IC - 1;
-         j += conv_p_.IC) {
-      int j_blk_id = j / conv_p_.IC;
-      // max( j_blk_id * IC, START)  -> min( END, (j_blk_id + 1) * IC )
-      int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start);
-      int j_blk_end = std::min(
-          (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size);
-      if (j_blk_start >= j_blk_end) {
-        break;
-      }
-
-      int rs = j / conv_p_.IC;
-      int s = rs % conv_p_.K[1];
-      int r = rs / conv_p_.K[1];
-
-      int h_in = -conv_p_.pad[0] + h * conv_p_.stride[0] + r;
-      int w_in = -conv_p_.pad[1] + w * conv_p_.stride[1] + s;
-
-      if (h_in < 0 || h_in >= conv_p_.IN_DIM[0] || w_in < 0 ||
-          w_in >= conv_p_.IN_DIM[1]) {
-        // Please note that padding for convolution should be filled with
-        // zero_pt
-        std::memset(
-            &out
-                [(i - block.row_start) * BaseType::blockColSize() +
-                 (j_blk_start - block.col_start)],
-            BaseType::zeroPoint(),
-            sizeof(T) * (j_blk_end - j_blk_start));
-      } else {
-        std::memcpy(
-            &out
-                [(i - block.row_start) * BaseType::blockColSize() +
-                 j_blk_start - block.col_start],
-            &sdata_
-                [((n * conv_p_.IN_DIM[0] + h_in) * conv_p_.IN_DIM[1] + w_in) *
-                     conv_p_.IC +
-                 (j_blk_start % conv_p_.IC)],
-            sizeof(T) * (j_blk_end - j_blk_start));
-      }
-    }
     // zero fill
     // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill.
     if ((block_p.col_start + block_p.col_size) -
@@ -219,7 +268,33 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
               ((block_p.col_start + block_p.col_size) -
                (block.col_start + block.col_size)));
     }
-  }
+
+    // TODO: skip row_offset computation when B_zero_point is 0
+    int32_t row_sum =
+        row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
+
+    __m256i sum_v = _mm256_setzero_si256();
+    __m256i one_epi16_v = _mm256_set1_epi16(1);
+    __m256i one_epi8_v = _mm256_set1_epi8(1);
+    for (int j = 0; j < block.col_size / 32 * 32; j += 32) {
+      __m256i src_v = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(
+          out + (i - block.row_start) * this->blockColSize() + j));
+      sum_v = _mm256_add_epi32(
+          sum_v,
+          _mm256_madd_epi16(
+              _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v));
+    }
+    for (int j = block.col_size / 32 * 32; j < block.col_size; ++j) {
+      row_sum += out[(i - block.row_start) * this->blockColSize() + j];
+    }
+    alignas(64) int32_t temp[8];
+    _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v);
+    for (int k = 0; k < 8; ++k) {
+      row_sum += temp[k];
+    }
+
+    row_offset_buf[i - block.row_start] = row_sum;
+  } // for each i
 }
 
 template <typename T, typename accT, int SPATIAL_DIM>
@@ -267,4 +342,4 @@ template class PackAWithIm2Col<uint8_t, int16_t>;
 template class PackAWithIm2Col<uint8_t, int32_t, 3>;
 template class PackAWithIm2Col<uint8_t, int16_t, 3>;
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index 878503f..485afb1 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -10,7 +10,7 @@
 #include <iostream>
 #include "fbgemm/Fbgemm.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename T, typename accT>
 PackBMatrix<T, accT>::PackBMatrix(
@@ -163,13 +163,17 @@ bool PackBMatrix<T, accT>::equals(const PackBMatrix<T, accT>& that) const {
     return false;
   }
 
-  return memcmp(
-      BaseType::buf_,
-      that.buf_,
-      BaseType::blockRows() * BaseType::brow_ * BaseType::blockCols() *
-          BaseType::bcol_ * sizeof(T)) == 0;
+  for (int i = 0; i < this->numRows(); ++i) {
+    for (int j = 0; j < this->numCols(); ++j) {
+      if (this->buf_[addr(i, j)] != that.buf_[that.addr(i, j)]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
 }
 
 template class PackBMatrix<int8_t, int32_t>;
 template class PackBMatrix<int8_t, int16_t>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
index 37b4e88..fd4c766 100644
--- a/src/PackMatrix.cc
+++ b/src/PackMatrix.cc
@@ -11,7 +11,7 @@
 #include "fbgemm/ConvUtils.h"
 #include "fbgemm/Fbgemm.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename PT, typename inpType, typename accType>
 PackMatrix<PT, inpType, accType>::PackMatrix(
@@ -91,4 +91,4 @@ template class PackMatrix<
 template class PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>;
 
 template class PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackWithQuantRowOffset.cc b/src/PackWithQuantRowOffset.cc
index 5f60faa..15cd737 100644
--- a/src/PackWithQuantRowOffset.cc
+++ b/src/PackWithQuantRowOffset.cc
@@ -13,7 +13,7 @@
 #include <stdexcept>
 #include "fbgemm/Fbgemm.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename T, typename accT>
 PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
@@ -255,4 +255,4 @@ int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize() {
 
 template class PackAWithQuantRowOffset<uint8_t, int32_t>;
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc
index fa1f2b0..dec3f70 100644
--- a/src/PackWithRowOffset.cc
+++ b/src/PackWithRowOffset.cc
@@ -12,7 +12,7 @@
 #include <stdexcept>
 #include "fbgemm/Fbgemm.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename T, typename accT>
 PackAWithRowOffset<T, accT>::PackAWithRowOffset(
@@ -211,4 +211,4 @@ int PackAWithRowOffset<T, accT>::rowOffsetBufferSize() {
 template class PackAWithRowOffset<uint8_t, int32_t>;
 template class PackAWithRowOffset<uint8_t, int16_t>;
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index 4b919c1..dc41c27 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -13,7 +13,7 @@
 
 using namespace std;
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 void requantize_u8acc32_ref(
     int M,
@@ -195,7 +195,7 @@ void spmdm_ref(
     int M,
     const uint8_t* A,
     int lda,
-    fbgemm2::CompressedSparseColumn& B,
+    fbgemm::CompressedSparseColumn& B,
     bool accumulation,
     int32_t* C,
     int ldc) {
@@ -746,4 +746,4 @@ void depthwise_3x3x3_pad_1_ref(
   }
 };
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index 69d060a..9e81ce1 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -12,7 +12,7 @@
 #include "fbgemm/ConvUtils.h"
 #include "fbgemm/FbgemmI8Spmdm.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /**
  * @brief Reference implementation of requantization step.
@@ -283,4 +283,4 @@ void depthwise_3x3x3_pad_1_ref(
     const std::int32_t* col_offsets,
     const std::int32_t* bias);
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/Utils.cc b/src/Utils.cc
index 10ab469..45aafd3 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -15,7 +15,7 @@
 #include <limits>
 #include <stdexcept>
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /**
  * @brief Compare the reference and test result matrix to check the correctness.
@@ -354,4 +354,4 @@ void transpose_simd(
   }
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/Utils_avx512.cc b/src/Utils_avx512.cc
index b6bf413..62a99ba 100644
--- a/src/Utils_avx512.cc
+++ b/src/Utils_avx512.cc
@@ -9,7 +9,7 @@
 
 #include <immintrin.h>
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 inline void transpose_kernel_16x16_avx512(
     const float* src,
@@ -240,4 +240,4 @@ void transpose_16x16(
   transpose_8x8(M - ib, N, &src[ib * ld_src], ld_src, &dst[ib], ld_dst);
 }
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/codegen_fp16fp32.cc b/src/codegen_fp16fp32.cc
index 8e36c85..2b2b022 100644
--- a/src/codegen_fp16fp32.cc
+++ b/src/codegen_fp16fp32.cc
@@ -79,7 +79,7 @@ int main() {
   hdrfile << "#include <tuple>\n";
   hdrfile << "#include <vector>\n";
   hdrfile << "#include \"fbgemm/Types.h\"\n";
-  hdrfile << "using fp16 = fbgemm2::float16;\n";
+  hdrfile << "using fp16 = fbgemm::float16;\n";
   hdrfile << "using fp32 = float;\n";
   hdrfile << "struct GemmParams {uint64_t k; float *A; const fp16 *B;\n"
              "float *beta; uint64_t accum; float *C;  uint64_t ldc;\n"
diff --git a/test/FP16Test.cc b/test/FP16Test.cc
index c346049..b5d8763 100644
--- a/test/FP16Test.cc
+++ b/test/FP16Test.cc
@@ -18,7 +18,7 @@
 #endif
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 namespace {
   // The template parameter is transpose of A and B
@@ -75,7 +75,7 @@ TEST_P(FBGemmFP16Test, Test) {
 
     aligned_vector<float> A(m * k, 0.f);
     aligned_vector<float> B(k * n, 0.f);
-    aligned_vector<float> C(m * n, 0.f);
+    aligned_vector<float> C(m * n, NAN);
 
     // initialize with small numbers
     randFill(A, 0, 4);
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc
index cfde880..d961612 100644
--- a/test/I8DepthwiseTest.cc
+++ b/test/I8DepthwiseTest.cc
@@ -19,7 +19,7 @@
 
 using namespace std;
 
-namespace fbgemm2
+namespace fbgemm
 {
 
 // From Xray OCR
@@ -445,4 +445,4 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
   } // for each shape
 } // Test3x3PerChannelQuantization
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/test/I8DepthwiseTest.h b/test/I8DepthwiseTest.h
index 38dea6f..77db34f 100644
--- a/test/I8DepthwiseTest.h
+++ b/test/I8DepthwiseTest.h
@@ -8,7 +8,7 @@
 
 #include <vector>
 
-namespace fbgemm2
+namespace fbgemm
 {
 
 // From ResNeXt-3D-101
@@ -35,4 +35,4 @@ static std::vector<std::vector<int>> shapes_3d = {
   {   1,   8,    4,   4,  4, 1, },
 };
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/test/I8SpmdmTest.cc b/test/I8SpmdmTest.cc
index cd8a94c..16dd038 100644
--- a/test/I8SpmdmTest.cc
+++ b/test/I8SpmdmTest.cc
@@ -23,7 +23,7 @@
 #include "bench/BenchUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 std::vector<float> densities{0.0001f, 0.001f, 0.01f, 0.1f, 1.0f};
 
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index 3ac8d28..391b993 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -17,54 +17,54 @@
 
 using namespace std;
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 // From Faster-RCNN with ShuffleNet
 static vector<conv_param_t<>> shapes = {
     // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w
-    conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0}),
-    conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}),
-    conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0}),
-    conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}),
-    // conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1}),
-    conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(3, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-    // conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}),
-    conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1}),
+    conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0, 0, 0}),
+    conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0, 0, 0}),
+    conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
+    // conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(3, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    // conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
 };
 
 TEST(FBGemmIm2colTest, Acc32Test) {
@@ -74,13 +74,40 @@ TEST(FBGemmIm2colTest, Acc32Test) {
     aligned_vector<int8_t> Bint8(
         conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+    aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+    aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
 
     randFill(Aint8, 0, 80);
     int32_t Aint8_zero_point = 43;
     randFill(Bint8, -16, 16);
+    int32_t Bint8_zero_point = -30;
+
+    float C_multiplier = 0.1234;
+    int32_t C_zero_pt = 5;
+
+    int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
+    int NDim = conv_p.OC;
+    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
+
+    // computing row offset
+    vector<int32_t> row_offsets(MDim);
+    vector<uint8_t> Aint8_im2col(MDim * KDim);
+    im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
+    row_offsets_u8acc32_ref(
+        MDim, KDim, KDim, Aint8_im2col.data(), row_offsets.data());
+
+    // computing column offset
+    vector<int32_t> col_offsets;
+    col_offsets.resize(NDim);
+    col_offsets_with_zero_pt_s8acc32_ref(
+        KDim,
+        NDim,
+        NDim,
+        Bint8.data(),
+        Bint8_zero_point,
+        col_offsets.data());
 
     conv_ref(
         conv_p,
@@ -89,8 +116,19 @@ TEST(FBGemmIm2colTest, Acc32Test) {
         Bint8.data(),
         Cint32_ref.data());
 
-    int NDim = conv_p.OC;
-    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
+    requantize_u8acc32_ref(
+        MDim,
+        NDim,
+        NDim,
+        Cint32_ref.data(),
+        Cint8_ref.data(),
+        C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        Bint8_zero_point,
+        row_offsets.data(),
+        col_offsets.data(),
+        nullptr);
 
     vector<int32_t> row_offset_buf;
     row_offset_buf.resize(
@@ -102,17 +140,24 @@ TEST(FBGemmIm2colTest, Acc32Test) {
     PackBMatrix<int8_t, int32_t> packedB(
         matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
 
-    // no-op output process objects
-    DoNothing<int32_t, int32_t> doNothing32BitObj;
-    memCopy<> memcopyObj(doNothing32BitObj);
+    DoNothing<> doNothingObj{};
+    ReQuantizeOutput<false> outputProcObj(
+        doNothingObj,
+        C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        Bint8_zero_point,
+        packA.getRowOffsetBuffer(),
+        col_offsets.data(),
+        nullptr);
 
     fbgemmPacked(
         packA,
         packedB,
-        Cint32_fb.data(),
+        Cint8_fb.data(),
         Cint32_fb.data(),
         NDim,
-        memcopyObj,
+        outputProcObj,
         0,
         1);
 
@@ -121,11 +166,11 @@ TEST(FBGemmIm2colTest, Acc32Test) {
       for (int h = 0; h < conv_p.OUT_DIM[0]; ++h) {
         for (int w = 0; w < conv_p.OUT_DIM[1]; ++w) {
           for (int k = 0; k < conv_p.OC; ++k) {
-            int32_t expected = Cint32_ref
+            int32_t expected = Cint8_ref
                 [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) *
                      conv_p.OC +
                  k];
-            int32_t actual = Cint32_fb
+            int32_t actual = Cint8_fb
                 [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) *
                      conv_p.OC +
                  k];
@@ -148,13 +193,40 @@ TEST(FBGemmIm2colTest, Acc16Test) {
     aligned_vector<int8_t> Bint8(
         conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0.0f);
-    aligned_vector<int32_t> Cint32_fb(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+    aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+    aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
 
     randFill(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
     randFill(Bint8, -4, 4);
+    int32_t Bint8_zero_point = -2;
+
+    float C_multiplier = 0.1234;
+    int32_t C_zero_pt = 5;
+
+    int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
+    int NDim = conv_p.OC;
+    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
+
+    // computing row offset
+    vector<int32_t> row_offsets(MDim);
+    vector<uint8_t> Aint8_im2col(MDim * KDim);
+    im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
+    row_offsets_u8acc32_ref(
+        MDim, KDim, KDim, Aint8_im2col.data(), row_offsets.data());
+
+    // computing column offset
+    vector<int32_t> col_offsets;
+    col_offsets.resize(NDim);
+    col_offsets_with_zero_pt_s8acc32_ref(
+        KDim,
+        NDim,
+        NDim,
+        Bint8.data(),
+        Bint8_zero_point,
+        col_offsets.data());
 
     conv_ref(
         conv_p,
@@ -163,8 +235,19 @@ TEST(FBGemmIm2colTest, Acc16Test) {
         Bint8.data(),
         Cint32_ref.data());
 
-    int NDim = conv_p.OC;
-    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
+    requantize_u8acc32_ref(
+        MDim,
+        NDim,
+        NDim,
+        Cint32_ref.data(),
+        Cint8_ref.data(),
+        C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        Bint8_zero_point,
+        row_offsets.data(),
+        col_offsets.data(),
+        nullptr);
 
     vector<int32_t> row_offset_buf;
     row_offset_buf.resize(
@@ -176,17 +259,24 @@ TEST(FBGemmIm2colTest, Acc16Test) {
     PackBMatrix<int8_t, int16_t> packedB(
         matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
 
-    // no-op output process objects
-    DoNothing<int32_t, int32_t> doNothing32BitObj;
-    memCopy<> memcopyObj(doNothing32BitObj);
+    DoNothing<> doNothingObj{};
+    ReQuantizeOutput<false> outputProcObj(
+        doNothingObj,
+        C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        Bint8_zero_point,
+        packA.getRowOffsetBuffer(),
+        col_offsets.data(),
+        nullptr);
 
     fbgemmPacked(
         packA,
         packedB,
-        Cint32_fb.data(),
+        Cint8_fb.data(),
         Cint32_fb.data(),
         NDim,
-        memcopyObj,
+        outputProcObj,
         0,
         1);
 
@@ -195,11 +285,11 @@ TEST(FBGemmIm2colTest, Acc16Test) {
       for (int h = 0; h < conv_p.OUT_DIM[0]; ++h) {
         for (int w = 0; w < conv_p.OUT_DIM[1]; ++w) {
           for (int k = 0; k < conv_p.OC; ++k) {
-            int32_t expected = Cint32_ref
+            int32_t expected = Cint8_ref
                 [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) *
                      conv_p.OC +
                  k];
-            int32_t actual = Cint32_fb
+            int32_t actual = Cint8_fb
                 [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) *
                      conv_p.OC +
                  k];
@@ -218,44 +308,75 @@ static vector<conv_param_t<3>> shapes_3d = {
     // MB, IC, OC, IT, IH, IW, G, KT, KH, KW, stride_t, stride_h, stride_w,
     // pad_t, pad_h, pad_w
     // conv_param_t<
-    //     3>(1, 3, 64, {32, 112, 112}, 1, {3, 7, 7}, {1, 2, 2}, {1, 3, 3}),
+    //     3>(1, 3, 64, {32, 112, 112}, 1, {3, 7, 7}, {1, 2, 2}, {1, 3, 3, 1, 3,
+    //     3}),
     // conv_param_t<
-    //     3>(1, 64, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 64, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0,
+    //     0}),
     // conv_param_t<
-    //     3>(1, 64, 256, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 64, 256, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0,
+    //     0}),
     // conv_param_t<
-    //     3>(1, 256, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 256, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0,
+    //     0}),
     // conv_param_t<
-    //     3>(1, 256, 128, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 256, 128, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 256, 512, {32, 56, 56}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}),
+    //     3>(1, 256, 512, {32, 56, 56}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 128, 512, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 128, 512, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 512, 128, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 512, 128, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 512, 256, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 512, 256, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 512, 1024, {16, 28, 28}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}),
+    //     3>(1, 512, 1024, {16, 28, 28}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 256, 1024, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 256, 1024, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 1024, 256, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 1024, 256, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 1024, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 1024, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 1024, 2048, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}),
+    //     3>(1, 1024, 2048, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 2048, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
+    //     3>(1, 2048, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0,
+    //     0, 0}),
     // conv_param_t<
-    //     3>(1, 512, 2048, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
-
+    //     3>(1, 512, 2048, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0,
+    //     0}),
+    conv_param_t<3>(
+        1,
+        3,
+        4,
+        {32, 112, 112},
+        1,
+        {3, 7, 7},
+        {1, 2, 2},
+        {1, 3, 3, 1, 3, 3}),
+    conv_param_t<3>(
+        1,
+        3,
+        4,
+        {32, 112, 112},
+        1,
+        {3, 7, 7},
+        {1, 2, 2},
+        {1, 3, 3, 1, 1, 0}),
     conv_param_t<
-        3>(1, 3, 4, {32, 112, 112}, 1, {3, 7, 7}, {1, 2, 2}, {1, 3, 3}),
+        3>(1, 8, 16, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}),
     conv_param_t<
-        3>(1, 8, 16, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}),
-    conv_param_t<
-        3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}),
+        3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, 0, 0}),
 };
 
 TEST(FBGemmIm2colTest, 3DAcc32Test) {
@@ -267,17 +388,43 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) {
     aligned_vector<int8_t> Bint8(
         conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC, 0);
     aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2] *
-            conv_p.OC,
-        0.0f);
-    aligned_vector<int32_t> Cint32_fb(
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2] *
             conv_p.OC,
         0);
+    aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+    aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
 
     randFill(Aint8, 0, 80);
     int32_t Aint8_zero_point = 43;
     randFill(Bint8, -16, 16);
+    int32_t Bint8_zero_point = -30;
+
+    float C_multiplier = 0.1234;
+    int32_t C_zero_pt = 5;
+
+    int MDim =
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2];
+    int NDim = conv_p.OC;
+    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC;
+
+    // computing row offset
+    vector<int32_t> row_offsets(MDim);
+    vector<uint8_t> Aint8_im2col(MDim * KDim);
+    im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
+    row_offsets_u8acc32_ref(
+        MDim, KDim, KDim, Aint8_im2col.data(), row_offsets.data());
+
+    // computing column offset
+    vector<int32_t> col_offsets;
+    col_offsets.resize(NDim);
+    col_offsets_with_zero_pt_s8acc32_ref(
+        KDim,
+        NDim,
+        NDim,
+        Bint8.data(),
+        Bint8_zero_point,
+        col_offsets.data());
 
     conv3d_ref(
         conv_p,
@@ -286,8 +433,19 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) {
         Bint8.data(),
         Cint32_ref.data());
 
-    int NDim = conv_p.OC;
-    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC;
+    requantize_u8acc32_ref(
+        MDim,
+        NDim,
+        NDim,
+        Cint32_ref.data(),
+        Cint8_ref.data(),
+        C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        Bint8_zero_point,
+        row_offsets.data(),
+        col_offsets.data(),
+        nullptr);
 
     vector<int32_t> row_offset_buf;
     row_offset_buf.resize(
@@ -297,19 +455,33 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) {
         conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
 
     PackBMatrix<int8_t, int32_t> packedB(
-        matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
-
-    // no-op output process objects
-    DoNothing<int32_t, int32_t> doNothing32BitObj;
-    memCopy<> memcopyObj(doNothing32BitObj);
+        matrix_op_t::NoTranspose,
+        KDim,
+        NDim,
+        Bint8.data(),
+        NDim,
+        nullptr,
+        1,
+        Bint8_zero_point);
+
+    DoNothing<> doNothingObj{};
+    ReQuantizeOutput<false> outputProcObj(
+        doNothingObj,
+        C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        Bint8_zero_point,
+        packA.getRowOffsetBuffer(),
+        col_offsets.data(),
+        nullptr);
 
     fbgemmPacked(
         packA,
         packedB,
-        Cint32_fb.data(),
+        Cint8_fb.data(),
         Cint32_fb.data(),
         NDim,
-        memcopyObj,
+        outputProcObj,
         0,
         1);
 
@@ -319,13 +491,13 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) {
         for (int h = 0; h < conv_p.OUT_DIM[1]; ++h) {
           for (int w = 0; w < conv_p.OUT_DIM[2]; ++w) {
             for (int k = 0; k < conv_p.OC; ++k) {
-              int32_t expected = Cint32_ref
+              int32_t expected = Cint8_ref
                   [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) *
                         conv_p.OUT_DIM[2] +
                     w) *
                        conv_p.OC +
                    k];
-              int32_t actual = Cint32_fb
+              int32_t actual = Cint8_fb
                   [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) *
                         conv_p.OUT_DIM[2] +
                     w) *
@@ -355,14 +527,40 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) {
         conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2] *
             conv_p.OC,
         0.0f);
-    aligned_vector<int32_t> Cint32_fb(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2] *
-            conv_p.OC,
-        0);
+    aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+    aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
 
     randFill(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
     randFill(Bint8, -4, 4);
+    int32_t Bint8_zero_point = -2;
+
+    float C_multiplier = 0.1234;
+    int32_t C_zero_pt = 5;
+
+    int MDim =
+        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2];
+    int NDim = conv_p.OC;
+    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC;
+
+    // computing row offset
+    vector<int32_t> row_offsets(MDim);
+    vector<uint8_t> Aint8_im2col(MDim * KDim);
+    im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
+    row_offsets_u8acc32_ref(
+        MDim, KDim, KDim, Aint8_im2col.data(), row_offsets.data());
+
+    // computing column offset
+    vector<int32_t> col_offsets;
+    col_offsets.resize(NDim);
+    col_offsets_with_zero_pt_s8acc32_ref(
+        KDim,
+        NDim,
+        NDim,
+        Bint8.data(),
+        Bint8_zero_point,
+        col_offsets.data());
 
     conv3d_ref(
         conv_p,
@@ -371,8 +569,19 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) {
         Bint8.data(),
         Cint32_ref.data());
 
-    int NDim = conv_p.OC;
-    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC;
+    requantize_u8acc32_ref(
+        MDim,
+        NDim,
+        NDim,
+        Cint32_ref.data(),
+        Cint8_ref.data(),
+        C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        Bint8_zero_point,
+        row_offsets.data(),
+        col_offsets.data(),
+        nullptr);
 
     vector<int32_t> row_offset_buf;
     row_offset_buf.resize(
@@ -384,17 +593,24 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) {
     PackBMatrix<int8_t, int16_t> packedB(
         matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
 
-    // no-op output process objects
-    DoNothing<int32_t, int32_t> doNothing32BitObj;
-    memCopy<> memcopyObj(doNothing32BitObj);
+    DoNothing<> doNothingObj{};
+    ReQuantizeOutput<false> outputProcObj(
+        doNothingObj,
+        C_multiplier,
+        C_zero_pt,
+        Aint8_zero_point,
+        Bint8_zero_point,
+        packA.getRowOffsetBuffer(),
+        col_offsets.data(),
+        nullptr);
 
     fbgemmPacked(
         packA,
         packedB,
-        Cint32_fb.data(),
+        Cint8_fb.data(),
         Cint32_fb.data(),
         NDim,
-        memcopyObj,
+        outputProcObj,
         0,
         1);
 
@@ -404,13 +620,13 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) {
         for (int h = 0; h < conv_p.OUT_DIM[1]; ++h) {
           for (int w = 0; w < conv_p.OUT_DIM[2]; ++w) {
             for (int k = 0; k < conv_p.OC; ++k) {
-              int32_t expected = Cint32_ref
+              int32_t expected = Cint8_ref
                   [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) *
                         conv_p.OUT_DIM[2] +
                     w) *
                        conv_p.OC +
                    k];
-              int32_t actual = Cint32_fb
+              int32_t actual = Cint8_fb
                   [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) *
                         conv_p.OUT_DIM[2] +
                     w) *
@@ -427,4 +643,4 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) {
   } // for each shape
 } // Acc16Test
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index 28e1114..82ae96f 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -23,7 +23,7 @@
 #include "TestUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
                                        matrix_op_t::Transpose};
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index 4b1b5b8..e0c9850 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -24,7 +24,7 @@
 #include "TestUtils.h"
 
 using namespace std;
-using namespace fbgemm2;
+using namespace fbgemm;
 
 std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
                                        matrix_op_t::Transpose};
diff --git a/test/QuantizationHelpers.cc b/test/QuantizationHelpers.cc
index 354519b..eab08de 100644
--- a/test/QuantizationHelpers.cc
+++ b/test/QuantizationHelpers.cc
@@ -12,7 +12,7 @@
 
 using namespace std;
 
-namespace fbgemm2 {
+namespace fbgemm {
 /*
  * @brief Make sure we won't have overflows from vpmaddubsw instruction.
  */
@@ -54,4 +54,4 @@ template void
 avoidOverflow(int m, int n, int k, const uint8_t* Aint8, int8_t* B);
 template void
 avoidOverflow(int m, int n, int k, const uint8_t* Aint8, float* B);
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/test/QuantizationHelpers.h b/test/QuantizationHelpers.h
index fdc6e02..42c3e08 100644
--- a/test/QuantizationHelpers.h
+++ b/test/QuantizationHelpers.h
@@ -7,7 +7,7 @@
 #pragma once
 #include <cinttypes>
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /*
  * @brief Make sure we won't have overflows from vpmaddubsw instruction.
@@ -15,4 +15,4 @@ namespace fbgemm2 {
 template <typename T>
 void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B);
 
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/test/TestUtils.cc b/test/TestUtils.cc
index 702425e..5cc14ef 100644
--- a/test/TestUtils.cc
+++ b/test/TestUtils.cc
@@ -9,7 +9,7 @@
 #include "fbgemm/Fbgemm.h"
 #include "bench/AlignedVec.h"
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 template <typename T>
 int compare_validate_buffers(
@@ -97,4 +97,4 @@ template void transpose_matrix<float>(float* ref, int n, int k);
 template void transpose_matrix<int32_t>(int32_t* ref, int n, int k);
 template void transpose_matrix<uint8_t>(uint8_t* ref, int n, int k);
 template void transpose_matrix<int8_t>(int8_t* ref, int n, int k);
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/test/TestUtils.h b/test/TestUtils.h
index 559f816..6cc365f 100644
--- a/test/TestUtils.h
+++ b/test/TestUtils.h
@@ -8,7 +8,7 @@
 #include <cmath>
 #include <vector>
 
-namespace fbgemm2 {
+namespace fbgemm {
 
 /*
  * @brief Check and validate the buffers for reference and FBGEMM result.
@@ -37,4 +37,4 @@ bool check_all_zero_entries(const T* test, int m, int n);
  */
 template <typename T>
 void transpose_matrix(T* ref, int n, int k);
-} // namespace fbgemm2
+} // namespace fbgemm
-- 
cgit v1.2.3