remove dw conv refs and use conv_ref instead (#122)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/122 To prepare depth-wise convolution other than 3x3. The existing reference depth-wise convolution is limited to 3x3 and we should reuse conv_ref implementation for easier maintenance. Reviewed By: dskhudia Differential Revision: D17176591 fbshipit-source-id: 9f6f90a801a0ad95091f1d085e66861f86c3a8f1
author: Jongsoo Park <jongsoo@fb.com> 2019-09-04 22:04:31 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-09-04 22:06:43 +0300
commit: 624d098f6701a951d02f0ad651ddba106442eb7c (patch)
tree: 67249705eaea0b8931051d4b5e199dafc74ceb66
parent: ab2d5278073fce692b4f3b95d164f8cb3bbb1f72 (diff)
7 files changed, 403 insertions, 882 deletions
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index 0efdcac..cd31524 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -4,7 +4,6 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include "test/I8DepthwiseTest.h"
 
 #include <algorithm>
 #include <chrono>
@@ -19,8 +18,8 @@
 
 #include "AlignedVec.h"
 #include "BenchUtils.h"
-#include "fbgemm/Utils.h"
 #include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
 #include "src/RefImplementations.h"
 
 using namespace std;
@@ -35,6 +34,34 @@ int main() {
   }
 #endif
 
+  // From ResNeXt-3D-101
+  // clang-format off
+  vector<vector<int>> shapes_3d = {
+    // NOTE: clang-format wants to use a different formatting but the current
+    // formatting should be easier to read.
+    // N, K, T_in, H_in, W_in, stride
+    {   1,  64,   32,  56, 56, 1, },
+    {   1, 128,   16,  28, 28, 1, },
+    {   1, 256,    8,  14, 14, 1, },
+    {   1, 512,    4,   7,  7, 1, },
+
+    {   1, 128,   32,  56, 56, 2, },
+    {   1, 256,   16,  28, 28, 2, },
+    {   1, 512,    8,  14, 14, 2, },
+
+    {   5,  64,   32,  56, 56, 1, },
+    {   5, 128,   16,  28, 28, 1, },
+    {   5, 256,    8,  14, 14, 1, },
+    {   5, 512,    4,   7,  7, 1, },
+
+    {   5, 128,   32,  56, 56, 2, },
+    {   5, 256,   16,  28, 28, 2, },
+    {   5, 512,    8,  14, 14, 2, },
+
+    {   1,   8,    4,   4,  4, 1, },
+  };
+  // clang-format on
+
   // Depthwise is memory BW bound so we want to flush LLC.
   bool flush = true;
   std::vector<char> llc;
@@ -61,14 +88,28 @@ int main() {
     constexpr int K_T = 3, K_H = 3, K_W = 3;
     constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
                   PAD_R = 1;
-    int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1;
-    int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1;
-    int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
+
+    conv_param_t<3> conv_p(
+        N,
+        K,
+        K,
+        {T, H, W},
+        K,
+        {K_T, K_H, K_W},
+        {stride_t, stride_h, stride_w},
+        {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R});
+    int T_OUT = conv_p.OUT_DIM[0];
+    int H_OUT = conv_p.OUT_DIM[1];
+    int W_OUT = conv_p.OUT_DIM[2];
+
+    int MDim = N * T_OUT * H_OUT * W_OUT;
+    int KDim = K_T * K_H * K_W * K;
+    int KDimPerGroup = KDim / conv_p.G;
 
     aligned_vector<uint8_t> A(N * T * H * W * K);
-    aligned_vector<int8_t> B(K * K_T * K_H * K_W);
-    aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
-        C(C_ref.size());
+    aligned_vector<int8_t> B(KDim);
+    aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
 
     randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
@@ -76,50 +117,47 @@ int main() {
     randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
-    depthwise_3x3x3_pad_1_ref(
-        N,
-        T,
-        H,
-        W,
-        K,
-        stride_t,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B.data(),
-        C_ref.data());
-
-    int32_t minimum = *min_element(C_ref.begin(), C_ref.end());
-    int32_t maximum = *max_element(C_ref.begin(), C_ref.end());
+    aligned_vector<float> C_multiplier(1);
+    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
+    int32_t C_zero_point = 5;
 
-    float C_multiplier = 255. / (maximum - minimum);
+    vector<int32_t> row_offsets(MDim);
+    // im2col to compute row offset later
+    vector<uint8_t> A_im2col(MDim * KDim);
+    im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
 
     aligned_vector<int32_t> col_offsets(K);
     aligned_vector<int32_t> bias(K);
     randFill(col_offsets, -100, 100);
     randFill(bias, -40, 40);
-    int32_t C_zero_point = 5;
 
-    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
-    depthwise_3x3x3_pad_1_ref(
-        N,
-        T,
-        H,
-        W,
-        K,
-        stride_t,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B_zero_point,
-        B.data(),
-        C_multiplier,
-        C_zero_point,
-        C_uint8_ref.data(),
-        col_offsets.data(),
-        bias.data());
+    conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      // Compute row offset
+      row_offsets_u8acc32_ref(
+          MDim,
+          KDimPerGroup,
+          KDim,
+          A_im2col.data() + g * KDimPerGroup,
+          row_offsets.data());
+
+      // Requantization
+      requantize_u8acc32_ref(
+          MDim,
+          1,
+          conv_p.G,
+          C_ref.data() + g,
+          C_uint8_ref.data() + g,
+          C_multiplier.data(),
+          C_zero_point,
+          A_zero_point,
+          &B_zero_point,
+          row_offsets.data(),
+          col_offsets.data() + g,
+          bias.data() + g,
+          K);
+    }
 
     Packed3x3x3ConvMatrix Bp(K, B.data());
 
@@ -153,7 +191,7 @@ int main() {
             A.data(),
             B_zero_point,
             Bp,
-            C_multiplier,
+            C_multiplier[0],
             C_zero_point,
             C_uint8.data(),
             col_offsets.data(),
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index e882890..4f36e6c 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -17,8 +17,8 @@
 
 #include "AlignedVec.h"
 #include "BenchUtils.h"
-#include "fbgemm/Utils.h"
 #include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
 #include "src/RefImplementations.h"
 
 using namespace std;
@@ -38,7 +38,7 @@ int main() {
   vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
-    // N,  G, H_in, W_in, stride
+    // N,  K, H_in, W_in, stride
     {   1,  272,  47, 125, 1, },
     {   1,  272,  64, 125, 1, },
     {   1,  272,  66, 125, 1, },
@@ -157,19 +157,35 @@ int main() {
 
   for (auto shape : shapes) {
     int N = shape[0];
-    int G = shape[1];
+    int K = shape[1];
     int H = shape[2];
     int W = shape[3];
     int stride_h = shape[4];
     int stride_w = stride_h;
     constexpr int R = 3, S = 3;
-    constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
-    int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
-    int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
+    int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2,
+        PAD_R = (S - 1) / 2;
+
+    conv_param_t<2> conv_p(
+        N,
+        K,
+        K,
+        {H, W},
+        K,
+        {R, S},
+        {stride_h, stride_w},
+        {PAD_T, PAD_L, PAD_B, PAD_R});
+    int H_OUT = conv_p.OUT_DIM[0];
+    int W_OUT = conv_p.OUT_DIM[1];
+
+    int MDim = N * H_OUT * W_OUT;
+    int KDim = R * S * K;
+    int KDimPerGroup = KDim / conv_p.G;
 
-    aligned_vector<uint8_t> A(N * H * W * G);
-    aligned_vector<int8_t> B(G * R * S);
-    aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());
+    aligned_vector<uint8_t> A(N * H * W * K);
+    aligned_vector<int8_t> B(KDim);
+    aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
 
     randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
@@ -177,53 +193,54 @@ int main() {
     randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
-    depthwise_3x3_pad_1_ref(
-        N,
-        H,
-        W,
-        G,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B.data(),
-        C_ref.data());
-
-    int32_t minimum = *min_element(C_ref.begin(), C_ref.end());
-    int32_t maximum = *max_element(C_ref.begin(), C_ref.end());
+    aligned_vector<float> C_multiplier(1);
+    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
+    int32_t C_zero_point = 5;
 
-    float C_multiplier = 255. / (maximum - minimum);
+    vector<int32_t> row_offsets(MDim);
+    // im2col to compute row offset later
+    vector<uint8_t> A_im2col(MDim * KDim);
+    im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
 
-    aligned_vector<int32_t> col_offsets(G);
-    aligned_vector<int32_t> bias(G);
+    aligned_vector<int32_t> col_offsets(K);
+    aligned_vector<int32_t> bias(K);
     randFill(col_offsets, -100, 100);
     randFill(bias, -40, 40);
-    int32_t C_zero_point = 5;
 
-    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
-    depthwise_3x3_pad_1_ref(
-        N,
-        H,
-        W,
-        G,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B_zero_point,
-        B.data(),
-        C_multiplier,
-        C_zero_point,
-        C_uint8_ref.data(),
-        col_offsets.data(),
-        bias.data());
+    conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      // Compute row offset
+      row_offsets_u8acc32_ref(
+          MDim,
+          KDimPerGroup,
+          KDim,
+          A_im2col.data() + g * KDimPerGroup,
+          row_offsets.data());
+
+      // Requantization
+      requantize_u8acc32_ref(
+          MDim,
+          1,
+          conv_p.G,
+          C_ref.data() + g,
+          C_uint8_ref.data() + g,
+          C_multiplier.data(),
+          C_zero_point,
+          A_zero_point,
+          &B_zero_point,
+          row_offsets.data(),
+          col_offsets.data() + g,
+          bias.data() + g,
+          K);
+    }
 
-    Packed3x3ConvMatrix Bp(G, B.data());
+    Packed3x3ConvMatrix Bp(K, B.data());
 
     double ttot = 0;
     double bytes = double(NITER) *
-        (G * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S));
-    double ops = double(NITER) * N * H_OUT * W_OUT * G * R * S * 2;
+        (K * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S));
+    double ops = double(NITER) * N * H_OUT * W_OUT * K * R * S * 2;
     chrono::time_point<chrono::system_clock> t_begin, t_end;
     for (int i = 0; i < NWARMUP + NITER; ++i) {
       llc_flush();
@@ -237,14 +254,14 @@ int main() {
             N,
             H,
             W,
-            G,
+            K,
             stride_h,
             stride_w,
             A_zero_point,
             A.data(),
             B_zero_point,
             Bp,
-            C_multiplier,
+            C_multiplier[0],
             C_zero_point,
             C_uint8.data(),
             col_offsets.data(),
@@ -264,10 +281,10 @@ int main() {
     for (int n = 0; n < N; ++n) {
       for (int h = 0; h < H_OUT; ++h) {
         for (int w = 0; w < W_OUT; ++w) {
-          for (int g = 0; g < G; ++g) {
+          for (int g = 0; g < K; ++g) {
             uint8_t expected =
-                C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * G + g];
-            uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * G + g];
+                C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * K + g];
+            uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * K + g];
             if (expected != actual) {
               cerr << "Depthwise 3x3 results differ at (" << n << ", " << h
                    << ", " << w << ", " << g << "). expected " << (int)expected
@@ -282,9 +299,9 @@ int main() {
 
     // Report performance
     printf(
-        "N = %d G = %d H = %d W = %d stride = %d with requantization fused\n",
+        "N = %d K = %d H = %d W = %d stride = %d with requantization fused\n",
         N,
-        G,
+        K,
         H,
         W,
         stride_h);
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
index e7b0ec4..98c4ed7 100644
--- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h
+++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -51,27 +51,8 @@ using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
 using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
 
 /**
- * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
- * @params A The input image in NHWK layout
- * @params Bp The pre-packed filter
- */
-FBGEMM_API void depthwise_3x3_pad_1(
-    int N,
-    int H,
-    int W,
-    int K,
-    int stride_h,
-    int stride_w,
-    std::int32_t A_zero_point,
-    const std::uint8_t* A,
-    const Packed3x3ConvMatrix& Bp,
-    std::int32_t* C,
-    int thread_id = 0,
-    int num_threads = 1);
-
-/**
- * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
- * This version is fused with requantization.
+ * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with
+ * requantization.
  *
  * @col_offsets nullptr if col_offsets are folded into bias
  */
@@ -96,8 +77,8 @@ FBGEMM_API void depthwise_3x3_pad_1(
     int num_threads = 1);
 
 /**
- * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
- * This version is fused with requantization and uses per-channel quantization.
+ * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with
+ * requantization, and using per-channel quantization.
  *
  * @col_offsets nullptr if col_offsets are folded into bias
  */
@@ -121,22 +102,6 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     int thread_id = 0,
     int num_threads = 1);
 
-FBGEMM_API void depthwise_3x3x3_pad_1(
-    int N,
-    int T,
-    int H,
-    int W,
-    int K,
-    int stride_t,
-    int stride_h,
-    int stride_w,
-    std::int32_t A_zero_point,
-    const std::uint8_t* A,
-    const Packed3x3x3ConvMatrix& Bp,
-    std::int32_t* C,
-    int thread_id = 0,
-    int num_threads = 1);
-
 /**
  * @col_offsets nullptr if col_offsets are folded into bias
  */
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index da58eba..dc40d44 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -600,406 +600,6 @@ void transposeConvWeights(
   }
 }
 
-void depthwise_3x3_pad_1_ref(
-    int N,
-    int H,
-    int W,
-    int K,
-    int stride_h,
-    int stride_w,
-    int32_t A_zero_point,
-    const uint8_t* A,
-    const int8_t* B,
-    int32_t* C) {
-  constexpr int R = 3, S = 3;
-  constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
-  int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
-  int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
-
-  for (int n = 0; n < N; ++n) {
-    for (int h = 0; h < H_OUT; ++h) {
-      for (int w = 0; w < W_OUT; ++w) {
-        for (int k = 0; k < K; ++k) {
-          int sum = 0;
-          for (int r = 0; r < R; ++r) {
-            int h_in = -PAD_T + h * stride_h + r;
-            for (int s = 0; s < S; ++s) {
-              int w_in = -PAD_L + w * stride_w + s;
-              int a = h_in < 0 || h_in >= H || w_in < 0 || w_in >= W
-                  ? A_zero_point
-                  : A[((n * H + h_in) * W + w_in) * K + k];
-              int b = B[(k * R + r) * S + s];
-              sum += a * b;
-            }
-          }
-          C[((n * H_OUT + h) * W_OUT + w) * K + k] = sum;
-        }
-      }
-    }
-  } // for each n
-};
-
-void depthwise_3x3_pad_1_ref(
-    int N,
-    int H,
-    int W,
-    int K,
-    int stride_h,
-    int stride_w,
-    int32_t A_zero_point,
-    const uint8_t* A,
-    int32_t B_zero_point,
-    const int8_t* B,
-    float C_multiplier,
-    int32_t C_zero_point,
-    uint8_t* C,
-    const int32_t* col_offsets,
-    const int32_t* bias) {
-  constexpr int R = 3, S = 3;
-  constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
-  int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
-  int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
-
-  vector<int32_t> C_int32(N * H_OUT * W_OUT * K);
-  depthwise_3x3_pad_1_ref(
-      N, H, W, K, stride_h, stride_w, A_zero_point, A, B, C_int32.data());
-
-  vector<int32_t> row_offsets(N * H_OUT * W_OUT * K);
-  for (int n = 0; n < N; ++n) {
-    for (int h = 0; h < H_OUT; ++h) {
-      for (int w = 0; w < W_OUT; ++w) {
-        for (int k = 0; k < K; ++k) {
-          int sum = 0;
-          for (int r = 0; r < R; ++r) {
-            int h_in = -PAD_T + h * stride_h + r;
-            for (int s = 0; s < S; ++s) {
-              int w_in = -PAD_L + w * stride_w + s;
-              int a = h_in < 0 || h_in >= H || w_in < 0 || w_in >= W
-                  ? A_zero_point
-                  : A[((n * H + h_in) * W + w_in) * K + k];
-              sum += a;
-            }
-          }
-          row_offsets[((n * H_OUT + h) * W_OUT + w) * K + k] = sum;
-        }
-      }
-    }
-  } // for each n
-
-  for (int i = 0; i < N * H_OUT * W_OUT; ++i) {
-    for (int k = 0; k < K; ++k) {
-      requantize_u8acc32_ref(
-          1,
-          1,
-          1,
-          C_int32.data() + i * K + k,
-          C + i * K + k,
-          &C_multiplier,
-          C_zero_point,
-          A_zero_point,
-          &B_zero_point,
-          &row_offsets[i * K + k],
-          col_offsets + k,
-          bias ? bias + k : nullptr,
-          1);
-    }
-  }
-};
-
-void depthwise_3x3_per_channel_quantization_pad_1_ref(
-    int N,
-    int H,
-    int W,
-    int K,
-    int stride_h,
-    int stride_w,
-    int32_t A_zero_point,
-    const uint8_t* A,
-    const int32_t* B_zero_point,
-    const int8_t* B,
-    const float* C_multiplier,
-    int32_t C_zero_point,
-    uint8_t* C,
-    const int32_t* col_offsets,
-    const int32_t* bias) {
-  constexpr int R = 3, S = 3;
-  constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
-  int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
-  int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
-
-  vector<int32_t> C_int32(N * H_OUT * W_OUT * K);
-  depthwise_3x3_pad_1_ref(
-      N, H, W, K, stride_h, stride_w, A_zero_point, A, B, C_int32.data());
-
-  vector<int32_t> row_offsets(N * H_OUT * W_OUT * K);
-  for (int n = 0; n < N; ++n) {
-    for (int h = 0; h < H_OUT; ++h) {
-      for (int w = 0; w < W_OUT; ++w) {
-        for (int k = 0; k < K; ++k) {
-          int sum = 0;
-          for (int r = 0; r < R; ++r) {
-            int h_in = -PAD_T + h * stride_h + r;
-            for (int s = 0; s < S; ++s) {
-              int w_in = -PAD_L + w * stride_w + s;
-              int a = h_in < 0 || h_in >= H || w_in < 0 || w_in >= W
-                  ? A_zero_point
-                  : A[((n * H + h_in) * W + w_in) * K + k];
-              sum += a;
-            }
-          }
-          row_offsets[((n * H_OUT + h) * W_OUT + w) * K + k] = sum;
-        }
-      }
-    }
-  } // for each n
-
-  for (int i = 0; i < N * H_OUT * W_OUT; ++i) {
-    for (int k = 0; k < K; ++k) {
-      requantize_u8acc32_ref(
-          1,
-          1,
-          1,
-          C_int32.data() + i * K + k,
-          C + i * K + k,
-          &C_multiplier[k],
-          C_zero_point,
-          A_zero_point,
-          &B_zero_point[k],
-          &row_offsets[i * K + k],
-          col_offsets + k,
-          bias ? bias + k : nullptr,
-          1);
-    }
-  }
-};
-
-void depthwise_3x3x3_pad_1_ref(
-    int N,
-    int T,
-    int H,
-    int W,
-    int K,
-    int stride_t,
-    int stride_h,
-    int stride_w,
-    int32_t A_zero_point,
-    const uint8_t* A,
-    const int8_t* B,
-    int32_t* C) {
-  constexpr int K_T = 3, K_H = 3, K_W = 3;
-  constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
-                PAD_R = 1;
-  int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1;
-  int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1;
-  int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
-
-  for (int n = 0; n < N; ++n) {
-    for (int t = 0; t < T_OUT; ++t) {
-      for (int h = 0; h < H_OUT; ++h) {
-        for (int w = 0; w < W_OUT; ++w) {
-          for (int k = 0; k < K; ++k) {
-            int sum = 0;
-            for (int k_t = 0; k_t < K_T; ++k_t) {
-              int t_in = -PAD_P + t * stride_t + k_t;
-              for (int k_h = 0; k_h < K_H; ++k_h) {
-                int h_in = -PAD_T + h * stride_h + k_h;
-                for (int k_w = 0; k_w < K_W; ++k_w) {
-                  int w_in = -PAD_L + w * stride_w + k_w;
-                  int a = t_in < 0 || t_in >= T || h_in < 0 || h_in >= H ||
-                          w_in < 0 || w_in >= W
-                      ? A_zero_point
-                      : A[(((n * T + t_in) * H + h_in) * W + w_in) * K + k];
-                  int b = B[((k * K_T + k_t) * K_H + k_h) * K_W + k_w];
-                  sum += a * b;
-                }
-              }
-            }
-            C[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k] = sum;
-          }
-        } // w
-      } // h
-    } // t
-  } // for each n
-};
-
-void depthwise_3x3x3_pad_1_ref(
-    int N,
-    int T,
-    int H,
-    int W,
-    int K,
-    int stride_t,
-    int stride_h,
-    int stride_w,
-    int32_t A_zero_point,
-    const uint8_t* A,
-    int32_t B_zero_point,
-    const int8_t* B,
-    float C_multiplier,
-    int32_t C_zero_point,
-    uint8_t* C,
-    const int32_t* col_offsets,
-    const int32_t* bias) {
-  constexpr int K_T = 3, K_H = 3, K_W = 3;
-  constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
-                PAD_R = 1;
-  int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1;
-  int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1;
-  int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
-
-  vector<int32_t> C_int32(N * T_OUT * H_OUT * W_OUT * K);
-  depthwise_3x3x3_pad_1_ref(
-      N,
-      T,
-      H,
-      W,
-      K,
-      stride_t,
-      stride_h,
-      stride_w,
-      A_zero_point,
-      A,
-      B,
-      C_int32.data());
-
-  vector<int32_t> row_offsets(N * T_OUT * H_OUT * W_OUT * K);
-  for (int n = 0; n < N; ++n) {
-    for (int t = 0; t < T_OUT; ++t) {
-      for (int h = 0; h < H_OUT; ++h) {
-        for (int w = 0; w < W_OUT; ++w) {
-          for (int k = 0; k < K; ++k) {
-            int sum = 0;
-            for (int k_t = 0; k_t < K_T; ++k_t) {
-              int t_in = -PAD_P + t * stride_t + k_t;
-              for (int k_h = 0; k_h < K_H; ++k_h) {
-                int h_in = -PAD_T + h * stride_h + k_h;
-                for (int k_w = 0; k_w < K_W; ++k_w) {
-                  int w_in = -PAD_L + w * stride_w + k_w;
-                  int a = t_in < 0 || t_in >= T || h_in < 0 || h_in >= H ||
-                          w_in < 0 || w_in >= W
-                      ? A_zero_point
-                      : A[(((n * T + t_in) * H + h_in) * W + w_in) * K + k];
-                  sum += a;
-                }
-              }
-            }
-            row_offsets[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k] =
-                sum;
-          }
-        } // w
-      } // h
-    } // t
-  } // for each n
-
-  for (int i = 0; i < N * T_OUT * H_OUT * W_OUT; ++i) {
-    for (int k = 0; k < K; ++k) {
-      requantize_u8acc32_ref(
-          1,
-          1,
-          1,
-          C_int32.data() + i * K + k,
-          C + i * K + k,
-          &C_multiplier,
-          C_zero_point,
-          A_zero_point,
-          &B_zero_point,
-          &row_offsets[i * K + k],
-          col_offsets + k,
-          bias ? bias + k : nullptr,
-          1);
-    }
-  }
-};
-
-void depthwise_3x3x3_per_channel_quantization_pad_1_ref(
-    int N,
-    int T,
-    int H,
-    int W,
-    int K,
-    int stride_t,
-    int stride_h,
-    int stride_w,
-    int32_t A_zero_point,
-    const uint8_t* A,
-    const int32_t* B_zero_point,
-    const int8_t* B,
-    const float* C_multiplier,
-    int32_t C_zero_point,
-    uint8_t* C,
-    const int32_t* col_offsets,
-    const int32_t* bias) {
-  constexpr int K_T = 3, K_H = 3, K_W = 3;
-  constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
-                PAD_R = 1;
-  int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1;
-  int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1;
-  int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
-
-  vector<int32_t> C_int32(N * T_OUT * H_OUT * W_OUT * K);
-  depthwise_3x3x3_pad_1_ref(
-      N,
-      T,
-      H,
-      W,
-      K,
-      stride_t,
-      stride_h,
-      stride_w,
-      A_zero_point,
-      A,
-      B,
-      C_int32.data());
-
-  vector<int32_t> row_offsets(N * T_OUT * H_OUT * W_OUT * K);
-  for (int n = 0; n < N; ++n) {
-    for (int t = 0; t < T_OUT; ++t) {
-      for (int h = 0; h < H_OUT; ++h) {
-        for (int w = 0; w < W_OUT; ++w) {
-          for (int k = 0; k < K; ++k) {
-            int sum = 0;
-            for (int k_t = 0; k_t < K_T; ++k_t) {
-              int t_in = -PAD_P + t * stride_t + k_t;
-              for (int k_h = 0; k_h < K_H; ++k_h) {
-                int h_in = -PAD_T + h * stride_h + k_h;
-                for (int k_w = 0; k_w < K_W; ++k_w) {
-                  int w_in = -PAD_L + w * stride_w + k_w;
-                  int a = t_in < 0 || t_in >= T || h_in < 0 || h_in >= H ||
-                          w_in < 0 || w_in >= W
-                      ? A_zero_point
-                      : A[(((n * T + t_in) * H + h_in) * W + w_in) * K + k];
-                  sum += a;
-                }
-              }
-            }
-            row_offsets[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k] =
-                sum;
-          }
-        } // w
-      } // h
-    } // t
-  } // for each n
-
-  for (int i = 0; i < N * T_OUT * H_OUT * W_OUT; ++i) {
-    for (int k = 0; k < K; ++k) {
-      requantize_u8acc32_ref(
-          1,
-          1,
-          1,
-          C_int32.data() + i * K + k,
-          C + i * K + k,
-          &C_multiplier[k],
-          C_zero_point,
-          A_zero_point,
-          &B_zero_point[k],
-          &row_offsets[i * K + k],
-          col_offsets + k,
-          bias ? bias + k : nullptr,
-          1);
-    }
-  }
-};
-
 template void transposeConvWeights(
     const conv_param_t<2>& conv_p,
     const std::int8_t* src,
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index 082bdf1..a20e348 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -215,124 +215,4 @@ FBGEMM_API void im2col_ref(
     std::int32_t A_zero_point,
     std::uint8_t* Ao);
 
-/*
- * @brief Reference implementation of depthwise convolution with a 3x3 filter
- * and padding size 1.
- */
-FBGEMM_API void depthwise_3x3_pad_1_ref(
-    int N,
-    int H,
-    int W,
-    int K,
-    int stride_h,
-    int stride_w,
-    std::int32_t A_zero_point,
-    const std::uint8_t* A,
-    const std::int8_t* B,
-    std::int32_t* C);
-
-/*
- * @brief Reference implementation of depthwise convolution with a 3x3 filter
- * and padding size 1, followed by requantization. (the same scaling factors and
- * zero points for each channel).
- */
-FBGEMM_API void depthwise_3x3_pad_1_ref(
-    int N,
-    int H,
-    int W,
-    int K,
-    int stride_h,
-    int stride_w,
-    std::int32_t A_zero_point,
-    const std::uint8_t* A,
-    std::int32_t B_zero_point,
-    const std::int8_t* B,
-    float C_multiplier,
-    std::int32_t C_zero_point,
-    std::uint8_t* C,
-    const std::int32_t* col_offsets,
-    const std::int32_t* bias);
-
-/*
- * @brief Reference implementation of depthwise convolution with a 3x3 filter
- * and padding size 1, followed by requantization. (different scaling factors
- * and zero points for each channel).
- */
-FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1_ref(
-    int N,
-    int H,
-    int W,
-    int K,
-    int stride_h,
-    int stride_w,
-    std::int32_t A_zero_point,
-    const std::uint8_t* A,
-    const std::int32_t* B_zero_point,
-    const std::int8_t* B,
-    const float* C_multiplier,
-    std::int32_t C_zero_point,
-    std::uint8_t* C,
-    const std::int32_t* col_offsets,
-    const std::int32_t* bias);
-
-/*
- * @brief Reference implementation of 3D depthwise convolution with a 3x3x3
- * filter and padding size 1.
- */
-FBGEMM_API void depthwise_3x3x3_pad_1_ref(
-    int N,
-    int T,
-    int H,
-    int W,
-    int K,
-    int stride_t,
-    int stride_h,
-    int stride_w,
-    std::int32_t A_zero_point,
-    const std::uint8_t* A,
-    const std::int8_t* B,
-    std::int32_t* C);
-
-/*
- * @brief Reference implementation of 3D depthwise convolution with a 3x3x3
- * filter and padding size 1, followed by requantization.
- */
-FBGEMM_API void depthwise_3x3x3_pad_1_ref(
-    int N,
-    int T,
-    int H,
-    int W,
-    int K,
-    int stride_t,
-    int stride_h,
-    int stride_w,
-    std::int32_t A_zero_point,
-    const std::uint8_t* A,
-    std::int32_t B_zero_point,
-    const std::int8_t* B,
-    float C_multiplier,
-    std::int32_t C_zero_point,
-    std::uint8_t* C,
-    const std::int32_t* col_offsets,
-    const std::int32_t* bias);
-
-FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1_ref(
-    int N,
-    int T,
-    int H,
-    int W,
-    int K,
-    int stride_t,
-    int stride_h,
-    int stride_w,
-    std::int32_t A_zero_point,
-    const std::uint8_t* A,
-    const std::int32_t* B_zero_point,
-    const std::int8_t* B,
-    const float* C_multiplier,
-    std::int32_t C_zero_point,
-    std::uint8_t* C,
-    const std::int32_t* col_offsets,
-    const std::int32_t* bias);
-
 } // namespace fbgemm
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc
index ad74ede..00c39c0 100644
--- a/test/I8DepthwiseTest.cc
+++ b/test/I8DepthwiseTest.cc
@@ -4,7 +4,6 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include "I8DepthwiseTest.h"
 
 #include <cmath>
 #include <cstdio>
@@ -68,6 +67,16 @@ static vector<vector<int>> shapes = {
 
   {   1,    8,   4,   4, 1, },
 };
+
+static vector<vector<int>> shapes_3d = {
+  // NOTE: clang-format wants to use a different formatting but the current
+  // formatting should be easier to read.
+  // N, K, T_in, H_in, W_in, stride
+  {   1,  32,   16,  28, 28, 1, },
+  {   1, 128,    8,  14, 14, 2, },
+  {   5,  16,   32,  56, 56, 1, },
+  {   1,   8,    4,   4,  4, 1, },
+};
 // clang-format on
 
 namespace {
@@ -107,13 +116,29 @@ TEST_P(FBGemmDepthWiseTest, Test3x3) {
     int stride_h = shape[4];
     int stride_w = stride_h;
     constexpr int R = 3, S = 3;
-    constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
-    int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
-    int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
+    int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2,
+        PAD_R = (S - 1) / 2;
+
+    conv_param_t<2> conv_p(
+        N,
+        K,
+        K,
+        {H, W},
+        K,
+        {R, S},
+        {stride_h, stride_w},
+        {PAD_T, PAD_L, PAD_B, PAD_R});
+    int H_OUT = conv_p.OUT_DIM[0];
+    int W_OUT = conv_p.OUT_DIM[1];
+
+    int MDim = N * H_OUT * W_OUT;
+    int KDim = R * S * K;
+    int KDimPerGroup = KDim / conv_p.G;
 
     aligned_vector<uint8_t> A(N * H * W * K);
-    aligned_vector<int8_t> B(K * R * S);
-    aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * K), C(C_ref.size());
+    aligned_vector<int8_t> B(KDim);
+    aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
 
     randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = a_symmetric ? 0 : 43;
@@ -121,46 +146,52 @@ TEST_P(FBGemmDepthWiseTest, Test3x3) {
     randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = b_symmetric ? 0 : 5;
 
-    depthwise_3x3_pad_1_ref(
-        N,
-        H,
-        W,
-        K,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B.data(),
-        C_ref.data());
-
-    int32_t minimum = *min_element(C_ref.begin(), C_ref.end());
-    int32_t maximum = *max_element(C_ref.begin(), C_ref.end());
-
-    float C_multiplier = 255. / (maximum - minimum);
+    aligned_vector<float> C_multiplier(1);
+    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
+    int32_t C_zero_point = 5;
 
     aligned_vector<int32_t> col_offsets(K);
     aligned_vector<int32_t> bias(K);
     randFill(col_offsets, -100, 100);
     randFill(bias, -40, 40);
-    int32_t C_zero_point = 5;
 
-    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
-    depthwise_3x3_pad_1_ref(
-        N,
-        H,
-        W,
-        K,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B_zero_point,
-        B.data(),
-        C_multiplier,
-        C_zero_point,
-        C_uint8_ref.data(),
-        col_offsets.data(),
-        bias.data());
+    vector<int32_t> row_offsets(MDim);
+    // im2col to compute row offset later
+    vector<uint8_t> A_im2col;
+    if (!b_symmetric) {
+      A_im2col.resize(MDim * KDim);
+      im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
+    }
+
+    conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      // Compute row offset
+      if (!b_symmetric) {
+        row_offsets_u8acc32_ref(
+            MDim,
+            KDimPerGroup,
+            KDim,
+            A_im2col.data() + g * KDimPerGroup,
+            row_offsets.data());
+      }
+
+      // Requantization
+      requantize_u8acc32_ref(
+          MDim,
+          1,
+          conv_p.G,
+          C_ref.data() + g,
+          C_uint8_ref.data() + g,
+          C_multiplier.data(),
+          C_zero_point,
+          A_zero_point,
+          &B_zero_point,
+          row_offsets.data(),
+          col_offsets.data() + g,
+          bias.data() + g,
+          K);
+    }
 
     Packed3x3ConvMatrix Bp(K, B.data());
 
@@ -175,7 +206,7 @@ TEST_P(FBGemmDepthWiseTest, Test3x3) {
         A.data(),
         B_zero_point,
         Bp,
-        C_multiplier,
+        C_multiplier[0],
         C_zero_point,
         C_uint8.data(),
         a_symmetric ? nullptr : col_offsets.data(),
@@ -222,65 +253,81 @@ TEST_P(FBGemmDepthWiseTest, Test3x3x3) {
     constexpr int K_T = 3, K_H = 3, K_W = 3;
     constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
                   PAD_R = 1;
-    int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1;
-    int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1;
-    int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
+
+    conv_param_t<3> conv_p(
+        N,
+        K,
+        K,
+        {T, H, W},
+        K,
+        {K_T, K_H, K_W},
+        {stride_t, stride_h, stride_w},
+        {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R});
+    int T_OUT = conv_p.OUT_DIM[0];
+    int H_OUT = conv_p.OUT_DIM[1];
+    int W_OUT = conv_p.OUT_DIM[2];
+
+    int MDim = N * T_OUT * H_OUT * W_OUT;
+    int KDim = K_T * K_H * K_W * K;
+    int KDimPerGroup = KDim / conv_p.G;
 
     aligned_vector<uint8_t> A(N * T * H * W * K);
-    aligned_vector<int8_t> B(K * K_T * K_H * K_W);
-    aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
-        C(C_ref.size());
+    aligned_vector<int8_t> B(KDim);
+    aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
 
     randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = a_symmetric ? 0 : 43;
 
     randFill<int8_t>(B, -16, 16);
-    int32_t B_zero_point = 5;
-
-    depthwise_3x3x3_pad_1_ref(
-        N,
-        T,
-        H,
-        W,
-        K,
-        stride_t,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B.data(),
-        C_ref.data());
-
-    int32_t minimum = *min_element(C_ref.begin(), C_ref.end());
-    int32_t maximum = *max_element(C_ref.begin(), C_ref.end());
+    int32_t B_zero_point = b_symmetric ? 0 : 5;
 
-    float C_multiplier = 255. / (maximum - minimum);
+    aligned_vector<float> C_multiplier(1);
+    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
+    int32_t C_zero_point = 5;
 
     aligned_vector<int32_t> col_offsets(K);
     aligned_vector<int32_t> bias(K);
     randFill(col_offsets, -100, 100);
     randFill(bias, -40, 40);
-    int32_t C_zero_point = 5;
 
-    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
-    depthwise_3x3x3_pad_1_ref(
-        N,
-        T,
-        H,
-        W,
-        K,
-        stride_t,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B_zero_point,
-        B.data(),
-        C_multiplier,
-        C_zero_point,
-        C_uint8_ref.data(),
-        col_offsets.data(),
-        bias.data());
+    vector<int32_t> row_offsets(MDim);
+    // im2col to compute row offset later
+    vector<uint8_t> A_im2col;
+    if (!b_symmetric) {
+      A_im2col.resize(MDim * KDim);
+      im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
+    }
+
+    conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      // Compute row offset
+      if (!b_symmetric) {
+        row_offsets_u8acc32_ref(
+            MDim,
+            KDimPerGroup,
+            KDim,
+            A_im2col.data() + g * KDimPerGroup,
+            row_offsets.data());
+      }
+
+      // Requantization
+      requantize_u8acc32_ref(
+          MDim,
+          1,
+          conv_p.G,
+          C_ref.data() + g,
+          C_uint8_ref.data() + g,
+          C_multiplier.data(),
+          C_zero_point,
+          A_zero_point,
+          &B_zero_point,
+          row_offsets.data(),
+          col_offsets.data() + g,
+          bias.data() + g,
+          K);
+    }
 
     Packed3x3x3ConvMatrix Bp(K, B.data());
 
@@ -297,10 +344,10 @@ TEST_P(FBGemmDepthWiseTest, Test3x3x3) {
         A.data(),
         B_zero_point,
         Bp,
-        C_multiplier,
+        C_multiplier[0],
         C_zero_point,
         C_uint8.data(),
-        col_offsets.data(),
+        a_symmetric ? nullptr : col_offsets.data(),
         bias.data(),
         false, /* fuse_relu */
         0,
@@ -336,14 +383,29 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
     int stride_h = shape[4];
     int stride_w = stride_h;
     constexpr int R = 3, S = 3;
-    constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
-    int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
-    int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
+    int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2,
+        PAD_R = (S - 1) / 2;
+
+    conv_param_t<2> conv_p(
+        N,
+        K,
+        K,
+        {H, W},
+        K,
+        {R, S},
+        {stride_h, stride_w},
+        {PAD_T, PAD_L, PAD_B, PAD_R});
+    int H_OUT = conv_p.OUT_DIM[0];
+    int W_OUT = conv_p.OUT_DIM[1];
+
+    int MDim = N * H_OUT * W_OUT;
+    int KDim = R * S * K;
+    int KDimPerGroup = KDim / conv_p.G;
 
     aligned_vector<uint8_t> A(N * H * W * K);
-    aligned_vector<int8_t> B(K * R * S);
-    int32_t C_num_rows = N * H_OUT * W_OUT;
-    aligned_vector<int32_t> C_ref(C_num_rows * K), C(C_ref.size());
+    aligned_vector<int8_t> B(KDim);
+    aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
 
     randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
@@ -359,28 +421,8 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
       B_zero_point[k] = 5 + k;
     }
 
-    depthwise_3x3_pad_1_ref(
-        N,
-        H,
-        W,
-        K,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B.data(),
-        C_ref.data());
-
-    aligned_vector<int32_t> C_ref_transpose(C_ref);
-    transpose_matrix(C_ref.data(), C_num_rows, K);
-    vector<float> C_multiplier(K);
-    for (auto k = 0; k < K; ++k) {
-      auto C_ref_k_begin = C_ref_transpose.begin() + k * C_num_rows;
-      auto C_ref_k_end = C_ref_k_begin + C_num_rows;
-      int32_t minimum = *min_element(C_ref_k_begin, C_ref_k_end);
-      int32_t maximum = *max_element(C_ref_k_begin, C_ref_k_end);
-      C_multiplier[k] = 255. / (maximum - minimum);
-    }
+    aligned_vector<float> C_multiplier(K);
+    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
     int32_t C_zero_point = 5;
 
     aligned_vector<int32_t> col_offsets(K);
@@ -388,23 +430,38 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
     randFill(col_offsets, -100, 100);
     randFill(bias, -40, 40);
 
-    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
-    depthwise_3x3_per_channel_quantization_pad_1_ref(
-        N,
-        H,
-        W,
-        K,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B_zero_point.data(),
-        B.data(),
-        C_multiplier.data(),
-        C_zero_point,
-        C_uint8_ref.data(),
-        col_offsets.data(),
-        bias.data());
+    // im2col to compute row offset later
+    vector<int32_t> row_offsets(MDim);
+    vector<uint8_t> A_im2col(MDim * KDim);
+    im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
+
+    conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      // Compute row offset
+      row_offsets_u8acc32_ref(
+          MDim,
+          KDimPerGroup,
+          KDim,
+          A_im2col.data() + g * KDimPerGroup,
+          row_offsets.data());
+
+      // Requantization
+      requantize_u8acc32_ref(
+          MDim,
+          1,
+          conv_p.G,
+          C_ref.data() + g,
+          C_uint8_ref.data() + g,
+          C_multiplier.data() + g,
+          C_zero_point,
+          A_zero_point,
+          B_zero_point.data() + g,
+          row_offsets.data(),
+          col_offsets.data() + g,
+          bias.data() + g,
+          K);
+    }
 
     Packed3x3ConvMatrix Bp(K, B.data());
 
@@ -459,14 +516,28 @@ TEST(FBGemmDepthWiseTest, Test3x3x3PerChannelQuantization) {
     constexpr int K_T = 3, K_H = 3, K_W = 3;
     constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
                   PAD_R = 1;
-    int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1;
-    int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1;
-    int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
+
+    conv_param_t<3> conv_p(
+        N,
+        K,
+        K,
+        {T, H, W},
+        K,
+        {K_T, K_H, K_W},
+        {stride_t, stride_h, stride_w},
+        {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R});
+    int T_OUT = conv_p.OUT_DIM[0];
+    int H_OUT = conv_p.OUT_DIM[1];
+    int W_OUT = conv_p.OUT_DIM[2];
+
+    int MDim = N * T_OUT * H_OUT * W_OUT;
+    int KDim = K_T * K_H * K_W * K;
+    int KDimPerGroup = KDim / conv_p.G;
 
     aligned_vector<uint8_t> A(N * T * H * W * K);
-    aligned_vector<int8_t> B(K * K_T * K_H * K_W);
-    int32_t C_num_rows = N * T_OUT * H_OUT * W_OUT;
-    aligned_vector<int32_t> C_ref(C_num_rows * K), C(C_ref.size());
+    aligned_vector<int8_t> B(KDim);
+    aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
 
     randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
@@ -482,30 +553,8 @@ TEST(FBGemmDepthWiseTest, Test3x3x3PerChannelQuantization) {
       B_zero_point[k] = 5 + k;
     }
 
-    depthwise_3x3x3_pad_1_ref(
-        N,
-        T,
-        H,
-        W,
-        K,
-        stride_t,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B.data(),
-        C_ref.data());
-
-    aligned_vector<int32_t> C_ref_transpose(C_ref);
-    transpose_matrix(C_ref.data(), C_num_rows, K);
-    vector<float> C_multiplier(K);
-    for (auto k = 0; k < K; ++k) {
-      auto C_ref_k_begin = C_ref_transpose.begin() + k * C_num_rows;
-      auto C_ref_k_end = C_ref_k_begin + C_num_rows;
-      int32_t minimum = *min_element(C_ref_k_begin, C_ref_k_end);
-      int32_t maximum = *max_element(C_ref_k_begin, C_ref_k_end);
-      C_multiplier[k] = 255. / (maximum - minimum);
-    }
+    aligned_vector<float> C_multiplier(K);
+    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
     int32_t C_zero_point = 5;
 
     aligned_vector<int32_t> col_offsets(K);
@@ -513,25 +562,38 @@ TEST(FBGemmDepthWiseTest, Test3x3x3PerChannelQuantization) {
     randFill(col_offsets, -100, 100);
     randFill(bias, -40, 40);
 
-    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
-    depthwise_3x3x3_per_channel_quantization_pad_1_ref(
-        N,
-        T,
-        H,
-        W,
-        K,
-        stride_t,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B_zero_point.data(),
-        B.data(),
-        C_multiplier.data(),
-        C_zero_point,
-        C_uint8_ref.data(),
-        col_offsets.data(),
-        bias.data());
+    vector<int32_t> row_offsets(MDim);
+    // im2col to compute row offset later
+    vector<uint8_t> A_im2col(MDim * KDim);
+    im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
+
+    conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      // Compute row offset
+      row_offsets_u8acc32_ref(
+          MDim,
+          KDimPerGroup,
+          KDim,
+          A_im2col.data() + g * KDimPerGroup,
+          row_offsets.data());
+
+      // Requantization
+      requantize_u8acc32_ref(
+          MDim,
+          1,
+          conv_p.G,
+          C_ref.data() + g,
+          C_uint8_ref.data() + g,
+          C_multiplier.data() + g,
+          C_zero_point,
+          A_zero_point,
+          B_zero_point.data() + g,
+          row_offsets.data(),
+          col_offsets.data() + g,
+          bias.data() + g,
+          K);
+    }
 
     Packed3x3x3ConvMatrix Bp(K, B.data());
 
diff --git a/test/I8DepthwiseTest.h b/test/I8DepthwiseTest.h
deleted file mode 100644
index cc8da9d..0000000
--- a/test/I8DepthwiseTest.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#pragma once
-
-#include <vector>
-
-namespace fbgemm {
-
-// From ResNeXt-3D-101
-// clang-format off
-static std::vector<std::vector<int>> shapes_3d = {
-  // NOTE: clang-format wants to use a different formatting but the current
-  // formatting should be easier to read.
-  // N, K, T_in, H_in, W_in, stride
-  {   1,  64,   32,  56, 56, 1, },
-  {   1, 128,   16,  28, 28, 1, },
-  {   1, 256,    8,  14, 14, 1, },
-  {   1, 512,    4,   7,  7, 1, },
-
-  {   1, 128,   32,  56, 56, 2, },
-  {   1, 256,   16,  28, 28, 2, },
-  {   1, 512,    8,  14, 14, 2, },
-
-  {   5,  64,   32,  56, 56, 1, },
-  {   5, 128,   16,  28, 28, 1, },
-  {   5, 256,    8,  14, 14, 1, },
-  {   5, 512,    4,   7,  7, 1, },
-
-  {   5, 128,   32,  56, 56, 2, },
-  {   5, 256,   16,  28, 28, 2, },
-  {   5, 512,    8,  14, 14, 2, },
-
-  {   1,   8,    4,   4,  4, 1, },
-};
-// clang-format on
-
-} // namespace fbgemm
author	Jongsoo Park <jongsoo@fb.com>	2019-09-04 22:04:31 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-09-04 22:06:43 +0300
commit	624d098f6701a951d02f0ad651ddba106442eb7c (patch)
tree	67249705eaea0b8931051d4b5e199dafc74ceb66
parent	ab2d5278073fce692b4f3b95d164f8cb3bbb1f72 (diff)