Per channel support in fbgemmConv (#119)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/119 Some paths in fbgemmConv had missing support for per channel quantization. Adding support for per channel as well as groupwise quantization support with this diff. Reviewed By: jianyuh Differential Revision: D16894740 fbshipit-source-id: 43a2c08d1c8d1b01775f875224774c39fae280bc
author: Daya Khudia <dskhudia@fb.com> 2019-08-21 02:52:53 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-08-21 02:58:08 +0300
commit: 280fa17349b763eb474c423a6d1172f81df29103 (patch)
tree: 061974b09a412e0025c11a456451c81efe7e4ab0
parent: a6d1d3eed7ba858d4532fc297b7a4ee984e6e7e3 (diff)
3 files changed, 398 insertions, 43 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index 543f1cb..0b7bf1f 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -1134,6 +1134,7 @@ template <
 class FBGEMM_API ReQuantizeOutput {
  public:
   static constexpr int RELU_FUSED = FUSE_RELU;
+  static constexpr QuantizationGranularity QGRANType = Q_GRAN;
   using outType = outT;
   using inpType = inT;
   /**
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 33d1535..164411d 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -94,47 +94,111 @@ int fbgemmConv(
             std::is_same<typename processOutputType::outType, std::uint8_t>::
                 value,
             "For depthwise, only requantized output is supported");
-        depthwise_3x3x3_pad_1(
-            conv_p.MB, // mini batch
-            conv_p.IN_DIM[0], // T
-            conv_p.IN_DIM[1], // H
-            conv_p.IN_DIM[2], // W
-            conv_p.OC, // output channels
-            conv_p.stride[0], // stride_t
-            conv_p.stride[1], // stride_h
-            conv_p.stride[2], // stride_w
-            outProcess.getAZeroPoint(),
-            activations,
-            B_zero_point[0],
-            *(packed_weights.getPackedWFor3DDW()),
-            C_multiplier[0],
-            outProcess.getCZeroPoint(),
-            out,
-            outProcess.getColOffsets(),
-            outProcess.getBias(),
-            outProcess.RELU_FUSED, // fuse_relu
-            thread_id,
-            num_threads);
+
+        if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+          depthwise_3x3x3_pad_1(
+              conv_p.MB, // mini batch
+              conv_p.IN_DIM[0], // T
+              conv_p.IN_DIM[1], // H
+              conv_p.IN_DIM[2], // W
+              conv_p.OC, // output channels
+              conv_p.stride[0], // stride_t
+              conv_p.stride[1], // stride_h
+              conv_p.stride[2], // stride_w
+              outProcess.getAZeroPoint(),
+              activations,
+              B_zero_point[0],
+              *(packed_weights.getPackedWFor3DDW()),
+              C_multiplier[0],
+              outProcess.getCZeroPoint(),
+              out,
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              outProcess.RELU_FUSED, // fuse_relu
+              thread_id,
+              num_threads);
+        } else if (
+            processOutputType::QGRANType ==
+                QuantizationGranularity::OUT_CHANNEL ||
+            processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+          depthwise_3x3x3_per_channel_quantization_pad_1(
+              conv_p.MB, // mini batch
+              conv_p.IN_DIM[0], // T
+              conv_p.IN_DIM[1], // H
+              conv_p.IN_DIM[2], // W
+              conv_p.OC, // output channels
+              conv_p.stride[0], // stride_t
+              conv_p.stride[1], // stride_h
+              conv_p.stride[2], // stride_w
+              outProcess.getAZeroPoint(),
+              activations,
+              B_zero_point,
+              *(packed_weights.getPackedWFor3DDW()),
+              C_multiplier,
+              outProcess.getCZeroPoint(),
+              out,
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              outProcess.RELU_FUSED, // fuse_relu
+              thread_id,
+              num_threads);
+        } else {
+          std::string msg =
+              "[FBGEMM_CONV_ERROR] This quantization granularity is "
+              "not supported";
+          throw std::runtime_error(msg);
+        }
       } else {
-        depthwise_3x3_pad_1(
-            conv_p.MB, // mini batch
-            conv_p.IN_DIM[0], // H
-            conv_p.IN_DIM[1], // W
-            conv_p.OC, // output channels
-            conv_p.stride[0], // stride_h
-            conv_p.stride[1], // stride_w
-            outProcess.getAZeroPoint(),
-            activations,
-            B_zero_point[0],
-            *(packed_weights.getPackedWFor2DDW()),
-            C_multiplier[0],
-            outProcess.getCZeroPoint(),
-            out,
-            outProcess.getColOffsets(),
-            outProcess.getBias(),
-            outProcess.RELU_FUSED, // fuse_relu
-            thread_id,
-            num_threads);
+        if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+          depthwise_3x3_pad_1(
+              conv_p.MB, // mini batch
+              conv_p.IN_DIM[0], // H
+              conv_p.IN_DIM[1], // W
+              conv_p.OC, // output channels
+              conv_p.stride[0], // stride_h
+              conv_p.stride[1], // stride_w
+              outProcess.getAZeroPoint(),
+              activations,
+              B_zero_point[0],
+              *(packed_weights.getPackedWFor2DDW()),
+              C_multiplier[0],
+              outProcess.getCZeroPoint(),
+              out,
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              outProcess.RELU_FUSED, // fuse_relu
+              thread_id,
+              num_threads);
+        } else if (
+            processOutputType::QGRANType ==
+                QuantizationGranularity::OUT_CHANNEL ||
+            processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+          // The number of channels == groups for depthwise convolutions
+          depthwise_3x3_per_channel_quantization_pad_1(
+              conv_p.MB, // mini batch
+              conv_p.IN_DIM[0], // H
+              conv_p.IN_DIM[1], // W
+              conv_p.OC, // output channels
+              conv_p.stride[0], // stride_h
+              conv_p.stride[1], // stride_w
+              outProcess.getAZeroPoint(),
+              activations,
+              B_zero_point,
+              *(packed_weights.getPackedWFor2DDW()),
+              C_multiplier,
+              outProcess.getCZeroPoint(),
+              out,
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              outProcess.RELU_FUSED, // fuse_relu
+              thread_id,
+              num_threads);
+        } else {
+          std::string msg =
+              "[FBGEMM_CONV_ERROR] This quantization granularity is "
+              "not supported";
+          throw std::runtime_error(msg);
+        }
       }
       break;
     }
@@ -195,11 +259,32 @@ int fbgemmConv(
       // All other convolutions go through im2col-based implementation
       // std::cout << "Im2col path" << std::endl;
       std::vector<int32_t> row_offset_buf(
-          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>
-          ::rowOffsetBufferSize(blocking_params));
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize(
+              blocking_params));
 
       const std::int32_t* b_zero_point = outProcess.getBZeroPoint();
-      bool b_symmetric = b_zero_point[0] == 0;
+      bool b_symmetric = false;
+      if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+        b_symmetric = b_zero_point[0] == 0;
+      } else if (
+          processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+        b_symmetric =
+            std::all_of(b_zero_point, b_zero_point + conv_p.G, [](int i) {
+              return i == 0;
+            });
+      } else if (
+          processOutputType::QGRANType ==
+          QuantizationGranularity::OUT_CHANNEL) {
+        b_symmetric =
+            std::all_of(b_zero_point, b_zero_point + conv_p.OC, [](int i) {
+              return i == 0;
+            });
+      } else {
+        std::string msg =
+            "[FBGEMM_CONV_ERROR] This quantization granularity is "
+            "not supported";
+        throw std::runtime_error(msg);
+      }
       PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM> packA(
           conv_p,
           activations,
diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc
index 91bf578..ce26442 100644
--- a/test/UniConvTest.cc
+++ b/test/UniConvTest.cc
@@ -21,6 +21,55 @@
 using namespace std;
 using namespace fbgemm;
 
+vector<QuantizationGranularity> qGranularityVals{
+    QuantizationGranularity::TENSOR,
+    QuantizationGranularity::GROUP,
+    QuantizationGranularity::OUT_CHANNEL};
+
+static vector<conv_param_t<>> GetShapes_() {
+  vector<conv_param_t<>> shapes = {
+      // MB, IC, OC, {IH, IW}, G, {KH, KW}, {stride_h, stride_w}, {pad_t, pad_l,
+      // pad_b, pad_r}
+      // Regular
+    conv_param_t<>(1, 16, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 16, 32, {30, 10}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {2, 1, 2, 1}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 2, 1, 2}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 2}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {2, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 16, 16, {10, 30}, 1, {3, 5}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 16, 16, {10, 30}, 1, {5, 3}, {1, 1}, {1, 1, 1, 1}),
+      // groupwise
+    conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 8, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 16, 32, {10, 30}, 8, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {2, 2}, {2, 1, 2, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {1, 2}, {2, 1, 2, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {2, 1}, {2, 1, 2, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 5}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 8, {5, 3}, {1, 1}, {1, 1, 1, 1}),
+      // DW
+    conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 1}, {1, 2, 1, 2}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {2, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 2}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 5}, {1, 1}, {1, 1, 1, 1}),
+    conv_param_t<>(1, 32, 32, {10, 30}, 32, {5, 3}, {1, 1}, {1, 1, 1, 1}),
+      // Pointwise
+    conv_param_t<>(1, 32, 32, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
+    conv_param_t<>(1, 16, 32, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {2, 2}, {0, 0, 0, 0}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {1, 2}, {0, 0, 0, 0}),
+    conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {2, 1}, {0, 0, 0, 0}),
+  };
+  return shapes;
+}
+
 namespace {
 
 // tuple represents MB, IC, OC, IT, IH, IW, KH/KW, stride, pad
@@ -28,8 +77,12 @@ class uniConvTest
     : public testing::TestWithParam<
           tuple<int, int, int, int, int, int, int, int, int, int>> {};
 
+class UniConvQGranTest : public testing::TestWithParam<
+                             tuple<QuantizationGranularity, bool, bool>> {};
+
 }; // namespace
 
+// Combine only allows at most 10 generators.
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
     uniConvTest,
@@ -45,6 +98,13 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::ValuesIn({1, 2}), // stride
         ::testing::ValuesIn({0, 1, 2}))); // pad
 
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    UniConvQGranTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(qGranularityVals),
+        ::testing::Bool(), // A symmetric
+        ::testing::Bool())); // B symmetric
 /**
  * Test for conv packing
  */
@@ -323,3 +383,212 @@ TEST(uniConvTest, cornerCases) {
       0,
       1);
 }
+
+/**
+ * @brief Unit test for uint8 activations, int8 weights, and 32-bit
+ * accumulation. Output processing: requantization -> nothing
+ */
+TEST_P(UniConvQGranTest, requantizeTest) {
+  vector<conv_param_t<>> shapes(GetShapes_());
+  QuantizationGranularity q_granularity;
+  bool a_symmetric, b_symmetric;
+  tie(q_granularity, a_symmetric, b_symmetric) = GetParam();
+
+  for (auto conv_p : shapes) {
+    int R = conv_p.K[0];
+    int S = conv_p.K[1];
+    int G = conv_p.G;
+    int OC = conv_p.OC;
+    int OH = conv_p.OUT_DIM[0];
+    int OW = conv_p.OUT_DIM[1];
+    int IC_per_G = conv_p.IC / conv_p.G;
+    int OC_per_G = conv_p.OC / conv_p.G;
+
+    // activations
+    aligned_vector<uint8_t> Aint8(
+        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+
+    // weights
+    // The weight matrix is in layout G K/G (R S C/G)
+    aligned_vector<int8_t> Bint8(R * S * conv_p.G * IC_per_G * OC_per_G, 0);
+    aligned_vector<int8_t> Bint8_tr(R * S * G * IC_per_G * OC_per_G, 0);
+
+    aligned_vector<int32_t> Cint32_ref(conv_p.MB * OH * OW * OC, 0);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+    aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+    aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+
+    randFill<uint8_t>(Aint8, 0, 5);
+    int32_t Aint8_zero_point = a_symmetric ? 0 : 4;
+
+    randFill<int8_t>(Bint8, -4, 4);
+
+    // computing column offset
+    vector<int32_t> col_offsets(G * OC_per_G);
+
+    int ncols_per_quant_group = G * OC_per_G;
+    if (q_granularity == QuantizationGranularity::GROUP) {
+      ncols_per_quant_group = OC_per_G;
+    } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+      ncols_per_quant_group = 1;
+    }
+
+    aligned_vector<int32_t> Bint8_zero_point(
+        G * OC_per_G / ncols_per_quant_group);
+    if (b_symmetric) {
+      randFill(Bint8_zero_point, -3, 3);
+    } else {
+      randFill(Bint8_zero_point, 0, 0);
+    }
+
+    // matrix dimensions after im2col for each GEMM.
+    // For each group, there is one GEMM of the following dimensions
+    int MDim = conv_p.MB * OH * OW;
+    int NDim = OC_per_G;
+    int KDim = R * S * IC_per_G;
+
+    vector<uint8_t> Aint8_im2col(MDim * KDim * G);
+    im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
+
+    vector<int32_t> row_offsets(MDim);
+
+    aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+    randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2);
+    int32_t C_zero_pt = 5;
+
+    // reference implementation
+    // conv_ref expects weights to be in G (R S C/G) K/G
+    int8_t* rightBData = Bint8.data();
+    transposeConvWeights(conv_p, Bint8.data(), Bint8_tr.data());
+    rightBData = Bint8_tr.data();
+    for (int g = 0; g < G; ++g) {
+      col_offsets_with_zero_pt_s8acc32_ref(
+          R * S * IC_per_G,
+          OC_per_G,
+          OC_per_G,
+          rightBData + g * R * S * IC_per_G * OC_per_G,
+          Bint8_zero_point.data() + g * OC_per_G / ncols_per_quant_group,
+          col_offsets.data() + g * OC_per_G,
+          ncols_per_quant_group);
+    }
+    conv_ref(
+        conv_p, Aint8.data(), Aint8_zero_point, rightBData, Cint32_ref.data());
+
+    for (int g = 0; g < G; ++g) {
+      row_offsets_u8acc32_ref(
+          MDim,
+          KDim,
+          KDim * G,
+          Aint8_im2col.data() + g * KDim,
+          row_offsets.data());
+
+      requantize_u8acc32_ref(
+          MDim,
+          NDim,
+          G * NDim,
+          Cint32_ref.data() + g * NDim,
+          Cint8_ref.data() + g * NDim,
+          C_multiplier.data() + g * NDim / ncols_per_quant_group,
+          C_zero_pt,
+          Aint8_zero_point,
+          Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
+          row_offsets.data(),
+          col_offsets.data() + g * NDim,
+          nullptr,
+          ncols_per_quant_group);
+    }
+
+    PackWeightsForConv<2> packedWeights(conv_p, Bint8.data());
+
+    // TODO: Uncomment once we support multiple threads in fbgemmGroupwiseConv
+    // #ifdef _OPENMP
+    // #pragma omp parallel
+    // #endif
+    {
+      vector<int32_t> row_offset_buf(rowOffsetBufferSizeGConv(conv_p));
+
+      DoNothing<> doNothingObj{};
+
+      int num_threads = fbgemm_get_num_threads();
+      int tid = fbgemm_get_thread_num();
+
+      if (q_granularity == QuantizationGranularity::TENSOR) {
+        ReQuantizeOutput<false, QuantizationGranularity::TENSOR> reqObj(
+            doNothingObj,
+            C_multiplier.data(),
+            C_zero_pt,
+            Aint8_zero_point,
+            Bint8_zero_point.data(),
+            nullptr, /* row offset buffer */
+            col_offsets.data(),
+            nullptr,
+            G * NDim,
+            G);
+
+        fbgemmConv(
+            conv_p,
+            Aint8.data(),
+            packedWeights,
+            Cint8_fb.data(),
+            Cint32_fb.data(),
+            reqObj,
+            tid,
+            num_threads);
+
+      } else if (q_granularity == QuantizationGranularity::GROUP) {
+        ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj(
+            doNothingObj,
+            C_multiplier.data(),
+            C_zero_pt,
+            Aint8_zero_point,
+            Bint8_zero_point.data(),
+            nullptr, /* row offset buffer */
+            col_offsets.data(),
+            nullptr,
+            G * NDim,
+            G);
+
+        fbgemmConv(
+            conv_p,
+            Aint8.data(),
+            packedWeights,
+            Cint8_fb.data(),
+            Cint32_fb.data(),
+            reqObj,
+            tid,
+            num_threads);
+
+      } else {
+        ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj(
+            doNothingObj,
+            C_multiplier.data(),
+            C_zero_pt,
+            Aint8_zero_point,
+            Bint8_zero_point.data(),
+            nullptr, /* row offset buffer */
+            col_offsets.data(),
+            nullptr,
+            G * NDim,
+            G);
+
+        fbgemmConv(
+            conv_p,
+            Aint8.data(),
+            packedWeights,
+            Cint8_fb.data(),
+            Cint32_fb.data(),
+            reqObj,
+            tid,
+            num_threads);
+      }
+    } // omp parallel
+
+    compare_validate_buffers(
+        Cint8_ref.data(),
+        Cint8_fb.data(),
+        MDim,
+        NDim * G,
+        NDim * G,
+        static_cast<uint8_t>(0));
+  } // for each shape
+}
author	Daya Khudia <dskhudia@fb.com>	2019-08-21 02:52:53 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-08-21 02:58:08 +0300
commit	280fa17349b763eb474c423a6d1172f81df29103 (patch)
tree	061974b09a412e0025c11a456451c81efe7e4ab0
parent	a6d1d3eed7ba858d4532fc297b7a4ee984e6e7e3 (diff)