1 files changed, 200 insertions, 47 deletions
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 5db63f6..de833d2 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -6,8 +6,9 @@
  */
 
 #include <algorithm>
-#include <iostream>
+#include <numeric>
 #include <vector>
+#include <functional>
 #include "fbgemm/Fbgemm.h"
 
 namespace fbgemm {
@@ -33,12 +34,24 @@ bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
          });
 }
 
+template <int SPATIAL_DIM>
+bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
+  return std::accumulate(conv_p.K.begin(), conv_p.K.end(), 0) == SPATIAL_DIM &&
+      std::accumulate(conv_p.stride.begin(), conv_p.stride.end(), 0) ==
+      SPATIAL_DIM &&
+      std::accumulate(conv_p.dilation.begin(), conv_p.dilation.end(), 0) ==
+      SPATIAL_DIM &&
+      std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0;
+}
+
 template <int SPATIAL_DIM, typename ACC_T>
 optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
   if (takeDepthWiseFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
     return optimized_conv_t::depthwise;
   } else if (fbgemmOptimizedGConv<SPATIAL_DIM>(conv_p)) {
     return optimized_conv_t::groupwise;
+  } else if (takePointWiseFastPath<SPATIAL_DIM>(conv_p)) {
+    return optimized_conv_t::pointwise;
   } else {
     return optimized_conv_t::im2col;
   }
@@ -58,58 +71,139 @@ int fbgemmConv(
   static_assert(
       SPATIAL_DIM == 2 || SPATIAL_DIM == 3,
       "Only 2D and 3D convolutions are supported");
+
+  if (!packed_weights.isPackingCompliant(conv_p)) {
+    std::string msg =
+        "[FBGEMM_CONV_ERROR] Convolution parameters "
+        "mismatch between pre-packed weights and conv invocation! ";
+    msg += packed_weights.mismatchingParams(conv_p);
+    msg += std::string(
+        " Please pack weights using the same parameters "
+        "with which convolution operation is invoked!");
+    throw std::logic_error(msg);
+  }
+
   switch (ConvFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
     case optimized_conv_t::depthwise: {
       // 2D and 3D depthwise fast path
       // std::cout << "Depthwise fast path" << std::endl;
       const std::int32_t* B_zero_point = outProcess.getBZeroPoint();
       const float* C_multiplier = outProcess.getCMultiplier();
+      const float* act_times_w_scale = outProcess.getActWScale();
       if (SPATIAL_DIM == 3) {
         static_assert(
             std::is_same<typename processOutputType::outType, std::uint8_t>::
                 value,
             "For depthwise, only requantized output is supported");
-        depthwise_3x3x3_pad_1(
-            conv_p.MB, // mini batch
-            conv_p.IN_DIM[0], // T
-            conv_p.IN_DIM[1], // H
-            conv_p.IN_DIM[2], // W
-            conv_p.OC, // output channels
-            conv_p.stride[0], // stride_t
-            conv_p.stride[1], // stride_h
-            conv_p.stride[2], // stride_w
-            outProcess.getAZeroPoint(),
-            activations,
-            B_zero_point[0],
-            *(packed_weights.getPackedWFor3DDW()),
-            C_multiplier[0],
-            outProcess.getCZeroPoint(),
-            out,
-            outProcess.getColOffsets(),
-            outProcess.getBias(),
-            outProcess.RELU_FUSED, // fuse_relu
-            thread_id,
-            num_threads);
+
+        if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+          depthwise_3x3x3_pad_1(
+              conv_p.MB, // mini batch
+              conv_p.IN_DIM[0], // T
+              conv_p.IN_DIM[1], // H
+              conv_p.IN_DIM[2], // W
+              conv_p.OC, // output channels
+              conv_p.stride[0], // stride_t
+              conv_p.stride[1], // stride_h
+              conv_p.stride[2], // stride_w
+              outProcess.getAZeroPoint(),
+              activations,
+              B_zero_point[0],
+              *(packed_weights.getPackedWForDepthwise()),
+              C_multiplier[0],
+              outProcess.getCZeroPoint(),
+              out,
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              outProcess.RELU_FUSED, // fuse_relu
+              act_times_w_scale ? act_times_w_scale[0] : 1.0f,
+              thread_id,
+              num_threads);
+        } else if (
+            processOutputType::QGRANType ==
+                QuantizationGranularity::OUT_CHANNEL ||
+            processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+          depthwise_3x3x3_per_channel_quantization_pad_1(
+              conv_p.MB, // mini batch
+              conv_p.IN_DIM[0], // T
+              conv_p.IN_DIM[1], // H
+              conv_p.IN_DIM[2], // W
+              conv_p.OC, // output channels
+              conv_p.stride[0], // stride_t
+              conv_p.stride[1], // stride_h
+              conv_p.stride[2], // stride_w
+              outProcess.getAZeroPoint(),
+              activations,
+              B_zero_point,
+              *(packed_weights.getPackedWForDepthwise()),
+              C_multiplier,
+              outProcess.getCZeroPoint(),
+              out,
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              outProcess.RELU_FUSED, // fuse_relu
+              outProcess.getActWScale(), // act_scale * weight_scale
+              thread_id,
+              num_threads);
+        } else {
+          std::string msg =
+              "[FBGEMM_CONV_ERROR] This quantization granularity is "
+              "not supported";
+          throw std::runtime_error(msg);
+        }
       } else {
-        depthwise_3x3_pad_1(
-            conv_p.MB, // mini batch
-            conv_p.IN_DIM[0], // H
-            conv_p.IN_DIM[1], // W
-            conv_p.OC, // output channels
-            conv_p.stride[0], // stride_h
-            conv_p.stride[1], // stride_w
-            outProcess.getAZeroPoint(),
-            activations,
-            B_zero_point[0],
-            *(packed_weights.getPackedWFor2DDW()),
-            C_multiplier[0],
-            outProcess.getCZeroPoint(),
-            out,
-            outProcess.getColOffsets(),
-            outProcess.getBias(),
-            outProcess.RELU_FUSED, // fuse_relu
-            thread_id,
-            num_threads);
+        if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+          depthwise_3x3_pad_1(
+              conv_p.MB, // mini batch
+              conv_p.IN_DIM[0], // H
+              conv_p.IN_DIM[1], // W
+              conv_p.OC, // output channels
+              conv_p.stride[0], // stride_h
+              conv_p.stride[1], // stride_w
+              outProcess.getAZeroPoint(),
+              activations,
+              B_zero_point[0],
+              *(packed_weights.getPackedWForDepthwise()),
+              C_multiplier[0],
+              outProcess.getCZeroPoint(),
+              out,
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              outProcess.RELU_FUSED, // fuse_relu
+              act_times_w_scale ? act_times_w_scale[0] : 1.0f,
+              thread_id,
+              num_threads);
+        } else if (
+            processOutputType::QGRANType ==
+                QuantizationGranularity::OUT_CHANNEL ||
+            processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+          // The number of channels == groups for depthwise convolutions
+          depthwise_3x3_per_channel_quantization_pad_1(
+              conv_p.MB, // mini batch
+              conv_p.IN_DIM[0], // H
+              conv_p.IN_DIM[1], // W
+              conv_p.OC, // output channels
+              conv_p.stride[0], // stride_h
+              conv_p.stride[1], // stride_w
+              outProcess.getAZeroPoint(),
+              activations,
+              B_zero_point,
+              *(packed_weights.getPackedWForDepthwise()),
+              C_multiplier,
+              outProcess.getCZeroPoint(),
+              out,
+              outProcess.getColOffsets(),
+              outProcess.getBias(),
+              outProcess.RELU_FUSED, // fuse_relu
+              outProcess.getActWScale(), // act_scale * weight_scale
+              thread_id,
+              num_threads);
+        } else {
+          std::string msg =
+              "[FBGEMM_CONV_ERROR] This quantization granularity is "
+              "not supported";
+          throw std::runtime_error(msg);
+        }
       }
       break;
     }
@@ -134,14 +228,68 @@ int fbgemmConv(
           num_threads);
       break;
     }
+    case optimized_conv_t::pointwise: {
+      std::vector<int32_t> row_offset_buf(
+          PackAWithRowOffset<uint8_t>::rowOffsetBufferSize(blocking_params));
+      int image_dim = std::accumulate(
+          conv_p.IN_DIM.begin(),
+          conv_p.IN_DIM.end(),
+          1,
+          std::multiplies<int>());
+      PackAWithRowOffset<uint8_t, ACC_T> packA(
+          matrix_op_t::NoTranspose,
+          conv_p.MB * image_dim,
+          conv_p.IC,
+          activations,
+          conv_p.IC,
+          nullptr,
+          conv_p.G,
+          row_offset_buf.data(),
+          blocking_params);
+
+      outProcess.setRowOffsets(row_offset_buf.data());
+      fbgemmPacked(
+          packA,
+          *(packed_weights.getPackedWForPointwise()),
+          out,
+          outBuffer,
+          conv_p.OC,
+          outProcess,
+          thread_id,
+          num_threads,
+          blocking_params);
+      break;
+    }
     case optimized_conv_t::im2col: {
       // All other convolutions go through im2col-based implementation
       // std::cout << "Im2col path" << std::endl;
       std::vector<int32_t> row_offset_buf(
-          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize());
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize(
+              blocking_params));
 
       const std::int32_t* b_zero_point = outProcess.getBZeroPoint();
-      bool b_symmetric = b_zero_point[0] == 0;
+      bool b_symmetric = false;
+      if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+        b_symmetric = b_zero_point[0] == 0;
+      } else if (
+          processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+        b_symmetric =
+            std::all_of(b_zero_point, b_zero_point + conv_p.G, [](int i) {
+              return i == 0;
+            });
+      } else if (
+          processOutputType::QGRANType ==
+          QuantizationGranularity::OUT_CHANNEL) {
+        b_symmetric =
+            std::all_of(b_zero_point, b_zero_point + conv_p.OC, [](int i) {
+              return i == 0;
+            });
+      } else {
+        std::string msg =
+            "[FBGEMM_CONV_ERROR] This quantization granularity is "
+            "not supported";
+        throw std::runtime_error(msg);
+      }
       PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM> packA(
           conv_p,
           activations,
@@ -169,21 +317,25 @@ int fbgemmConv(
   return 0;
 }
 
-#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM)                 \
+#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, BIAS_TYPE)      \
   template int fbgemmConv(                                                 \
       const conv_param_t<SPATIAL_DIM>& conv_p,                             \
       const std::uint8_t* activations,                                     \
       PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, \
       std::uint8_t* out,                                                   \
       std::int32_t* outBuffer,                                             \
-      ReQuantizeOutput<RELU, Q_GRAN>& outProcess,                          \
+      ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess,               \
       int thread_id,                                                       \
       int num_threads,                                                     \
       const BlockingFactors* blocking_params);
 
+#define INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \
+  INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, float); \
+  INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t);
+
 #define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \
-  INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 2);          \
-  INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 3);
+  INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2);        \
+  INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3);
 
 #define INSTANTIATE_RELU(ACC_T, Q_GRAN)         \
   INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, true); \
@@ -199,6 +351,7 @@ INSTANTIATE_Q_GRANS(std::int32_t);
 #undef INSTANTIATE_Q_GRANS
 #undef INSTANTIATE_RELU
 #undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BIAS_T
 #undef INSTANTIATE_BASE
 
 template bool takeDepthWiseFastPath<2, std::int32_t>(