fbgemmPacked and fbgemmConv apis with float bias + tests

Summary: fbgemmPacked and fbgemmConv api changes to take float bias. Reviewed By: jianyuh Differential Revision: D17244262 fbshipit-source-id: 0531c829190d20e31cb957a3f1861d4a65645cee
author: Daya Khudia <dskhudia@fb.com> 2019-09-11 21:47:58 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-09-11 21:52:07 +0300
commit: ea787e8278744ab4c7d6c4ee42a050bb1c76ef88 (patch)
tree: 3846cfc169c2333cfa23f519f88565b0e0b40416 /src
parent: 637288bff9972c02e72341d6a60fdf9bab1dce7e (diff)
4 files changed, 87 insertions, 53 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 0a4ff55..4ae1b50 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -315,19 +315,23 @@ void ExecuteKernel<
 
 ////////////////////////////////////////////////////////////////////////////////
 // ReQuantizeOutput
-#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \
-  template class ExecuteKernel<                               \
-      PACK_A<uint8_t, ACC_T>,                                 \
-      PackBMatrix<int8_t, ACC_T>,                             \
-      uint8_t,                                                \
-      ReQuantizeOutput<RELU, Q_GRAN>>;
+#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE) \
+  template class ExecuteKernel<                                          \
+      PACK_A<uint8_t, ACC_T>,                                            \
+      PackBMatrix<int8_t, ACC_T>,                                        \
+      uint8_t,                                                           \
+      ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>>;
+
+#define INSTANTIATE_REQUANT_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \
+  INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \
+  INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t);
 
 #define INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, RELU)     \
-  INSTANTIATE_REQUANT_BASE(                                  \
+  INSTANTIATE_REQUANT_BIAS_T(                                \
       PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
-  INSTANTIATE_REQUANT_BASE(                                  \
+  INSTANTIATE_REQUANT_BIAS_T(                                \
       PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP);  \
-  INSTANTIATE_REQUANT_BASE(                                  \
+  INSTANTIATE_REQUANT_BIAS_T(                                \
       PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
 
 #define INSTANTIATE_REQUANT_RELU(PACK_A, ACC_T)      \
@@ -344,21 +348,27 @@ INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset);
 #undef INSTANTIATE_REQUANT_ACC_T
 #undef INSTANTIATE_REQUANT_RELU
 #undef INSTANTIATE_REQUANT_Q_GRANS
+#undef INSTANTIATE_REQUANT_BIAS_T
 #undef INSTANTIATE_REQUANT_BASE
 
-#define INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
-  template class ExecuteKernel<                                           \
-      PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,                       \
-      PackBMatrix<int8_t, ACC_T>,                                         \
-      uint8_t,                                                            \
-      ReQuantizeOutput<RELU, Q_GRAN>>;
+#define INSTANTIATE_IM2COL_REQUANT_BASE(            \
+    ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE)    \
+  template class ExecuteKernel<                     \
+      PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+      PackBMatrix<int8_t, ACC_T>,                   \
+      uint8_t,                                      \
+      ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>>;
+
+#define INSTANTIATE_IM2COL_REQUANT_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+  INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \
+  INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t);
 
 #define INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
-  INSTANTIATE_IM2COL_REQUANT_BASE(                                   \
+  INSTANTIATE_IM2COL_REQUANT_BIAS_T(                                 \
       ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR);    \
-  INSTANTIATE_IM2COL_REQUANT_BASE(                                   \
+  INSTANTIATE_IM2COL_REQUANT_BIAS_T(                                 \
       ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);     \
-  INSTANTIATE_IM2COL_REQUANT_BASE(                                   \
+  INSTANTIATE_IM2COL_REQUANT_BIAS_T(                                 \
       ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
 
 #define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \
@@ -375,6 +385,7 @@ INSTANTIATE_IM2COL_REQUANT_RELU(int16_t);
 #undef INSTANTIATE_IM2COL_REQUANT_RELU
 #undef INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM
 #undef INSTANTIATE_IM2COL_REQUANT_Q_GRANS
+#undef INSTANTIATE_IM2COL_REQUANT_BIAS_T
 #undef INSTANTIATE_IM2COL_REQUANT_BASE
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index 4f7026f..ade851f 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -237,22 +237,26 @@ bool fbgemmSupportedCPU() {
 
 ////////////////////////////////////////////////////////////////////////////////
 // ReQuantizeOutput
-#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN)               \
+#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE)    \
   template void fbgemmPacked(                                       \
       PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA,    \
       PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
       uint8_t* C,                                                   \
       int32_t* C_buffer,                                            \
       uint32_t ldc,                                                 \
-      const ReQuantizeOutput<RELU, Q_GRAN>& outProcess,             \
+      const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess,  \
       int thread_id,                                                \
       int num_threads,                                              \
       const BlockingFactors* blocking_params);
 
-#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU)                          \
-  INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
-  INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP);  \
-  INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \
+  INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \
+  INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t);
+
+#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU)                            \
+  INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
 
 #define INSTANTIATE_RELU(PACK_A, ACC_T)      \
   INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \
@@ -268,27 +272,34 @@ INSTANTIATE_ACC_T(PackAWithRowOffset);
 #undef INSTANTIATE_ACC_T
 #undef INSTANTIATE_RELU
 #undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BIAS_T
 #undef INSTANTIATE_BASE
 
-#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN)          \
-  template void fbgemmPacked(                                       \
-      PackMatrix<                                                   \
-          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,             \
-          uint8_t,                                                  \
-          ACC_T>& packA,                                            \
-      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
-      uint8_t* C,                                                   \
-      int32_t* C_buffer,                                            \
-      uint32_t ldc,                                                 \
-      const ReQuantizeOutput<RELU, Q_GRAN>& outProcess,             \
-      int thread_id,                                                \
-      int num_threads,                                              \
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE) \
+  template void fbgemmPacked(                                         \
+      PackMatrix<                                                     \
+          PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>,               \
+          uint8_t,                                                    \
+          ACC_T>& packA,                                              \
+      PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB,   \
+      uint8_t* C,                                                     \
+      int32_t* C_buffer,                                              \
+      uint32_t ldc,                                                   \
+      const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess,    \
+      int thread_id,                                                  \
+      int num_threads,                                                \
       const BlockingFactors* blocking_params);
 
-#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)                          \
-  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
-  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
-  INSTANTIATE_BASE(                                                            \
+#define INSTANTIATE_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \
+  INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM)             \
+  INSTANTIATE_BIAS_T(                                             \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+  INSTANTIATE_BIAS_T(                                             \
+      ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP);  \
+  INSTANTIATE_BIAS_T(                                             \
       ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
 
 #define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
@@ -305,6 +316,7 @@ INSTANTIATE_RELU(int16_t);
 #undef INSTANTIATE_RELU
 #undef INSTANTIATE_SPATIAL_DIM
 #undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BIAS_T
 #undef INSTANTIATE_BASE
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 6a1e55b..5a486c0 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -89,6 +89,7 @@ int fbgemmConv(
       // std::cout << "Depthwise fast path" << std::endl;
       const std::int32_t* B_zero_point = outProcess.getBZeroPoint();
       const float* C_multiplier = outProcess.getCMultiplier();
+      const float* act_times_w_scale = outProcess.getActWScale();
       if (SPATIAL_DIM == 3) {
         static_assert(
             std::is_same<typename processOutputType::outType, std::uint8_t>::
@@ -115,7 +116,7 @@ int fbgemmConv(
               outProcess.getColOffsets(),
               outProcess.getBias(),
               outProcess.RELU_FUSED, // fuse_relu
-              1.0f, // act_scale * weight_scale
+              act_times_w_scale ? act_times_w_scale[0] : 1.0f,
               thread_id,
               num_threads);
         } else if (
@@ -141,7 +142,7 @@ int fbgemmConv(
               outProcess.getColOffsets(),
               outProcess.getBias(),
               outProcess.RELU_FUSED, // fuse_relu
-              nullptr, // act_scale * weight_scale
+              outProcess.getActWScale(), // act_scale * weight_scale
               thread_id,
               num_threads);
         } else {
@@ -169,7 +170,7 @@ int fbgemmConv(
               outProcess.getColOffsets(),
               outProcess.getBias(),
               outProcess.RELU_FUSED, // fuse_relu
-              1.0f, // act_scale * weight_scale
+              act_times_w_scale ? act_times_w_scale[0] : 1.0f,
               thread_id,
               num_threads);
         } else if (
@@ -194,7 +195,7 @@ int fbgemmConv(
               outProcess.getColOffsets(),
               outProcess.getBias(),
               outProcess.RELU_FUSED, // fuse_relu
-              nullptr, // act_scale * weight_scale
+              outProcess.getActWScale(), // act_scale * weight_scale
               thread_id,
               num_threads);
         } else {
@@ -316,21 +317,25 @@ int fbgemmConv(
   return 0;
 }
 
-#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM)                 \
+#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, BIAS_TYPE)      \
   template int fbgemmConv(                                                 \
       const conv_param_t<SPATIAL_DIM>& conv_p,                             \
       const std::uint8_t* activations,                                     \
       PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, \
       std::uint8_t* out,                                                   \
       std::int32_t* outBuffer,                                             \
-      ReQuantizeOutput<RELU, Q_GRAN>& outProcess,                          \
+      ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess,               \
       int thread_id,                                                       \
       int num_threads,                                                     \
       const BlockingFactors* blocking_params);
 
+#define INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \
+  INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, float); \
+  INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t);
+
 #define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \
-  INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 2);          \
-  INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 3);
+  INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2);        \
+  INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3);
 
 #define INSTANTIATE_RELU(ACC_T, Q_GRAN)         \
   INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, true); \
@@ -346,6 +351,7 @@ INSTANTIATE_Q_GRANS(std::int32_t);
 #undef INSTANTIATE_Q_GRANS
 #undef INSTANTIATE_RELU
 #undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BIAS_T
 #undef INSTANTIATE_BASE
 
 template bool takeDepthWiseFastPath<2, std::int32_t>(
diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc
index 40f3fba..4ba3549 100644
--- a/src/GroupwiseConvAcc32Avx2.cc
+++ b/src/GroupwiseConvAcc32Avx2.cc
@@ -2204,7 +2204,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
 template int rowOffsetBufferSizeGConv<2>(const conv_param_t<2>& conv_param);
 template int rowOffsetBufferSizeGConv<3>(const conv_param_t<3>& conv_param);
 
-#define INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM)                           \
+#define INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, BIAS_TYPE)                \
   template void fbgemmGroupwiseConv(                                          \
       const conv_param_t<SPATIAL_DIM>& conv_param,                            \
       const uint8_t* activations,                                             \
@@ -2213,13 +2213,17 @@ template int rowOffsetBufferSizeGConv<3>(const conv_param_t<3>& conv_param);
       PackWeightMatrixForGConv<int8_t, int32_t, SPATIAL_DIM>& packed_weights, \
       uint8_t* out,                                                           \
       int32_t* outBuffer,                                                     \
-      const ReQuantizeOutput<RELU, Q_GRAN>& outProcess,                       \
+      const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess,            \
       int thread_id,                                                          \
       int num_threads);
 
+#define INSTANTIATE_BIAS_T(RELU, Q_GRAN, SPATIAL_DIM) \
+  INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, float); \
+  INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, int32_t);
+
 #define INSTANTIATE_SPATIAL_DIM(RELU, Q_GRAN) \
-  INSTANTIATE_BASE(RELU, Q_GRAN, 2);          \
-  INSTANTIATE_BASE(RELU, Q_GRAN, 3);
+  INSTANTIATE_BIAS_T(RELU, Q_GRAN, 2);        \
+  INSTANTIATE_BIAS_T(RELU, Q_GRAN, 3);
 
 #define INSTANTIATE_Q_GRANS(RELU)                                 \
   INSTANTIATE_SPATIAL_DIM(RELU, QuantizationGranularity::TENSOR); \
@@ -2231,6 +2235,7 @@ INSTANTIATE_Q_GRANS(true);
 
 #undef INSTANTIATE_Q_GRANS
 #undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BIAS_T
 #undef INSTANTIATE_BASE
 
 template void fbgemmGroupwiseConv(
author	Daya Khudia <dskhudia@fb.com>	2019-09-11 21:47:58 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-09-11 21:52:07 +0300
commit	ea787e8278744ab4c7d6c4ee42a050bb1c76ef88 (patch)
tree	3846cfc169c2333cfa23f519f88565b0e0b40416 /src
parent	637288bff9972c02e72341d6a60fdf9bab1dce7e (diff)