Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJongsoo Park <jongsoo@fb.com>2019-02-13 02:50:07 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-02-13 02:55:31 +0300
commitc4e50e0ed5db7c9aadb6df47faa4227f5286293c (patch)
tree963e416174ca1a3d486633c9e61f72760ce025ca /src/ExecuteKernelU8S8.cc
parent66df1a0ccd762e525e319cb579810deade551152 (diff)
no need to subtract col offset if a_zp is 0 (#69)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/69 This diff prepares for D14013931 that folds column offsets into bias. In depthwise convolution, we allow passing column_offsets == nullptr which means column_offsets are folded into bias. We bypass adding column_offset * A_zero_point if either column_offset == nullptr or A_zero_point == 0 Reviewed By: jianyuh Differential Revision: D14017772 fbshipit-source-id: ad4a79402f43cbf78dbad68e1bff6d07c19dded0
Diffstat (limited to 'src/ExecuteKernelU8S8.cc')
-rw-r--r--src/ExecuteKernelU8S8.cc286
1 files changed, 151 insertions, 135 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index e7f7c70..cdceb63 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -243,116 +243,127 @@ void ExecuteKernel<
////////////////////////////////////////////////////////////////////////////////
// ReQuantizeOutput
-#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PACK_A<uint8_t, ACC_T>, \
- PackBMatrix<int8_t, ACC_T>, \
- uint8_t, \
+#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
ReQuantizeOutput<RELU, Q_GRAN>>;
-#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
-
-#define INSTANTIATE_RELU(PACK_A, ACC_T) \
- INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \
- INSTANTIATE_Q_GRANS(PACK_A, ACC_T, true);
-
-#define INSTANTIATE_ACC_T(PACK_A) \
- INSTANTIATE_RELU(PACK_A, int32_t); \
- INSTANTIATE_RELU(PACK_A, int16_t);
-
-INSTANTIATE_ACC_T(PackAMatrix);
-INSTANTIATE_ACC_T(PackAWithRowOffset);
-
-#undef INSTANTIATE_ACC_T
-#undef INSTANTIATE_RELU
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
-
-#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
- PackBMatrix<int8_t, ACC_T>, \
- uint8_t, \
+#define INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, RELU) \
+ INSTANTIATE_REQUANT_BASE( \
+ PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_REQUANT_BASE( \
+ PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_REQUANT_BASE( \
+ PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_REQUANT_RELU(PACK_A, ACC_T) \
+ INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, false); \
+ INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, true);
+
+#define INSTANTIATE_REQUANT_ACC_T(PACK_A) \
+ INSTANTIATE_REQUANT_RELU(PACK_A, int32_t); \
+ INSTANTIATE_REQUANT_RELU(PACK_A, int16_t);
+
+INSTANTIATE_REQUANT_ACC_T(PackAMatrix);
+INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset);
+
+#undef INSTANTIATE_REQUANT_ACC_T
+#undef INSTANTIATE_REQUANT_RELU
+#undef INSTANTIATE_REQUANT_Q_GRANS
+#undef INSTANTIATE_REQUANT_BASE
+
+#define INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
ReQuantizeOutput<RELU, Q_GRAN>>;
-#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE( \
+#define INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_IM2COL_REQUANT_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_IM2COL_REQUANT_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_IM2COL_REQUANT_BASE( \
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
-#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
- INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
- INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+#define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 3);
-#define INSTANTIATE_RELU(ACC_T) \
- INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
- INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+#define INSTANTIATE_IM2COL_REQUANT_RELU(ACC_T) \
+ INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, true);
-INSTANTIATE_RELU(int32_t);
-INSTANTIATE_RELU(int16_t);
+INSTANTIATE_IM2COL_REQUANT_RELU(int32_t);
+INSTANTIATE_IM2COL_REQUANT_RELU(int16_t);
-#undef INSTANTIATE_RELU
-#undef INSTANTIATE_SPATIAL_DIM
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_IM2COL_REQUANT_RELU
+#undef INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM
+#undef INSTANTIATE_IM2COL_REQUANT_Q_GRANS
+#undef INSTANTIATE_IM2COL_REQUANT_BASE
////////////////////////////////////////////////////////////////////////////////
// ReQuantizeForFloat
-#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PACK_A<uint8_t, int32_t>, \
- PackBMatrix<int8_t, int32_t>, \
- float, \
+#define INSTANTIATE_REQUANT_FLOAT_BASE(PACK_A, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, int32_t>, \
+ PackBMatrix<int8_t, int32_t>, \
+ float, \
ReQuantizeForFloat<RELU, Q_GRAN>>;
-#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \
- INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, RELU) \
+ INSTANTIATE_REQUANT_FLOAT_BASE( \
+ PACK_A, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_REQUANT_FLOAT_BASE( \
+ PACK_A, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_REQUANT_FLOAT_BASE( \
+ PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
-#define INSTANTIATE_RELU(PACK_A) \
- INSTANTIATE_Q_GRANS(PACK_A, false); \
- INSTANTIATE_Q_GRANS(PACK_A, true);
+#define INSTANTIATE_REQUANT_FLOAT_RELU(PACK_A) \
+ INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, false); \
+ INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, true);
-INSTANTIATE_RELU(PackAWithRowOffset);
-INSTANTIATE_RELU(PackAWithQuantRowOffset);
+INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithRowOffset);
+INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithQuantRowOffset);
-#undef INSTANTIATE_RELU
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_REQUANT_FLOAT_RELU
+#undef INSTANTIATE_REQUANT_FLOAT_Q_GRANS
+#undef INSTANTIATE_REQUANT_FLOAT_BASE
-#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
- PackBMatrix<int8_t, ACC_T>, \
- float, \
+#define INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ float, \
ReQuantizeForFloat<RELU, Q_GRAN>>;
-#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE( \
+#define INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
-#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
- INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
- INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+#define INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 3);
-#define INSTANTIATE_RELU(ACC_T) \
- INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
- INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+#define INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(ACC_T) \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, true);
-INSTANTIATE_RELU(int32_t);
-INSTANTIATE_RELU(int16_t);
+INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(int32_t);
+INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(int16_t);
-#undef INSTANTIATE_RELU
-#undef INSTANTIATE_SPATIAL_DIM
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU
+#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM
+#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS
+#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
@@ -362,41 +373,46 @@ template class ExecuteKernel<
////////////////////////////////////////////////////////////////////////////////
// DoSpmdmOnInpBuffer
-#define INSTANTIATE_BASE(RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithRowOffset<uint8_t, int16_t>, \
- PackBMatrix<int8_t, int16_t>, \
- uint8_t, \
+#define INSTANTIATE_SPMDM_BASE(PACK_A, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, int16_t>, \
+ PackBMatrix<int8_t, int16_t>, \
+ uint8_t, \
DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
-#define INSTANTIATE_Q_GRANS(RELU) \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_SPMDM_Q_GRANS(PACK_A, RELU) \
+ INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
-INSTANTIATE_Q_GRANS(false);
-INSTANTIATE_Q_GRANS(true);
+#define INSTANTIATE_SPMDM_RELU(PACK_A) \
+ INSTANTIATE_SPMDM_Q_GRANS(PACK_A, false); \
+ INSTANTIATE_SPMDM_Q_GRANS(PACK_A, true);
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+INSTANTIATE_SPMDM_RELU(PackAMatrix);
+INSTANTIATE_SPMDM_RELU(PackAWithRowOffset);
-#define INSTANTIATE_BASE(RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, int16_t>, \
- PackBMatrix<int8_t, int16_t>, \
- uint8_t, \
+#undef INSTANTIATE_SPMDM_RELU
+#undef INSTANTIATE_SPMDM_Q_GRANS
+#undef INSTANTIATE_SPMDM_BASE
+
+#define INSTANTIATE_SCONV_BASE(RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, int16_t>, \
+ PackBMatrix<int8_t, int16_t>, \
+ uint8_t, \
DoSConvOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
-#define INSTANTIATE_Q_GRANS(RELU) \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_SCONV_Q_GRANS(RELU) \
+ INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
-INSTANTIATE_Q_GRANS(false);
-INSTANTIATE_Q_GRANS(true);
+INSTANTIATE_SCONV_Q_GRANS(false);
+INSTANTIATE_SCONV_Q_GRANS(true);
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_SCONV_Q_GRANS
+#undef INSTANTIATE_SCONV_BASE
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
@@ -406,39 +422,39 @@ template class ExecuteKernel<
////////////////////////////////////////////////////////////////////////////////
// memCopy
-#define INSTANTIATE_BASE(PACK_A, ACC_T) \
- template class ExecuteKernel< \
- PACK_A<uint8_t, ACC_T>, \
- PackBMatrix<int8_t, ACC_T>, \
- int32_t, \
+#define INSTANTIATE_MEMCPY_BASE(PACK_A, ACC_T) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ int32_t, \
memCopy<>>;
-#define INSTANTIATE_ACC_T(PACK_A) \
- INSTANTIATE_BASE(PACK_A, int32_t) \
- INSTANTIATE_BASE(PACK_A, int16_t)
+#define INSTANTIATE_MEMCPY_ACC_T(PACK_A) \
+ INSTANTIATE_MEMCPY_BASE(PACK_A, int32_t) \
+ INSTANTIATE_MEMCPY_BASE(PACK_A, int16_t)
-INSTANTIATE_ACC_T(PackAMatrix);
-INSTANTIATE_ACC_T(PackAWithRowOffset);
+INSTANTIATE_MEMCPY_ACC_T(PackAMatrix);
+INSTANTIATE_MEMCPY_ACC_T(PackAWithRowOffset);
-#undef INSTANTIATE_ACC_T
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_MEMCPY_ACC_T
+#undef INSTANTIATE_MEMCPY_BASE
-#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
- PackBMatrix<int8_t, ACC_T>, \
- int32_t, \
+#define INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, SPATIAL_DIM) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ int32_t, \
memCopy<>>;
-#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
- INSTANTIATE_BASE(ACC_T, 2); \
- INSTANTIATE_BASE(ACC_T, 3);
+#define INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(ACC_T) \
+ INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 2); \
+ INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 3);
-INSTANTIATE_SPATIAL_DIM(int32_t);
-INSTANTIATE_SPATIAL_DIM(int16_t);
+INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(int32_t);
+INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(int16_t);
-#undef INSTANTIATE_SPATIAL_DIM
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM
+#undef INSTANTIATE_MEMCPY_IM2COL_BASE
template class ExecuteKernel<
PackAWithQuantRowOffset<uint8_t, int32_t>,