Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/ExecuteKernelU8S8.cc')
-rw-r--r--src/ExecuteKernelU8S8.cc284
1 files changed, 154 insertions, 130 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 2e2035c..f1ec882 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -240,47 +240,60 @@ void ExecuteKernel<
} // for each j block
}
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- uint8_t,
- ReQuantizeOutput<false /* FUSE_RELU*/>>;
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- uint8_t,
- ReQuantizeOutput<true>>;
-
-template class ExecuteKernel<
- PackAWithQuantRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- float,
- ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
- PackAWithQuantRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- float,
- ReQuantizeForFloat<true>>;
-
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- float,
- ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- float,
- ReQuantizeForFloat<true>>;
-
-template class ExecuteKernel<
- PackAMatrix<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- int32_t,
- memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeOutput
+#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithRowOffset<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
+ ReQuantizeOutput<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_Q_GRANS(ACC_T, false); \
+ INSTANTIATE_Q_GRANS(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
+ ReQuantizeOutput<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
template class ExecuteKernel<
PackAMatrix<uint8_t, int16_t>,
@@ -288,110 +301,127 @@ template class ExecuteKernel<
uint8_t,
ReQuantizeOutput<false>>;
-template class ExecuteKernel<
- PackAMatrix<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- int32_t,
- memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeForFloat
+#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, int32_t>, \
+ PackBMatrix<int8_t, int32_t>, \
+ float, \
+ ReQuantizeForFloat<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(PACK_A) \
+ INSTANTIATE_Q_GRANS(PACK_A, false); \
+ INSTANTIATE_Q_GRANS(PACK_A, true);
+
+INSTANTIATE_RELU(PackAWithRowOffset);
+INSTANTIATE_RELU(PackAWithQuantRowOffset);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ float, \
+ ReQuantizeForFloat<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
PackBMatrix<int8_t, int16_t>,
- uint8_t,
- DoSpmdmOnInpBuffer<
- ReQuantizeOutput<false>::outType,
- int32_t,
- ReQuantizeOutput<false>>>;
+ float,
+ ReQuantizeForFloat<false /* FUSE_RELU*/>>;
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- DoSpmdmOnInpBuffer<
- ReQuantizeOutput<true>::outType,
- int32_t,
- ReQuantizeOutput<true>>>;
+////////////////////////////////////////////////////////////////////////////////
+// DoSpmdmOnInpBuffer
+#define INSTANTIATE_BASE(RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithRowOffset<uint8_t, int16_t>, \
+ PackBMatrix<int8_t, int16_t>, \
+ uint8_t, \
+ DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- float,
- DoSpmdmOnInpBuffer<
- ReQuantizeForFloat<false>::outType,
- int32_t,
- ReQuantizeForFloat<false>>>;
+#define INSTANTIATE_Q_GRANS(RELU) \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- ReQuantizeOutput<true>>;
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
PackBMatrix<int8_t, int16_t>,
- int32_t,
- memCopy<>>;
+ float,
+ DoSpmdmOnInpBuffer<float, int32_t, ReQuantizeForFloat<false>>>;
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- int32_t,
- memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// memCopy
+#define INSTANTIATE_BASE(PACK_A, ACC_T) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ int32_t, \
+ memCopy<>>;
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int16_t, 3>,
- PackBMatrix<int8_t, int16_t>,
- int32_t,
- memCopy<>>;
+#define INSTANTIATE_ACC_T(PACK_A) \
+ INSTANTIATE_BASE(PACK_A, int32_t) \
+ INSTANTIATE_BASE(PACK_A, int16_t)
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+INSTANTIATE_ACC_T(PackAMatrix);
+INSTANTIATE_ACC_T(PackAWithRowOffset);
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int16_t, 3>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+#undef INSTANTIATE_ACC_T
+#undef INSTANTIATE_BASE
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- int32_t,
- memCopy<>>;
-
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- int32_t,
- memCopy<>>;
+#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ int32_t, \
+ memCopy<>>;
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int32_t, 3>,
- PackBMatrix<int8_t, int32_t>,
- int32_t,
- memCopy<>>;
+#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
+ INSTANTIATE_BASE(ACC_T, 2); \
+ INSTANTIATE_BASE(ACC_T, 3);
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+INSTANTIATE_SPATIAL_DIM(int32_t);
+INSTANTIATE_SPATIAL_DIM(int16_t);
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int32_t, 3>,
- PackBMatrix<int8_t, int32_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BASE
template class ExecuteKernel<
PackAWithQuantRowOffset<uint8_t, int32_t>,
@@ -400,12 +430,6 @@ template class ExecuteKernel<
memCopy<>>;
template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- float,
- ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
PackAMatrix<uint8_t, int16_t>,
PackBMatrix<int8_t, int16_t>,
int32_t,