Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDaya Khudia <dskhudia@fb.com>2019-09-11 21:47:58 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-09-11 21:52:07 +0300
commitea787e8278744ab4c7d6c4ee42a050bb1c76ef88 (patch)
tree3846cfc169c2333cfa23f519f88565b0e0b40416 /src
parent637288bff9972c02e72341d6a60fdf9bab1dce7e (diff)
fbgemmPacked and fbgemmConv apis with float bias + tests
Summary: fbgemmPacked and fbgemmConv api changes to take float bias. Reviewed By: jianyuh Differential Revision: D17244262 fbshipit-source-id: 0531c829190d20e31cb957a3f1861d4a65645cee
Diffstat (limited to 'src')
-rw-r--r--src/ExecuteKernelU8S8.cc47
-rw-r--r--src/Fbgemm.cc58
-rw-r--r--src/FbgemmConv.cc22
-rw-r--r--src/GroupwiseConvAcc32Avx2.cc13
4 files changed, 87 insertions, 53 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 0a4ff55..4ae1b50 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -315,19 +315,23 @@ void ExecuteKernel<
////////////////////////////////////////////////////////////////////////////////
// ReQuantizeOutput
-#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PACK_A<uint8_t, ACC_T>, \
- PackBMatrix<int8_t, ACC_T>, \
- uint8_t, \
- ReQuantizeOutput<RELU, Q_GRAN>>;
+#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
+ ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>>;
+
+#define INSTANTIATE_REQUANT_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \
+ INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \
+ INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t);
#define INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, RELU) \
- INSTANTIATE_REQUANT_BASE( \
+ INSTANTIATE_REQUANT_BIAS_T( \
PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_REQUANT_BASE( \
+ INSTANTIATE_REQUANT_BIAS_T( \
PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_REQUANT_BASE( \
+ INSTANTIATE_REQUANT_BIAS_T( \
PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
#define INSTANTIATE_REQUANT_RELU(PACK_A, ACC_T) \
@@ -344,21 +348,27 @@ INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset);
#undef INSTANTIATE_REQUANT_ACC_T
#undef INSTANTIATE_REQUANT_RELU
#undef INSTANTIATE_REQUANT_Q_GRANS
+#undef INSTANTIATE_REQUANT_BIAS_T
#undef INSTANTIATE_REQUANT_BASE
-#define INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
- PackBMatrix<int8_t, ACC_T>, \
- uint8_t, \
- ReQuantizeOutput<RELU, Q_GRAN>>;
+#define INSTANTIATE_IM2COL_REQUANT_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
+ ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>>;
+
+#define INSTANTIATE_IM2COL_REQUANT_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \
+ INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t);
#define INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
- INSTANTIATE_IM2COL_REQUANT_BASE( \
+ INSTANTIATE_IM2COL_REQUANT_BIAS_T( \
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
- INSTANTIATE_IM2COL_REQUANT_BASE( \
+ INSTANTIATE_IM2COL_REQUANT_BIAS_T( \
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
- INSTANTIATE_IM2COL_REQUANT_BASE( \
+ INSTANTIATE_IM2COL_REQUANT_BIAS_T( \
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
#define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \
@@ -375,6 +385,7 @@ INSTANTIATE_IM2COL_REQUANT_RELU(int16_t);
#undef INSTANTIATE_IM2COL_REQUANT_RELU
#undef INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM
#undef INSTANTIATE_IM2COL_REQUANT_Q_GRANS
+#undef INSTANTIATE_IM2COL_REQUANT_BIAS_T
#undef INSTANTIATE_IM2COL_REQUANT_BASE
////////////////////////////////////////////////////////////////////////////////
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index 4f7026f..ade851f 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -237,22 +237,26 @@ bool fbgemmSupportedCPU() {
////////////////////////////////////////////////////////////////////////////////
// ReQuantizeOutput
-#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \
+#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE) \
template void fbgemmPacked( \
PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \
PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
uint8_t* C, \
int32_t* C_buffer, \
uint32_t ldc, \
- const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \
+ const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \
int thread_id, \
int num_threads, \
const BlockingFactors* blocking_params);
-#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \
+ INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \
+ INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t);
+
+#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \
+ INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
#define INSTANTIATE_RELU(PACK_A, ACC_T) \
INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \
@@ -268,27 +272,34 @@ INSTANTIATE_ACC_T(PackAWithRowOffset);
#undef INSTANTIATE_ACC_T
#undef INSTANTIATE_RELU
#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BIAS_T
#undef INSTANTIATE_BASE
-#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
- template void fbgemmPacked( \
- PackMatrix< \
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
- uint8_t, \
- ACC_T>& packA, \
- PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
- uint8_t* C, \
- int32_t* C_buffer, \
- uint32_t ldc, \
- const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \
- int thread_id, \
- int num_threads, \
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE) \
+ template void fbgemmPacked( \
+ PackMatrix< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ uint8_t, \
+ ACC_T>& packA, \
+ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+ uint8_t* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \
+ int thread_id, \
+ int num_threads, \
const BlockingFactors* blocking_params);
-#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE( \
+#define INSTANTIATE_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BIAS_T( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BIAS_T( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BIAS_T( \
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
@@ -305,6 +316,7 @@ INSTANTIATE_RELU(int16_t);
#undef INSTANTIATE_RELU
#undef INSTANTIATE_SPATIAL_DIM
#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BIAS_T
#undef INSTANTIATE_BASE
////////////////////////////////////////////////////////////////////////////////
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 6a1e55b..5a486c0 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -89,6 +89,7 @@ int fbgemmConv(
// std::cout << "Depthwise fast path" << std::endl;
const std::int32_t* B_zero_point = outProcess.getBZeroPoint();
const float* C_multiplier = outProcess.getCMultiplier();
+ const float* act_times_w_scale = outProcess.getActWScale();
if (SPATIAL_DIM == 3) {
static_assert(
std::is_same<typename processOutputType::outType, std::uint8_t>::
@@ -115,7 +116,7 @@ int fbgemmConv(
outProcess.getColOffsets(),
outProcess.getBias(),
outProcess.RELU_FUSED, // fuse_relu
- 1.0f, // act_scale * weight_scale
+ act_times_w_scale ? act_times_w_scale[0] : 1.0f,
thread_id,
num_threads);
} else if (
@@ -141,7 +142,7 @@ int fbgemmConv(
outProcess.getColOffsets(),
outProcess.getBias(),
outProcess.RELU_FUSED, // fuse_relu
- nullptr, // act_scale * weight_scale
+ outProcess.getActWScale(), // act_scale * weight_scale
thread_id,
num_threads);
} else {
@@ -169,7 +170,7 @@ int fbgemmConv(
outProcess.getColOffsets(),
outProcess.getBias(),
outProcess.RELU_FUSED, // fuse_relu
- 1.0f, // act_scale * weight_scale
+ act_times_w_scale ? act_times_w_scale[0] : 1.0f,
thread_id,
num_threads);
} else if (
@@ -194,7 +195,7 @@ int fbgemmConv(
outProcess.getColOffsets(),
outProcess.getBias(),
outProcess.RELU_FUSED, // fuse_relu
- nullptr, // act_scale * weight_scale
+ outProcess.getActWScale(), // act_scale * weight_scale
thread_id,
num_threads);
} else {
@@ -316,21 +317,25 @@ int fbgemmConv(
return 0;
}
-#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \
+#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, BIAS_TYPE) \
template int fbgemmConv( \
const conv_param_t<SPATIAL_DIM>& conv_p, \
const std::uint8_t* activations, \
PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, \
std::uint8_t* out, \
std::int32_t* outBuffer, \
- ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \
+ ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \
int thread_id, \
int num_threads, \
const BlockingFactors* blocking_params);
+#define INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, float); \
+ INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t);
+
#define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \
- INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 2); \
- INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 3);
+ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2); \
+ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3);
#define INSTANTIATE_RELU(ACC_T, Q_GRAN) \
INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, true); \
@@ -346,6 +351,7 @@ INSTANTIATE_Q_GRANS(std::int32_t);
#undef INSTANTIATE_Q_GRANS
#undef INSTANTIATE_RELU
#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BIAS_T
#undef INSTANTIATE_BASE
template bool takeDepthWiseFastPath<2, std::int32_t>(
diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc
index 40f3fba..4ba3549 100644
--- a/src/GroupwiseConvAcc32Avx2.cc
+++ b/src/GroupwiseConvAcc32Avx2.cc
@@ -2204,7 +2204,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
template int rowOffsetBufferSizeGConv<2>(const conv_param_t<2>& conv_param);
template int rowOffsetBufferSizeGConv<3>(const conv_param_t<3>& conv_param);
-#define INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM) \
+#define INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, BIAS_TYPE) \
template void fbgemmGroupwiseConv( \
const conv_param_t<SPATIAL_DIM>& conv_param, \
const uint8_t* activations, \
@@ -2213,13 +2213,17 @@ template int rowOffsetBufferSizeGConv<3>(const conv_param_t<3>& conv_param);
PackWeightMatrixForGConv<int8_t, int32_t, SPATIAL_DIM>& packed_weights, \
uint8_t* out, \
int32_t* outBuffer, \
- const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \
+ const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \
int thread_id, \
int num_threads);
+#define INSTANTIATE_BIAS_T(RELU, Q_GRAN, SPATIAL_DIM) \
+ INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, float); \
+ INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, int32_t);
+
#define INSTANTIATE_SPATIAL_DIM(RELU, Q_GRAN) \
- INSTANTIATE_BASE(RELU, Q_GRAN, 2); \
- INSTANTIATE_BASE(RELU, Q_GRAN, 3);
+ INSTANTIATE_BIAS_T(RELU, Q_GRAN, 2); \
+ INSTANTIATE_BIAS_T(RELU, Q_GRAN, 3);
#define INSTANTIATE_Q_GRANS(RELU) \
INSTANTIATE_SPATIAL_DIM(RELU, QuantizationGranularity::TENSOR); \
@@ -2231,6 +2235,7 @@ INSTANTIATE_Q_GRANS(true);
#undef INSTANTIATE_Q_GRANS
#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BIAS_T
#undef INSTANTIATE_BASE
template void fbgemmGroupwiseConv(