diff options
author | Daya Khudia <dskhudia@fb.com> | 2019-09-11 21:47:58 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-09-11 21:52:07 +0300 |
commit | ea787e8278744ab4c7d6c4ee42a050bb1c76ef88 (patch) | |
tree | 3846cfc169c2333cfa23f519f88565b0e0b40416 /src | |
parent | 637288bff9972c02e72341d6a60fdf9bab1dce7e (diff) |
fbgemmPacked and fbgemmConv apis with float bias + tests
Summary: fbgemmPacked and fbgemmConv api changes to take float bias.
Reviewed By: jianyuh
Differential Revision: D17244262
fbshipit-source-id: 0531c829190d20e31cb957a3f1861d4a65645cee
Diffstat (limited to 'src')
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 47 | ||||
-rw-r--r-- | src/Fbgemm.cc | 58 | ||||
-rw-r--r-- | src/FbgemmConv.cc | 22 | ||||
-rw-r--r-- | src/GroupwiseConvAcc32Avx2.cc | 13 |
4 files changed, 87 insertions, 53 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 0a4ff55..4ae1b50 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -315,19 +315,23 @@ void ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PACK_A<uint8_t, ACC_T>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ - ReQuantizeOutput<RELU, Q_GRAN>>; +#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ + ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>>; + +#define INSTANTIATE_REQUANT_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \ + INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \ + INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t); #define INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, RELU) \ - INSTANTIATE_REQUANT_BASE( \ + INSTANTIATE_REQUANT_BIAS_T( \ PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_REQUANT_BASE( \ + INSTANTIATE_REQUANT_BIAS_T( \ PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_REQUANT_BASE( \ + INSTANTIATE_REQUANT_BIAS_T( \ PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_REQUANT_RELU(PACK_A, ACC_T) \ @@ -344,21 +348,27 @@ INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset); #undef INSTANTIATE_REQUANT_ACC_T #undef INSTANTIATE_REQUANT_RELU #undef INSTANTIATE_REQUANT_Q_GRANS +#undef INSTANTIATE_REQUANT_BIAS_T #undef INSTANTIATE_REQUANT_BASE -#define INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ - ReQuantizeOutput<RELU, Q_GRAN>>; +#define INSTANTIATE_IM2COL_REQUANT_BASE( \ + ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ + ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>>; + +#define INSTANTIATE_IM2COL_REQUANT_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \ + INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t); #define INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_IM2COL_REQUANT_BASE( \ + INSTANTIATE_IM2COL_REQUANT_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_IM2COL_REQUANT_BASE( \ + INSTANTIATE_IM2COL_REQUANT_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_IM2COL_REQUANT_BASE( \ + INSTANTIATE_IM2COL_REQUANT_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \ @@ -375,6 +385,7 @@ INSTANTIATE_IM2COL_REQUANT_RELU(int16_t); #undef INSTANTIATE_IM2COL_REQUANT_RELU #undef INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM #undef INSTANTIATE_IM2COL_REQUANT_Q_GRANS +#undef INSTANTIATE_IM2COL_REQUANT_BIAS_T #undef INSTANTIATE_IM2COL_REQUANT_BASE //////////////////////////////////////////////////////////////////////////////// diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 4f7026f..ade851f 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -237,22 +237,26 @@ bool fbgemmSupportedCPU() { //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ +#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE) \ template void fbgemmPacked( \ PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ uint8_t* C, \ int32_t* C_buffer, \ uint32_t ldc, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ int thread_id, \ int num_threads, \ const BlockingFactors* blocking_params); -#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t); + +#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_RELU(PACK_A, ACC_T) \ INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ @@ -268,27 +272,34 @@ INSTANTIATE_ACC_T(PackAWithRowOffset); #undef INSTANTIATE_ACC_T #undef INSTANTIATE_RELU #undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE -#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template void fbgemmPacked( \ - PackMatrix< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - uint8_t, \ - ACC_T>& packA, \ - PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ - uint8_t* C, \ - int32_t* C_buffer, \ - uint32_t ldc, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ - int thread_id, \ - int num_threads, \ +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ + int thread_id, \ + int num_threads, \ const BlockingFactors* blocking_params); -#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE( \ +#define INSTANTIATE_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BIAS_T( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BIAS_T( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ @@ -305,6 +316,7 @@ INSTANTIATE_RELU(int16_t); #undef INSTANTIATE_RELU #undef INSTANTIATE_SPATIAL_DIM #undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE //////////////////////////////////////////////////////////////////////////////// diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc index 6a1e55b..5a486c0 100644 --- a/src/FbgemmConv.cc +++ b/src/FbgemmConv.cc @@ -89,6 +89,7 @@ int fbgemmConv( // std::cout << "Depthwise fast path" << std::endl; const std::int32_t* B_zero_point = outProcess.getBZeroPoint(); const float* C_multiplier = outProcess.getCMultiplier(); + const float* act_times_w_scale = outProcess.getActWScale(); if (SPATIAL_DIM == 3) { static_assert( std::is_same<typename processOutputType::outType, std::uint8_t>:: @@ -115,7 +116,7 @@ int fbgemmConv( outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu - 1.0f, // act_scale * weight_scale + act_times_w_scale ? act_times_w_scale[0] : 1.0f, thread_id, num_threads); } else if ( @@ -141,7 +142,7 @@ int fbgemmConv( outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu - nullptr, // act_scale * weight_scale + outProcess.getActWScale(), // act_scale * weight_scale thread_id, num_threads); } else { @@ -169,7 +170,7 @@ int fbgemmConv( outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu - 1.0f, // act_scale * weight_scale + act_times_w_scale ? act_times_w_scale[0] : 1.0f, thread_id, num_threads); } else if ( @@ -194,7 +195,7 @@ int fbgemmConv( outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu - nullptr, // act_scale * weight_scale + outProcess.getActWScale(), // act_scale * weight_scale thread_id, num_threads); } else { @@ -316,21 +317,25 @@ int fbgemmConv( return 0; } -#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \ +#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, BIAS_TYPE) \ template int fbgemmConv( \ const conv_param_t<SPATIAL_DIM>& conv_p, \ const std::uint8_t* activations, \ PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, \ std::uint8_t* out, \ std::int32_t* outBuffer, \ - ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ int thread_id, \ int num_threads, \ const BlockingFactors* blocking_params); +#define INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, float); \ + INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t); + #define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \ - INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 2); \ - INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 3); + INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2); \ + INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3); #define INSTANTIATE_RELU(ACC_T, Q_GRAN) \ INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, true); \ @@ -346,6 +351,7 @@ INSTANTIATE_Q_GRANS(std::int32_t); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_RELU #undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE template bool takeDepthWiseFastPath<2, std::int32_t>( diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc index 40f3fba..4ba3549 100644 --- a/src/GroupwiseConvAcc32Avx2.cc +++ b/src/GroupwiseConvAcc32Avx2.cc @@ -2204,7 +2204,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) { template int rowOffsetBufferSizeGConv<2>(const conv_param_t<2>& conv_param); template int rowOffsetBufferSizeGConv<3>(const conv_param_t<3>& conv_param); -#define INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM) \ +#define INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, BIAS_TYPE) \ template void fbgemmGroupwiseConv( \ const conv_param_t<SPATIAL_DIM>& conv_param, \ const uint8_t* activations, \ @@ -2213,13 +2213,17 @@ template int rowOffsetBufferSizeGConv<3>(const conv_param_t<3>& conv_param); PackWeightMatrixForGConv<int8_t, int32_t, SPATIAL_DIM>& packed_weights, \ uint8_t* out, \ int32_t* outBuffer, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ int thread_id, \ int num_threads); +#define INSTANTIATE_BIAS_T(RELU, Q_GRAN, SPATIAL_DIM) \ + INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, float); \ + INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, int32_t); + #define INSTANTIATE_SPATIAL_DIM(RELU, Q_GRAN) \ - INSTANTIATE_BASE(RELU, Q_GRAN, 2); \ - INSTANTIATE_BASE(RELU, Q_GRAN, 3); + INSTANTIATE_BIAS_T(RELU, Q_GRAN, 2); \ + INSTANTIATE_BIAS_T(RELU, Q_GRAN, 3); #define INSTANTIATE_Q_GRANS(RELU) \ INSTANTIATE_SPATIAL_DIM(RELU, QuantizationGranularity::TENSOR); \ @@ -2231,6 +2235,7 @@ INSTANTIATE_Q_GRANS(true); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE template void fbgemmGroupwiseConv( |