diff options
Diffstat (limited to 'src/FbgemmConv.cc')
-rw-r--r-- | src/FbgemmConv.cc | 247 |
1 files changed, 200 insertions, 47 deletions
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc index 5db63f6..de833d2 100644 --- a/src/FbgemmConv.cc +++ b/src/FbgemmConv.cc @@ -6,8 +6,9 @@ */ #include <algorithm> -#include <iostream> +#include <numeric> #include <vector> +#include <functional> #include "fbgemm/Fbgemm.h" namespace fbgemm { @@ -33,12 +34,24 @@ bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) { }); } +template <int SPATIAL_DIM> +bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) { + return std::accumulate(conv_p.K.begin(), conv_p.K.end(), 0) == SPATIAL_DIM && + std::accumulate(conv_p.stride.begin(), conv_p.stride.end(), 0) == + SPATIAL_DIM && + std::accumulate(conv_p.dilation.begin(), conv_p.dilation.end(), 0) == + SPATIAL_DIM && + std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0; +} + template <int SPATIAL_DIM, typename ACC_T> optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) { if (takeDepthWiseFastPath<SPATIAL_DIM, ACC_T>(conv_p)) { return optimized_conv_t::depthwise; } else if (fbgemmOptimizedGConv<SPATIAL_DIM>(conv_p)) { return optimized_conv_t::groupwise; + } else if (takePointWiseFastPath<SPATIAL_DIM>(conv_p)) { + return optimized_conv_t::pointwise; } else { return optimized_conv_t::im2col; } @@ -58,58 +71,139 @@ int fbgemmConv( static_assert( SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "Only 2D and 3D convolutions are supported"); + + if (!packed_weights.isPackingCompliant(conv_p)) { + std::string msg = + "[FBGEMM_CONV_ERROR] Convolution parameters " + "mismatch between pre-packed weights and conv invocation! "; + msg += packed_weights.mismatchingParams(conv_p); + msg += std::string( + " Please pack weights using the same parameters " + "with which convolution operation is invoked!"); + throw std::logic_error(msg); + } + switch (ConvFastPath<SPATIAL_DIM, ACC_T>(conv_p)) { case optimized_conv_t::depthwise: { // 2D and 3D depthwise fast path // std::cout << "Depthwise fast path" << std::endl; const std::int32_t* B_zero_point = outProcess.getBZeroPoint(); const float* C_multiplier = outProcess.getCMultiplier(); + const float* act_times_w_scale = outProcess.getActWScale(); if (SPATIAL_DIM == 3) { static_assert( std::is_same<typename processOutputType::outType, std::uint8_t>:: value, "For depthwise, only requantized output is supported"); - depthwise_3x3x3_pad_1( - conv_p.MB, // mini batch - conv_p.IN_DIM[0], // T - conv_p.IN_DIM[1], // H - conv_p.IN_DIM[2], // W - conv_p.OC, // output channels - conv_p.stride[0], // stride_t - conv_p.stride[1], // stride_h - conv_p.stride[2], // stride_w - outProcess.getAZeroPoint(), - activations, - B_zero_point[0], - *(packed_weights.getPackedWFor3DDW()), - C_multiplier[0], - outProcess.getCZeroPoint(), - out, - outProcess.getColOffsets(), - outProcess.getBias(), - outProcess.RELU_FUSED, // fuse_relu - thread_id, - num_threads); + + if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { + depthwise_3x3x3_pad_1( + conv_p.MB, // mini batch + conv_p.IN_DIM[0], // T + conv_p.IN_DIM[1], // H + conv_p.IN_DIM[2], // W + conv_p.OC, // output channels + conv_p.stride[0], // stride_t + conv_p.stride[1], // stride_h + conv_p.stride[2], // stride_w + outProcess.getAZeroPoint(), + activations, + B_zero_point[0], + *(packed_weights.getPackedWForDepthwise()), + C_multiplier[0], + outProcess.getCZeroPoint(), + out, + outProcess.getColOffsets(), + outProcess.getBias(), + outProcess.RELU_FUSED, // fuse_relu + act_times_w_scale ? act_times_w_scale[0] : 1.0f, + thread_id, + num_threads); + } else if ( + processOutputType::QGRANType == + QuantizationGranularity::OUT_CHANNEL || + processOutputType::QGRANType == QuantizationGranularity::GROUP) { + depthwise_3x3x3_per_channel_quantization_pad_1( + conv_p.MB, // mini batch + conv_p.IN_DIM[0], // T + conv_p.IN_DIM[1], // H + conv_p.IN_DIM[2], // W + conv_p.OC, // output channels + conv_p.stride[0], // stride_t + conv_p.stride[1], // stride_h + conv_p.stride[2], // stride_w + outProcess.getAZeroPoint(), + activations, + B_zero_point, + *(packed_weights.getPackedWForDepthwise()), + C_multiplier, + outProcess.getCZeroPoint(), + out, + outProcess.getColOffsets(), + outProcess.getBias(), + outProcess.RELU_FUSED, // fuse_relu + outProcess.getActWScale(), // act_scale * weight_scale + thread_id, + num_threads); + } else { + std::string msg = + "[FBGEMM_CONV_ERROR] This quantization granularity is " + "not supported"; + throw std::runtime_error(msg); + } } else { - depthwise_3x3_pad_1( - conv_p.MB, // mini batch - conv_p.IN_DIM[0], // H - conv_p.IN_DIM[1], // W - conv_p.OC, // output channels - conv_p.stride[0], // stride_h - conv_p.stride[1], // stride_w - outProcess.getAZeroPoint(), - activations, - B_zero_point[0], - *(packed_weights.getPackedWFor2DDW()), - C_multiplier[0], - outProcess.getCZeroPoint(), - out, - outProcess.getColOffsets(), - outProcess.getBias(), - outProcess.RELU_FUSED, // fuse_relu - thread_id, - num_threads); + if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { + depthwise_3x3_pad_1( + conv_p.MB, // mini batch + conv_p.IN_DIM[0], // H + conv_p.IN_DIM[1], // W + conv_p.OC, // output channels + conv_p.stride[0], // stride_h + conv_p.stride[1], // stride_w + outProcess.getAZeroPoint(), + activations, + B_zero_point[0], + *(packed_weights.getPackedWForDepthwise()), + C_multiplier[0], + outProcess.getCZeroPoint(), + out, + outProcess.getColOffsets(), + outProcess.getBias(), + outProcess.RELU_FUSED, // fuse_relu + act_times_w_scale ? act_times_w_scale[0] : 1.0f, + thread_id, + num_threads); + } else if ( + processOutputType::QGRANType == + QuantizationGranularity::OUT_CHANNEL || + processOutputType::QGRANType == QuantizationGranularity::GROUP) { + // The number of channels == groups for depthwise convolutions + depthwise_3x3_per_channel_quantization_pad_1( + conv_p.MB, // mini batch + conv_p.IN_DIM[0], // H + conv_p.IN_DIM[1], // W + conv_p.OC, // output channels + conv_p.stride[0], // stride_h + conv_p.stride[1], // stride_w + outProcess.getAZeroPoint(), + activations, + B_zero_point, + *(packed_weights.getPackedWForDepthwise()), + C_multiplier, + outProcess.getCZeroPoint(), + out, + outProcess.getColOffsets(), + outProcess.getBias(), + outProcess.RELU_FUSED, // fuse_relu + outProcess.getActWScale(), // act_scale * weight_scale + thread_id, + num_threads); + } else { + std::string msg = + "[FBGEMM_CONV_ERROR] This quantization granularity is " + "not supported"; + throw std::runtime_error(msg); + } } break; } @@ -134,14 +228,68 @@ int fbgemmConv( num_threads); break; } + case optimized_conv_t::pointwise: { + std::vector<int32_t> row_offset_buf( + PackAWithRowOffset<uint8_t>::rowOffsetBufferSize(blocking_params)); + int image_dim = std::accumulate( + conv_p.IN_DIM.begin(), + conv_p.IN_DIM.end(), + 1, + std::multiplies<int>()); + PackAWithRowOffset<uint8_t, ACC_T> packA( + matrix_op_t::NoTranspose, + conv_p.MB * image_dim, + conv_p.IC, + activations, + conv_p.IC, + nullptr, + conv_p.G, + row_offset_buf.data(), + blocking_params); + + outProcess.setRowOffsets(row_offset_buf.data()); + fbgemmPacked( + packA, + *(packed_weights.getPackedWForPointwise()), + out, + outBuffer, + conv_p.OC, + outProcess, + thread_id, + num_threads, + blocking_params); + break; + } case optimized_conv_t::im2col: { // All other convolutions go through im2col-based implementation // std::cout << "Im2col path" << std::endl; std::vector<int32_t> row_offset_buf( - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize()); + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize( + blocking_params)); const std::int32_t* b_zero_point = outProcess.getBZeroPoint(); - bool b_symmetric = b_zero_point[0] == 0; + bool b_symmetric = false; + if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { + b_symmetric = b_zero_point[0] == 0; + } else if ( + processOutputType::QGRANType == QuantizationGranularity::GROUP) { + b_symmetric = + std::all_of(b_zero_point, b_zero_point + conv_p.G, [](int i) { + return i == 0; + }); + } else if ( + processOutputType::QGRANType == + QuantizationGranularity::OUT_CHANNEL) { + b_symmetric = + std::all_of(b_zero_point, b_zero_point + conv_p.OC, [](int i) { + return i == 0; + }); + } else { + std::string msg = + "[FBGEMM_CONV_ERROR] This quantization granularity is " + "not supported"; + throw std::runtime_error(msg); + } PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM> packA( conv_p, activations, @@ -169,21 +317,25 @@ int fbgemmConv( return 0; } -#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \ +#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, BIAS_TYPE) \ template int fbgemmConv( \ const conv_param_t<SPATIAL_DIM>& conv_p, \ const std::uint8_t* activations, \ PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, \ std::uint8_t* out, \ std::int32_t* outBuffer, \ - ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ int thread_id, \ int num_threads, \ const BlockingFactors* blocking_params); +#define INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, float); \ + INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t); + #define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \ - INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 2); \ - INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 3); + INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2); \ + INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3); #define INSTANTIATE_RELU(ACC_T, Q_GRAN) \ INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, true); \ @@ -199,6 +351,7 @@ INSTANTIATE_Q_GRANS(std::int32_t); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_RELU #undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE template bool takeDepthWiseFastPath<2, std::int32_t>( |