diff options
author | Daya Khudia <dskhudia@fb.com> | 2019-08-21 02:52:53 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-08-21 02:58:08 +0300 |
commit | 280fa17349b763eb474c423a6d1172f81df29103 (patch) | |
tree | 061974b09a412e0025c11a456451c81efe7e4ab0 | |
parent | a6d1d3eed7ba858d4532fc297b7a4ee984e6e7e3 (diff) |
Per channel support in fbgemmConv (#119)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/119
Some paths in fbgemmConv had missing support for per channel quantization. Adding support for per channel as well as groupwise quantization support with this diff.
Reviewed By: jianyuh
Differential Revision: D16894740
fbshipit-source-id: 43a2c08d1c8d1b01775f875224774c39fae280bc
-rw-r--r-- | include/fbgemm/Fbgemm.h | 1 | ||||
-rw-r--r-- | src/FbgemmConv.cc | 171 | ||||
-rw-r--r-- | test/UniConvTest.cc | 269 |
3 files changed, 398 insertions, 43 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index 543f1cb..0b7bf1f 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -1134,6 +1134,7 @@ template < class FBGEMM_API ReQuantizeOutput { public: static constexpr int RELU_FUSED = FUSE_RELU; + static constexpr QuantizationGranularity QGRANType = Q_GRAN; using outType = outT; using inpType = inT; /** diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc index 33d1535..164411d 100644 --- a/src/FbgemmConv.cc +++ b/src/FbgemmConv.cc @@ -94,47 +94,111 @@ int fbgemmConv( std::is_same<typename processOutputType::outType, std::uint8_t>:: value, "For depthwise, only requantized output is supported"); - depthwise_3x3x3_pad_1( - conv_p.MB, // mini batch - conv_p.IN_DIM[0], // T - conv_p.IN_DIM[1], // H - conv_p.IN_DIM[2], // W - conv_p.OC, // output channels - conv_p.stride[0], // stride_t - conv_p.stride[1], // stride_h - conv_p.stride[2], // stride_w - outProcess.getAZeroPoint(), - activations, - B_zero_point[0], - *(packed_weights.getPackedWFor3DDW()), - C_multiplier[0], - outProcess.getCZeroPoint(), - out, - outProcess.getColOffsets(), - outProcess.getBias(), - outProcess.RELU_FUSED, // fuse_relu - thread_id, - num_threads); + + if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { + depthwise_3x3x3_pad_1( + conv_p.MB, // mini batch + conv_p.IN_DIM[0], // T + conv_p.IN_DIM[1], // H + conv_p.IN_DIM[2], // W + conv_p.OC, // output channels + conv_p.stride[0], // stride_t + conv_p.stride[1], // stride_h + conv_p.stride[2], // stride_w + outProcess.getAZeroPoint(), + activations, + B_zero_point[0], + *(packed_weights.getPackedWFor3DDW()), + C_multiplier[0], + outProcess.getCZeroPoint(), + out, + outProcess.getColOffsets(), + outProcess.getBias(), + outProcess.RELU_FUSED, // fuse_relu + thread_id, + num_threads); + } else if ( + processOutputType::QGRANType == + QuantizationGranularity::OUT_CHANNEL || + processOutputType::QGRANType == QuantizationGranularity::GROUP) { + depthwise_3x3x3_per_channel_quantization_pad_1( + conv_p.MB, // mini batch + conv_p.IN_DIM[0], // T + conv_p.IN_DIM[1], // H + conv_p.IN_DIM[2], // W + conv_p.OC, // output channels + conv_p.stride[0], // stride_t + conv_p.stride[1], // stride_h + conv_p.stride[2], // stride_w + outProcess.getAZeroPoint(), + activations, + B_zero_point, + *(packed_weights.getPackedWFor3DDW()), + C_multiplier, + outProcess.getCZeroPoint(), + out, + outProcess.getColOffsets(), + outProcess.getBias(), + outProcess.RELU_FUSED, // fuse_relu + thread_id, + num_threads); + } else { + std::string msg = + "[FBGEMM_CONV_ERROR] This quantization granularity is " + "not supported"; + throw std::runtime_error(msg); + } } else { - depthwise_3x3_pad_1( - conv_p.MB, // mini batch - conv_p.IN_DIM[0], // H - conv_p.IN_DIM[1], // W - conv_p.OC, // output channels - conv_p.stride[0], // stride_h - conv_p.stride[1], // stride_w - outProcess.getAZeroPoint(), - activations, - B_zero_point[0], - *(packed_weights.getPackedWFor2DDW()), - C_multiplier[0], - outProcess.getCZeroPoint(), - out, - outProcess.getColOffsets(), - outProcess.getBias(), - outProcess.RELU_FUSED, // fuse_relu - thread_id, - num_threads); + if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { + depthwise_3x3_pad_1( + conv_p.MB, // mini batch + conv_p.IN_DIM[0], // H + conv_p.IN_DIM[1], // W + conv_p.OC, // output channels + conv_p.stride[0], // stride_h + conv_p.stride[1], // stride_w + outProcess.getAZeroPoint(), + activations, + B_zero_point[0], + *(packed_weights.getPackedWFor2DDW()), + C_multiplier[0], + outProcess.getCZeroPoint(), + out, + outProcess.getColOffsets(), + outProcess.getBias(), + outProcess.RELU_FUSED, // fuse_relu + thread_id, + num_threads); + } else if ( + processOutputType::QGRANType == + QuantizationGranularity::OUT_CHANNEL || + processOutputType::QGRANType == QuantizationGranularity::GROUP) { + // The number of channels == groups for depthwise convolutions + depthwise_3x3_per_channel_quantization_pad_1( + conv_p.MB, // mini batch + conv_p.IN_DIM[0], // H + conv_p.IN_DIM[1], // W + conv_p.OC, // output channels + conv_p.stride[0], // stride_h + conv_p.stride[1], // stride_w + outProcess.getAZeroPoint(), + activations, + B_zero_point, + *(packed_weights.getPackedWFor2DDW()), + C_multiplier, + outProcess.getCZeroPoint(), + out, + outProcess.getColOffsets(), + outProcess.getBias(), + outProcess.RELU_FUSED, // fuse_relu + thread_id, + num_threads); + } else { + std::string msg = + "[FBGEMM_CONV_ERROR] This quantization granularity is " + "not supported"; + throw std::runtime_error(msg); + } } break; } @@ -195,11 +259,32 @@ int fbgemmConv( // All other convolutions go through im2col-based implementation // std::cout << "Im2col path" << std::endl; std::vector<int32_t> row_offset_buf( - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM> - ::rowOffsetBufferSize(blocking_params)); + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize( + blocking_params)); const std::int32_t* b_zero_point = outProcess.getBZeroPoint(); - bool b_symmetric = b_zero_point[0] == 0; + bool b_symmetric = false; + if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) { + b_symmetric = b_zero_point[0] == 0; + } else if ( + processOutputType::QGRANType == QuantizationGranularity::GROUP) { + b_symmetric = + std::all_of(b_zero_point, b_zero_point + conv_p.G, [](int i) { + return i == 0; + }); + } else if ( + processOutputType::QGRANType == + QuantizationGranularity::OUT_CHANNEL) { + b_symmetric = + std::all_of(b_zero_point, b_zero_point + conv_p.OC, [](int i) { + return i == 0; + }); + } else { + std::string msg = + "[FBGEMM_CONV_ERROR] This quantization granularity is " + "not supported"; + throw std::runtime_error(msg); + } PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM> packA( conv_p, activations, diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc index 91bf578..ce26442 100644 --- a/test/UniConvTest.cc +++ b/test/UniConvTest.cc @@ -21,6 +21,55 @@ using namespace std; using namespace fbgemm; +vector<QuantizationGranularity> qGranularityVals{ + QuantizationGranularity::TENSOR, + QuantizationGranularity::GROUP, + QuantizationGranularity::OUT_CHANNEL}; + +static vector<conv_param_t<>> GetShapes_() { + vector<conv_param_t<>> shapes = { + // MB, IC, OC, {IH, IW}, G, {KH, KW}, {stride_h, stride_w}, {pad_t, pad_l, + // pad_b, pad_r} + // Regular + conv_param_t<>(1, 16, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 16, 32, {30, 10}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {2, 1, 2, 1}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 2, 1, 2}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {2, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 16, 16, {10, 30}, 1, {3, 5}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 16, 16, {10, 30}, 1, {5, 3}, {1, 1}, {1, 1, 1, 1}), + // groupwise + conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 16, {10, 30}, 8, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 16, 32, {10, 30}, 8, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {2, 2}, {2, 1, 2, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {1, 2}, {2, 1, 2, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {2, 1}, {2, 1, 2, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 5}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 8, {5, 3}, {1, 1}, {1, 1, 1, 1}), + // DW + conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 1}, {1, 2, 1, 2}), + conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {2, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 5}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 32, 32, {10, 30}, 32, {5, 3}, {1, 1}, {1, 1, 1, 1}), + // Pointwise + conv_param_t<>(1, 32, 32, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}), + conv_param_t<>(1, 16, 32, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {2, 2}, {0, 0, 0, 0}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {1, 2}, {0, 0, 0, 0}), + conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {2, 1}, {0, 0, 0, 0}), + }; + return shapes; +} + namespace { // tuple represents MB, IC, OC, IT, IH, IW, KH/KW, stride, pad @@ -28,8 +77,12 @@ class uniConvTest : public testing::TestWithParam< tuple<int, int, int, int, int, int, int, int, int, int>> {}; +class UniConvQGranTest : public testing::TestWithParam< + tuple<QuantizationGranularity, bool, bool>> {}; + }; // namespace +// Combine only allows at most 10 generators. INSTANTIATE_TEST_CASE_P( InstantiationName, uniConvTest, @@ -45,6 +98,13 @@ INSTANTIATE_TEST_CASE_P( ::testing::ValuesIn({1, 2}), // stride ::testing::ValuesIn({0, 1, 2}))); // pad +INSTANTIATE_TEST_CASE_P( + InstantiationName, + UniConvQGranTest, + ::testing::Combine( + ::testing::ValuesIn(qGranularityVals), + ::testing::Bool(), // A symmetric + ::testing::Bool())); // B symmetric /** * Test for conv packing */ @@ -323,3 +383,212 @@ TEST(uniConvTest, cornerCases) { 0, 1); } + +/** + * @brief Unit test for uint8 activations, int8 weights, and 32-bit + * accumulation. Output processing: requantization -> nothing + */ +TEST_P(UniConvQGranTest, requantizeTest) { + vector<conv_param_t<>> shapes(GetShapes_()); + QuantizationGranularity q_granularity; + bool a_symmetric, b_symmetric; + tie(q_granularity, a_symmetric, b_symmetric) = GetParam(); + + for (auto conv_p : shapes) { + int R = conv_p.K[0]; + int S = conv_p.K[1]; + int G = conv_p.G; + int OC = conv_p.OC; + int OH = conv_p.OUT_DIM[0]; + int OW = conv_p.OUT_DIM[1]; + int IC_per_G = conv_p.IC / conv_p.G; + int OC_per_G = conv_p.OC / conv_p.G; + + // activations + aligned_vector<uint8_t> Aint8( + conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0); + + // weights + // The weight matrix is in layout G K/G (R S C/G) + aligned_vector<int8_t> Bint8(R * S * conv_p.G * IC_per_G * OC_per_G, 0); + aligned_vector<int8_t> Bint8_tr(R * S * G * IC_per_G * OC_per_G, 0); + + aligned_vector<int32_t> Cint32_ref(conv_p.MB * OH * OW * OC, 0); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); + + randFill<uint8_t>(Aint8, 0, 5); + int32_t Aint8_zero_point = a_symmetric ? 0 : 4; + + randFill<int8_t>(Bint8, -4, 4); + + // computing column offset + vector<int32_t> col_offsets(G * OC_per_G); + + int ncols_per_quant_group = G * OC_per_G; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = OC_per_G; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + + aligned_vector<int32_t> Bint8_zero_point( + G * OC_per_G / ncols_per_quant_group); + if (b_symmetric) { + randFill(Bint8_zero_point, -3, 3); + } else { + randFill(Bint8_zero_point, 0, 0); + } + + // matrix dimensions after im2col for each GEMM. + // For each group, there is one GEMM of the following dimensions + int MDim = conv_p.MB * OH * OW; + int NDim = OC_per_G; + int KDim = R * S * IC_per_G; + + vector<uint8_t> Aint8_im2col(MDim * KDim * G); + im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); + + vector<int32_t> row_offsets(MDim); + + aligned_vector<float> C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2); + int32_t C_zero_pt = 5; + + // reference implementation + // conv_ref expects weights to be in G (R S C/G) K/G + int8_t* rightBData = Bint8.data(); + transposeConvWeights(conv_p, Bint8.data(), Bint8_tr.data()); + rightBData = Bint8_tr.data(); + for (int g = 0; g < G; ++g) { + col_offsets_with_zero_pt_s8acc32_ref( + R * S * IC_per_G, + OC_per_G, + OC_per_G, + rightBData + g * R * S * IC_per_G * OC_per_G, + Bint8_zero_point.data() + g * OC_per_G / ncols_per_quant_group, + col_offsets.data() + g * OC_per_G, + ncols_per_quant_group); + } + conv_ref( + conv_p, Aint8.data(), Aint8_zero_point, rightBData, Cint32_ref.data()); + + for (int g = 0; g < G; ++g) { + row_offsets_u8acc32_ref( + MDim, + KDim, + KDim * G, + Aint8_im2col.data() + g * KDim, + row_offsets.data()); + + requantize_u8acc32_ref( + MDim, + NDim, + G * NDim, + Cint32_ref.data() + g * NDim, + Cint8_ref.data() + g * NDim, + C_multiplier.data() + g * NDim / ncols_per_quant_group, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, + row_offsets.data(), + col_offsets.data() + g * NDim, + nullptr, + ncols_per_quant_group); + } + + PackWeightsForConv<2> packedWeights(conv_p, Bint8.data()); + + // TODO: Uncomment once we support multiple threads in fbgemmGroupwiseConv + // #ifdef _OPENMP + // #pragma omp parallel + // #endif + { + vector<int32_t> row_offset_buf(rowOffsetBufferSizeGConv(conv_p)); + + DoNothing<> doNothingObj{}; + + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); + + if (q_granularity == QuantizationGranularity::TENSOR) { + ReQuantizeOutput<false, QuantizationGranularity::TENSOR> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + nullptr, + G * NDim, + G); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + nullptr, + G * NDim, + G); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + + } else { + ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + nullptr, + G * NDim, + G); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + } + } // omp parallel + + compare_validate_buffers( + Cint8_ref.data(), + Cint8_fb.data(), + MDim, + NDim * G, + NDim * G, + static_cast<uint8_t>(0)); + } // for each shape +} |