diff options
author | Daya Khudia <dskhudia@fb.com> | 2019-09-11 21:47:58 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-09-11 21:52:07 +0300 |
commit | ea787e8278744ab4c7d6c4ee42a050bb1c76ef88 (patch) | |
tree | 3846cfc169c2333cfa23f519f88565b0e0b40416 | |
parent | 637288bff9972c02e72341d6a60fdf9bab1dce7e (diff) |
fbgemmPacked and fbgemmConv apis with float bias + tests
Summary: fbgemmPacked and fbgemmConv api changes to take float bias.
Reviewed By: jianyuh
Differential Revision: D17244262
fbshipit-source-id: 0531c829190d20e31cb957a3f1861d4a65645cee
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 47 | ||||
-rw-r--r-- | src/Fbgemm.cc | 58 | ||||
-rw-r--r-- | src/FbgemmConv.cc | 22 | ||||
-rw-r--r-- | src/GroupwiseConvAcc32Avx2.cc | 13 | ||||
-rw-r--r-- | test/RequantizeOnlyTest.cc | 169 | ||||
-rw-r--r-- | test/UniConvTest.cc | 266 |
6 files changed, 453 insertions, 122 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 0a4ff55..4ae1b50 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -315,19 +315,23 @@ void ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PACK_A<uint8_t, ACC_T>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ - ReQuantizeOutput<RELU, Q_GRAN>>; +#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ + ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>>; + +#define INSTANTIATE_REQUANT_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \ + INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \ + INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t); #define INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, RELU) \ - INSTANTIATE_REQUANT_BASE( \ + INSTANTIATE_REQUANT_BIAS_T( \ PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_REQUANT_BASE( \ + INSTANTIATE_REQUANT_BIAS_T( \ PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_REQUANT_BASE( \ + INSTANTIATE_REQUANT_BIAS_T( \ PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_REQUANT_RELU(PACK_A, ACC_T) \ @@ -344,21 +348,27 @@ INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset); #undef INSTANTIATE_REQUANT_ACC_T #undef INSTANTIATE_REQUANT_RELU #undef INSTANTIATE_REQUANT_Q_GRANS +#undef INSTANTIATE_REQUANT_BIAS_T #undef INSTANTIATE_REQUANT_BASE -#define INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ - ReQuantizeOutput<RELU, Q_GRAN>>; +#define INSTANTIATE_IM2COL_REQUANT_BASE( \ + ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ + ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>>; + +#define INSTANTIATE_IM2COL_REQUANT_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \ + INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t); #define INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_IM2COL_REQUANT_BASE( \ + INSTANTIATE_IM2COL_REQUANT_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_IM2COL_REQUANT_BASE( \ + INSTANTIATE_IM2COL_REQUANT_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_IM2COL_REQUANT_BASE( \ + INSTANTIATE_IM2COL_REQUANT_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \ @@ -375,6 +385,7 @@ INSTANTIATE_IM2COL_REQUANT_RELU(int16_t); #undef INSTANTIATE_IM2COL_REQUANT_RELU #undef INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM #undef INSTANTIATE_IM2COL_REQUANT_Q_GRANS +#undef INSTANTIATE_IM2COL_REQUANT_BIAS_T #undef INSTANTIATE_IM2COL_REQUANT_BASE //////////////////////////////////////////////////////////////////////////////// diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 4f7026f..ade851f 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -237,22 +237,26 @@ bool fbgemmSupportedCPU() { //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ +#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE) \ template void fbgemmPacked( \ PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ uint8_t* C, \ int32_t* C_buffer, \ uint32_t ldc, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ int thread_id, \ int num_threads, \ const BlockingFactors* blocking_params); -#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t); + +#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_RELU(PACK_A, ACC_T) \ INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ @@ -268,27 +272,34 @@ INSTANTIATE_ACC_T(PackAWithRowOffset); #undef INSTANTIATE_ACC_T #undef INSTANTIATE_RELU #undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE -#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template void fbgemmPacked( \ - PackMatrix< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - uint8_t, \ - ACC_T>& packA, \ - PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ - uint8_t* C, \ - int32_t* C_buffer, \ - uint32_t ldc, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ - int thread_id, \ - int num_threads, \ +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ + int thread_id, \ + int num_threads, \ const BlockingFactors* blocking_params); -#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE( \ +#define INSTANTIATE_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BIAS_T( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BIAS_T( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ @@ -305,6 +316,7 @@ INSTANTIATE_RELU(int16_t); #undef INSTANTIATE_RELU #undef INSTANTIATE_SPATIAL_DIM #undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE //////////////////////////////////////////////////////////////////////////////// diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc index 6a1e55b..5a486c0 100644 --- a/src/FbgemmConv.cc +++ b/src/FbgemmConv.cc @@ -89,6 +89,7 @@ int fbgemmConv( // std::cout << "Depthwise fast path" << std::endl; const std::int32_t* B_zero_point = outProcess.getBZeroPoint(); const float* C_multiplier = outProcess.getCMultiplier(); + const float* act_times_w_scale = outProcess.getActWScale(); if (SPATIAL_DIM == 3) { static_assert( std::is_same<typename processOutputType::outType, std::uint8_t>:: @@ -115,7 +116,7 @@ int fbgemmConv( outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu - 1.0f, // act_scale * weight_scale + act_times_w_scale ? act_times_w_scale[0] : 1.0f, thread_id, num_threads); } else if ( @@ -141,7 +142,7 @@ int fbgemmConv( outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu - nullptr, // act_scale * weight_scale + outProcess.getActWScale(), // act_scale * weight_scale thread_id, num_threads); } else { @@ -169,7 +170,7 @@ int fbgemmConv( outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu - 1.0f, // act_scale * weight_scale + act_times_w_scale ? act_times_w_scale[0] : 1.0f, thread_id, num_threads); } else if ( @@ -194,7 +195,7 @@ int fbgemmConv( outProcess.getColOffsets(), outProcess.getBias(), outProcess.RELU_FUSED, // fuse_relu - nullptr, // act_scale * weight_scale + outProcess.getActWScale(), // act_scale * weight_scale thread_id, num_threads); } else { @@ -316,21 +317,25 @@ int fbgemmConv( return 0; } -#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \ +#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, BIAS_TYPE) \ template int fbgemmConv( \ const conv_param_t<SPATIAL_DIM>& conv_p, \ const std::uint8_t* activations, \ PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, \ std::uint8_t* out, \ std::int32_t* outBuffer, \ - ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ int thread_id, \ int num_threads, \ const BlockingFactors* blocking_params); +#define INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, float); \ + INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t); + #define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \ - INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 2); \ - INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 3); + INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2); \ + INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3); #define INSTANTIATE_RELU(ACC_T, Q_GRAN) \ INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, true); \ @@ -346,6 +351,7 @@ INSTANTIATE_Q_GRANS(std::int32_t); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_RELU #undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE template bool takeDepthWiseFastPath<2, std::int32_t>( diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc index 40f3fba..4ba3549 100644 --- a/src/GroupwiseConvAcc32Avx2.cc +++ b/src/GroupwiseConvAcc32Avx2.cc @@ -2204,7 +2204,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) { template int rowOffsetBufferSizeGConv<2>(const conv_param_t<2>& conv_param); template int rowOffsetBufferSizeGConv<3>(const conv_param_t<3>& conv_param); -#define INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM) \ +#define INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, BIAS_TYPE) \ template void fbgemmGroupwiseConv( \ const conv_param_t<SPATIAL_DIM>& conv_param, \ const uint8_t* activations, \ @@ -2213,13 +2213,17 @@ template int rowOffsetBufferSizeGConv<3>(const conv_param_t<3>& conv_param); PackWeightMatrixForGConv<int8_t, int32_t, SPATIAL_DIM>& packed_weights, \ uint8_t* out, \ int32_t* outBuffer, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ int thread_id, \ int num_threads); +#define INSTANTIATE_BIAS_T(RELU, Q_GRAN, SPATIAL_DIM) \ + INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, float); \ + INSTANTIATE_BASE(RELU, Q_GRAN, SPATIAL_DIM, int32_t); + #define INSTANTIATE_SPATIAL_DIM(RELU, Q_GRAN) \ - INSTANTIATE_BASE(RELU, Q_GRAN, 2); \ - INSTANTIATE_BASE(RELU, Q_GRAN, 3); + INSTANTIATE_BIAS_T(RELU, Q_GRAN, 2); \ + INSTANTIATE_BIAS_T(RELU, Q_GRAN, 3); #define INSTANTIATE_Q_GRANS(RELU) \ INSTANTIATE_SPATIAL_DIM(RELU, QuantizationGranularity::TENSOR); \ @@ -2231,6 +2235,7 @@ INSTANTIATE_Q_GRANS(true); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE template void fbgemmGroupwiseConv( diff --git a/test/RequantizeOnlyTest.cc b/test/RequantizeOnlyTest.cc new file mode 100644 index 0000000..2f73d49 --- /dev/null +++ b/test/RequantizeOnlyTest.cc @@ -0,0 +1,169 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include <algorithm> +#include <functional> +#include <iostream> +#include <random> +#include <stdexcept> +#include <string> + +#include <gtest/gtest.h> + +#include "TestUtils.h" +#include "bench/BenchUtils.h" +#include "fbgemm/Fbgemm.h" + +using namespace std; +using namespace fbgemm; + +vector<QuantizationGranularity> qGranularityVals{ + QuantizationGranularity::TENSOR, + QuantizationGranularity::OUT_CHANNEL}; + +namespace { + +// tuple represents #rows, #cols, fuse_relu, quantization_granularity, bias_type +class FloatRequantizeTest + : public testing::TestWithParam< + tuple<int, int, bool, QuantizationGranularity>> {}; + +}; // namespace + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + FloatRequantizeTest, + ::testing::Combine( + ::testing::ValuesIn({1, 2, 3, 4}), // number of rows + ::testing::ValuesIn( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 20, 32}), // number of + // cols + ::testing::Bool(), // fuse relu + ::testing::ValuesIn(qGranularityVals))); // requantization granularity + +/** + * Test for float bias + */ +TEST_P(FloatRequantizeTest, floatBiasTest) { + int rows, cols; + bool fuse_relu; + QuantizationGranularity q_gran; + tie(rows, cols, fuse_relu, q_gran) = GetParam(); + + int numElements = rows * cols; + + aligned_vector<float> act_times_w_scale(cols); + randFill<float>(act_times_w_scale, -8, 8); + + float out_scale = 2.0f; + + aligned_vector<float> C_multiplier(cols); + transform( + act_times_w_scale.begin(), + act_times_w_scale.end(), + C_multiplier.begin(), + [&out_scale](float i) { return i / out_scale; }); + + aligned_vector<int32_t> Bint8_zero_point(cols); + randFill<int32_t>(Bint8_zero_point, -8, 8); + + aligned_vector<int32_t> row_offset_buf(rows); + randFill<int32_t>(row_offset_buf, -8, 8); + + aligned_vector<int32_t> col_offsets(cols); + randFill<int32_t>(col_offsets, -8, 8); + + // quantized bias + aligned_vector<int32_t> bias_q(cols); + randFill<int32_t>(bias_q, -8, 8); + + // floating point bias + aligned_vector<float> bias_f(cols); + if (q_gran == QuantizationGranularity::TENSOR) { + transform( + bias_q.begin(), + bias_q.end(), + bias_f.begin(), + [&act_times_w_scale](float i) { return i * act_times_w_scale[0]; }); + } else if (q_gran == QuantizationGranularity::OUT_CHANNEL) { + transform( + act_times_w_scale.begin(), + act_times_w_scale.end(), + bias_q.begin(), + bias_f.begin(), + multiplies<float>()); + + } else { + FAIL(); + } + + aligned_vector<int32_t> input(numElements); + randFill<int32_t>(input, -8, 8); + + aligned_vector<uint8_t> output_q_bias(numElements); + aligned_vector<uint8_t> output_f_bias(numElements); + + int32_t C_zero_point = 3; + int32_t Aint8_zero_point = 3; + + block_type_t block{0, rows, 0, cols}; + + DoNothing<> doNothingObj{}; + +#define TESTCODE(FUSE_RELU, Q_GRAN) \ + ReQuantizeOutput<FUSE_RELU, Q_GRAN> reqObj_q( \ + doNothingObj, \ + C_multiplier.data(), \ + C_zero_point, \ + Aint8_zero_point, \ + Bint8_zero_point.data(), \ + row_offset_buf.data(), \ + col_offsets.data(), \ + bias_q.data(), \ + cols); \ + ReQuantizeOutput<FUSE_RELU, Q_GRAN, float> reqObj_f( \ + doNothingObj, \ + C_multiplier.data(), \ + C_zero_point, \ + Aint8_zero_point, \ + Bint8_zero_point.data(), \ + row_offset_buf.data(), \ + col_offsets.data(), \ + bias_f.data(), \ + cols, \ + 1, \ + act_times_w_scale.data()); \ + reqObj_q.f<inst_set_t::avx2>( \ + output_q_bias.data(), input.data(), block, cols, cols); \ + reqObj_f.f<inst_set_t::avx2>( \ + output_f_bias.data(), input.data(), block, cols, cols); + + if (fuse_relu) { + if (q_gran == QuantizationGranularity::TENSOR) { + TESTCODE(true, QuantizationGranularity::TENSOR) + + } else if (q_gran == QuantizationGranularity::OUT_CHANNEL) { + TESTCODE(true, QuantizationGranularity::OUT_CHANNEL) + + } else { + FAIL(); + } + + } else { + if (q_gran == QuantizationGranularity::TENSOR) { + TESTCODE(false, QuantizationGranularity::TENSOR) + + } else if (q_gran == QuantizationGranularity::OUT_CHANNEL) { + TESTCODE(false, QuantizationGranularity::OUT_CHANNEL) + + } else { + FAIL(); + } + } +#undef TESTCODE + ASSERT_EQ(output_q_bias, output_f_bias) + << "Requantization with quantized bias and float bias differs"; +} diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc index 21bdcb5..83674ce 100644 --- a/test/UniConvTest.cc +++ b/test/UniConvTest.cc @@ -88,8 +88,11 @@ class uniConvTest : public testing::TestWithParam< tuple<int, int, int, int, int, int, int, int, int, int>> {}; -class UniConvQGranTest : public testing::TestWithParam< - tuple<QuantizationGranularity, bool, bool>> {}; +// tuple represents QuantizationGranularity, A symmetric, B symmetric, +// test_bias, test_float_bias +class UniConvQGranTest + : public testing::TestWithParam< + tuple<QuantizationGranularity, bool, bool, bool, bool>> {}; }; // namespace @@ -115,7 +118,9 @@ INSTANTIATE_TEST_CASE_P( ::testing::Combine( ::testing::ValuesIn(qGranularityVals), ::testing::Bool(), // A symmetric - ::testing::Bool())); // B symmetric + ::testing::Bool(), // B symmetric + ::testing::Bool(), // test_bias + ::testing::Bool())); // test_float_bias /** * Test for conv packing */ @@ -403,7 +408,9 @@ TEST_P(UniConvQGranTest, requantizeTest) { vector<conv_param_t<>> shapes(GetShapes_()); QuantizationGranularity q_granularity; bool a_symmetric, b_symmetric; - tie(q_granularity, a_symmetric, b_symmetric) = GetParam(); + bool test_bias, test_float_bias; + tie(q_granularity, a_symmetric, b_symmetric, test_bias, test_float_bias) = + GetParam(); for (auto conv_p : shapes) { int R = conv_p.K[0]; @@ -463,10 +470,51 @@ TEST_P(UniConvQGranTest, requantizeTest) { vector<int32_t> row_offsets(MDim); + // activation_scale * weight_scale + aligned_vector<float> act_times_w_scale(Bint8_zero_point.size()); + randFill(act_times_w_scale, 0.1234f / 2, 0.1234f * 3 / 2); + + float out_scale = 2.0f; aligned_vector<float> C_multiplier(Bint8_zero_point.size()); - randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2); + transform( + act_times_w_scale.begin(), + act_times_w_scale.end(), + C_multiplier.begin(), + [&out_scale](float i) { return i / out_scale; }); + int32_t C_zero_pt = 5; + // initialize bias + aligned_vector<int32_t> bias_int32(OC); + aligned_vector<float> bias_fp32(OC); + if (test_bias) { + randFill(bias_int32, -8, 8); + } + + // floating point bias + if (test_float_bias) { + if (q_granularity == QuantizationGranularity::TENSOR) { + transform( + bias_int32.begin(), + bias_int32.end(), + bias_fp32.begin(), + [&act_times_w_scale](float i) { return i * act_times_w_scale[0]; }); + } else if (q_granularity == QuantizationGranularity::GROUP) { + for (int g = 0; g < G; ++g) { + for (int c = 0; c < OC_per_G; ++c) { + bias_fp32[g * OC_per_G + c] = act_times_w_scale[g] * + static_cast<float>(bias_int32[g * OC_per_G + c]); + } + } + } else { // OUT_CHANNEL + transform( + act_times_w_scale.begin(), + act_times_w_scale.end(), + bias_int32.begin(), + bias_fp32.begin(), + multiplies<float>()); + } + } // reference implementation // conv_ref expects weights to be in G (R S C/G) K/G int8_t* rightBData = Bint8.data(); @@ -505,7 +553,7 @@ TEST_P(UniConvQGranTest, requantizeTest) { Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * NDim, - nullptr, + test_bias ? bias_int32.data() + g * NDim : nullptr, ncols_per_quant_group); } @@ -524,73 +572,153 @@ TEST_P(UniConvQGranTest, requantizeTest) { int tid = fbgemm_get_thread_num(); if (q_granularity == QuantizationGranularity::TENSOR) { - ReQuantizeOutput<false, QuantizationGranularity::TENSOR> reqObj( - doNothingObj, - C_multiplier.data(), - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point.data(), - nullptr, /* row offset buffer */ - col_offsets.data(), - nullptr, - G * NDim, - G); - - fbgemmConv( - conv_p, - Aint8.data(), - packedWeights, - Cint8_fb.data(), - Cint32_fb.data(), - reqObj, - tid, - num_threads); + if (test_float_bias) { + ReQuantizeOutput<false, QuantizationGranularity::TENSOR, float> + reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + test_bias ? bias_fp32.data() : nullptr, + G * NDim, + G, + act_times_w_scale.data()); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + + } else { + ReQuantizeOutput<false, QuantizationGranularity::TENSOR> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + test_bias ? bias_int32.data() : nullptr, + G * NDim, + G); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + } } else if (q_granularity == QuantizationGranularity::GROUP) { - ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj( - doNothingObj, - C_multiplier.data(), - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point.data(), - nullptr, /* row offset buffer */ - col_offsets.data(), - nullptr, - G * NDim, - G); - - fbgemmConv( - conv_p, - Aint8.data(), - packedWeights, - Cint8_fb.data(), - Cint32_fb.data(), - reqObj, - tid, - num_threads); + if (test_float_bias) { + ReQuantizeOutput<false, QuantizationGranularity::GROUP, float> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + test_bias ? bias_fp32.data() : nullptr, + G * NDim, + G, + act_times_w_scale.data()); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + + } else { + ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + test_bias ? bias_int32.data() : nullptr, + G * NDim, + G); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + } } else { - ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj( - doNothingObj, - C_multiplier.data(), - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point.data(), - nullptr, /* row offset buffer */ - col_offsets.data(), - nullptr, - G * NDim, - G); - - fbgemmConv( - conv_p, - Aint8.data(), - packedWeights, - Cint8_fb.data(), - Cint32_fb.data(), - reqObj, - tid, - num_threads); + if (test_float_bias) { + ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL, float> + reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + test_bias ? bias_fp32.data() : nullptr, + G * NDim, + G, + act_times_w_scale.data()); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + + } else { + ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, /* row offset buffer */ + col_offsets.data(), + test_bias ? bias_int32.data() : nullptr, + G * NDim, + G); + + fbgemmConv( + conv_p, + Aint8.data(), + packedWeights, + Cint8_fb.data(), + Cint32_fb.data(), + reqObj, + tid, + num_threads); + } } } // omp parallel |