diff options
author | Jongsoo Park <jongsoo@fb.com> | 2019-02-20 19:50:37 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-02-20 19:59:51 +0300 |
commit | 9ebc998c2bf0da81ab7afa1478df395d79429cf8 (patch) | |
tree | c664731e03c05f239f28bc3eb6a8010b65696f93 | |
parent | a5f1f5308130f21b376f5368e87eb74b071493fa (diff) |
optimize PackAWithIm2Col for symmetric b quant
Summary: Add additional option b_symmetric and skip row offset computation if it's true
Reviewed By: jianyuh
Differential Revision: D14119128
fbshipit-source-id: fa079347562b7f75727b3a1414e9bdda3f9c65dd
-rw-r--r-- | include/fbgemm/Fbgemm.h | 12 | ||||
-rw-r--r-- | src/PackAWithIm2Col.cc | 80 | ||||
-rw-r--r-- | test/Im2ColFusedRequantizeTest.cc | 67 |
3 files changed, 101 insertions, 58 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index 2ac10f7..4f3c92e 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -524,13 +524,15 @@ class FBGEMM_API PackAWithIm2Col * buffer and owns it. Otherwise, this class doesn't own * the buffer. The buffer will be populated when pack * function is called. + * @params b_symmetric if true we skip row offset computation */ PackAWithIm2Col( const conv_param_t<SPATIAL_DIM>& conv_param, const T* sdata, inpType* pmat = nullptr, - std::int32_t zero_pt = 0, - std::int32_t* row_offset = nullptr); + std::int32_t a_zero_pt = 0, + std::int32_t* row_offset = nullptr, + bool b_symmetric = false); /** * Activation matrices are not constant so cannot amortize the cost of @@ -578,9 +580,9 @@ class FBGEMM_API PackAWithIm2Col private: const conv_param_t<SPATIAL_DIM> conv_p_; const T* sdata_; - std::int32_t zero_pt_; - std::int32_t* row_offset_; - bool rowOffsetAllocatedHere; + std::int32_t a_zero_pt_; + std::int32_t* row_offset_{nullptr}; + bool rowOffsetAllocatedHere{false}; std::int32_t row_interleave_B_; }; diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index f096d7a..b9e310f 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -21,8 +21,9 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( const conv_param_t<SPATIAL_DIM>& conv_p, const T* sdata, inpType* pmat, - int32_t zero_pt, - int32_t* row_offset) + int32_t a_zero_pt, + int32_t* row_offset, + bool b_symmetric) : PackMatrix<PackAWithIm2Col<T, accT, SPATIAL_DIM>, T, accT>( conv_p.MB * std::accumulate( @@ -40,7 +41,7 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( conv_p.G), conv_p_(conv_p), sdata_(sdata), - zero_pt_(zero_pt) { + a_zero_pt_(a_zero_pt) { static_assert( SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension "); if (cpuinfo_has_x86_avx512f()) { @@ -70,13 +71,15 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( fbgemmAlignedAlloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T))); // aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T))); } - if (row_offset) { - rowOffsetAllocatedHere = false; - row_offset_ = row_offset; - } else { - rowOffsetAllocatedHere = true; - row_offset_ = static_cast<int32_t*>( - fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(int32_t))); + if (!b_symmetric) { + if (row_offset) { + rowOffsetAllocatedHere = false; + row_offset_ = row_offset; + } else { + rowOffsetAllocatedHere = true; + row_offset_ = static_cast<int32_t*>( + fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(int32_t))); + } } } @@ -115,20 +118,35 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) { "PackAWithIm2Col<T, accT>::pack only works for T == uint8_t"); if (point_wise) { int32_t ld = this->numCols(); - for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { - int buf_idx = i - block.row_start; - memcpy( - out + buf_idx * BaseType::blockColSize(), - sdata_ + i * ld + block.col_start, - block.col_size * sizeof(T)); - // zero fill - for (int j = block.col_size; j < block_p.col_size; ++j) { - out[buf_idx * BaseType::blockColSize() + j] = 0; + if (row_offset_buf) { + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + int buf_idx = i - block.row_start; + memcpy( + out + buf_idx * BaseType::blockColSize(), + sdata_ + i * ld + block.col_start, + block.col_size * sizeof(T)); + // zero fill + for (int j = block.col_size; j < block_p.col_size; ++j) { + out[buf_idx * BaseType::blockColSize() + j] = 0; + } + int32_t row_sum = + row_offset_acc ? row_offset_buf[i - block.row_start] : 0; + row_sum += + reduceAvx2(sdata_ + i * ld + block.col_start, block.col_size); + row_offset_buf[i - block.row_start] = row_sum; + } + } else { + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + int buf_idx = i - block.row_start; + memcpy( + out + buf_idx * BaseType::blockColSize(), + sdata_ + i * ld + block.col_start, + block.col_size * sizeof(T)); + // zero fill + for (int j = block.col_size; j < block_p.col_size; ++j) { + out[buf_idx * BaseType::blockColSize() + j] = 0; + } } - int32_t row_sum = - row_offset_acc ? row_offset_buf[i - block.row_start] : 0; - row_sum += reduceAvx2(sdata_ + i * ld + block.col_start, block.col_size); - row_offset_buf[i - block.row_start] = row_sum; } return; @@ -168,7 +186,7 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) { std::memset( out + (i - block.row_start) * BaseType::blockColSize() + (j_blk_start - block.col_start), - zero_pt_, + a_zero_pt_, sizeof(T) * (j_blk_end - j_blk_start)); } else { std::memcpy( @@ -220,7 +238,7 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) { &out [(i - block.row_start) * BaseType::blockColSize() + (j_blk_start - block.col_start)], - zero_pt_, + a_zero_pt_, sizeof(T) * (j_blk_end - j_blk_start)); } else { std::memcpy( @@ -252,11 +270,13 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) { (block.col_start + block.col_size))); } - // TODO: skip row_offset computation when B_zero_point is 0 - int32_t row_sum = row_offset_acc ? row_offset_buf[i - block.row_start] : 0; - row_sum += reduceAvx2( - out + (i - block.row_start) * this->blockColSize(), block.col_size); - row_offset_buf[i - block.row_start] = row_sum; + if (row_offset_buf) { + int32_t row_sum = + row_offset_acc ? row_offset_buf[i - block.row_start] : 0; + row_sum += reduceAvx2( + out + (i - block.row_start) * this->blockColSize(), block.col_size); + row_offset_buf[i - block.row_start] = row_sum; + } } // for each i } diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc index feae002..1dfc756 100644 --- a/test/Im2ColFusedRequantizeTest.cc +++ b/test/Im2ColFusedRequantizeTest.cc @@ -30,13 +30,15 @@ vector<QuantizationGranularity> qGranularityVals{ namespace { class fbgemmIm2colTest - : public testing::TestWithParam<QuantizationGranularity> {}; + : public testing::TestWithParam<tuple<QuantizationGranularity, bool>> {}; }; // namespace INSTANTIATE_TEST_CASE_P( InstantiationName, fbgemmIm2colTest, - ::testing::ValuesIn(qGranularityVals)); + ::testing::Combine( + ::testing::ValuesIn(qGranularityVals), + ::testing::Bool())); // From Faster-RCNN with ShuffleNet static vector<conv_param_t<>> shapes = { @@ -52,7 +54,7 @@ static vector<conv_param_t<>> shapes = { }; template <typename ACC_T, QuantizationGranularity Q_GRAN> -static void Im2colTest() { +static void Im2colTest(bool b_symmetric) { for (auto conv_p : shapes) { for (int groups : {1, 4}) { if (conv_p.IC % groups != 0 || conv_p.OC % groups != 0) { @@ -89,6 +91,9 @@ static void Im2colTest() { randFill<int8_t>(Bint8, -4, 4); randFill(Bint8_zero_point, -3, -1); } + if (b_symmetric) { + randFill(Bint8_zero_point, 0, 0); + } aligned_vector<float> C_multiplier(Bint8_zero_point.size()); randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); @@ -169,7 +174,8 @@ static void Im2colTest() { Aint8.data(), nullptr, Aint8_zero_point, - row_offset_buf.data()); + row_offset_buf.data(), + b_symmetric); DoNothing<> doNothingObj{}; ReQuantizeOutput<false, Q_GRAN> outputProcObj( @@ -223,24 +229,28 @@ static void Im2colTest() { } TEST_P(fbgemmIm2colTest, Acc32Test) { - QuantizationGranularity q_granularity = GetParam(); + QuantizationGranularity q_granularity; + bool b_symmetric; + tie(q_granularity, b_symmetric) = GetParam(); if (q_granularity == QuantizationGranularity::TENSOR) { - Im2colTest<int32_t, QuantizationGranularity::TENSOR>(); + Im2colTest<int32_t, QuantizationGranularity::TENSOR>(b_symmetric); } else if (q_granularity == QuantizationGranularity::GROUP) { - Im2colTest<int32_t, QuantizationGranularity::GROUP>(); + Im2colTest<int32_t, QuantizationGranularity::GROUP>(b_symmetric); } else { - Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(); + Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric); } } TEST_P(fbgemmIm2colTest, Acc16Test) { - QuantizationGranularity q_granularity = GetParam(); + QuantizationGranularity q_granularity; + bool b_symmetric; + tie(q_granularity, b_symmetric) = GetParam(); if (q_granularity == QuantizationGranularity::TENSOR) { - Im2colTest<int16_t, QuantizationGranularity::TENSOR>(); + Im2colTest<int16_t, QuantizationGranularity::TENSOR>(b_symmetric); } else if (q_granularity == QuantizationGranularity::GROUP) { - Im2colTest<int16_t, QuantizationGranularity::GROUP>(); + Im2colTest<int16_t, QuantizationGranularity::GROUP>(b_symmetric); } else { - Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(); + Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric); } } @@ -453,7 +463,10 @@ void SConvTest() { } TEST_P(fbgemmIm2colTest, SConvTest) { - QuantizationGranularity q_granularity = GetParam(); + QuantizationGranularity q_granularity; + bool b_symmetric; + tie(q_granularity, b_symmetric) = GetParam(); + // b_symmetric ignored for now if (q_granularity == QuantizationGranularity::TENSOR) { SConvTest<QuantizationGranularity::TENSOR>(); } else if (q_granularity == QuantizationGranularity::GROUP) { @@ -539,7 +552,7 @@ static vector<conv_param_t<3>> shapes_3d = { }; template <typename ACC_T, QuantizationGranularity Q_GRAN> -static void Im2col3DTest() { +static void Im2col3DTest(bool b_symmetric) { for (auto conv_p : shapes_3d) { for (int groups : {1, 4}) { if (conv_p.IC % groups != 0 || conv_p.OC % groups != 0) { @@ -578,6 +591,9 @@ static void Im2col3DTest() { randFill<int8_t>(Bint8, -4, 4); randFill(Bint8_zero_point, -3, -1); } + if (b_symmetric) { + randFill(Bint8_zero_point, 0, 0); + } aligned_vector<float> C_multiplier(Bint8_zero_point.size()); randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); @@ -659,7 +675,8 @@ static void Im2col3DTest() { Aint8.data(), nullptr, Aint8_zero_point, - row_offset_buf.data()); + row_offset_buf.data(), + b_symmetric); DoNothing<> doNothingObj{}; ReQuantizeOutput<false, Q_GRAN> outputProcObj( @@ -719,23 +736,27 @@ static void Im2col3DTest() { } TEST_P(fbgemmIm2colTest, 3DAcc32Test) { - QuantizationGranularity q_granularity = GetParam(); + QuantizationGranularity q_granularity; + bool b_symmetric; + tie(q_granularity, b_symmetric) = GetParam(); if (q_granularity == QuantizationGranularity::TENSOR) { - Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>(); + Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>(b_symmetric); } else if (q_granularity == QuantizationGranularity::GROUP) { - Im2col3DTest<int32_t, QuantizationGranularity::GROUP>(); + Im2col3DTest<int32_t, QuantizationGranularity::GROUP>(b_symmetric); } else { - Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(); + Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric); } } TEST_P(fbgemmIm2colTest, 3DAcc16Test) { - QuantizationGranularity q_granularity = GetParam(); + QuantizationGranularity q_granularity; + bool b_symmetric; + tie(q_granularity, b_symmetric) = GetParam(); if (q_granularity == QuantizationGranularity::TENSOR) { - Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>(); + Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>(b_symmetric); } else if (q_granularity == QuantizationGranularity::GROUP) { - Im2col3DTest<int16_t, QuantizationGranularity::GROUP>(); + Im2col3DTest<int16_t, QuantizationGranularity::GROUP>(b_symmetric); } else { - Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(); + Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric); } } |