diff options
-rw-r--r-- | bench/GEMMsTunableBenchmark.cc | 2 | ||||
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 76 | ||||
-rw-r--r-- | src/Fbgemm.cc | 8 | ||||
-rw-r--r-- | src/GroupwiseConvAcc32Avx2.cc | 3 | ||||
-rw-r--r-- | src/PackAMatrix.cc | 8 | ||||
-rw-r--r-- | src/PackAWithIm2Col.cc | 24 | ||||
-rw-r--r-- | src/PackAWithQuantRowOffset.cc | 3 | ||||
-rw-r--r-- | src/PackAWithRowOffset.cc | 22 | ||||
-rw-r--r-- | src/PackBMatrix.cc | 8 | ||||
-rw-r--r-- | src/PackMatrix.cc | 14 | ||||
-rw-r--r-- | src/QuantUtils.cc | 10 | ||||
-rw-r--r-- | src/QuantUtilsAvx2.cc | 13 |
12 files changed, 122 insertions, 69 deletions
diff --git a/bench/GEMMsTunableBenchmark.cc b/bench/GEMMsTunableBenchmark.cc index 04eed8a..a65b51f 100644 --- a/bench/GEMMsTunableBenchmark.cc +++ b/bench/GEMMsTunableBenchmark.cc @@ -291,6 +291,8 @@ int main(int /* unused */, char** /* unused */) { assert(0 && "architecture not supported"); return 0; } + } else { + throw std::runtime_error("Failed to initialize cpuinfo!"); } set<vector<int>> incorrect_configs; diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 4175d65..f7292fd 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -45,11 +45,19 @@ ExecuteKernel< outputProcess_(outputProcess), thread_id_(thread_id), num_threads_(num_threads) { + if (!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } if (params) { - mbSize_ = params->MCB; - nbSize_ = params->NCB; - nrMinSize_ = params->NR_MIN; - nrSize_ = params->NR; + if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { + mbSize_ = params->MCB; + nbSize_ = params->NCB; + nrMinSize_ = params->NR_MIN; + nrSize_ = params->NR; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecure"); + } } else { if (fbgemmHasAvx512Support()) { mbSize_ = PackingTraits< @@ -110,28 +118,24 @@ void ExecuteKernel< typename BaseType::jit_micro_kernel_fp fn; - if (cpuinfo_initialize()) { - if (fbgemmHasAvx512Support()) { - fn = BaseType::template getOrCreate<inst_set_t::avx512>( - accum, - packed_rows_A, - packedB_.blockColSize(), - packedA_.numPackedCols(), - nbSize_); - } else if (fbgemmHasAvx2Support()) { - fn = BaseType::template getOrCreate<inst_set_t::avx2>( - accum, - packed_rows_A, - packedB_.blockColSize(), - packedA_.numPackedCols(), - nbSize_); - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecture"); - return; - } + if (fbgemmHasAvx512Support()) { + fn = BaseType::template getOrCreate<inst_set_t::avx512>( + accum, + packed_rows_A, + packedB_.blockColSize(), + packedA_.numPackedCols(), + nbSize_); + } else if (fbgemmHasAvx2Support()) { + fn = BaseType::template getOrCreate<inst_set_t::avx2>( + accum, + packed_rows_A, + packedB_.blockColSize(), + packedA_.numPackedCols(), + nbSize_); } else { - throw std::runtime_error("Failed to initialize cpuinfo!"); + // TODO: Have default slower path + assert(0 && "unsupported architecture"); + return; } #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN @@ -144,20 +148,16 @@ void ExecuteKernel< if (jb == bColBlocks - 1) { int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_; if (nc != nbSize_) { - if (cpuinfo_initialize()) { - if (fbgemmHasAvx512Support()) { - fn = BaseType::template getOrCreate<inst_set_t::avx512>( - accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); - } else if (fbgemmHasAvx2Support()) { - fn = BaseType::template getOrCreate<inst_set_t::avx2>( - accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecture"); - return; - } + if (fbgemmHasAvx512Support()) { + fn = BaseType::template getOrCreate<inst_set_t::avx512>( + accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); + } else if (fbgemmHasAvx2Support()) { + fn = BaseType::template getOrCreate<inst_set_t::avx2>( + accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); } else { - throw std::runtime_error("Failed to initialize cpuinfo!"); + // TODO: Have default slower path + assert(0 && "unsupported architecture"); + return; } } } diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index a40f38a..523f556 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -50,9 +50,11 @@ void fbgemmPacked( // Run time CPU detection if (cpuinfo_initialize()) { if (blocking_params) { - MCB = blocking_params->MCB; - KCB = blocking_params->KCB; - MR = blocking_params->MR; + if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { + MCB = blocking_params->MCB; + KCB = blocking_params->KCB; + MR = blocking_params->MR; + } } else { if (fbgemmHasAvx512Support()) { MCB = PackingTraits< diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc index 0e1b85b..0032e72 100644 --- a/src/GroupwiseConvAcc32Avx2.cc +++ b/src/GroupwiseConvAcc32Avx2.cc @@ -1789,6 +1789,9 @@ void fbgemmGroupwiseConv( int num_threads) { typedef ReQuantizeOutput<FUSE_RELU, Q_GRAN> processOutputType; + if (!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } if (!fbgemmOptimizedGConv<SPATIAL_DIM>(conv_param) || (!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { return fbgemmGroupwiseConvBase_< diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc index 89ec13e..87adaba 100644 --- a/src/PackAMatrix.cc +++ b/src/PackAMatrix.cc @@ -31,10 +31,18 @@ PackAMatrix<T, accT>::PackAMatrix( trans_(trans), smat_(smat), ld_(ld) { + if (!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } if (params) { + if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { BaseType::brow_ = params->MCB; BaseType::bcol_ = params->KCB; row_interleave_B_ = params->ROW_INTERLEAVE; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecure"); + } } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index fb4556c..e55dd4e 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -46,11 +46,19 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( a_zero_pt_(a_zero_pt) { static_assert( SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension "); + if (!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } if (params) { + if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { BaseType::brow_ = params->MCB; BaseType::bcol_ = params->KCB; row_interleave_B_ = params->ROW_INTERLEAVE; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecure"); + } } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; @@ -470,19 +478,19 @@ template <typename T, typename accT, int SPATIAL_DIM> int PackAWithIm2Col<T, accT, SPATIAL_DIM>::rowOffsetBufferSize( const BlockingFactors* params) { if (cpuinfo_initialize()) { - if (params){ + if (params) { return params->MCB; } else { - if (fbgemmHasAvx512Support()) { - return PackingTraits<T, accT, inst_set_t::avx512>::MCB; - } else if (fbgemmHasAvx2Support()) { + if (fbgemmHasAvx512Support()) { + return PackingTraits<T, accT, inst_set_t::avx512>::MCB; + } else if (fbgemmHasAvx2Support()) { return PackingTraits<T, accT, inst_set_t::avx2>::MCB; } else { - // TODO: Have default slower path - assert(0 && "unsupported architecture"); - return -1; + // TODO: Have default slower path + assert(0 && "unsupported architecture"); + return -1; + } } - } } else { throw std::runtime_error("Failed to initialize cpuinfo!"); } diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc index 175425f..7572a51 100644 --- a/src/PackAWithQuantRowOffset.cc +++ b/src/PackAWithQuantRowOffset.cc @@ -42,6 +42,9 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset( scale_(scale), zero_pt_(zero_pt), row_offset_(row_offset) { + if (!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } rowOffsetAllocatedHere = false; if (params) { if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc index 139a6d3..b791817 100644 --- a/src/PackAWithRowOffset.cc +++ b/src/PackAWithRowOffset.cc @@ -36,11 +36,19 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset( smat_(smat), ld_(ld), row_offset_(row_offset) { + if (!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } rowOffsetAllocatedHere = false; if (params) { + if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { BaseType::brow_ = params->MCB; BaseType::bcol_ = params->KCB; row_interleave_B_ = params->ROW_INTERLEAVE; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecure"); + } } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; @@ -179,19 +187,19 @@ template <typename T, typename accT> int PackAWithRowOffset<T, accT>::rowOffsetBufferSize( const BlockingFactors* params) { if (cpuinfo_initialize()) { - if (params){ + if (params) { return params->MCB; } else { - if (fbgemmHasAvx512Support()) { + if (fbgemmHasAvx512Support()) { return PackingTraits<T, accT, inst_set_t::avx512>::MCB; } else if (fbgemmHasAvx2Support()) { return PackingTraits<T, accT, inst_set_t::avx2>::MCB; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecture"); - return -1; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecture"); + return -1; + } } - } } else { throw std::runtime_error("Failed to initialize cpuinfo!"); } diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index 472c802..970a741 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -185,10 +185,18 @@ PackBMatrix<T, accT>::PackBMatrix( trans_(trans), smat_(smat), ld_(ld) { + if (!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } if (params) { + if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { BaseType::brow_ = params->KCB; BaseType::bcol_ = params->NCB; row_interleave_ = params->ROW_INTERLEAVE; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecure"); + } } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc index e93b97c..c9a68a6 100644 --- a/src/PackMatrix.cc +++ b/src/PackMatrix.cc @@ -33,11 +33,19 @@ int PackMatrix<PT, inpType, accType>::packedBufferSize( int rows, int cols, const BlockingFactors* params) { + if (!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } int MCB, KCB, NCB; if (params) { - MCB = params->MCB; - NCB = params->NCB; - KCB = params->KCB; + if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { + MCB = params->MCB; + NCB = params->NCB; + KCB = params->KCB; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecure"); + } } else { if (fbgemmHasAvx512Support()) { MCB = PackingTraits<inpType, accType, inst_set_t::avx512>::MCB; diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc index 82f8c2a..1ab00d1 100644 --- a/src/QuantUtils.cc +++ b/src/QuantUtils.cc @@ -176,7 +176,7 @@ void Quantize<uint8_t>( uint8_t* dst, int len, const TensorQuantizationParams& qparams) { - bool avx2_support = fbgemmHasAvx2Support(); + bool avx2_support = cpuinfo_initialize() && fbgemmHasAvx2Support(); bool fma_support = cpuinfo_has_x86_fma3(); if (avx2_support && fma_support && qparams.precision == 8) { // fast path @@ -221,7 +221,8 @@ void Requantize<uint8_t>( uint8_t* dst, const int len, const RequantizationParams& params) { - if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) { + if (params.target_qparams.precision == 8 && cpuinfo_initialize() && + fbgemmHasAvx2Support()) { RequantizeAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { @@ -237,7 +238,7 @@ void RequantizeFixedPoint( int len, const RequantizationParams& params) { if (std::is_same<T, uint8_t>::value && params.target_qparams.precision == 8 && - fbgemmHasAvx2Support()) { + cpuinfo_initialize() && fbgemmHasAvx2Support()) { RequantizeFixedPointAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { @@ -267,7 +268,8 @@ void RequantizeFixedPoint<uint8_t>( uint8_t* dst, const int len, const RequantizationParams& params) { - if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) { + if (params.target_qparams.precision == 8 && cpuinfo_initialize() && + fbgemmHasAvx2Support()) { RequantizeFixedPointAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index 875c9e1..7f43ced 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -142,16 +142,17 @@ void RequantizeAvx2( int len, const RequantizationParams& params) { DoNothing<> doNothingObj{}; + int32_t Bq_zero_point[] = { 0 }; ReQuantizeOutput<false /* FUSE_RELU */> requantizeObj( doNothingObj, ¶ms.real_multiplier, params.target_qparams.zero_point, - 0, - nullptr, - nullptr, - nullptr, - nullptr, - len); + 0, // Aq_zero_point + Bq_zero_point, // Bq_zero_point + nullptr, // row_offsets + nullptr, // col_offsets + nullptr, // bias + len); // ncol requantizeObj.f<inst_set_t::avx2>(dst, src, {0, 1, 0, len}, 0, 0); } |