diff options
-rw-r--r-- | CMakeLists.txt | 3 | ||||
-rw-r--r-- | include/fbgemm/Utils.h | 10 | ||||
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 20 | ||||
-rw-r--r-- | src/Fbgemm.cc | 4 | ||||
-rw-r--r-- | src/GenerateKernel.h | 4 | ||||
-rw-r--r-- | src/GroupwiseConv.h | 4 | ||||
-rw-r--r-- | src/GroupwiseConvAcc32Avx2.cc | 10 | ||||
-rw-r--r-- | src/PackAMatrix.cc | 4 | ||||
-rw-r--r-- | src/PackAWithIm2Col.cc | 8 | ||||
-rw-r--r-- | src/PackAWithQuantRowOffset.cc | 8 | ||||
-rw-r--r-- | src/PackAWithRowOffset.cc | 8 | ||||
-rw-r--r-- | src/PackBMatrix.cc | 4 | ||||
-rw-r--r-- | src/PackMatrix.cc | 4 | ||||
-rw-r--r-- | src/QuantUtils.cc | 8 | ||||
-rw-r--r-- | src/Utils.cc | 14 |
15 files changed, 67 insertions, 46 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 607657e..494e261 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,7 +100,8 @@ set_target_properties(fbgemm_generic fbgemm_avx2 fbgemm_avx512 PROPERTIES target_compile_options(fbgemm_avx2 PRIVATE "-m64" "-mavx2" "-mfma" "-masm=intel") target_compile_options(fbgemm_avx512 PRIVATE - "-m64" "-mavx2" "-mfma" "-mavx512f" "-masm=intel") + "-m64" "-mavx2" "-mfma" "-mavx512f" "-mavx512bw" "-mavx512dq" + "-mavx512vl" "-masm=intel") if(NOT TARGET asmjit) #Download asmjit from github if ASMJIT_SRC_DIR is not specified. diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index 775e18c..2813629 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -77,4 +77,14 @@ void transpose_simd( float* dst, int ld_dst); +/** + * @brief Are we running on a AVX512 supported cpu? + */ +FBGEMM_API bool fbgemmHasAvx512Support(); + +/** + * @brief Are we running on a AVX2 supported cpu? + */ +FBGEMM_API bool fbgemmHasAvx2Support(); + } // namespace fbgemm diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 0dfc531..f2b028d 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -42,7 +42,7 @@ ExecuteKernel< outputProcess_(outputProcess), thread_id_(thread_id), num_threads_(num_threads) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { mbSize_ = PackingTraits< int8_t, typename packingAMatrix::accType, @@ -55,7 +55,7 @@ ExecuteKernel< int8_t, typename packingAMatrix::accType, inst_set_t::avx512>::NR; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { mbSize_ = PackingTraits< int8_t, typename packingAMatrix::accType, @@ -101,14 +101,14 @@ void ExecuteKernel< typename BaseType::jit_micro_kernel_fp fn; if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx512>( accum, packed_rows_A, packedB_.blockColSize(), packedA_.numPackedCols(), nbSize_); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx2>( accum, packed_rows_A, @@ -135,10 +135,10 @@ void ExecuteKernel< int nc = ((packedB_.lastBcol() - 1) / nrSize_ + 1) * nrSize_; if (nc != nbSize_) { if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx512>( accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx2>( accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); } else { @@ -203,7 +203,7 @@ void ExecuteKernel< int32_t nSize = C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols(); if (nSize) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { // TODO: avx512 path // Currently use avx2 code outputProcess_.template f<inst_set_t::avx2>( @@ -212,7 +212,7 @@ void ExecuteKernel< {row_start_A, packed_rows_A, NDim * group, nSize}, ldc_, ldc_); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { outputProcess_.template f<inst_set_t::avx2>( matC_, C_buffer_row_start, @@ -228,7 +228,7 @@ void ExecuteKernel< if (C_buffer_start == C_tile_) { // When C_tile_ scratchpad was used to avoid accessing memory past // C_buffer_ . - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { // TODO: avx512 path // Currently use avx2 code outputProcess_.template f<inst_set_t::avx2>( @@ -240,7 +240,7 @@ void ExecuteKernel< packedB_.lastBcol()}, ldc_, leadingDim); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { outputProcess_.template f<inst_set_t::avx2>( matC_, C_tile_, diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index c7a7e33..a90dd2d 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -48,7 +48,7 @@ void fbgemmPacked( // Run time CPU detection if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { MCB = PackingTraits< typename packingAMatrix::inpType, typename packingAMatrix::accType, @@ -61,7 +61,7 @@ void fbgemmPacked( typename packingAMatrix::inpType, typename packingAMatrix::accType, inst_set_t::avx512>::MR; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { MCB = PackingTraits< typename packingAMatrix::inpType, typename packingAMatrix::accType, diff --git a/src/GenerateKernel.h b/src/GenerateKernel.h index b24ef23..548ed7e 100644 --- a/src/GenerateKernel.h +++ b/src/GenerateKernel.h @@ -60,9 +60,9 @@ class CodeGenBase { } { // vector width in bits if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { vectorWidth_ = 512; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { vectorWidth_ = 256; } else { // TODO: Have default path diff --git a/src/GroupwiseConv.h b/src/GroupwiseConv.h index 6a0a8a5..4e539f2 100644 --- a/src/GroupwiseConv.h +++ b/src/GroupwiseConv.h @@ -51,10 +51,10 @@ class GenConvKernel { x86::ymm8} { // vector width in bits if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { // TODO: change this to 512 once we have avx512f version vectorWidth_ = 256; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { vectorWidth_ = 256; } else { // TODO: Have default path diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc index 26eb486..0e1b85b 100644 --- a/src/GroupwiseConvAcc32Avx2.cc +++ b/src/GroupwiseConvAcc32Avx2.cc @@ -1681,7 +1681,7 @@ void fbgemmGroupwiseConvBase_( // current group memcpy(rowOffsetBuf, rowOffsetForCurG, ih_iw * sizeof(int32_t)); - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { // Currently use avx2 code outProcess.template f<inst_set_t::avx2>( out, @@ -1689,7 +1689,7 @@ void fbgemmGroupwiseConvBase_( {i * oh_ow, oh_ow, (g + j) * K_per_G, K_per_G}, K_per_G * G, K_per_G * G); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { outProcess.template f<inst_set_t::avx2>( out, currOutBuf + j * K_per_G, @@ -1790,7 +1790,7 @@ void fbgemmGroupwiseConv( typedef ReQuantizeOutput<FUSE_RELU, Q_GRAN> processOutputType; if (!fbgemmOptimizedGConv<SPATIAL_DIM>(conv_param) || - (!cpuinfo_has_x86_avx512f() && !cpuinfo_has_x86_avx2())) { + (!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { return fbgemmGroupwiseConvBase_< packed_W, outType, @@ -2150,7 +2150,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) { // row offset buffer should be a able to hold row offsets for however // number of groups we process at a time. if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { int bufferSize = conv_param.OUT_DIM[0] * conv_param.OUT_DIM[1]; int C_per_G = conv_param.IC / conv_param.G; int K_per_G = conv_param.OC / conv_param.G; @@ -2160,7 +2160,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) { } else { return conv_param.G * bufferSize; } - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { int bufferSize = conv_param.OUT_DIM[0] * conv_param.OUT_DIM[1]; int C_per_G = conv_param.IC / conv_param.G; int K_per_G = conv_param.OC / conv_param.G; diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc index f52684a..db019db 100644 --- a/src/PackAMatrix.cc +++ b/src/PackAMatrix.cc @@ -25,12 +25,12 @@ PackAMatrix<T, accT>::PackAMatrix( trans_(trans), smat_(smat), ld_(ld) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index f75092e..93408da 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -44,12 +44,12 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( a_zero_pt_(a_zero_pt) { static_assert( SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension "); - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = @@ -461,9 +461,9 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::printPackedMatrix( template <typename T, typename accT, int SPATIAL_DIM> int PackAWithIm2Col<T, accT, SPATIAL_DIM>::rowOffsetBufferSize() { if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { return PackingTraits<T, accT, inst_set_t::avx512>::MCB; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { return PackingTraits<T, accT, inst_set_t::avx2>::MCB; } else { // TODO: Have default slower path diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc index 02e701e..2929ebb 100644 --- a/src/PackAWithQuantRowOffset.cc +++ b/src/PackAWithQuantRowOffset.cc @@ -42,12 +42,12 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset( row_offset_(row_offset) { rowOffsetAllocatedHere = false; - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = @@ -181,9 +181,9 @@ void PackAWithQuantRowOffset<T, accT>::printPackedMatrix(std::string name) { template <typename T, typename accT> int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize() { if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { return PackingTraits<T, accT, inst_set_t::avx512>::MCB; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { return PackingTraits<T, accT, inst_set_t::avx2>::MCB; } else { assert(0 && "unsupported architecture"); diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc index cb0dcaa..7777f1a 100644 --- a/src/PackAWithRowOffset.cc +++ b/src/PackAWithRowOffset.cc @@ -36,12 +36,12 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset( row_offset_(row_offset) { rowOffsetAllocatedHere = false; - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = @@ -171,9 +171,9 @@ void PackAWithRowOffset<T, accT>::printPackedMatrix(std::string name) { template <typename T, typename accT> int PackAWithRowOffset<T, accT>::rowOffsetBufferSize() { if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { return PackingTraits<T, accT, inst_set_t::avx512>::MCB; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { return PackingTraits<T, accT, inst_set_t::avx2>::MCB; } else { // TODO: Have default slower path diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index 8d76e29..659d1df 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -25,12 +25,12 @@ PackBMatrix<T, accT>::PackBMatrix( trans_(trans), smat_(smat), ld_(ld) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB; row_interleave_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::NCB; row_interleave_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc index 0177a07..316fc06 100644 --- a/src/PackMatrix.cc +++ b/src/PackMatrix.cc @@ -28,7 +28,7 @@ PackMatrix<PT, inpType, accType>::PackMatrix( template <typename PT, typename inpType, typename accType> int PackMatrix<PT, inpType, accType>::packedBufferSize(int rows, int cols) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { if (isA()) { return PackingTraits<inpType, accType, inst_set_t::avx512>::MCB * PackingTraits<inpType, accType, inst_set_t::avx512>::KCB; @@ -38,7 +38,7 @@ int PackMatrix<PT, inpType, accType>::packedBufferSize(int rows, int cols) { return (((rows + rowBlock - 1) / rowBlock) * rowBlock) * (((cols + colBlock - 1) / colBlock) * colBlock); } - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { if (isA()) { return PackingTraits<inpType, accType, inst_set_t::avx2>::MCB * PackingTraits<inpType, accType, inst_set_t::avx2>::KCB; diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc index 1d057e2..82f8c2a 100644 --- a/src/QuantUtils.cc +++ b/src/QuantUtils.cc @@ -176,7 +176,7 @@ void Quantize<uint8_t>( uint8_t* dst, int len, const TensorQuantizationParams& qparams) { - bool avx2_support = cpuinfo_has_x86_avx2(); + bool avx2_support = fbgemmHasAvx2Support(); bool fma_support = cpuinfo_has_x86_fma3(); if (avx2_support && fma_support && qparams.precision == 8) { // fast path @@ -221,7 +221,7 @@ void Requantize<uint8_t>( uint8_t* dst, const int len, const RequantizationParams& params) { - if (params.target_qparams.precision == 8 && cpuinfo_has_x86_avx2()) { + if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) { RequantizeAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { @@ -237,7 +237,7 @@ void RequantizeFixedPoint( int len, const RequantizationParams& params) { if (std::is_same<T, uint8_t>::value && params.target_qparams.precision == 8 && - cpuinfo_has_x86_avx2()) { + fbgemmHasAvx2Support()) { RequantizeFixedPointAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { @@ -267,7 +267,7 @@ void RequantizeFixedPoint<uint8_t>( uint8_t* dst, const int len, const RequantizationParams& params) { - if (params.target_qparams.precision == 8 && cpuinfo_has_x86_avx2()) { + if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) { RequantizeFixedPointAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { diff --git a/src/Utils.cc b/src/Utils.cc index fcd8ade..0fa620d 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -179,9 +179,9 @@ void transpose_simd( int ld_dst) { // Run time CPU detection if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { internal::transpose_16x16(M, N, src, ld_src, dst, ld_dst); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { internal::transpose_8x8(M, N, src, ld_src, dst, ld_dst); } else { transpose_ref(M, N, src, ld_src, dst, ld_dst); @@ -192,4 +192,14 @@ void transpose_simd( } } +bool fbgemmHasAvx512Support() { + return ( + cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && + cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()); +} + +bool fbgemmHasAvx2Support() { + return (cpuinfo_has_x86_avx2()); +} + } // namespace fbgemm |