diff options
author | Jianyu Huang <jianyuhuang@fb.com> | 2019-03-06 21:18:56 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-03-06 21:21:45 +0300 |
commit | 66b41357561f2ff9895d2b4638273f07c49dbe29 (patch) | |
tree | 1d0014e483a9fe5beb0cce3b1394b0eb550c12ba /src | |
parent | 2eb84b8912f8340d8ffc54a3ef7653291f64f6f8 (diff) |
Add Avx512BW/VL/DQ check (#84)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/84
Add AVX512BW Check:
AVX-512 Byte and Word Instructions add support for for 8-bit and 16-bit integer operations such as vpmaddubsw.
Similarly, add AVX512VL/DQ check.
Reviewed By: jspark1105
Differential Revision: D14321050
fbshipit-source-id: bd34745fd488ce4efe3248aeb78c54e1c2d91d47
Diffstat (limited to 'src')
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 20 | ||||
-rw-r--r-- | src/Fbgemm.cc | 4 | ||||
-rw-r--r-- | src/GenerateKernel.h | 4 | ||||
-rw-r--r-- | src/GroupwiseConv.h | 4 | ||||
-rw-r--r-- | src/GroupwiseConvAcc32Avx2.cc | 10 | ||||
-rw-r--r-- | src/PackAMatrix.cc | 4 | ||||
-rw-r--r-- | src/PackAWithIm2Col.cc | 8 | ||||
-rw-r--r-- | src/PackAWithQuantRowOffset.cc | 8 | ||||
-rw-r--r-- | src/PackAWithRowOffset.cc | 8 | ||||
-rw-r--r-- | src/PackBMatrix.cc | 4 | ||||
-rw-r--r-- | src/PackMatrix.cc | 4 | ||||
-rw-r--r-- | src/QuantUtils.cc | 8 | ||||
-rw-r--r-- | src/Utils.cc | 14 |
13 files changed, 55 insertions, 45 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 0dfc531..f2b028d 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -42,7 +42,7 @@ ExecuteKernel< outputProcess_(outputProcess), thread_id_(thread_id), num_threads_(num_threads) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { mbSize_ = PackingTraits< int8_t, typename packingAMatrix::accType, @@ -55,7 +55,7 @@ ExecuteKernel< int8_t, typename packingAMatrix::accType, inst_set_t::avx512>::NR; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { mbSize_ = PackingTraits< int8_t, typename packingAMatrix::accType, @@ -101,14 +101,14 @@ void ExecuteKernel< typename BaseType::jit_micro_kernel_fp fn; if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx512>( accum, packed_rows_A, packedB_.blockColSize(), packedA_.numPackedCols(), nbSize_); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx2>( accum, packed_rows_A, @@ -135,10 +135,10 @@ void ExecuteKernel< int nc = ((packedB_.lastBcol() - 1) / nrSize_ + 1) * nrSize_; if (nc != nbSize_) { if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx512>( accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx2>( accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); } else { @@ -203,7 +203,7 @@ void ExecuteKernel< int32_t nSize = C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols(); if (nSize) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { // TODO: avx512 path // Currently use avx2 code outputProcess_.template f<inst_set_t::avx2>( @@ -212,7 +212,7 @@ void ExecuteKernel< {row_start_A, packed_rows_A, NDim * group, nSize}, ldc_, ldc_); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { outputProcess_.template f<inst_set_t::avx2>( matC_, C_buffer_row_start, @@ -228,7 +228,7 @@ void ExecuteKernel< if (C_buffer_start == C_tile_) { // When C_tile_ scratchpad was used to avoid accessing memory past // C_buffer_ . - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { // TODO: avx512 path // Currently use avx2 code outputProcess_.template f<inst_set_t::avx2>( @@ -240,7 +240,7 @@ void ExecuteKernel< packedB_.lastBcol()}, ldc_, leadingDim); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { outputProcess_.template f<inst_set_t::avx2>( matC_, C_tile_, diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index c7a7e33..a90dd2d 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -48,7 +48,7 @@ void fbgemmPacked( // Run time CPU detection if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { MCB = PackingTraits< typename packingAMatrix::inpType, typename packingAMatrix::accType, @@ -61,7 +61,7 @@ void fbgemmPacked( typename packingAMatrix::inpType, typename packingAMatrix::accType, inst_set_t::avx512>::MR; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { MCB = PackingTraits< typename packingAMatrix::inpType, typename packingAMatrix::accType, diff --git a/src/GenerateKernel.h b/src/GenerateKernel.h index b24ef23..548ed7e 100644 --- a/src/GenerateKernel.h +++ b/src/GenerateKernel.h @@ -60,9 +60,9 @@ class CodeGenBase { } { // vector width in bits if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { vectorWidth_ = 512; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { vectorWidth_ = 256; } else { // TODO: Have default path diff --git a/src/GroupwiseConv.h b/src/GroupwiseConv.h index 6a0a8a5..4e539f2 100644 --- a/src/GroupwiseConv.h +++ b/src/GroupwiseConv.h @@ -51,10 +51,10 @@ class GenConvKernel { x86::ymm8} { // vector width in bits if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { // TODO: change this to 512 once we have avx512f version vectorWidth_ = 256; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { vectorWidth_ = 256; } else { // TODO: Have default path diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc index 26eb486..0e1b85b 100644 --- a/src/GroupwiseConvAcc32Avx2.cc +++ b/src/GroupwiseConvAcc32Avx2.cc @@ -1681,7 +1681,7 @@ void fbgemmGroupwiseConvBase_( // current group memcpy(rowOffsetBuf, rowOffsetForCurG, ih_iw * sizeof(int32_t)); - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { // Currently use avx2 code outProcess.template f<inst_set_t::avx2>( out, @@ -1689,7 +1689,7 @@ void fbgemmGroupwiseConvBase_( {i * oh_ow, oh_ow, (g + j) * K_per_G, K_per_G}, K_per_G * G, K_per_G * G); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { outProcess.template f<inst_set_t::avx2>( out, currOutBuf + j * K_per_G, @@ -1790,7 +1790,7 @@ void fbgemmGroupwiseConv( typedef ReQuantizeOutput<FUSE_RELU, Q_GRAN> processOutputType; if (!fbgemmOptimizedGConv<SPATIAL_DIM>(conv_param) || - (!cpuinfo_has_x86_avx512f() && !cpuinfo_has_x86_avx2())) { + (!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { return fbgemmGroupwiseConvBase_< packed_W, outType, @@ -2150,7 +2150,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) { // row offset buffer should be a able to hold row offsets for however // number of groups we process at a time. if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { int bufferSize = conv_param.OUT_DIM[0] * conv_param.OUT_DIM[1]; int C_per_G = conv_param.IC / conv_param.G; int K_per_G = conv_param.OC / conv_param.G; @@ -2160,7 +2160,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) { } else { return conv_param.G * bufferSize; } - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { int bufferSize = conv_param.OUT_DIM[0] * conv_param.OUT_DIM[1]; int C_per_G = conv_param.IC / conv_param.G; int K_per_G = conv_param.OC / conv_param.G; diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc index f52684a..db019db 100644 --- a/src/PackAMatrix.cc +++ b/src/PackAMatrix.cc @@ -25,12 +25,12 @@ PackAMatrix<T, accT>::PackAMatrix( trans_(trans), smat_(smat), ld_(ld) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index f75092e..93408da 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -44,12 +44,12 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( a_zero_pt_(a_zero_pt) { static_assert( SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension "); - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = @@ -461,9 +461,9 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::printPackedMatrix( template <typename T, typename accT, int SPATIAL_DIM> int PackAWithIm2Col<T, accT, SPATIAL_DIM>::rowOffsetBufferSize() { if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { return PackingTraits<T, accT, inst_set_t::avx512>::MCB; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { return PackingTraits<T, accT, inst_set_t::avx2>::MCB; } else { // TODO: Have default slower path diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc index 02e701e..2929ebb 100644 --- a/src/PackAWithQuantRowOffset.cc +++ b/src/PackAWithQuantRowOffset.cc @@ -42,12 +42,12 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset( row_offset_(row_offset) { rowOffsetAllocatedHere = false; - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = @@ -181,9 +181,9 @@ void PackAWithQuantRowOffset<T, accT>::printPackedMatrix(std::string name) { template <typename T, typename accT> int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize() { if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { return PackingTraits<T, accT, inst_set_t::avx512>::MCB; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { return PackingTraits<T, accT, inst_set_t::avx2>::MCB; } else { assert(0 && "unsupported architecture"); diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc index cb0dcaa..7777f1a 100644 --- a/src/PackAWithRowOffset.cc +++ b/src/PackAWithRowOffset.cc @@ -36,12 +36,12 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset( row_offset_(row_offset) { rowOffsetAllocatedHere = false; - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = @@ -171,9 +171,9 @@ void PackAWithRowOffset<T, accT>::printPackedMatrix(std::string name) { template <typename T, typename accT> int PackAWithRowOffset<T, accT>::rowOffsetBufferSize() { if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { return PackingTraits<T, accT, inst_set_t::avx512>::MCB; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { return PackingTraits<T, accT, inst_set_t::avx2>::MCB; } else { // TODO: Have default slower path diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index 8d76e29..659d1df 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -25,12 +25,12 @@ PackBMatrix<T, accT>::PackBMatrix( trans_(trans), smat_(smat), ld_(ld) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB; row_interleave_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::NCB; row_interleave_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc index 0177a07..316fc06 100644 --- a/src/PackMatrix.cc +++ b/src/PackMatrix.cc @@ -28,7 +28,7 @@ PackMatrix<PT, inpType, accType>::PackMatrix( template <typename PT, typename inpType, typename accType> int PackMatrix<PT, inpType, accType>::packedBufferSize(int rows, int cols) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { if (isA()) { return PackingTraits<inpType, accType, inst_set_t::avx512>::MCB * PackingTraits<inpType, accType, inst_set_t::avx512>::KCB; @@ -38,7 +38,7 @@ int PackMatrix<PT, inpType, accType>::packedBufferSize(int rows, int cols) { return (((rows + rowBlock - 1) / rowBlock) * rowBlock) * (((cols + colBlock - 1) / colBlock) * colBlock); } - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { if (isA()) { return PackingTraits<inpType, accType, inst_set_t::avx2>::MCB * PackingTraits<inpType, accType, inst_set_t::avx2>::KCB; diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc index 1d057e2..82f8c2a 100644 --- a/src/QuantUtils.cc +++ b/src/QuantUtils.cc @@ -176,7 +176,7 @@ void Quantize<uint8_t>( uint8_t* dst, int len, const TensorQuantizationParams& qparams) { - bool avx2_support = cpuinfo_has_x86_avx2(); + bool avx2_support = fbgemmHasAvx2Support(); bool fma_support = cpuinfo_has_x86_fma3(); if (avx2_support && fma_support && qparams.precision == 8) { // fast path @@ -221,7 +221,7 @@ void Requantize<uint8_t>( uint8_t* dst, const int len, const RequantizationParams& params) { - if (params.target_qparams.precision == 8 && cpuinfo_has_x86_avx2()) { + if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) { RequantizeAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { @@ -237,7 +237,7 @@ void RequantizeFixedPoint( int len, const RequantizationParams& params) { if (std::is_same<T, uint8_t>::value && params.target_qparams.precision == 8 && - cpuinfo_has_x86_avx2()) { + fbgemmHasAvx2Support()) { RequantizeFixedPointAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { @@ -267,7 +267,7 @@ void RequantizeFixedPoint<uint8_t>( uint8_t* dst, const int len, const RequantizationParams& params) { - if (params.target_qparams.precision == 8 && cpuinfo_has_x86_avx2()) { + if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) { RequantizeFixedPointAvx2(src, dst, len, params); } else { for (int i = 0; i < len; ++i) { diff --git a/src/Utils.cc b/src/Utils.cc index fcd8ade..0fa620d 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -179,9 +179,9 @@ void transpose_simd( int ld_dst) { // Run time CPU detection if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx512f()) { + if (fbgemmHasAvx512Support()) { internal::transpose_16x16(M, N, src, ld_src, dst, ld_dst); - } else if (cpuinfo_has_x86_avx2()) { + } else if (fbgemmHasAvx2Support()) { internal::transpose_8x8(M, N, src, ld_src, dst, ld_dst); } else { transpose_ref(M, N, src, ld_src, dst, ld_dst); @@ -192,4 +192,14 @@ void transpose_simd( } } +bool fbgemmHasAvx512Support() { + return ( + cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && + cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()); +} + +bool fbgemmHasAvx2Support() { + return (cpuinfo_has_x86_avx2()); +} + } // namespace fbgemm |