Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt3
-rw-r--r--include/fbgemm/Utils.h10
-rw-r--r--src/ExecuteKernelU8S8.cc20
-rw-r--r--src/Fbgemm.cc4
-rw-r--r--src/GenerateKernel.h4
-rw-r--r--src/GroupwiseConv.h4
-rw-r--r--src/GroupwiseConvAcc32Avx2.cc10
-rw-r--r--src/PackAMatrix.cc4
-rw-r--r--src/PackAWithIm2Col.cc8
-rw-r--r--src/PackAWithQuantRowOffset.cc8
-rw-r--r--src/PackAWithRowOffset.cc8
-rw-r--r--src/PackBMatrix.cc4
-rw-r--r--src/PackMatrix.cc4
-rw-r--r--src/QuantUtils.cc8
-rw-r--r--src/Utils.cc14
15 files changed, 67 insertions, 46 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 607657e..494e261 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,7 +100,8 @@ set_target_properties(fbgemm_generic fbgemm_avx2 fbgemm_avx512 PROPERTIES
target_compile_options(fbgemm_avx2 PRIVATE
"-m64" "-mavx2" "-mfma" "-masm=intel")
target_compile_options(fbgemm_avx512 PRIVATE
- "-m64" "-mavx2" "-mfma" "-mavx512f" "-masm=intel")
+ "-m64" "-mavx2" "-mfma" "-mavx512f" "-mavx512bw" "-mavx512dq"
+ "-mavx512vl" "-masm=intel")
if(NOT TARGET asmjit)
#Download asmjit from github if ASMJIT_SRC_DIR is not specified.
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 775e18c..2813629 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -77,4 +77,14 @@ void transpose_simd(
float* dst,
int ld_dst);
+/**
+ * @brief Are we running on a AVX512 supported cpu?
+ */
+FBGEMM_API bool fbgemmHasAvx512Support();
+
+/**
+ * @brief Are we running on a AVX2 supported cpu?
+ */
+FBGEMM_API bool fbgemmHasAvx2Support();
+
} // namespace fbgemm
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 0dfc531..f2b028d 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -42,7 +42,7 @@ ExecuteKernel<
outputProcess_(outputProcess),
thread_id_(thread_id),
num_threads_(num_threads) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
mbSize_ = PackingTraits<
int8_t,
typename packingAMatrix::accType,
@@ -55,7 +55,7 @@ ExecuteKernel<
int8_t,
typename packingAMatrix::accType,
inst_set_t::avx512>::NR;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
mbSize_ = PackingTraits<
int8_t,
typename packingAMatrix::accType,
@@ -101,14 +101,14 @@ void ExecuteKernel<
typename BaseType::jit_micro_kernel_fp fn;
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
fn = BaseType::template getOrCreate<inst_set_t::avx512>(
accum,
packed_rows_A,
packedB_.blockColSize(),
packedA_.numPackedCols(),
nbSize_);
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
fn = BaseType::template getOrCreate<inst_set_t::avx2>(
accum,
packed_rows_A,
@@ -135,10 +135,10 @@ void ExecuteKernel<
int nc = ((packedB_.lastBcol() - 1) / nrSize_ + 1) * nrSize_;
if (nc != nbSize_) {
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
fn = BaseType::template getOrCreate<inst_set_t::avx512>(
accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
fn = BaseType::template getOrCreate<inst_set_t::avx2>(
accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
} else {
@@ -203,7 +203,7 @@ void ExecuteKernel<
int32_t nSize =
C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols();
if (nSize) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
// TODO: avx512 path
// Currently use avx2 code
outputProcess_.template f<inst_set_t::avx2>(
@@ -212,7 +212,7 @@ void ExecuteKernel<
{row_start_A, packed_rows_A, NDim * group, nSize},
ldc_,
ldc_);
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
outputProcess_.template f<inst_set_t::avx2>(
matC_,
C_buffer_row_start,
@@ -228,7 +228,7 @@ void ExecuteKernel<
if (C_buffer_start == C_tile_) {
// When C_tile_ scratchpad was used to avoid accessing memory past
// C_buffer_ .
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
// TODO: avx512 path
// Currently use avx2 code
outputProcess_.template f<inst_set_t::avx2>(
@@ -240,7 +240,7 @@ void ExecuteKernel<
packedB_.lastBcol()},
ldc_,
leadingDim);
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
outputProcess_.template f<inst_set_t::avx2>(
matC_,
C_tile_,
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index c7a7e33..a90dd2d 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -48,7 +48,7 @@ void fbgemmPacked(
// Run time CPU detection
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
MCB = PackingTraits<
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
@@ -61,7 +61,7 @@ void fbgemmPacked(
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
inst_set_t::avx512>::MR;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
MCB = PackingTraits<
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
diff --git a/src/GenerateKernel.h b/src/GenerateKernel.h
index b24ef23..548ed7e 100644
--- a/src/GenerateKernel.h
+++ b/src/GenerateKernel.h
@@ -60,9 +60,9 @@ class CodeGenBase {
} {
// vector width in bits
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
vectorWidth_ = 512;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
vectorWidth_ = 256;
} else {
// TODO: Have default path
diff --git a/src/GroupwiseConv.h b/src/GroupwiseConv.h
index 6a0a8a5..4e539f2 100644
--- a/src/GroupwiseConv.h
+++ b/src/GroupwiseConv.h
@@ -51,10 +51,10 @@ class GenConvKernel {
x86::ymm8} {
// vector width in bits
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
// TODO: change this to 512 once we have avx512f version
vectorWidth_ = 256;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
vectorWidth_ = 256;
} else {
// TODO: Have default path
diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc
index 26eb486..0e1b85b 100644
--- a/src/GroupwiseConvAcc32Avx2.cc
+++ b/src/GroupwiseConvAcc32Avx2.cc
@@ -1681,7 +1681,7 @@ void fbgemmGroupwiseConvBase_(
// current group
memcpy(rowOffsetBuf, rowOffsetForCurG, ih_iw * sizeof(int32_t));
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
// Currently use avx2 code
outProcess.template f<inst_set_t::avx2>(
out,
@@ -1689,7 +1689,7 @@ void fbgemmGroupwiseConvBase_(
{i * oh_ow, oh_ow, (g + j) * K_per_G, K_per_G},
K_per_G * G,
K_per_G * G);
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
outProcess.template f<inst_set_t::avx2>(
out,
currOutBuf + j * K_per_G,
@@ -1790,7 +1790,7 @@ void fbgemmGroupwiseConv(
typedef ReQuantizeOutput<FUSE_RELU, Q_GRAN> processOutputType;
if (!fbgemmOptimizedGConv<SPATIAL_DIM>(conv_param) ||
- (!cpuinfo_has_x86_avx512f() && !cpuinfo_has_x86_avx2())) {
+ (!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
return fbgemmGroupwiseConvBase_<
packed_W,
outType,
@@ -2150,7 +2150,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
// row offset buffer should be a able to hold row offsets for however
// number of groups we process at a time.
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
int bufferSize = conv_param.OUT_DIM[0] * conv_param.OUT_DIM[1];
int C_per_G = conv_param.IC / conv_param.G;
int K_per_G = conv_param.OC / conv_param.G;
@@ -2160,7 +2160,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
} else {
return conv_param.G * bufferSize;
}
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
int bufferSize = conv_param.OUT_DIM[0] * conv_param.OUT_DIM[1];
int C_per_G = conv_param.IC / conv_param.G;
int K_per_G = conv_param.OC / conv_param.G;
diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
index f52684a..db019db 100644
--- a/src/PackAMatrix.cc
+++ b/src/PackAMatrix.cc
@@ -25,12 +25,12 @@ PackAMatrix<T, accT>::PackAMatrix(
trans_(trans),
smat_(smat),
ld_(ld) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
row_interleave_B_ =
PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
row_interleave_B_ =
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index f75092e..93408da 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -44,12 +44,12 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
a_zero_pt_(a_zero_pt) {
static_assert(
SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension ");
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
row_interleave_B_ =
PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
row_interleave_B_ =
@@ -461,9 +461,9 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::printPackedMatrix(
template <typename T, typename accT, int SPATIAL_DIM>
int PackAWithIm2Col<T, accT, SPATIAL_DIM>::rowOffsetBufferSize() {
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
} else {
// TODO: Have default slower path
diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc
index 02e701e..2929ebb 100644
--- a/src/PackAWithQuantRowOffset.cc
+++ b/src/PackAWithQuantRowOffset.cc
@@ -42,12 +42,12 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
row_offset_(row_offset) {
rowOffsetAllocatedHere = false;
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
row_interleave_B_ =
PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
row_interleave_B_ =
@@ -181,9 +181,9 @@ void PackAWithQuantRowOffset<T, accT>::printPackedMatrix(std::string name) {
template <typename T, typename accT>
int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize() {
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
} else {
assert(0 && "unsupported architecture");
diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc
index cb0dcaa..7777f1a 100644
--- a/src/PackAWithRowOffset.cc
+++ b/src/PackAWithRowOffset.cc
@@ -36,12 +36,12 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
row_offset_(row_offset) {
rowOffsetAllocatedHere = false;
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
row_interleave_B_ =
PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
row_interleave_B_ =
@@ -171,9 +171,9 @@ void PackAWithRowOffset<T, accT>::printPackedMatrix(std::string name) {
template <typename T, typename accT>
int PackAWithRowOffset<T, accT>::rowOffsetBufferSize() {
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
} else {
// TODO: Have default slower path
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index 8d76e29..659d1df 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -25,12 +25,12 @@ PackBMatrix<T, accT>::PackBMatrix(
trans_(trans),
smat_(smat),
ld_(ld) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB;
row_interleave_ =
PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::NCB;
row_interleave_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
index 0177a07..316fc06 100644
--- a/src/PackMatrix.cc
+++ b/src/PackMatrix.cc
@@ -28,7 +28,7 @@ PackMatrix<PT, inpType, accType>::PackMatrix(
template <typename PT, typename inpType, typename accType>
int PackMatrix<PT, inpType, accType>::packedBufferSize(int rows, int cols) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
if (isA()) {
return PackingTraits<inpType, accType, inst_set_t::avx512>::MCB *
PackingTraits<inpType, accType, inst_set_t::avx512>::KCB;
@@ -38,7 +38,7 @@ int PackMatrix<PT, inpType, accType>::packedBufferSize(int rows, int cols) {
return (((rows + rowBlock - 1) / rowBlock) * rowBlock) *
(((cols + colBlock - 1) / colBlock) * colBlock);
}
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
if (isA()) {
return PackingTraits<inpType, accType, inst_set_t::avx2>::MCB *
PackingTraits<inpType, accType, inst_set_t::avx2>::KCB;
diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
index 1d057e2..82f8c2a 100644
--- a/src/QuantUtils.cc
+++ b/src/QuantUtils.cc
@@ -176,7 +176,7 @@ void Quantize<uint8_t>(
uint8_t* dst,
int len,
const TensorQuantizationParams& qparams) {
- bool avx2_support = cpuinfo_has_x86_avx2();
+ bool avx2_support = fbgemmHasAvx2Support();
bool fma_support = cpuinfo_has_x86_fma3();
if (avx2_support && fma_support && qparams.precision == 8) {
// fast path
@@ -221,7 +221,7 @@ void Requantize<uint8_t>(
uint8_t* dst,
const int len,
const RequantizationParams& params) {
- if (params.target_qparams.precision == 8 && cpuinfo_has_x86_avx2()) {
+ if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) {
RequantizeAvx2(src, dst, len, params);
} else {
for (int i = 0; i < len; ++i) {
@@ -237,7 +237,7 @@ void RequantizeFixedPoint(
int len,
const RequantizationParams& params) {
if (std::is_same<T, uint8_t>::value && params.target_qparams.precision == 8 &&
- cpuinfo_has_x86_avx2()) {
+ fbgemmHasAvx2Support()) {
RequantizeFixedPointAvx2(src, dst, len, params);
} else {
for (int i = 0; i < len; ++i) {
@@ -267,7 +267,7 @@ void RequantizeFixedPoint<uint8_t>(
uint8_t* dst,
const int len,
const RequantizationParams& params) {
- if (params.target_qparams.precision == 8 && cpuinfo_has_x86_avx2()) {
+ if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) {
RequantizeFixedPointAvx2(src, dst, len, params);
} else {
for (int i = 0; i < len; ++i) {
diff --git a/src/Utils.cc b/src/Utils.cc
index fcd8ade..0fa620d 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -179,9 +179,9 @@ void transpose_simd(
int ld_dst) {
// Run time CPU detection
if (cpuinfo_initialize()) {
- if (cpuinfo_has_x86_avx512f()) {
+ if (fbgemmHasAvx512Support()) {
internal::transpose_16x16(M, N, src, ld_src, dst, ld_dst);
- } else if (cpuinfo_has_x86_avx2()) {
+ } else if (fbgemmHasAvx2Support()) {
internal::transpose_8x8(M, N, src, ld_src, dst, ld_dst);
} else {
transpose_ref(M, N, src, ld_src, dst, ld_dst);
@@ -192,4 +192,14 @@ void transpose_simd(
}
}
+bool fbgemmHasAvx512Support() {
+ return (
+ cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() &&
+ cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl());
+}
+
+bool fbgemmHasAvx2Support() {
+ return (cpuinfo_has_x86_avx2());
+}
+
} // namespace fbgemm