15 files changed, 67 insertions, 46 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 607657e..494e261 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,7 +100,8 @@ set_target_properties(fbgemm_generic fbgemm_avx2 fbgemm_avx512 PROPERTIES
 target_compile_options(fbgemm_avx2 PRIVATE
   "-m64" "-mavx2" "-mfma" "-masm=intel")
 target_compile_options(fbgemm_avx512 PRIVATE
-  "-m64" "-mavx2" "-mfma" "-mavx512f" "-masm=intel")
+  "-m64" "-mavx2" "-mfma" "-mavx512f" "-mavx512bw" "-mavx512dq"
+  "-mavx512vl" "-masm=intel")
 
 if(NOT TARGET asmjit)
   #Download asmjit from github if ASMJIT_SRC_DIR is not specified.
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 775e18c..2813629 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -77,4 +77,14 @@ void transpose_simd(
     float* dst,
     int ld_dst);
 
+/**
+ * @brief Are we running on a AVX512 supported cpu?
+ */
+FBGEMM_API bool fbgemmHasAvx512Support();
+
+/**
+ * @brief Are we running on a AVX2 supported cpu?
+ */
+FBGEMM_API bool fbgemmHasAvx2Support();
+
 } // namespace fbgemm
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 0dfc531..f2b028d 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -42,7 +42,7 @@ ExecuteKernel<
       outputProcess_(outputProcess),
       thread_id_(thread_id),
       num_threads_(num_threads) {
-  if (cpuinfo_has_x86_avx512f()) {
+  if (fbgemmHasAvx512Support()) {
     mbSize_ = PackingTraits<
         int8_t,
         typename packingAMatrix::accType,
@@ -55,7 +55,7 @@ ExecuteKernel<
         int8_t,
         typename packingAMatrix::accType,
         inst_set_t::avx512>::NR;
-  } else if (cpuinfo_has_x86_avx2()) {
+  } else if (fbgemmHasAvx2Support()) {
     mbSize_ = PackingTraits<
         int8_t,
         typename packingAMatrix::accType,
@@ -101,14 +101,14 @@ void ExecuteKernel<
   typename BaseType::jit_micro_kernel_fp fn;
 
   if (cpuinfo_initialize()) {
-    if (cpuinfo_has_x86_avx512f()) {
+    if (fbgemmHasAvx512Support()) {
       fn = BaseType::template getOrCreate<inst_set_t::avx512>(
           accum,
           packed_rows_A,
           packedB_.blockColSize(),
           packedA_.numPackedCols(),
           nbSize_);
-    } else if (cpuinfo_has_x86_avx2()) {
+    } else if (fbgemmHasAvx2Support()) {
       fn = BaseType::template getOrCreate<inst_set_t::avx2>(
           accum,
           packed_rows_A,
@@ -135,10 +135,10 @@ void ExecuteKernel<
       int nc = ((packedB_.lastBcol() - 1) / nrSize_ + 1) * nrSize_;
       if (nc != nbSize_) {
         if (cpuinfo_initialize()) {
-          if (cpuinfo_has_x86_avx512f()) {
+          if (fbgemmHasAvx512Support()) {
             fn = BaseType::template getOrCreate<inst_set_t::avx512>(
                 accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
-          } else if (cpuinfo_has_x86_avx2()) {
+          } else if (fbgemmHasAvx2Support()) {
             fn = BaseType::template getOrCreate<inst_set_t::avx2>(
                 accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
           } else {
@@ -203,7 +203,7 @@ void ExecuteKernel<
       int32_t nSize =
           C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols();
       if (nSize) {
-        if (cpuinfo_has_x86_avx512f()) {
+        if (fbgemmHasAvx512Support()) {
           // TODO: avx512 path
           // Currently use avx2 code
           outputProcess_.template f<inst_set_t::avx2>(
@@ -212,7 +212,7 @@ void ExecuteKernel<
               {row_start_A, packed_rows_A, NDim * group, nSize},
               ldc_,
               ldc_);
-        } else if (cpuinfo_has_x86_avx2()) {
+        } else if (fbgemmHasAvx2Support()) {
           outputProcess_.template f<inst_set_t::avx2>(
               matC_,
               C_buffer_row_start,
@@ -228,7 +228,7 @@ void ExecuteKernel<
       if (C_buffer_start == C_tile_) {
         // When C_tile_ scratchpad was used to avoid accessing memory past
         // C_buffer_ .
-        if (cpuinfo_has_x86_avx512f()) {
+        if (fbgemmHasAvx512Support()) {
           // TODO: avx512 path
           // Currently use avx2 code
           outputProcess_.template f<inst_set_t::avx2>(
@@ -240,7 +240,7 @@ void ExecuteKernel<
                packedB_.lastBcol()},
               ldc_,
               leadingDim);
-        } else if (cpuinfo_has_x86_avx2()) {
+        } else if (fbgemmHasAvx2Support()) {
           outputProcess_.template f<inst_set_t::avx2>(
               matC_,
               C_tile_,
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index c7a7e33..a90dd2d 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -48,7 +48,7 @@ void fbgemmPacked(
 
   // Run time CPU detection
   if (cpuinfo_initialize()) {
-    if (cpuinfo_has_x86_avx512f()) {
+    if (fbgemmHasAvx512Support()) {
       MCB = PackingTraits<
           typename packingAMatrix::inpType,
           typename packingAMatrix::accType,
@@ -61,7 +61,7 @@ void fbgemmPacked(
           typename packingAMatrix::inpType,
           typename packingAMatrix::accType,
           inst_set_t::avx512>::MR;
-    } else if (cpuinfo_has_x86_avx2()) {
+    } else if (fbgemmHasAvx2Support()) {
       MCB = PackingTraits<
           typename packingAMatrix::inpType,
           typename packingAMatrix::accType,
diff --git a/src/GenerateKernel.h b/src/GenerateKernel.h
index b24ef23..548ed7e 100644
--- a/src/GenerateKernel.h
+++ b/src/GenerateKernel.h
@@ -60,9 +60,9 @@ class CodeGenBase {
         } {
     // vector width in bits
     if (cpuinfo_initialize()) {
-      if (cpuinfo_has_x86_avx512f()) {
+      if (fbgemmHasAvx512Support()) {
         vectorWidth_ = 512;
-      } else if (cpuinfo_has_x86_avx2()) {
+      } else if (fbgemmHasAvx2Support()) {
         vectorWidth_ = 256;
       } else {
         // TODO: Have default path
diff --git a/src/GroupwiseConv.h b/src/GroupwiseConv.h
index 6a0a8a5..4e539f2 100644
--- a/src/GroupwiseConv.h
+++ b/src/GroupwiseConv.h
@@ -51,10 +51,10 @@ class GenConvKernel {
                     x86::ymm8} {
     // vector width in bits
     if (cpuinfo_initialize()) {
-      if (cpuinfo_has_x86_avx512f()) {
+      if (fbgemmHasAvx512Support()) {
         // TODO: change this to 512 once we have avx512f version
         vectorWidth_ = 256;
-      } else if (cpuinfo_has_x86_avx2()) {
+      } else if (fbgemmHasAvx2Support()) {
         vectorWidth_ = 256;
       } else {
         // TODO: Have default path
diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc
index 26eb486..0e1b85b 100644
--- a/src/GroupwiseConvAcc32Avx2.cc
+++ b/src/GroupwiseConvAcc32Avx2.cc
@@ -1681,7 +1681,7 @@ void fbgemmGroupwiseConvBase_(
             // current group
             memcpy(rowOffsetBuf, rowOffsetForCurG, ih_iw * sizeof(int32_t));
 
-            if (cpuinfo_has_x86_avx512f()) {
+            if (fbgemmHasAvx512Support()) {
               // Currently use avx2 code
               outProcess.template f<inst_set_t::avx2>(
                   out,
@@ -1689,7 +1689,7 @@ void fbgemmGroupwiseConvBase_(
                   {i * oh_ow, oh_ow, (g + j) * K_per_G, K_per_G},
                   K_per_G * G,
                   K_per_G * G);
-            } else if (cpuinfo_has_x86_avx2()) {
+            } else if (fbgemmHasAvx2Support()) {
               outProcess.template f<inst_set_t::avx2>(
                   out,
                   currOutBuf + j * K_per_G,
@@ -1790,7 +1790,7 @@ void fbgemmGroupwiseConv(
   typedef ReQuantizeOutput<FUSE_RELU, Q_GRAN> processOutputType;
 
   if (!fbgemmOptimizedGConv<SPATIAL_DIM>(conv_param) ||
-      (!cpuinfo_has_x86_avx512f() && !cpuinfo_has_x86_avx2())) {
+      (!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
     return fbgemmGroupwiseConvBase_<
         packed_W,
         outType,
@@ -2150,7 +2150,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
   // row offset buffer should be a able to hold row offsets for however
   // number of groups we process at a time.
   if (cpuinfo_initialize()) {
-    if (cpuinfo_has_x86_avx512f()) {
+    if (fbgemmHasAvx512Support()) {
       int bufferSize = conv_param.OUT_DIM[0] * conv_param.OUT_DIM[1];
       int C_per_G = conv_param.IC / conv_param.G;
       int K_per_G = conv_param.OC / conv_param.G;
@@ -2160,7 +2160,7 @@ int rowOffsetBufferSizeGConv(const conv_param_t<SPATIAL_DIM>& conv_param) {
       } else {
         return conv_param.G * bufferSize;
       }
-    } else if (cpuinfo_has_x86_avx2()) {
+    } else if (fbgemmHasAvx2Support()) {
       int bufferSize = conv_param.OUT_DIM[0] * conv_param.OUT_DIM[1];
       int C_per_G = conv_param.IC / conv_param.G;
       int K_per_G = conv_param.OC / conv_param.G;
diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
index f52684a..db019db 100644
--- a/src/PackAMatrix.cc
+++ b/src/PackAMatrix.cc
@@ -25,12 +25,12 @@ PackAMatrix<T, accT>::PackAMatrix(
       trans_(trans),
       smat_(smat),
       ld_(ld) {
-  if (cpuinfo_has_x86_avx512f()) {
+  if (fbgemmHasAvx512Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
     row_interleave_B_ =
         PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-  } else if (cpuinfo_has_x86_avx2()) {
+  } else if (fbgemmHasAvx2Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
     row_interleave_B_ =
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index f75092e..93408da 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -44,12 +44,12 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
       a_zero_pt_(a_zero_pt) {
   static_assert(
       SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension ");
-  if (cpuinfo_has_x86_avx512f()) {
+  if (fbgemmHasAvx512Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
     row_interleave_B_ =
         PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-  } else if (cpuinfo_has_x86_avx2()) {
+  } else if (fbgemmHasAvx2Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
     row_interleave_B_ =
@@ -461,9 +461,9 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::printPackedMatrix(
 template <typename T, typename accT, int SPATIAL_DIM>
 int PackAWithIm2Col<T, accT, SPATIAL_DIM>::rowOffsetBufferSize() {
   if (cpuinfo_initialize()) {
-    if (cpuinfo_has_x86_avx512f()) {
+    if (fbgemmHasAvx512Support()) {
       return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
-    } else if (cpuinfo_has_x86_avx2()) {
+    } else if (fbgemmHasAvx2Support()) {
       return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
     } else {
       // TODO: Have default slower path
diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc
index 02e701e..2929ebb 100644
--- a/src/PackAWithQuantRowOffset.cc
+++ b/src/PackAWithQuantRowOffset.cc
@@ -42,12 +42,12 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
       row_offset_(row_offset) {
   rowOffsetAllocatedHere = false;
 
-  if (cpuinfo_has_x86_avx512f()) {
+  if (fbgemmHasAvx512Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
     row_interleave_B_ =
         PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-  } else if (cpuinfo_has_x86_avx2()) {
+  } else if (fbgemmHasAvx2Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
     row_interleave_B_ =
@@ -181,9 +181,9 @@ void PackAWithQuantRowOffset<T, accT>::printPackedMatrix(std::string name) {
 template <typename T, typename accT>
 int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize() {
   if (cpuinfo_initialize()) {
-    if (cpuinfo_has_x86_avx512f()) {
+    if (fbgemmHasAvx512Support()) {
       return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
-    } else if (cpuinfo_has_x86_avx2()) {
+    } else if (fbgemmHasAvx2Support()) {
       return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
     } else {
       assert(0 && "unsupported architecture");
diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc
index cb0dcaa..7777f1a 100644
--- a/src/PackAWithRowOffset.cc
+++ b/src/PackAWithRowOffset.cc
@@ -36,12 +36,12 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
       row_offset_(row_offset) {
   rowOffsetAllocatedHere = false;
 
-  if (cpuinfo_has_x86_avx512f()) {
+  if (fbgemmHasAvx512Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
     row_interleave_B_ =
         PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-  } else if (cpuinfo_has_x86_avx2()) {
+  } else if (fbgemmHasAvx2Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
     row_interleave_B_ =
@@ -171,9 +171,9 @@ void PackAWithRowOffset<T, accT>::printPackedMatrix(std::string name) {
 template <typename T, typename accT>
 int PackAWithRowOffset<T, accT>::rowOffsetBufferSize() {
   if (cpuinfo_initialize()) {
-    if (cpuinfo_has_x86_avx512f()) {
+    if (fbgemmHasAvx512Support()) {
       return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
-    } else if (cpuinfo_has_x86_avx2()) {
+    } else if (fbgemmHasAvx2Support()) {
       return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
     } else {
       // TODO: Have default slower path
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index 8d76e29..659d1df 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -25,12 +25,12 @@ PackBMatrix<T, accT>::PackBMatrix(
       trans_(trans),
       smat_(smat),
       ld_(ld) {
-  if (cpuinfo_has_x86_avx512f()) {
+  if (fbgemmHasAvx512Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB;
     row_interleave_ =
         PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
-  } else if (cpuinfo_has_x86_avx2()) {
+  } else if (fbgemmHasAvx2Support()) {
     BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
     BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::NCB;
     row_interleave_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
index 0177a07..316fc06 100644
--- a/src/PackMatrix.cc
+++ b/src/PackMatrix.cc
@@ -28,7 +28,7 @@ PackMatrix<PT, inpType, accType>::PackMatrix(
 
 template <typename PT, typename inpType, typename accType>
 int PackMatrix<PT, inpType, accType>::packedBufferSize(int rows, int cols) {
-  if (cpuinfo_has_x86_avx512f()) {
+  if (fbgemmHasAvx512Support()) {
     if (isA()) {
       return PackingTraits<inpType, accType, inst_set_t::avx512>::MCB *
           PackingTraits<inpType, accType, inst_set_t::avx512>::KCB;
@@ -38,7 +38,7 @@ int PackMatrix<PT, inpType, accType>::packedBufferSize(int rows, int cols) {
       return (((rows + rowBlock - 1) / rowBlock) * rowBlock) *
           (((cols + colBlock - 1) / colBlock) * colBlock);
     }
-  } else if (cpuinfo_has_x86_avx2()) {
+  } else if (fbgemmHasAvx2Support()) {
     if (isA()) {
       return PackingTraits<inpType, accType, inst_set_t::avx2>::MCB *
           PackingTraits<inpType, accType, inst_set_t::avx2>::KCB;
diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
index 1d057e2..82f8c2a 100644
--- a/src/QuantUtils.cc
+++ b/src/QuantUtils.cc
@@ -176,7 +176,7 @@ void Quantize<uint8_t>(
     uint8_t* dst,
     int len,
     const TensorQuantizationParams& qparams) {
-  bool avx2_support = cpuinfo_has_x86_avx2();
+  bool avx2_support = fbgemmHasAvx2Support();
   bool fma_support = cpuinfo_has_x86_fma3();
   if (avx2_support && fma_support && qparams.precision == 8) {
     // fast path
@@ -221,7 +221,7 @@ void Requantize<uint8_t>(
     uint8_t* dst,
     const int len,
     const RequantizationParams& params) {
-  if (params.target_qparams.precision == 8 && cpuinfo_has_x86_avx2()) {
+  if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) {
     RequantizeAvx2(src, dst, len, params);
   } else {
     for (int i = 0; i < len; ++i) {
@@ -237,7 +237,7 @@ void RequantizeFixedPoint(
     int len,
     const RequantizationParams& params) {
   if (std::is_same<T, uint8_t>::value && params.target_qparams.precision == 8 &&
-      cpuinfo_has_x86_avx2()) {
+      fbgemmHasAvx2Support()) {
     RequantizeFixedPointAvx2(src, dst, len, params);
   } else {
     for (int i = 0; i < len; ++i) {
@@ -267,7 +267,7 @@ void RequantizeFixedPoint<uint8_t>(
     uint8_t* dst,
     const int len,
     const RequantizationParams& params) {
-  if (params.target_qparams.precision == 8 && cpuinfo_has_x86_avx2()) {
+  if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) {
     RequantizeFixedPointAvx2(src, dst, len, params);
   } else {
     for (int i = 0; i < len; ++i) {
diff --git a/src/Utils.cc b/src/Utils.cc
index fcd8ade..0fa620d 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -179,9 +179,9 @@ void transpose_simd(
     int ld_dst) {
   // Run time CPU detection
   if (cpuinfo_initialize()) {
-    if (cpuinfo_has_x86_avx512f()) {
+    if (fbgemmHasAvx512Support()) {
       internal::transpose_16x16(M, N, src, ld_src, dst, ld_dst);
-    } else if (cpuinfo_has_x86_avx2()) {
+    } else if (fbgemmHasAvx2Support()) {
       internal::transpose_8x8(M, N, src, ld_src, dst, ld_dst);
     } else {
       transpose_ref(M, N, src, ld_src, dst, ld_dst);
@@ -192,4 +192,14 @@ void transpose_simd(
   }
 }
 
+bool fbgemmHasAvx512Support() {
+  return (
+      cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() &&
+      cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl());
+}
+
+bool fbgemmHasAvx2Support() {
+  return (cpuinfo_has_x86_avx2());
+}
+
 } // namespace fbgemm