Merge pull request #71 from kpuatamazon/master

Improve compiler support
author: Kenneth Heafield <kpu@users.noreply.github.com> 2020-03-17 14:02:44 +0300
committer: GitHub <noreply@github.com> 2020-03-17 14:02:44 +0300
commit: 8033cdf974f69599edde6ba9126bff01b91f2435 (patch)
tree: 3dd60dbf7b979db3eeb46083477d00433c78c63a
parent: 261a5fbcf7558fc3c2ac22b33fe0c2930d440fc3 (diff)
parent: 79a3be9e7b78a4cbb231b5fc4e23dc0593f20240 (diff)
10 files changed, 128 insertions, 66 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbb9b83..32c19ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,8 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 add_library(intgemm STATIC intgemm.cc)
 
-if (OPENMP)
+option(USE_OPENMP "Use OpenMP" OFF)
+if (USE_OPENMP)
   message(STATUS "Compiling with OpenMP")
   find_package(OpenMP)
   if (NOT ${OpenMP_CXX_FOUND})
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 335fa0d..25866bb 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -201,7 +201,9 @@ struct AVX2_8bit {
   INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
-
+ private:
+  INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2)
+ public:
   INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2)
 
   // Currently A is prepared by quantization but this could theoretically change.
diff --git a/avx512_gemm.h b/avx512_gemm.h
index cdbfff5..6286ccc 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -224,26 +224,48 @@ struct AVX512_8bit {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
+ private:
+  /* g++ (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 does not carry target attributes
+   * to the hidden function it creates in implementing #pragma omp parallel for.
+   * So intrinstics were not working inside the for loop when compiled with
+   * OMP. Also, passing register types across #pragma omp parallel for
+   * generated an internal compiler error.
+   * The problem does not occur in g++-8 (Ubuntu 8.3.0-6ubuntu1~18.04.1) 8.3.0.
+   * As a workaround, I split into #pragma omp parallel with boring types
+   * passed across the boundary then call this function with target attributes.
+   */
+  INTGEMM_AVX512BW static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) {
+    const __m512i neg127 = _mm512_set1_epi32(-127);
+    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
+    const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
+#pragma omp for
+    for (std::size_t i = 0; i < count; i += kBatch) {
+      __m512i asint = avx512f::QuantizerGrab(input + i, quant_mult_reg);
+      asint = _mm512_max_epi32(asint, neg127);
+      // There doesn't seem to be an unmasked version.
+      _mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint);
+    }
+  }
+
+ public:
   // Technically output can be unaligned in Quantize.
   // But then it will need to be aligned for Multiply.
   // Convert to 8-bit signed integers.
   /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
   INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
     assert(reinterpret_cast<uintptr_t>(input) % sizeof(__m512i) == 0);
-    const __m512i neg127 = _mm512_set1_epi32(-127);
-    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
     const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
-    const float *fast_input_end = input + (size & ~(kBatch - 1));
-    int8_t *fast_output_end = output + (size & ~(kBatch - 1));
-#pragma omp parallel for
-    for (const float *input_it = input; input_it < fast_input_end; input_it += kBatch) {
-      __m512i asint = avx512f::QuantizerGrab(input_it, quant_mult_reg);
-      asint = _mm512_max_epi32(asint, neg127);
-      // There doesn't seem to be an unmasked version.
-      _mm512_mask_cvtsepi32_storeu_epi8(output + (input_it - input), 0xffff, asint);
+    std::size_t fast_size = (size & ~(kBatch - 1));
+    const float *fast_input_end = input + fast_size;
+    int8_t *fast_output_end = output + fast_size;
+#pragma omp parallel
+    {
+      QuantizeThread(input, output, quant_mult, fast_size);
     }
     std::size_t overhang = size & (kBatch - 1);
     if (!overhang) return; // We needed a branch anyway for the empty case.
+    const __m512i neg127 = _mm512_set1_epi32(-127);
+    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
     __m512i asint = avx512f::QuantizerGrab(fast_input_end, quant_mult_reg);
     asint = _mm512_max_epi32(asint, neg127);
     _mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint);
diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc
index 515536e..8f0816f 100644
--- a/benchmarks/biasmultiply.cc
+++ b/benchmarks/biasmultiply.cc
@@ -8,7 +8,6 @@ using namespace intgemm;
 
 template <class Routine>
 void testOld(Index /*rows*/, Index /*cols*/) {
-
 }
 
 template <class Routine>
diff --git a/compile_test_avx512vnni.cc b/compile_test_avx512vnni.cc
index 611cc53..deb0f88 100644
--- a/compile_test_avx512vnni.cc
+++ b/compile_test_avx512vnni.cc
@@ -19,11 +19,13 @@ bool Foo() {
 }
 
 int main() {
-  return Foo() &&
-#ifdef __INTEL_COMPILER
-    _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)
+  return Foo()
+#if defined(__GNUC__) || defined(__clang__)
+    // uses cpuid
+#elif defined(__INTEL_COMPILER)
+    && _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)
 #else
-    __builtin_cpu_supports("avx512vnni")
+    && __builtin_cpu_supports("avx512vnni")
 #endif
     ;
 }
diff --git a/intgemm.cc b/intgemm.cc
index c069424..8838cdb 100644
--- a/intgemm.cc
+++ b/intgemm.cc
@@ -40,12 +40,18 @@ const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::
 
 float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512f::MaxAbsolute, avx512f::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute);
 
+constexpr const char *const Unsupported_16bit::kName;
+constexpr const char *const Unsupported_8bit::kName;
 constexpr const char *const SSE2_16bit::kName;
 constexpr const char *const SSSE3_8bit::kName;
 constexpr const char *const AVX2_8bit::kName;
 constexpr const char *const AVX2_16bit::kName;
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
 constexpr const char *const AVX512_8bit::kName;
-constexpr const char *const AVX512VNNI_8bit::kName;
 constexpr const char *const AVX512_16bit::kName;
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+constexpr const char *const AVX512VNNI_8bit::kName;
+#endif
 
 }
diff --git a/intgemm.h b/intgemm.h
index 6d4d95f..0c315fc 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -51,7 +51,7 @@
 #include "avx512_gemm.h"
 #include "avx512vnni_gemm.h"
 
-#if defined(__GNUC__) && defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
+#if defined(__GNUC__) || defined(__clang__)
 #include "cpuid.h"
 #endif
 
@@ -124,7 +124,7 @@ struct Unsupported_8bit {
 typedef Unsupported_16bit AVX512_16bit;
 typedef Unsupported_8bit AVX512_8bit;
 namespace avx512f {
-static inline float MaxAbsolute(const float *begin, const float *end) {
+static inline float MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
   throw UnsupportedCPU();
 }
 } //namespace
@@ -135,26 +135,6 @@ static inline float MaxAbsolute(const float *begin, const float *end) {
 typedef Unsupported_8bit AVX512VNNI_8bit;
 #endif
 
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-// gcc 5.4.0 bizarrely supports avx512bw targets but not __builtin_cpu_supports("avx512bw").  So implement it manually.
-inline bool CheckAVX512BW() {
-  __builtin_cpu_init ();
-#ifdef __INTEL_COMPILER
-  return _may_i_use_cpu_feature(_FEATURE_AVX512BW)
-#elif __GNUC__
-  unsigned int m = __get_cpuid_max(0, NULL);
-  if (m < 7) return false;
-  unsigned int eax, ebx, ecx, edx;
-  __cpuid_count(7, 0, eax, ebx, ecx, edx);
-  const unsigned int avx512bw_bit = (1 << 30);
-  return ebx & avx512bw_bit;
-#else
-  return __builtin_cpu_supports("avx512bw");
-#endif
-}
-#endif
-
 /* Returns:
  * axx512vnni if the CPU supports AVX512VNNI
  *
@@ -172,24 +152,59 @@ template <class T> T ChooseCPU(T
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
     avx512vnni
 #endif
-    , T avx512bw, T avx2, T ssse3, T sse2, T unsupported) {
-  __builtin_cpu_init ();
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-  if (
-#ifdef __INTEL_COMPILER
-      _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)
-#else
-      __builtin_cpu_supports("avx512vnni")
+    , T 
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+    avx512bw
 #endif
-      ) {
-    return avx512vnni;
+    , T avx2, T ssse3, T sse2, T unsupported) {
+  /* If intgemm is compiled by gcc 6.4.1 then dlopened into an executable
+   * compiled by gcc 7.3.0, there will be a undefined symbol __cpu_info.
+   * Work around this by calling the intrinsics more directly instead of
+   * __builtin_cpu_supports.
+   *
+   * clang 6.0.0-1ubuntu2 supports vnni but doesn't have
+   *   __builtin_cpu_supports("avx512vnni")
+   * so use the hand-coded CPUID for clang.
+   */
+#if defined(__GNUC__) || defined(__clang__)
+  unsigned int m = __get_cpuid_max(0, NULL);
+  unsigned int eax, ebx, ecx, edx;
+  if (m >= 7) {
+    __cpuid_count(7, 0, eax, ebx, ecx, edx);
+#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+    if (ecx & (1 << 11)) return avx512vnni;
+#  endif
+#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+    if (ebx & (1 << 30)) return avx512bw;
+#  endif
+    if (ebx & (1 << 5)) return avx2;
   }
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-  if (CheckAVX512BW()) {
-    return avx512bw;
+  if (m >= 1) {
+    __cpuid_count(1, 0, eax, ebx, ecx, edx);
+    if (ecx & (1 << 9)) return ssse3;
+    if (edx & (1 << 26)) return sse2;
   }
-#endif
+  return unsupported;
+#else // not gcc or clang.
+  __builtin_cpu_init();
+#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+  if (
+#    ifdef __INTEL_COMPILER
+      _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)
+#    else
+      __builtin_cpu_supports("avx512vnni")
+#    endif
+      ) return vnni;
+#  endif
+#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+  if (
+#    ifdef __INTEL_COMPILER
+      _may_i_use_cpu_feature(_FEATURE_AVX512BW)
+#    else
+      __builtin_cpu_supports("avx512bw")
+#    endif
+      ) return avx512bw;
+#  endif
   if (__builtin_cpu_supports("avx2")) {
     return avx2;
   } else if (__builtin_cpu_supports("ssse3")) {
@@ -199,6 +214,7 @@ template <class T> T ChooseCPU(T
   } else {
     return unsupported;
   }
+#endif
 }
 
 struct TileInfo {
diff --git a/multiply.h b/multiply.h
index a313c16..84d6737 100644
--- a/multiply.h
+++ b/multiply.h
@@ -60,19 +60,30 @@ static inline INTGEMM_AVX512F float MaxFloat32(__m512 a) {
 #endif
 
 // Quantize function used for SSSE3 and AVX2.
+// Separate function for thread to work around gcc 7 bug that doesn't imbue
+// target attributes across #pragma omp parallel.
+#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \
+target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
+  name::QuantizeTile8 q(quant_mult); \
+  _Pragma("omp for") \
+  for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
+    *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+  } \
+}
+
 #define INTGEMM_QUANTIZE(target, Register, name) \
 target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  name::QuantizeTile8 q(quant_mult); \
   const std::size_t kBatch = sizeof(Register); \
   const std::size_t fast_end = size & ~(kBatch - 1); \
-  _Pragma("omp parallel for") \
-  for (std::size_t i = 0; i < fast_end; i += kBatch) { \
-    *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+  _Pragma("omp parallel") \
+  { \
+    QuantizeThread(input, output, quant_mult, fast_end); \
   } \
   std::size_t overhang = size & (kBatch - 1); \
   if (!overhang) return; \
+  name::QuantizeTile8 q(quant_mult); \
   /* Each does size(Register) / 32 == kBatch / 4 floats at a time.
    * If we're allowed to read one of them, then we can read the whole register.  */ \
   const float *inputs[4]; \
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index 2cf341e..fd3ab8c 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -116,6 +116,9 @@ struct SSSE3_8bit {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
+ private:
+  INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3)
+ public:
   INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3)
 
   // Version with unsigned int + 127
diff --git a/test/kernels/multiply_sat_test.cc b/test/kernels/multiply_sat_test.cc
index 36a9b9f..edea772 100644
--- a/test/kernels/multiply_sat_test.cc
+++ b/test/kernels/multiply_sat_test.cc
@@ -21,13 +21,13 @@ void kernel_multiply_sat_test() {
   std::iota(input1.begin(), input1.end(), -int(VECTOR_LENGTH / 2));
   std::iota(input2.begin(), input2.end(), -int(VECTOR_LENGTH / 3));
 
-  for (std::size_t shift = 0; shift <= 2 * 8 * sizeof(Type_); ++shift) {
-    *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift);
-    for (std::size_t i = 0; i < output.size(); ++i) {
-      auto ref = (int64_t(input1[i]) * input2[i]) >> shift;
-      auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref)));
-      CHECK(output[i] == ref_sat);
-    }
+  // TODO: try all shifts.  The shift must be an immediate.
+  std::size_t shift = 1;
+  *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift);
+  for (std::size_t i = 0; i < output.size(); ++i) {
+    auto ref = (int64_t(input1[i]) * input2[i]) >> shift;
+    auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref)));
+    CHECK(output[i] == ref_sat);
   }
 }
author	Kenneth Heafield <kpu@users.noreply.github.com>	2020-03-17 14:02:44 +0300
committer	GitHub <noreply@github.com>	2020-03-17 14:02:44 +0300
commit	8033cdf974f69599edde6ba9126bff01b91f2435 (patch)
tree	3dd60dbf7b979db3eeb46083477d00433c78c63a
parent	261a5fbcf7558fc3c2ac22b33fe0c2930d440fc3 (diff)
parent	79a3be9e7b78a4cbb231b5fc4e23dc0593f20240 (diff)