Merge branch 'kpu:master' into master

author: Nikolay Bogoychev <nheart@gmail.com> 2022-02-15 01:05:40 +0300
committer: GitHub <noreply@github.com> 2022-02-15 01:05:40 +0300
commit: a05a2e51ab524bcee954a39ee72005193f3adf7c (patch)
tree: faf4a2d6208b60899e3db1b832d8fbd3f3ce7a31
parent: e4b82c15a368f21903657a2d3fb3259cd0f502c8 (diff)
parent: fc3a614351ce6e667197307d97f45db5265c96af (diff)
10 files changed, 191 insertions, 52 deletions
diff --git a/.github/workflows/ubuntu-no-cpuid-environment.yml b/.github/workflows/ubuntu-no-cpuid-environment.yml
new file mode 100644
index 0000000..dc1862f
--- /dev/null
+++ b/.github/workflows/ubuntu-no-cpuid-environment.yml
@@ -0,0 +1,25 @@
+name: Ubuntu No CPUID Environment Variable
+
+on:
+  push:
+    branches: [master, static]
+  pull_request:
+    branches: [master, static]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: cmake
+      run: |
+        cmake -E make_directory build
+        cd build
+        cmake -DINTGEMM_CPUID_ENVIRONMENT=OFF ..
+    - name: Compile
+      working-directory: build
+      run: cmake --build . -j2
+    - name: Test
+      working-directory: build
+      run: ctest -j2
diff --git a/.github/workflows/ubuntu-noexceptions.yml b/.github/workflows/ubuntu-noexceptions.yml
new file mode 100644
index 0000000..371df05
--- /dev/null
+++ b/.github/workflows/ubuntu-noexceptions.yml
@@ -0,0 +1,25 @@
+name: Ubuntu no exceptions
+
+on:
+  push:
+    branches: [master, static]
+  pull_request:
+    branches: [master, static]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: cmake
+      run: |
+        cmake -E make_directory build
+        cd build
+        cmake -DCMAKE_CXX_FLAGS=-fno-exceptions ..
+    - name: Compile
+      working-directory: build
+      run: cmake --build . -j2
+    - name: Test
+      working-directory: build
+      run: ctest -j2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index af27542..c9f78fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,11 @@ if (WORMHOLE)
   target_compile_definitions(intgemm PUBLIC INTGEMM_WORMHOLE)
 endif()
 
+option(INTGEMM_CPUID_ENVIRONMENT "Allow INTGEMM_CPUID environment variable to downgrade CPU model, which is mainly for testing." ON)
+if (INTGEMM_CPUID_ENVIRONMENT)
+  target_compile_definitions(intgemm PRIVATE INTGEMM_CPUID_ENVIRONMENT)
+endif()
+
 if(INTGEMM_DONT_BUILD_TESTS)
   return()
 endif()
diff --git a/compile_test/avx2.cc b/compile_test/avx2.cc
index 8460fc0..9ed534e 100644
--- a/compile_test/avx2.cc
+++ b/compile_test/avx2.cc
@@ -1,7 +1,15 @@
 // Some compilers don't have AVX2 support.  Test for them.
 #include <immintrin.h>
 
-#if defined(_MSC_VER)
+// clang-cl bug doesn't include these headers when pretending to be MSVC
+// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
+#if defined(_MSC_VER) && defined(__clang__)
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <smmintrin.h>
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
 #define INTGEMM_AVX2
 #else
 #define INTGEMM_AVX2 __attribute__ ((target ("avx2")))
diff --git a/compile_test/avx512bw.cc b/compile_test/avx512bw.cc
index 2cd4c6a..2361f75 100644
--- a/compile_test/avx512bw.cc
+++ b/compile_test/avx512bw.cc
@@ -1,7 +1,18 @@
 // Some compilers don't have AVX512BW support.  Test for them.
 #include <immintrin.h>
 
-#if defined(_MSC_VER)
+// clang-cl bug doesn't include these headers when pretending to be MSVC
+// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
+#if defined(_MSC_VER) && defined(__clang__)
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <smmintrin.h>
+#include <avx512fintrin.h>
+#include <avx512dqintrin.h>
+#include <avx512bwintrin.h>
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
 #define INTGEMM_AVX512BW
 #elif defined(__INTEL_COMPILER)
 #define INTGEMM_AVX512BW __attribute__ ((target ("avx512f")))
diff --git a/compile_test/avx512vnni.cc b/compile_test/avx512vnni.cc
index 1485cde..59035e4 100644
--- a/compile_test/avx512vnni.cc
+++ b/compile_test/avx512vnni.cc
@@ -1,6 +1,18 @@
 #include <immintrin.h>
 
-#if defined(_MSC_VER)
+// clang-cl bug doesn't include these headers when pretending to be MSVC
+// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
+#if defined(_MSC_VER) && defined(__clang__)
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <smmintrin.h>
+#include <avx512fintrin.h>
+#include <avx512dqintrin.h>
+#include <avx512bwintrin.h>
+#include <avx512vnniintrin.h>
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
 #elif defined(__INTEL_COMPILER)
 __attribute__ ((target ("avx512f")))
 #else
diff --git a/intgemm/aligned.h b/intgemm/aligned.h
index 6fda369..6b55ff2 100644
--- a/intgemm/aligned.h
+++ b/intgemm/aligned.h
@@ -2,9 +2,15 @@
 #include <cstdlib>
 #include <new>
 #ifdef _MSC_VER
+// Ensure _HAS_EXCEPTIONS is defined
+#include <vcruntime.h>
 #include <malloc.h>
 #endif
 
+#if !((defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS))
+#include <cstdlib>
+#endif
+
 // Aligned simple vector.
 
 namespace intgemm {
@@ -17,10 +23,20 @@ template <class T> class AlignedVector {
       : size_(size) {
 #ifdef _MSC_VER
       mem_ = static_cast<T*>(_aligned_malloc(size * sizeof(T), alignment));
-      if (!mem_) throw std::bad_alloc();
-#else      
+      if (!mem_) {
+#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
+        throw std::bad_alloc();
+#  else
+        std::abort();
+#  endif
+      }
+#else
       if (posix_memalign(reinterpret_cast<void **>(&mem_), alignment, size * sizeof(T))) {
+#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
         throw std::bad_alloc();
+#  else
+        std::abort();
+#  endif
       }
 #endif
     }
@@ -31,6 +47,8 @@ template <class T> class AlignedVector {
     }
 
     AlignedVector &operator=(AlignedVector &&from) {
+      if (this == &from) return *this;
+      release();
       mem_ = from.mem_;
       size_ = from.size_;
       from.mem_ = nullptr;
@@ -41,13 +59,7 @@ template <class T> class AlignedVector {
     AlignedVector(const AlignedVector&) = delete;
     AlignedVector& operator=(const AlignedVector&) = delete;
 
-    ~AlignedVector() {
-#ifdef _MSC_VER
-      _aligned_free(mem_);
-#else
-      std::free(mem_);
-#endif
-    }
+    ~AlignedVector() { release(); }
 
     std::size_t size() const { return size_; }
 
@@ -65,6 +77,14 @@ template <class T> class AlignedVector {
   private:
     T *mem_;
     std::size_t size_;
+
+    void release() {
+#ifdef _MSC_VER
+      _aligned_free(mem_);
+#else
+      std::free(mem_);
+#endif
+    }
 };
 
 } // namespace intgemm
diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc
index 9b38e08..d6c26b9 100644
--- a/intgemm/intgemm.cc
+++ b/intgemm/intgemm.cc
@@ -1,10 +1,20 @@
+#if defined(WASM)
+// No header for CPUID since it's hard-coded.
+#elif defined(__INTEL_COMPILER)
+#include <immintrin.h>
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#else
+// Assume GCC and clang style.
+#include <cpuid.h>
+#endif
+
 #include "intgemm.h"
 #include "stats.h"
 
+#include <stdio.h>
 #include <stdlib.h>
 
-#include <iostream>
-
 namespace intgemm {
 
 namespace {
@@ -77,40 +87,58 @@ CPUType RealCPUID() {
 #endif
 }
 
+#ifdef INTGEMM_CPUID_ENVIRONMENT
 CPUType EnvironmentCPUID() {
-#if defined(_MSC_VER)
+#  if defined(_MSC_VER)
   char env_override[11];
   size_t len = 0;
   if (getenv_s(&len, env_override, sizeof(env_override), "INTGEMM_CPUID")) return CPUType::AVX512VNNI;
   if (!len) return CPUType::AVX512VNNI;
-#else
+#  else
   const char *env_override = getenv("INTGEMM_CPUID");
   if (!env_override) return CPUType::AVX512VNNI; /* This will be capped to actual ID */
-#endif
+#  endif
   if (!strcmp(env_override, "AVX512VNNI")) return CPUType::AVX512VNNI;
   if (!strcmp(env_override, "AVX512BW")) return CPUType::AVX512BW;
   if (!strcmp(env_override, "AVX2")) return CPUType::AVX2;
   if (!strcmp(env_override, "SSSE3")) return CPUType::SSSE3;
   if (!strcmp(env_override, "SSE2")) return CPUType::SSE2;
-  std::cerr << "Unrecognized INTGEMM_CPUID " << env_override << std::endl;
+  fprintf(stderr, "Ignoring unrecognized INTGEMM_CPUID %s\n", env_override);
   return CPUType::AVX512VNNI;
 }
+#endif
 
 } // namespace
 
 CPUType GetCPUID() {
-  static const CPUType kLocalCPU = std::min(RealCPUID(), EnvironmentCPUID());
+  static const CPUType kLocalCPU =
+#ifdef INTGEMM_CPUID_ENVIRONMENT
+    std::min(RealCPUID(), EnvironmentCPUID());
+#else
+    RealCPUID();
+#endif
   return kLocalCPU;
 }
 
 const CPUType kCPU = GetCPUID();
 
-float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
+void UnsupportedCPUError() {
+#if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
   throw UnsupportedCPU();
+#else
+  fprintf(stderr, "intgemm does not support this CPU.\n");
+  abort();
+#endif
+}
+
+float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
+  UnsupportedCPUError();
+  return 0.0f;
 }
 
 MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) {
-  throw UnsupportedCPU();
+  UnsupportedCPUError();
+  return MeanStd();
 }
 
 void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512BW::Kernels16::Quantize, AVX512BW::Kernels16::Quantize, AVX2::Kernels16::Quantize, SSE2::Kernels16::Quantize, SSE2::Kernels16::Quantize, Unsupported_16bit::Quantize);
diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h
index 977210d..2528fdb 100644
--- a/intgemm/intgemm.h
+++ b/intgemm/intgemm.h
@@ -41,7 +41,6 @@
 
 #include <cstdint>
 
-#include "intgemm/intgemm_config.h"
 #include "types.h"
 #include "sse2_gemm.h"
 #include "ssse3_gemm.h"
@@ -49,77 +48,68 @@
 #include "avx512_gemm.h"
 #include "avx512vnni_gemm.h"
 
-#if defined(WASM)
-// No header for CPUID since it's hard-coded.
-#elif defined(__INTEL_COMPILER)
-#include <immintrin.h>
-#elif defined(_MSC_VER)
-#include <intrin.h>
-#else
-// Assume GCC and clang style.
-#include <cpuid.h>
-#endif
-
 /* Dispatch to functions based on runtime CPUID.  This adds one call-by-variable to each call. */
 
 namespace intgemm {
 
+void UnsupportedCPUError();
+
 struct Unsupported_16bit {
   static void Quantize(const float *, int16_t *, float, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void PrepareB(const float *, int16_t *, float, Index, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void PrepareBQuantizedTransposed(const int16_t *, int16_t *, Index, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void PrepareBTransposed(const float *, int16_t *, float, Index, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   template <typename Callback>
   static void Multiply(const int16_t *, const int16_t *, Index, Index, Index, Callback) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   constexpr static const char *const kName = "16-bit Unsupported";
 };
 
 struct Unsupported_8bit {
   static void Quantize(const float *, int8_t *, float, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void QuantizeU(const float *, uint8_t *, float, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void PrepareA(const float *, int8_t *, float, Index, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void PrepareBQuantizedTransposed(const int8_t *, int8_t *, Index, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void PrepareBTransposed(const float *, int8_t *, float, Index, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void PrepareB(const float *, int8_t *, float, Index, Index) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   template<class Callback>
   static void PrepareBias(const int8_t *, Index, Index, Callback) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   template <typename Callback>
   static void Multiply(const int8_t *, const int8_t *, Index, Index, Index, Callback) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
   template<class Callback>
   static void Multiply8Shift(const uint8_t *, const int8_t *, Index, Index, Index, Callback) {
-    throw UnsupportedCPU();
+    UnsupportedCPUError();
   }
 
   constexpr static const char *const kName = "8-bit Unsupported";
diff --git a/intgemm/types.h b/intgemm/types.h
index 81b38af..44fb4e2 100644
--- a/intgemm/types.h
+++ b/intgemm/types.h
@@ -1,13 +1,28 @@
 #pragma once
+#include "intgemm/intgemm_config.h"
+
 #include <exception>
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
 #include <immintrin.h>
 #endif
 #include <emmintrin.h>
 
-#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
-/* MSVC does not appear to have target attributes but is also fine with just
- * using intrinsics anywhere.
+// clang-cl bug doesn't include these headers when pretending to be MSVC
+// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
+#if defined(_MSC_VER) && defined(__clang__)
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <smmintrin.h>
+#include <avx512fintrin.h>
+#include <avx512dqintrin.h>
+#include <avx512bwintrin.h>
+#include <avx512vnniintrin.h>
+#endif
+
+#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__INTEL_COMPILER)
+/* Real MSVC does not appear to have target attributes but is also fine with
+ * just using intrinsics anywhere.  clang-cl pretending to be MSVC requires
+ * target attributes, so it's excluded from the above.
  *
  * The Intel compiler has a bug whereby constructors with target attributes do
  * not link.  Like this program doesn't compile with icpc:
author	Nikolay Bogoychev <nheart@gmail.com>	2022-02-15 01:05:40 +0300
committer	GitHub <noreply@github.com>	2022-02-15 01:05:40 +0300
commit	a05a2e51ab524bcee954a39ee72005193f3adf7c (patch)
tree	faf4a2d6208b60899e3db1b832d8fbd3f3ce7a31
parent	e4b82c15a368f21903657a2d3fb3259cd0f502c8 (diff)
parent	fc3a614351ce6e667197307d97f45db5265c96af (diff)