Fix compilation on icc 19.1.0.20191121

author: Kenneth Heafield <github@kheafield.com> 2021-01-04 23:36:22 +0300
committer: Kenneth Heafield <github@kheafield.com> 2021-01-04 23:36:22 +0300
commit: 65276ad59ab9cd5b2bc623c2411f481f79aa7c5c (patch)
tree: dee1cdb23b97d60f15d55ba52b933987834ef838
parent: 1318506945c0dfc0af8f24be15be31323140c40a (diff)
5 files changed, 42 insertions, 29 deletions
diff --git a/intgemm/avx512_gemm.h b/intgemm/avx512_gemm.h
index a69b2dc..90f67ee 100644
--- a/intgemm/avx512_gemm.h
+++ b/intgemm/avx512_gemm.h
@@ -391,7 +391,7 @@ struct Kernels8 {
         Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
 
         auto total = PermuteSummer(pack0123, pack4567);
-        callback_impl(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
+        callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
       }
     }
   }
diff --git a/intgemm/avx512vnni_gemm.h b/intgemm/avx512vnni_gemm.h
index 747bdf9..28e8c14 100644
--- a/intgemm/avx512vnni_gemm.h
+++ b/intgemm/avx512vnni_gemm.h
@@ -75,7 +75,7 @@ struct Kernels8 : public AVX512BW::Kernels8 {
         Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
         Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
         auto total = PermuteSummer(pack0123, pack4567);
-        callback_impl(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
+        callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
       }
     }
   }
@@ -116,7 +116,7 @@ struct Kernels8 : public AVX512BW::Kernels8 {
         Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
         Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
         auto total = PermuteSummer(pack0123, pack4567);
-        callback_impl(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
+        callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
       }
     }
   }
@@ -153,7 +153,7 @@ struct Kernels8 : public AVX512BW::Kernels8 {
       Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
       Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
       auto total = PermuteSummer(pack0123, pack4567);
-      callback_impl(total, callbacks::OutputBufferInfo(0, B0_colidx, 1, B_cols));
+      callback_impl.Run(total, callbacks::OutputBufferInfo(0, B0_colidx, 1, B_cols));
     }
   }
 
diff --git a/intgemm/callbacks/implementations.inl b/intgemm/callbacks/implementations.inl
index 47d2aa4..9a8f9e1 100644
--- a/intgemm/callbacks/implementations.inl
+++ b/intgemm/callbacks/implementations.inl
@@ -1,13 +1,13 @@
 /* This file is included multiple times, once per architecture. */
 #if defined(CALLBACKS_THIS_IS_SSE2)
   #define CPU_NAME SSE2
-  #define CPU_ATTR INTGEMM_SSE2
+  #define INTGEMM_TARGET INTGEMM_SSE2
 #elif defined(CALLBACKS_THIS_IS_AVX2)
   #define CPU_NAME AVX2
-  #define CPU_ATTR INTGEMM_AVX2
+  #define INTGEMM_TARGET INTGEMM_AVX2
 #elif defined(CALLBACKS_THIS_IS_AVX512BW)
   #define CPU_NAME AVX512BW
-  #define CPU_ATTR INTGEMM_AVX512BW
+  #define INTGEMM_TARGET INTGEMM_AVX512BW
 #else
   #error "Only SSE2, AVX2 and AVX512BW are supported"
 #endif
@@ -22,6 +22,13 @@
   #define vd vector_t<CPUType::AVX2, double>
 #endif
 
+/* Intel compiler 19.1.0.166 20191121 fails to link constructors with target attributes */
+#ifdef __INTEL_COMPILER
+#define INTGEMM_TARGET_CONSTRUCTOR
+#else
+#define INTGEMM_TARGET_CONSTRUCTOR INTGEMM_TARGET
+#endif
+
 namespace intgemm {
 namespace callbacks {
 
@@ -42,9 +49,9 @@ namespace callbacks {
 template <typename... Configs>
 class CallbackImpl<CPUType::CPU_NAME, std::tuple<Configs...>> {
 public:
-  CPU_ATTR CallbackImpl(const std::tuple<Configs...>& configs) : callbacks(init_callbacks(configs, make_sequence<sizeof...(Configs)>())) {}
+  explicit CallbackImpl(const std::tuple<Configs...>& configs) : callbacks(init_callbacks(configs, make_sequence<sizeof...(Configs)>())) {}
 
-  CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) {
+  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
     run_callbacks(input, info, callbacks, make_sequence<sizeof...(Configs)>());
   }
 
@@ -60,11 +67,11 @@ private:
 
 #define RUN_CALLBACKS_PIPELINE_IMPL(vtype) \
   template <unsigned FirstIndex> \
-  CPU_ATTR static inline void run_callbacks(vtype input, const OutputBufferInfo& info, CallbacksTupleType& tuple, sequence<FirstIndex>) { \
+  INTGEMM_TARGET static inline void run_callbacks(vtype input, const OutputBufferInfo& info, CallbacksTupleType& tuple, sequence<FirstIndex>) { \
     std::get<FirstIndex>(tuple)(input, info); \
   } \
   template <unsigned FirstIndex, unsigned SecondIndex, unsigned... RestIndices> \
-  CPU_ATTR static inline void run_callbacks(vtype input, const OutputBufferInfo& info, CallbacksTupleType& tuple, sequence<FirstIndex, SecondIndex, RestIndices...>) { \
+  INTGEMM_TARGET static inline void run_callbacks(vtype input, const OutputBufferInfo& info, CallbacksTupleType& tuple, sequence<FirstIndex, SecondIndex, RestIndices...>) { \
     auto output = std::get<FirstIndex>(tuple)(input, info); \
     run_callbacks(output, info, tuple, sequence<SecondIndex, RestIndices...>()); \
   }
@@ -81,8 +88,8 @@ private:
  */
 template <> class CallbackImpl<CPUType::CPU_NAME, Dummy> {
 public:
-  CPU_ATTR CallbackImpl(const Dummy&) {}
-  CPU_ATTR void operator()(vi, const OutputBufferInfo&) {}
+  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Dummy&) {}
+  INTGEMM_TARGET void Run(vi, const OutputBufferInfo&) {}
 };
 
 /*
@@ -91,9 +98,9 @@ public:
 template <typename Type>
 class CallbackImpl<CPUType::CPU_NAME, Write<Type>> {
 public:
-  CPU_ATTR CallbackImpl(const Write<Type>& config) : config(config) {}
+  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Write<Type>& config) : config(config) {}
 
-  CPU_ATTR void operator()(vector_t<CPUType::CPU_NAME, Type> input, const OutputBufferInfo& info) {
+  INTGEMM_TARGET void Run(vector_t<CPUType::CPU_NAME, Type> input, const OutputBufferInfo& info) {
     kernels::write(input, config.output_addr, info.row_idx * info.cols + info.col_idx);
   }
 
@@ -106,11 +113,11 @@ private:
  */
 template <> class CallbackImpl<CPUType::CPU_NAME, Unquantize> {
 public:
-  CPU_ATTR CallbackImpl(const Unquantize& config) : config(config) {
+  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Unquantize& config) : config(config) {
     unquant_mult = set1_ps<vf>(config.unquant_mult);
   }
 
-  CPU_ATTR vf operator()(vi input, const OutputBufferInfo&) {
+  INTGEMM_TARGET vf Run(vi input, const OutputBufferInfo&) {
     return kernels::unquantize(input, unquant_mult);
   }
 
@@ -124,11 +131,11 @@ private:
  */
 template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndWrite> {
 public:
-  CPU_ATTR CallbackImpl(const UnquantizeAndWrite& config) : config(config) {
+  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndWrite& config) : config(config) {
     unquant_mult = set1_ps<vf>(config.unquant_mult);
   }
 
-  CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) {
+  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
     // Workaround gcc 5 internal compiler error that can't read register members in debug.
     vf mult_reg;
 #if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
@@ -150,9 +157,9 @@ private:
  */
 template <> class CallbackImpl<CPUType::CPU_NAME, AddBiasAndWrite> {
 public:
-  CPU_ATTR CallbackImpl(const AddBiasAndWrite& config) : config(config) {}
+  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const AddBiasAndWrite& config) : config(config) {}
 
-  CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) {
+  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
     auto result = kernels::add_bias(input, config.bias_addr, info.col_idx);
     kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
   }
@@ -166,11 +173,11 @@ private:
  */
 template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndAddBiasAndWrite> {
 public:
-  CPU_ATTR CallbackImpl(const UnquantizeAndAddBiasAndWrite& config) : config(config) {
+  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndAddBiasAndWrite& config) : config(config) {
     unquant_mult = set1_ps<vf>(config.unquant_mult);
   }
 
-  CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) {
+  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
     // Workaround gcc 5 internal compiler error that can't read register members in debug.
     vf mult_reg;
 #if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
@@ -191,7 +198,7 @@ private:
 }
 
 #undef CPU_NAME
-#undef CPU_ATTR
+#undef INTGEMM_TARGET
 #undef vi
 #undef vf
 #undef vd
diff --git a/intgemm/multiply.h b/intgemm/multiply.h
index 84c0655..8d411f3 100644
--- a/intgemm/multiply.h
+++ b/intgemm/multiply.h
@@ -110,14 +110,14 @@ INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i)
 
 template <typename Callback>
 INTGEMM_SSE2 static inline void RunCallback(Callback& callback_impl, dvector_t<CPUType::SSE2, int> total, Index row_idx, Index col_idx, Index rows, Index cols) {
-  callback_impl(total.first, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
-  callback_impl(total.second, callbacks::OutputBufferInfo(row_idx, col_idx + 4, rows, cols));
+  callback_impl.Run(total.first, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
+  callback_impl.Run(total.second, callbacks::OutputBufferInfo(row_idx, col_idx + 4, rows, cols));
 }
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
 template <typename Callback>
 INTGEMM_AVX2 static inline void RunCallback(Callback& callback_impl, vector_t<CPUType::AVX2, int> total, Index row_idx, Index col_idx, Index rows, Index cols) {
-  callback_impl(total, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
+  callback_impl.Run(total, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
 }
 #endif
 
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index 5395d40..186b0f9 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -20,7 +20,10 @@
 
 namespace intgemm {
 
-INTGEMM_SSE2 TEST_CASE("Transpose 16", "[transpose]") {
+#ifndef __INTEL_COMPILER
+INTGEMM_SSE2
+#endif
+TEST_CASE("Transpose 16", "[transpose]") {
   if (kCPU < CPUType::SSE2) return;
   const unsigned N = 8;
   AlignedVector<int16_t> input(N * N);
@@ -38,7 +41,10 @@ INTGEMM_SSE2 TEST_CASE("Transpose 16", "[transpose]") {
   }
 }
 
-INTGEMM_SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
+#ifndef __INTEL_COMPILER
+INTGEMM_SSSE3
+#endif
+TEST_CASE("Transpose 8", "[transpose]") {
   if (kCPU < CPUType::SSSE3) return;
   const unsigned N = 16;
   AlignedVector<int8_t> input(N * N);
author	Kenneth Heafield <github@kheafield.com>	2021-01-04 23:36:22 +0300
committer	Kenneth Heafield <github@kheafield.com>	2021-01-04 23:36:22 +0300
commit	65276ad59ab9cd5b2bc623c2411f481f79aa7c5c (patch)
tree	dee1cdb23b97d60f15d55ba52b933987834ef838
parent	1318506945c0dfc0af8f24be15be31323140c40a (diff)