Fix OpenMP compilation on gcc 7.4.0

The #pragma omp parallel is implemented by creating another function for the thread to launch. gcc 7.4.0 fails to carry the target attributes to that new function, so intrinstics were not working. Copying register types causes an internal compiler error. These for loops need constants initialized in registers like -127 and just moving the constants into the for loop was generating code that initializes them every iteration (no cross-loop constant extraction). The workaround is to split #pragma omp parallel to launch a function with target attributes which initializes the constants then does #pragma omp for to just divvy up the work.
author: Kenneth Heafield <kheafiel@amazon.com> 2020-03-06 15:56:51 +0300
committer: Kenneth Heafield <kheafiel@amazon.com> 2020-03-06 15:56:51 +0300
commit: 37a20ecd90cd8d28e70740156ecffbd692b376dc (patch)
tree: 425189169a558f1c6e424c1cb8af19fdc5ce6376
parent: 8a6d47f126f0ab16a97e99e61f24cba4028f57cd (diff)
4 files changed, 53 insertions, 15 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 335fa0d..25866bb 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -201,7 +201,9 @@ struct AVX2_8bit {
   INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
-
+ private:
+  INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2)
+ public:
   INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2)
 
   // Currently A is prepared by quantization but this could theoretically change.
diff --git a/avx512_gemm.h b/avx512_gemm.h
index 267dc6d..c592e1c 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -224,26 +224,48 @@ struct AVX512_8bit {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
+ private:
+  /* g++ (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 does not carry target attributes
+   * to the hidden function it creates in implementing #pragma omp parallel for.
+   * So intrinstics were not working inside the for loop when compiled with
+   * OMP. Also, passing register types across #pragma omp parallel for
+   * generated an internal compiler error.
+   * The problem does not occur in g++-8 (Ubuntu 8.3.0-6ubuntu1~18.04.1) 8.3.0.
+   * As a workaround, I split into #pragma omp parallel with boring types
+   * passed across the boundary then call this function with target attributes.
+   */
+  INTGEMM_AVX512BW static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) {
+    const __m512i neg127 = _mm512_set1_epi32(-127);
+    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
+    const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
+#pragma omp for
+    for (std::size_t i = 0; i < count; i += kBatch) {
+      __m512i asint = avx512f::QuantizerGrab(input + i, quant_mult_reg);
+      asint = _mm512_max_epi32(asint, neg127);
+      // There doesn't seem to be an unmasked version.
+      _mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint);
+    }
+  }
+
+ public:
   // Technically output can be unaligned in Quantize.
   // But then it will need to be aligned for Multiply.
   // Convert to 8-bit signed integers.
   /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
   INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
     assert(reinterpret_cast<uintptr_t>(input) % sizeof(__m512i) == 0);
-    const __m512i neg127 = _mm512_set1_epi32(-127);
-    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
     const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
-    const float *fast_input_end = input + (size & ~(kBatch - 1));
-    int8_t *fast_output_end = output + (size & ~(kBatch - 1));
-#pragma omp parallel for
-    for (const float *input_it = input; input_it < fast_input_end; input_it += kBatch) {
-      __m512i asint = avx512f::QuantizerGrab(input_it, quant_mult_reg);
-      asint = _mm512_max_epi32(asint, neg127);
-      // There doesn't seem to be an unmasked version.
-      _mm512_mask_cvtsepi32_storeu_epi8(output + (input_it - input), 0xffff, asint);
+    std::size_t fast_size = (size & ~(kBatch - 1));
+    const float *fast_input_end = input + fast_size;
+    int8_t *fast_output_end = output + fast_size;
+#pragma omp parallel
+    {
+      QuantizeThread(input, output, quant_mult, fast_size);
     }
     std::size_t overhang = size & (kBatch - 1);
     if (!overhang) return; // We needed a branch anyway for the empty case.
+    const __m512i neg127 = _mm512_set1_epi32(-127);
+    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
     __m512i asint = avx512f::QuantizerGrab(fast_input_end, quant_mult_reg);
     asint = _mm512_max_epi32(asint, neg127);
     _mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint);
diff --git a/multiply.h b/multiply.h
index a9766d3..1c39c98 100644
--- a/multiply.h
+++ b/multiply.h
@@ -60,19 +60,30 @@ static inline INTGEMM_AVX512F float MaxFloat32(__m512 a) {
 #endif
 
 // Quantize function used for SSSE3 and AVX2.
+// Separate function for thread to work around gcc 7 bug that doesn't imbue
+// target attributes across #pragma omp parallel.
+#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \
+target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
+  name::QuantizeTile8 q(quant_mult); \
+  _Pragma("omp for") \
+  for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
+    *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+  } \
+}
+
 #define INTGEMM_QUANTIZE(target, Register, name) \
 target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  name::QuantizeTile8 q(quant_mult); \
   const std::size_t kBatch = sizeof(Register); \
   const std::size_t fast_end = size & ~(kBatch - 1); \
-  _Pragma("omp parallel for") \
-  for (std::size_t i = 0; i < fast_end; i += kBatch) { \
-    *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+  _Pragma("omp parallel") \
+  { \
+    QuantizeThread(input, output, quant_mult, fast_end); \
   } \
   std::size_t overhang = size & (kBatch - 1); \
   if (!overhang) return; \
+  name::QuantizeTile8 q(quant_mult); \
   /* Each does size(Register) / 32 == kBatch / 4 floats at a time.
    * If we're allowed to read one of them, then we can read the whole register.  */ \
   const float *inputs[4]; \
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index 2cf341e..fd3ab8c 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -116,6 +116,9 @@ struct SSSE3_8bit {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
+ private:
+  INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3)
+ public:
   INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3)
 
   // Version with unsigned int + 127
author	Kenneth Heafield <kheafiel@amazon.com>	2020-03-06 15:56:51 +0300
committer	Kenneth Heafield <kheafiel@amazon.com>	2020-03-06 15:56:51 +0300
commit	37a20ecd90cd8d28e70740156ecffbd692b376dc (patch)
tree	425189169a558f1c6e424c1cb8af19fdc5ce6376
parent	8a6d47f126f0ab16a97e99e61f24cba4028f57cd (diff)