Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <kpu@users.noreply.github.com>2020-03-17 14:02:44 +0300
committerGitHub <noreply@github.com>2020-03-17 14:02:44 +0300
commit8033cdf974f69599edde6ba9126bff01b91f2435 (patch)
tree3dd60dbf7b979db3eeb46083477d00433c78c63a
parent261a5fbcf7558fc3c2ac22b33fe0c2930d440fc3 (diff)
parent79a3be9e7b78a4cbb231b5fc4e23dc0593f20240 (diff)
Merge pull request #71 from kpuatamazon/master
Improve compiler support
-rw-r--r--CMakeLists.txt3
-rw-r--r--avx2_gemm.h4
-rw-r--r--avx512_gemm.h42
-rw-r--r--benchmarks/biasmultiply.cc1
-rw-r--r--compile_test_avx512vnni.cc10
-rw-r--r--intgemm.cc8
-rw-r--r--intgemm.h90
-rw-r--r--multiply.h19
-rw-r--r--ssse3_gemm.h3
-rw-r--r--test/kernels/multiply_sat_test.cc14
10 files changed, 128 insertions, 66 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbb9b83..32c19ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,8 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
add_library(intgemm STATIC intgemm.cc)
-if (OPENMP)
+option(USE_OPENMP "Use OpenMP" OFF)
+if (USE_OPENMP)
message(STATUS "Compiling with OpenMP")
find_package(OpenMP)
if (NOT ${OpenMP_CXX_FOUND})
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 335fa0d..25866bb 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -201,7 +201,9 @@ struct AVX2_8bit {
INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
-
+ private:
+ INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2)
+ public:
INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2)
// Currently A is prepared by quantization but this could theoretically change.
diff --git a/avx512_gemm.h b/avx512_gemm.h
index cdbfff5..6286ccc 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -224,26 +224,48 @@ struct AVX512_8bit {
Quantize(input, output, quant_mult, rows * cols);
}
+ private:
+ /* g++ (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 does not carry target attributes
+ * to the hidden function it creates in implementing #pragma omp parallel for.
+ * So intrinstics were not working inside the for loop when compiled with
+ * OMP. Also, passing register types across #pragma omp parallel for
+ * generated an internal compiler error.
+ * The problem does not occur in g++-8 (Ubuntu 8.3.0-6ubuntu1~18.04.1) 8.3.0.
+ * As a workaround, I split into #pragma omp parallel with boring types
+ * passed across the boundary then call this function with target attributes.
+ */
+ INTGEMM_AVX512BW static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) {
+ const __m512i neg127 = _mm512_set1_epi32(-127);
+ const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
+ const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
+#pragma omp for
+ for (std::size_t i = 0; i < count; i += kBatch) {
+ __m512i asint = avx512f::QuantizerGrab(input + i, quant_mult_reg);
+ asint = _mm512_max_epi32(asint, neg127);
+ // There doesn't seem to be an unmasked version.
+ _mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint);
+ }
+ }
+
+ public:
// Technically output can be unaligned in Quantize.
// But then it will need to be aligned for Multiply.
// Convert to 8-bit signed integers.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
assert(reinterpret_cast<uintptr_t>(input) % sizeof(__m512i) == 0);
- const __m512i neg127 = _mm512_set1_epi32(-127);
- const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
- const float *fast_input_end = input + (size & ~(kBatch - 1));
- int8_t *fast_output_end = output + (size & ~(kBatch - 1));
-#pragma omp parallel for
- for (const float *input_it = input; input_it < fast_input_end; input_it += kBatch) {
- __m512i asint = avx512f::QuantizerGrab(input_it, quant_mult_reg);
- asint = _mm512_max_epi32(asint, neg127);
- // There doesn't seem to be an unmasked version.
- _mm512_mask_cvtsepi32_storeu_epi8(output + (input_it - input), 0xffff, asint);
+ std::size_t fast_size = (size & ~(kBatch - 1));
+ const float *fast_input_end = input + fast_size;
+ int8_t *fast_output_end = output + fast_size;
+#pragma omp parallel
+ {
+ QuantizeThread(input, output, quant_mult, fast_size);
}
std::size_t overhang = size & (kBatch - 1);
if (!overhang) return; // We needed a branch anyway for the empty case.
+ const __m512i neg127 = _mm512_set1_epi32(-127);
+ const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
__m512i asint = avx512f::QuantizerGrab(fast_input_end, quant_mult_reg);
asint = _mm512_max_epi32(asint, neg127);
_mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint);
diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc
index 515536e..8f0816f 100644
--- a/benchmarks/biasmultiply.cc
+++ b/benchmarks/biasmultiply.cc
@@ -8,7 +8,6 @@ using namespace intgemm;
template <class Routine>
void testOld(Index /*rows*/, Index /*cols*/) {
-
}
template <class Routine>
diff --git a/compile_test_avx512vnni.cc b/compile_test_avx512vnni.cc
index 611cc53..deb0f88 100644
--- a/compile_test_avx512vnni.cc
+++ b/compile_test_avx512vnni.cc
@@ -19,11 +19,13 @@ bool Foo() {
}
int main() {
- return Foo() &&
-#ifdef __INTEL_COMPILER
- _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)
+ return Foo()
+#if defined(__GNUC__) || defined(__clang__)
+ // uses cpuid
+#elif defined(__INTEL_COMPILER)
+ && _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)
#else
- __builtin_cpu_supports("avx512vnni")
+ && __builtin_cpu_supports("avx512vnni")
#endif
;
}
diff --git a/intgemm.cc b/intgemm.cc
index c069424..8838cdb 100644
--- a/intgemm.cc
+++ b/intgemm.cc
@@ -40,12 +40,18 @@ const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::
float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512f::MaxAbsolute, avx512f::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute);
+constexpr const char *const Unsupported_16bit::kName;
+constexpr const char *const Unsupported_8bit::kName;
constexpr const char *const SSE2_16bit::kName;
constexpr const char *const SSSE3_8bit::kName;
constexpr const char *const AVX2_8bit::kName;
constexpr const char *const AVX2_16bit::kName;
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
constexpr const char *const AVX512_8bit::kName;
-constexpr const char *const AVX512VNNI_8bit::kName;
constexpr const char *const AVX512_16bit::kName;
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+constexpr const char *const AVX512VNNI_8bit::kName;
+#endif
}
diff --git a/intgemm.h b/intgemm.h
index 6d4d95f..0c315fc 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -51,7 +51,7 @@
#include "avx512_gemm.h"
#include "avx512vnni_gemm.h"
-#if defined(__GNUC__) && defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
+#if defined(__GNUC__) || defined(__clang__)
#include "cpuid.h"
#endif
@@ -124,7 +124,7 @@ struct Unsupported_8bit {
typedef Unsupported_16bit AVX512_16bit;
typedef Unsupported_8bit AVX512_8bit;
namespace avx512f {
-static inline float MaxAbsolute(const float *begin, const float *end) {
+static inline float MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
throw UnsupportedCPU();
}
} //namespace
@@ -135,26 +135,6 @@ static inline float MaxAbsolute(const float *begin, const float *end) {
typedef Unsupported_8bit AVX512VNNI_8bit;
#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-// gcc 5.4.0 bizarrely supports avx512bw targets but not __builtin_cpu_supports("avx512bw"). So implement it manually.
-inline bool CheckAVX512BW() {
- __builtin_cpu_init ();
-#ifdef __INTEL_COMPILER
- return _may_i_use_cpu_feature(_FEATURE_AVX512BW)
-#elif __GNUC__
- unsigned int m = __get_cpuid_max(0, NULL);
- if (m < 7) return false;
- unsigned int eax, ebx, ecx, edx;
- __cpuid_count(7, 0, eax, ebx, ecx, edx);
- const unsigned int avx512bw_bit = (1 << 30);
- return ebx & avx512bw_bit;
-#else
- return __builtin_cpu_supports("avx512bw");
-#endif
-}
-#endif
-
/* Returns:
* axx512vnni if the CPU supports AVX512VNNI
*
@@ -172,24 +152,59 @@ template <class T> T ChooseCPU(T
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
avx512vnni
#endif
- , T avx512bw, T avx2, T ssse3, T sse2, T unsupported) {
- __builtin_cpu_init ();
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
- if (
-#ifdef __INTEL_COMPILER
- _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)
-#else
- __builtin_cpu_supports("avx512vnni")
+ , T
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+ avx512bw
#endif
- ) {
- return avx512vnni;
+ , T avx2, T ssse3, T sse2, T unsupported) {
+ /* If intgemm is compiled by gcc 6.4.1 then dlopened into an executable
+ * compiled by gcc 7.3.0, there will be a undefined symbol __cpu_info.
+ * Work around this by calling the intrinsics more directly instead of
+ * __builtin_cpu_supports.
+ *
+ * clang 6.0.0-1ubuntu2 supports vnni but doesn't have
+ * __builtin_cpu_supports("avx512vnni")
+ * so use the hand-coded CPUID for clang.
+ */
+#if defined(__GNUC__) || defined(__clang__)
+ unsigned int m = __get_cpuid_max(0, NULL);
+ unsigned int eax, ebx, ecx, edx;
+ if (m >= 7) {
+ __cpuid_count(7, 0, eax, ebx, ecx, edx);
+# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+ if (ecx & (1 << 11)) return avx512vnni;
+# endif
+# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+ if (ebx & (1 << 30)) return avx512bw;
+# endif
+ if (ebx & (1 << 5)) return avx2;
}
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- if (CheckAVX512BW()) {
- return avx512bw;
+ if (m >= 1) {
+ __cpuid_count(1, 0, eax, ebx, ecx, edx);
+ if (ecx & (1 << 9)) return ssse3;
+ if (edx & (1 << 26)) return sse2;
}
-#endif
+ return unsupported;
+#else // not gcc or clang.
+ __builtin_cpu_init();
+# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+ if (
+# ifdef __INTEL_COMPILER
+ _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)
+# else
+ __builtin_cpu_supports("avx512vnni")
+# endif
+ ) return vnni;
+# endif
+# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+ if (
+# ifdef __INTEL_COMPILER
+ _may_i_use_cpu_feature(_FEATURE_AVX512BW)
+# else
+ __builtin_cpu_supports("avx512bw")
+# endif
+ ) return avx512bw;
+# endif
if (__builtin_cpu_supports("avx2")) {
return avx2;
} else if (__builtin_cpu_supports("ssse3")) {
@@ -199,6 +214,7 @@ template <class T> T ChooseCPU(T
} else {
return unsupported;
}
+#endif
}
struct TileInfo {
diff --git a/multiply.h b/multiply.h
index a313c16..84d6737 100644
--- a/multiply.h
+++ b/multiply.h
@@ -60,19 +60,30 @@ static inline INTGEMM_AVX512F float MaxFloat32(__m512 a) {
#endif
// Quantize function used for SSSE3 and AVX2.
+// Separate function for thread to work around gcc 7 bug that doesn't imbue
+// target attributes across #pragma omp parallel.
+#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \
+target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
+ name::QuantizeTile8 q(quant_mult); \
+ _Pragma("omp for") \
+ for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
+ *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+ } \
+}
+
#define INTGEMM_QUANTIZE(target, Register, name) \
target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
- name::QuantizeTile8 q(quant_mult); \
const std::size_t kBatch = sizeof(Register); \
const std::size_t fast_end = size & ~(kBatch - 1); \
- _Pragma("omp parallel for") \
- for (std::size_t i = 0; i < fast_end; i += kBatch) { \
- *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+ _Pragma("omp parallel") \
+ { \
+ QuantizeThread(input, output, quant_mult, fast_end); \
} \
std::size_t overhang = size & (kBatch - 1); \
if (!overhang) return; \
+ name::QuantizeTile8 q(quant_mult); \
/* Each does size(Register) / 32 == kBatch / 4 floats at a time.
* If we're allowed to read one of them, then we can read the whole register. */ \
const float *inputs[4]; \
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index 2cf341e..fd3ab8c 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -116,6 +116,9 @@ struct SSSE3_8bit {
Quantize(input, output, quant_mult, rows * cols);
}
+ private:
+ INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3)
+ public:
INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3)
// Version with unsigned int + 127
diff --git a/test/kernels/multiply_sat_test.cc b/test/kernels/multiply_sat_test.cc
index 36a9b9f..edea772 100644
--- a/test/kernels/multiply_sat_test.cc
+++ b/test/kernels/multiply_sat_test.cc
@@ -21,13 +21,13 @@ void kernel_multiply_sat_test() {
std::iota(input1.begin(), input1.end(), -int(VECTOR_LENGTH / 2));
std::iota(input2.begin(), input2.end(), -int(VECTOR_LENGTH / 3));
- for (std::size_t shift = 0; shift <= 2 * 8 * sizeof(Type_); ++shift) {
- *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift);
- for (std::size_t i = 0; i < output.size(); ++i) {
- auto ref = (int64_t(input1[i]) * input2[i]) >> shift;
- auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref)));
- CHECK(output[i] == ref_sat);
- }
+ // TODO: try all shifts. The shift must be an immediate.
+ std::size_t shift = 1;
+ *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift);
+ for (std::size_t i = 0; i < output.size(); ++i) {
+ auto ref = (int64_t(input1[i]) * input2[i]) >> shift;
+ auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref)));
+ CHECK(output[i] == ref_sat);
}
}