diff options
author | Tue Ly <lntue@google.com> | 2022-04-07 23:02:10 +0300 |
---|---|---|
committer | Tue Ly <lntue@google.com> | 2022-04-08 21:12:24 +0300 |
commit | c5f8a0a1e929ea41bd3e0d781c6b394a3f619427 (patch) | |
tree | 9edde00f445c547d31f78d29be562c11d9834cc7 /libc | |
parent | d49c0ba5abdeb8f6ebe081d96e0fdf44de46e7f8 (diff) |
[libc] Add support for x86-64 targets that do not have FMA instructions.
Make FMA flag checks more accurate for x86-64 targets, and refactor
polyeval to use multiply and add instead when FMA instructions are not
available.
Reviewed By: michaelrj, sivachandra
Differential Revision: https://reviews.llvm.org/D123335
Diffstat (limited to 'libc')
-rw-r--r-- | libc/src/__support/FPUtil/CMakeLists.txt | 26 | ||||
-rw-r--r-- | libc/src/__support/FPUtil/FMA.h | 5 | ||||
-rw-r--r-- | libc/src/__support/FPUtil/PolyEval.h | 37 | ||||
-rw-r--r-- | libc/src/__support/FPUtil/aarch64/FMA.h | 4 | ||||
-rw-r--r-- | libc/src/__support/FPUtil/generic/CMakeLists.txt | 6 | ||||
-rw-r--r-- | libc/src/__support/FPUtil/generic/FMA.h | 1 | ||||
-rw-r--r-- | libc/src/__support/FPUtil/multiply_add.h | 41 | ||||
-rw-r--r-- | libc/src/__support/FPUtil/x86_64/FMA.h | 4 | ||||
-rw-r--r-- | libc/src/__support/architectures.h | 10 | ||||
-rw-r--r-- | libc/src/math/CMakeLists.txt | 6 | ||||
-rw-r--r-- | libc/src/math/generic/CMakeLists.txt | 11 | ||||
-rw-r--r-- | libc/src/math/generic/expm1f.cpp | 6 | ||||
-rw-r--r-- | libc/src/math/generic/log10f.cpp | 2 | ||||
-rw-r--r-- | libc/src/math/generic/log1pf.cpp | 4 | ||||
-rw-r--r-- | libc/src/math/generic/logf.cpp | 2 |
15 files changed, 121 insertions, 44 deletions
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt index f1cd0b587d96..7f1cecc25f03 100644 --- a/libc/src/__support/FPUtil/CMakeLists.txt +++ b/libc/src/__support/FPUtil/CMakeLists.txt @@ -12,7 +12,6 @@ add_header_library( NearestIntegerOperations.h NormalFloat.h PlatformDefs.h - PolyEval.h UInt.h XFloat.h DEPENDS @@ -34,4 +33,29 @@ add_header_library( libc.src.__support.FPUtil.generic.sqrt ) +add_header_library( + fma + HDRS + FMA.h + DEPENDS + .fputil + libc.src.__support.FPUtil.generic.fma +) + +add_header_library( + multiply_add + HDRS + multiply_add.h + DEPENDS + .fma +) + +add_header_library( + polyeval + HDRS + PolyEval.h + DEPENDS + .multiply_add +) + add_subdirectory(generic) diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h index c735c069fd4d..6823dd0f897b 100644 --- a/libc/src/__support/FPUtil/FMA.h +++ b/libc/src/__support/FPUtil/FMA.h @@ -11,11 +11,16 @@ #include "src/__support/architectures.h" +#if defined(LIBC_TARGET_HAS_FMA) + #if defined(LLVM_LIBC_ARCH_X86_64) #include "x86_64/FMA.h" #elif defined(LLVM_LIBC_ARCH_AARCH64) #include "aarch64/FMA.h" +#endif + #else +// FMA instructions are not available #include "generic/FMA.h" #include "src/__support/CPP/TypeTraits.h" diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h index 368ee3848ddc..c9e818accd1b 100644 --- a/libc/src/__support/FPUtil/PolyEval.h +++ b/libc/src/__support/FPUtil/PolyEval.h @@ -9,19 +9,15 @@ #ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H -#include "src/__support/CPP/TypeTraits.h" -#include "src/__support/architectures.h" +#include "multiply_add.h" // Evaluate polynomial using Horner's Scheme: // With polyeval(x, a_0, a_1, ..., a_n) = a_n * x^n + ... + a_1 * x + a_0, we // evaluated it as: a_0 + x * (a_1 + x * ( ... (a_(n-1) + x * a_n) ... ) ) ). -// We will use fma instructions if available. +// We will use FMA instructions if available. // Example: to evaluate x^3 + 2*x^2 + 3*x + 4, call // polyeval( x, 4.0, 3.0, 2.0, 1.0 ) -#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64) -#include "FMA.h" - namespace __llvm_libc { namespace fputil { @@ -29,35 +25,10 @@ template <typename T> static inline T polyeval(T x, T a0) { return a0; } template <typename T, typename... Ts> INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) { - return fma(x, polyeval(x, a...), a0); + return multiply_add(x, polyeval(x, a...), a0); } } // namespace fputil } // namespace __llvm_libc -#ifdef LLVM_LIBC_ARCH_X86_64 - -// [DISABLED] There is a regression with using vectorized version for polyeval -// compared to the naive Horner's scheme with fma. Need further investigation -// #include "x86_64/PolyEval.h" - -#endif // LLVM_LIBC_ARCH_X86_64 - -#else - -namespace __llvm_libc { -namespace fputil { - -template <typename T> static inline T polyeval(T x, T a0) { return a0; } - -template <typename T, typename... Ts> -static inline T polyeval(T x, T a0, Ts... a) { - return x * polyeval(x, a...) + a0; -} - -} // namespace fputil -} // namespace __llvm_libc - -#endif - -#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_FMA_H +#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H diff --git a/libc/src/__support/FPUtil/aarch64/FMA.h b/libc/src/__support/FPUtil/aarch64/FMA.h index c236c9a90b74..ed637c848658 100644 --- a/libc/src/__support/FPUtil/aarch64/FMA.h +++ b/libc/src/__support/FPUtil/aarch64/FMA.h @@ -15,6 +15,10 @@ #error "Invalid include" #endif +#if !defined(LIBC_TARGET_HAS_FMA) +#error "FMA instructions are not supported" +#endif + #include "src/__support/CPP/TypeTraits.h" namespace __llvm_libc { diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt index bf69e7dd961c..a755e7670ce6 100644 --- a/libc/src/__support/FPUtil/generic/CMakeLists.txt +++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt @@ -4,3 +4,9 @@ add_header_library( sqrt.h sqrt_80_bit_long_double.h ) + +add_header_library( + fma + HDRS + FMA.h +) diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h index efdd8b712975..78b640c2c1a1 100644 --- a/libc/src/__support/FPUtil/generic/FMA.h +++ b/libc/src/__support/FPUtil/generic/FMA.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMA_H #include "src/__support/CPP/TypeTraits.h" +#include "src/__support/FPUtil/FPBits.h" namespace __llvm_libc { namespace fputil { diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h new file mode 100644 index 000000000000..8f5da22a53cb --- /dev/null +++ b/libc/src/__support/FPUtil/multiply_add.h @@ -0,0 +1,41 @@ +//===-- Common header for multiply-add implementations ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H +#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H + +#include "src/__support/architectures.h" + +namespace __llvm_libc { +namespace fputil { + +// Implement a simple wrapper for multiply-add operation: +// multiply_add(x, y, z) = x*y + z +// which uses FMA instructions to speed up if available. + +template <typename T> static inline T multiply_add(T x, T y, T z) { + return x * y + z; +} + +#if defined(LIBC_TARGET_HAS_FMA) +// FMA instructions are available. +#include "FMA.h" + +template <> inline float multiply_add<float>(float x, float y, float z) { + return fma(x, y, z); +} + +template <> inline double multiply_add<double>(double x, double y, double z) { + return fma(x, y, z); +} +#endif // LIBC_TARGET_HAS_FMA + +} // namespace fputil +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h index 70ebe382e841..08de6da34410 100644 --- a/libc/src/__support/FPUtil/x86_64/FMA.h +++ b/libc/src/__support/FPUtil/x86_64/FMA.h @@ -15,6 +15,10 @@ #error "Invalid include" #endif +#if !defined(LIBC_TARGET_HAS_FMA) +#error "FMA instructions are not supported" +#endif + #include "src/__support/CPP/TypeTraits.h" #include <immintrin.h> diff --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h index 14eb1a586463..70eeb99107b2 100644 --- a/libc/src/__support/architectures.h +++ b/libc/src/__support/architectures.h @@ -37,7 +37,15 @@ #define LLVM_LIBC_ARCH_ANY_ARM #endif -#if defined(LLVM_LIBC_ARCH_X86_64) +#if defined(LLVM_LIBC_ARCH_AARCH64) +#define LIBC_TARGET_HAS_FMA +#elif defined(LLVM_LIBC_ARCH_X86_64) +#if (defined(__AVX2__) || defined(__FMA__)) +#define LIBC_TARGET_HAS_FMA +#endif +#endif + +#if (defined(LLVM_LIBC_ARCH_X86_64) && defined(LIBC_TARGET_HAS_FMA)) #define INLINE_FMA __attribute__((target("fma"))) #else #define INLINE_FMA diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 9737f4a362dc..8fc550d1f968 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -48,8 +48,9 @@ add_entrypoint_object( fmaf.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.fma COMPILE_OPTIONS - -O2 + -O3 -mfma ) @@ -61,8 +62,9 @@ add_entrypoint_object( fma.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.fma COMPILE_OPTIONS - -O2 + -O3 -mfma ) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 73957cf78910..6a96b5510804 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -478,6 +478,7 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -492,6 +493,7 @@ add_entrypoint_object( ../exp2f.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -507,6 +509,8 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -674,6 +678,8 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -688,6 +694,8 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -702,6 +710,7 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -716,6 +725,8 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp index b0544b76e09a..76232d6ab6a9 100644 --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -83,7 +83,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { // = x otherwise. // To simplify the rounding decision and make it more efficient, we use // fma(x, x, x) ~ x + x^2 instead. - return fputil::fma(x, x, x); + return fputil::multiply_add(x, x, x); } // 2^-25 <= |x| < 2^-4 @@ -96,7 +96,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { fputil::polyeval(xd, 0x1p-1, 0x1.55555555557ddp-3, 0x1.55555555552fap-5, 0x1.111110fcd58b7p-7, 0x1.6c16c1717660bp-10, 0x1.a0241f0006d62p-13, 0x1.a01e3f8d3c06p-16); - return static_cast<float>(fputil::fma(r, xsq, xd)); + return static_cast<float>(fputil::multiply_add(r, xsq, xd)); } // For -18 < x < 89, to compute expm1(x), we perform the following range @@ -132,7 +132,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { double exp_lo = fputil::polyeval(xd, 0x1.0p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1, 0x1.555566668e5e7p-3, 0x1.55555555ef243p-5); - return static_cast<float>(fputil::fma(exp_hi_mid, exp_lo, -1.0)); + return static_cast<float>(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0)); } } // namespace __llvm_libc diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp index 59ca6590b640..878ae68f85eb 100644 --- a/libc/src/math/generic/log10f.cpp +++ b/libc/src/math/generic/log10f.cpp @@ -170,7 +170,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) { double d = static_cast<float>(xbits) - static_cast<float>(f); d *= ONE_OVER_F[f_index]; - double extra_factor = fputil::fma(m, LOG10_2, LOG10_F[f_index]); + double extra_factor = fputil::multiply_add(m, LOG10_2, LOG10_F[f_index]); double r = fputil::polyeval(d, extra_factor, 0x1.bcb7b1526e4c5p-2, -0x1.bcb7b1518a5e9p-3, 0x1.287a72a6f716p-3, diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp index 7d1e71e91990..6e8c6781e6ef 100644 --- a/libc/src/math/generic/log1pf.cpp +++ b/libc/src/math/generic/log1pf.cpp @@ -66,7 +66,7 @@ INLINE_FMA static inline float log(double x) { double d = static_cast<double>(xbits) - static_cast<double>(f); d *= ONE_OVER_F[f_index]; - double extra_factor = fputil::fma(m, LOG_2, LOG_F[f_index]); + double extra_factor = fputil::multiply_add(m, LOG_2, LOG_F[f_index]); double r = fputil::polyeval(d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2, 0x1.5555513bc679ap-2, @@ -161,7 +161,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { // > fpminimax(log(1 + x)/x, 5, [|D...|], [-2^-8; 2^-8]); r = fputil::polyeval(xd, -0x1p-1, 0x1.5555555515551p-2, -0x1.ffffffff82bdap-3, 0x1.999b33348d3aep-3, -0x1.5556cae3adcc3p-3); - return static_cast<float>(fputil::fma(r, xd * xd, xd)); + return static_cast<float>(fputil::multiply_add(r, xd * xd, xd)); } } // namespace __llvm_libc diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp index 3e712378b64c..747f8c73c27c 100644 --- a/libc/src/math/generic/logf.cpp +++ b/libc/src/math/generic/logf.cpp @@ -120,7 +120,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) { d *= ONE_OVER_F[f_index]; double extra_factor = - fputil::fma(static_cast<double>(m), LOG_2, LOG_F[f_index]); + fputil::multiply_add(static_cast<double>(m), LOG_2, LOG_F[f_index]); double r = __llvm_libc::fputil::polyeval( d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2, |