From 869a46df2980818644db4823fb1d29e9d525b645 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Tue, 12 Apr 2022 19:36:55 +0100 Subject: Cycles fp consistency for Apple Silicon CPUs Propagate the fp settings from the main thread to all the worker threads (the fp settings includes the FZ settings among other things) - this guarantees consistency in execution of floating point math regardless if its executed in tbb thread arena or on main thread Add FZ mode to arm64/aarch64 in parallel to the way its been done on intel processors, currently compiling for arm target does not set this mode at all, hence potentially runs slower and with possible results mismatch with intel x86. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14454 --- intern/cycles/util/simd.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/simd.h b/intern/cycles/util/simd.h index 15dda4e76a8..6772025d1de 100644 --- a/intern/cycles/util/simd.h +++ b/intern/cycles/util/simd.h @@ -32,6 +32,12 @@ # define SIMD_SET_FLUSH_TO_ZERO \ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#elif defined(__aarch64__) || defined(_M_ARM64) +#define _MM_FLUSH_ZERO_ON 24 +#define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r" (__fpcr)) +#define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : :"ri" (__fpcr)) +# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON); +# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON) #else # define SIMD_SET_FLUSH_TO_ZERO #endif @@ -104,6 +110,21 @@ static struct PosInfTy { static struct StepTy { } step ccl_attr_maybe_unused; +#endif +#if defined(__aarch64__) || defined(_M_ARM64) +__forceinline int set_fz(uint32_t flag) { + uint64_t old_fpcr, new_fpcr; + __get_fpcr(old_fpcr); + new_fpcr = old_fpcr | (1ULL << flag); + __set_fpcr(new_fpcr); + __get_fpcr(old_fpcr); + return old_fpcr == new_fpcr; +} +__forceinline int get_fz(uint32_t flag) { + uint64_t cur_fpcr; + __get_fpcr(cur_fpcr); + return (cur_fpcr & (1ULL<< flag)) > 0 ? 1 : 0 ; +} #endif /* Utilities used by Neon */ -- cgit v1.2.3