From d19e35873f67c90b251ca38e007a83aa1eada211 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Thu, 18 Nov 2021 14:25:05 +0100 Subject: Cycles: several small fixes and additions for MSL This patch contains many small leftover fixes and additions that are required for Metal-enablement: - Address space fixes and a few other small compile fixes - Addition of missing functionality to the Metal adapter headers - Addition of various scattered `__KERNEL_METAL__` blocks (e.g. for atomic support & maths functions) Ref T92212 Differential Revision: https://developer.blender.org/D13263 --- intern/cycles/util/atomic.h | 56 +++++++++++++++ intern/cycles/util/debug.cpp | 12 ++++ intern/cycles/util/debug.h | 14 ++++ intern/cycles/util/half.h | 36 ++++++++-- intern/cycles/util/math.h | 75 ++++++++++++++++---- intern/cycles/util/math_float2.h | 5 ++ intern/cycles/util/math_float3.h | 144 +++++++++++++++++++++----------------- intern/cycles/util/math_float4.h | 145 ++++++++++++++++++++------------------- intern/cycles/util/math_int2.h | 4 ++ intern/cycles/util/math_int3.h | 40 ++++++----- intern/cycles/util/math_matrix.h | 2 +- intern/cycles/util/path.cpp | 2 +- intern/cycles/util/transform.h | 23 +++++-- intern/cycles/util/types.h | 4 +- 14 files changed, 384 insertions(+), 178 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h index faba411c769..afc3fd019df 100644 --- a/intern/cycles/util/atomic.h +++ b/intern/cycles/util/atomic.h @@ -63,6 +63,62 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest, # endif /* __KERNEL_CUDA__ */ +# ifdef __KERNEL_METAL__ + +// global address space versions +ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_source, + const float operand) +{ + volatile ccl_global atomic_int *source = (ccl_global atomic_int *)_source; + union { + int int_value; + float float_value; + } new_value, prev_value; + prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed); + do { + new_value.float_value = prev_value.float_value + operand; + } while (!atomic_compare_exchange_weak_explicit(source, + &prev_value.int_value, + new_value.int_value, + memory_order_relaxed, + memory_order_relaxed)); + + return new_value.float_value; +} + +# define atomic_fetch_and_add_uint32(p, x) \ + atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed) +# define atomic_fetch_and_sub_uint32(p, x) \ + atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed) +# define atomic_fetch_and_inc_uint32(p) \ + atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed) +# define atomic_fetch_and_dec_uint32(p) \ + atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed) +# define atomic_fetch_and_or_uint32(p, x) \ + atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed) + +ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest, + const float old_val, + const float new_val) +{ + int prev_value; + prev_value = __float_as_int(old_val); + atomic_compare_exchange_weak_explicit((ccl_global atomic_int *)dest, + &prev_value, + __float_as_int(new_val), + memory_order_relaxed, + memory_order_relaxed); + return __int_as_float(prev_value); +} + +# define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed) +# define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed) + +# define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup +# define ccl_barrier(flags) threadgroup_barrier(flags) + +# endif /* __KERNEL_METAL__ */ + #endif /* __KERNEL_GPU__ */ #endif /* __UTIL_ATOMIC_H__ */ diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp index 7d5b6d4e54e..717e55a2c9a 100644 --- a/intern/cycles/util/debug.cpp +++ b/intern/cycles/util/debug.cpp @@ -64,6 +64,11 @@ DebugFlags::HIP::HIP() : adaptive_compile(false) reset(); } +DebugFlags::Metal::Metal() : adaptive_compile(false) +{ + reset(); +} + void DebugFlags::CUDA::reset() { if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) @@ -76,6 +81,12 @@ void DebugFlags::HIP::reset() adaptive_compile = true; } +void DebugFlags::Metal::reset() +{ + if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL) + adaptive_compile = true; +} + DebugFlags::OptiX::OptiX() { reset(); @@ -97,6 +108,7 @@ void DebugFlags::reset() cpu.reset(); cuda.reset(); optix.reset(); + metal.reset(); } CCL_NAMESPACE_END diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h index 548c67600e5..1e431fde68a 100644 --- a/intern/cycles/util/debug.h +++ b/intern/cycles/util/debug.h @@ -116,6 +116,17 @@ class DebugFlags { bool use_debug; }; + /* Descriptor of Metal feature-set to be used. */ + struct Metal { + Metal(); + + /* Reset flags to their defaults. */ + void reset(); + + /* Whether adaptive feature based runtime compile is enabled or not.*/ + bool adaptive_compile; + }; + /* Get instance of debug flags registry. */ static DebugFlags &get() { @@ -138,6 +149,9 @@ class DebugFlags { /* Requested HIP flags. */ HIP hip; + /* Requested Metal flags. */ + Metal metal; + private: DebugFlags(); diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h index 016975e3c25..555f17259bd 100644 --- a/intern/cycles/util/half.h +++ b/intern/cycles/util/half.h @@ -28,8 +28,27 @@ CCL_NAMESPACE_BEGIN /* Half Floats */ +#if defined(__KERNEL_METAL__) + +ccl_device_inline float half_to_float(half h_in) +{ + float f; + union { + half h; + uint16_t s; + } val; + val.h = h_in; + + *((ccl_private int *)&f) = ((val.s & 0x8000) << 16) | (((val.s & 0x7c00) + 0x1C000) << 13) | + ((val.s & 0x03FF) << 13); + + return f; +} + +#else + /* CUDA has its own half data type, no need to define then */ -#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) +# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -53,11 +72,12 @@ class half { private: unsigned short v; }; -#endif +# endif struct half4 { half x, y, z, w; }; +#endif /* Conversion to/from half float for image textures * @@ -66,7 +86,9 @@ struct half4 { ccl_device_inline half float_to_half_image(float f) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) +#if defined(__KERNEL_METAL__) + return half(f); +#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __float2half(f); #else const uint u = __float_as_uint(f); @@ -92,7 +114,9 @@ ccl_device_inline half float_to_half_image(float f) ccl_device_inline float half_to_float_image(half h) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) +#if defined(__KERNEL_METAL__) + return half_to_float(h); +#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __half2float(h); #else const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13); @@ -125,7 +149,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h) ccl_device_inline half float_to_half_display(const float f) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) +#if defined(__KERNEL_METAL__) + return half(f); +#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __float2half(f); #else const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f); diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index 2e13eecd002..6cfeb1aa917 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -30,9 +30,11 @@ # include #endif -#include -#include -#include +#if !defined(__KERNEL_METAL__) +# include +# include +# include +#endif /* !defined(__KERNEL_METAL__) */ #include "util/types.h" @@ -174,6 +176,7 @@ ccl_device_inline float max4(float a, float b, float c, float d) return max(max(a, b), max(c, d)); } +#if !defined(__KERNEL_METAL__) /* Int/Float conversion */ ccl_device_inline int as_int(uint i) @@ -206,7 +209,7 @@ ccl_device_inline uint as_uint(float f) return u.i; } -#ifndef __HIP__ +# ifndef __HIP__ ccl_device_inline int __float_as_int(float f) { union { @@ -246,28 +249,33 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } -#endif +# endif ccl_device_inline int4 __float4_as_int4(float4 f) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(f.m128)); -#else +# else return make_int4( __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w)); -#endif +# endif } ccl_device_inline float4 __int4_as_float4(int4 i) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_castsi128_ps(i.m128)); -#else +# else return make_float4( __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w)); -#endif +# endif } +#endif /* !defined(__KERNEL_METAL__) */ +#if defined(__KERNEL_METAL__) +# define isnan_safe(v) isnan(v) +# define isfinite_safe(v) isfinite(v) +#else template ccl_device_inline uint pointer_pack_to_uint_0(T *ptr) { return ((uint64_t)ptr) & 0xFFFFFFFF; @@ -311,12 +319,14 @@ ccl_device_inline bool isfinite_safe(float f) unsigned int x = __float_as_uint(f); return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u); } +#endif ccl_device_inline float ensure_finite(float v) { return isfinite_safe(v) ? v : 0.0f; } +#if !defined(__KERNEL_METAL__) ccl_device_inline int clamp(int a, int mn, int mx) { return min(max(a, mn), mx); @@ -346,15 +356,17 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x) return result; } -#ifndef __KERNEL_CUDA__ +#endif /* !defined(__KERNEL_METAL__) */ + +#if defined(__KERNEL_CUDA__) ccl_device_inline float saturatef(float a) { - return clamp(a, 0.0f, 1.0f); + return __saturatef(a); } -#else +#elif !defined(__KERNEL_METAL__) ccl_device_inline float saturatef(float a) { - return __saturatef(a); + return clamp(a, 0.0f, 1.0f); } #endif /* __KERNEL_CUDA__ */ @@ -491,6 +503,7 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN +#if !defined(__KERNEL_METAL__) /* Interpolation */ template A lerp(const A &a, const A &b, const B &t) @@ -498,6 +511,8 @@ template A lerp(const A &a, const A &b, const B &t) return (A)(a * ((B)1 - t) + b * t); } +#endif /* __KERNEL_METAL__ */ + /* Triangle */ ccl_device_inline float triangle_area(ccl_private const float3 &v1, @@ -627,7 +642,11 @@ ccl_device_inline float safe_sqrtf(float f) ccl_device_inline float inversesqrtf(float f) { +#if defined(__KERNEL_METAL__) + return (f > 0.0f) ? rsqrt(f) : 0.0f; +#else return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f; +#endif } ccl_device float safe_asinf(float a) @@ -715,10 +734,30 @@ ccl_device float bits_to_01(uint bits) return bits * (1.0f / (float)0xFFFFFFFF); } +#if !defined(__KERNEL_GPU__) +# if defined(__GNUC__) +# define popcount(x) __builtin_popcount(x) +# else +ccl_device_inline uint popcount(uint x) +{ + /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */ + uint i = x & 0xaaaaaaaa; + i = i - ((i >> 1) & 0x55555555); + i = (i & 0x33333333) + ((i >> 2) & 0x33333333); + i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; + return i & 1; +} +# endif +#elif !defined(__KERNEL_METAL__) +# define popcount(x) __popc(x) +#endif + ccl_device_inline uint count_leading_zeros(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __clz(x); +#elif defined(__KERNEL_METAL__) + return clz(x); #else assert(x != 0); # ifdef _MSC_VER @@ -735,6 +774,8 @@ ccl_device_inline uint count_trailing_zeros(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return (__ffs(x) - 1); +#elif defined(__KERNEL_METAL__) + return ctz(x); #else assert(x != 0); # ifdef _MSC_VER @@ -751,6 +792,8 @@ ccl_device_inline uint find_first_set(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __ffs(x); +#elif defined(__KERNEL_METAL__) + return (x != 0) ? ctz(x) + 1 : 0; #else # ifdef _MSC_VER return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0; @@ -849,6 +892,8 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x) return x; #elif defined(__KERNEL_CUDA__) return __brev(x); +#elif defined(__KERNEL_METAL__) + return reverse_bits(x); #elif __has_builtin(__builtin_bitreverse32) return __builtin_bitreverse32(x); #else diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h index 87141d5bc37..8ff75c6c20a 100644 --- a/intern/cycles/util/math_float2.h +++ b/intern/cycles/util/math_float2.h @@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline float2 operator-(const float2 &a); ccl_device_inline float2 operator*(const float2 &a, const float2 &b); ccl_device_inline float2 operator*(const float2 &a, float f); @@ -63,6 +64,7 @@ ccl_device_inline float2 fabs(const float2 &a); ccl_device_inline float2 as_float2(const float4 &a); ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t); ccl_device_inline float2 floor(const float2 &a); +#endif /* !__KERNEL_METAL__ */ ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b); @@ -80,6 +82,7 @@ ccl_device_inline float2 one_float2() return make_float2(1.0f, 1.0f); } +#if !defined(__KERNEL_METAL__) ccl_device_inline float2 operator-(const float2 &a) { return make_float2(-a.x, -a.y); @@ -259,6 +262,8 @@ ccl_device_inline float2 floor(const float2 &a) return make_float2(floorf(a.x), floorf(a.y)); } +#endif /* !__KERNEL_METAL__ */ + ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b) { return (b != 0.0f) ? a / b : zero_float2(); diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index 031aac1b5d4..1a0213f2a6d 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline float3 operator-(const float3 &a); ccl_device_inline float3 operator*(const float3 &a, const float3 &b); ccl_device_inline float3 operator*(const float3 &a, const float f); @@ -62,19 +63,20 @@ ccl_device_inline float3 rcp(const float3 &a); ccl_device_inline float3 sqrt(const float3 &a); ccl_device_inline float3 floor(const float3 &a); ccl_device_inline float3 ceil(const float3 &a); +ccl_device_inline float3 reflect(const float3 incident, const float3 normal); +#endif /* !defined(__KERNEL_METAL__) */ ccl_device_inline float min3(float3 a); ccl_device_inline float max3(float3 a); ccl_device_inline float len(const float3 a); ccl_device_inline float len_squared(const float3 a); -ccl_device_inline float3 reflect(const float3 incident, const float3 normal); ccl_device_inline float3 project(const float3 v, const float3 v_proj); ccl_device_inline float3 saturate3(float3 a); ccl_device_inline float3 safe_normalize(const float3 a); -ccl_device_inline float3 normalize_len(const float3 a, float *t); -ccl_device_inline float3 safe_normalize_len(const float3 a, float *t); +ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t); +ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t); ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b); ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b); ccl_device_inline float3 interp(float3 a, float3 b, float t); @@ -103,49 +105,58 @@ ccl_device_inline float3 one_float3() return make_float3(1.0f, 1.0f, 1.0f); } +#if defined(__KERNEL_METAL__) + +ccl_device_inline float3 rcp(float3 a) +{ + return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z); +} + +#else + ccl_device_inline float3 operator-(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); -#else +# else return make_float3(-a.x, -a.y, -a.z); -#endif +# endif } ccl_device_inline float3 operator*(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_mul_ps(a.m128, b.m128)); -#else +# else return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); -#endif +# endif } ccl_device_inline float3 operator*(const float3 &a, const float f) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f))); -#else +# else return make_float3(a.x * f, a.y * f, a.z * f); -#endif +# endif } ccl_device_inline float3 operator*(const float f, const float3 &a) { -#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE__) return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); -#else +# else return make_float3(a.x * f, a.y * f, a.z * f); -#endif +# endif } ccl_device_inline float3 operator/(const float f, const float3 &a) { -#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE__) return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); -#else +# else return make_float3(f / a.x, f / a.y, f / a.z); -#endif +# endif } ccl_device_inline float3 operator/(const float3 &a, const float f) @@ -156,11 +167,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f) ccl_device_inline float3 operator/(const float3 &a, const float3 &b) { -#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE__) return float3(_mm_div_ps(a.m128, b.m128)); -#else +# else return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); -#endif +# endif } ccl_device_inline float3 operator+(const float3 &a, const float f) @@ -170,11 +181,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f) ccl_device_inline float3 operator+(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_add_ps(a.m128, b.m128)); -#else +# else return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); -#endif +# endif } ccl_device_inline float3 operator-(const float3 &a, const float f) @@ -184,11 +195,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f) ccl_device_inline float3 operator-(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_sub_ps(a.m128, b.m128)); -#else +# else return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); -#endif +# endif } ccl_device_inline float3 operator+=(float3 &a, const float3 &b) @@ -250,11 +261,11 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f) ccl_device_inline bool operator==(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; -#else +# else return (a.x == b.x && a.y == b.y && a.z == b.z); -#endif +# endif } ccl_device_inline bool operator!=(const float3 &a, const float3 &b) @@ -269,20 +280,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b) ccl_device_inline float dot(const float3 &a, const float3 &b) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); -#else +# else return a.x * b.x + a.y * b.y + a.z * b.z; -#endif +# endif } ccl_device_inline float dot_xy(const float3 &a, const float3 &b) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b)); -#else +# else return a.x * b.x + a.y * b.y; -#endif +# endif } ccl_device_inline float3 cross(const float3 &a, const float3 &b) @@ -293,30 +304,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b) ccl_device_inline float3 normalize(const float3 &a) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); return float3(_mm_div_ps(a.m128, norm)); -#else +# else return a / len(a); -#endif +# endif } ccl_device_inline float3 min(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_min_ps(a.m128, b.m128)); -#else +# else return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif +# endif } ccl_device_inline float3 max(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_max_ps(a.m128, b.m128)); -#else +# else return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif +# endif } ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx) @@ -326,43 +337,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 & ccl_device_inline float3 fabs(const float3 &a) { -#ifdef __KERNEL_SSE__ -# ifdef __KERNEL_NEON__ +# ifdef __KERNEL_SSE__ +# ifdef __KERNEL_NEON__ return float3(vabsq_f32(a.m128)); -# else +# else __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); return float3(_mm_and_ps(a.m128, mask)); -# endif -#else +# endif +# else return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); -#endif +# endif } ccl_device_inline float3 sqrt(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_sqrt_ps(a)); -#else +# else return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z)); -#endif +# endif } ccl_device_inline float3 floor(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_floor_ps(a)); -#else +# else return make_float3(floorf(a.x), floorf(a.y), floorf(a.z)); -#endif +# endif } ccl_device_inline float3 ceil(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_ceil_ps(a)); -#else +# else return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z)); -#endif +# endif } ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t) @@ -372,13 +383,14 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t) ccl_device_inline float3 rcp(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ /* Don't use _mm_rcp_ps due to poor precision. */ return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); -#else +# else return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z); -#endif +# endif } +#endif /* !__KERNEL_METAL__ */ ccl_device_inline float min3(float3 a) { @@ -404,6 +416,7 @@ ccl_device_inline float len_squared(const float3 a) return dot(a, a); } +#if !defined(__KERNEL_METAL__) ccl_device_inline float3 reflect(const float3 incident, const float3 normal) { float3 unit_normal = normalize(normal); @@ -425,6 +438,7 @@ ccl_device_inline float3 faceforward(const float3 vector, { return (dot(reference, incident) < 0.0f) ? vector : -vector; } +#endif ccl_device_inline float3 project(const float3 v, const float3 v_proj) { @@ -505,7 +519,11 @@ ccl_device_inline float average(const float3 a) ccl_device_inline bool isequal_float3(const float3 a, const float3 b) { +#if defined(__KERNEL_METAL__) + return all(a == b); +#else return a == b; +#endif } ccl_device_inline float3 pow3(float3 v, float e) diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h index c76959ee7ff..1203a10cca4 100644 --- a/intern/cycles/util/math_float4.h +++ b/intern/cycles/util/math_float4.h @@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline float4 operator-(const float4 &a); ccl_device_inline float4 operator*(const float4 &a, const float4 &b); ccl_device_inline float4 operator*(const float4 &a, float f); @@ -65,6 +66,7 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 & ccl_device_inline float4 fabs(const float4 &a); ccl_device_inline float4 floor(const float4 &a); ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t); +#endif /* !__KERNEL_METAL__*/ ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b); @@ -110,32 +112,33 @@ ccl_device_inline float4 one_float4() return make_float4(1.0f, 1.0f, 1.0f, 1.0f); } +#if !defined(__KERNEL_METAL__) ccl_device_inline float4 operator-(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); return float4(_mm_xor_ps(a.m128, mask)); -#else +# else return make_float4(-a.x, -a.y, -a.z, -a.w); -#endif +# endif } ccl_device_inline float4 operator*(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_mul_ps(a.m128, b.m128)); -#else +# else return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); -#endif +# endif } ccl_device_inline float4 operator*(const float4 &a, float f) { -#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE__) return a * make_float4(f); -#else +# else return make_float4(a.x * f, a.y * f, a.z * f, a.w * f); -#endif +# endif } ccl_device_inline float4 operator*(float f, const float4 &a) @@ -150,11 +153,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f) ccl_device_inline float4 operator/(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_div_ps(a.m128, b.m128)); -#else +# else return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); -#endif +# endif } ccl_device_inline float4 operator+(const float4 &a, const float f) @@ -164,11 +167,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f) ccl_device_inline float4 operator+(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_add_ps(a.m128, b.m128)); -#else +# else return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -#endif +# endif } ccl_device_inline float4 operator-(const float4 &a, const float f) @@ -178,11 +181,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f) ccl_device_inline float4 operator-(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_sub_ps(a.m128, b.m128)); -#else +# else return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); -#endif +# endif } ccl_device_inline float4 operator+=(float4 &a, const float4 &b) @@ -212,38 +215,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f) ccl_device_inline int4 operator<(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128))); -#else +# else return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); -#endif +# endif } ccl_device_inline int4 operator>=(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128))); -#else +# else return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); -#endif +# endif } ccl_device_inline int4 operator<=(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128))); -#else +# else return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); -#endif +# endif } ccl_device_inline bool operator==(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; -#else +# else return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); -#endif +# endif } ccl_device_inline float distance(const float4 &a, const float4 &b) @@ -253,16 +256,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b) ccl_device_inline float dot(const float4 &a, const float4 &b) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) __m128 t = vmulq_f32(a, b); return vaddvq_f32(t); -# else +# else return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); -# endif -#else +# endif +# else return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w); -#endif +# endif } ccl_device_inline float len_squared(const float4 &a) @@ -272,21 +275,21 @@ ccl_device_inline float len_squared(const float4 &a) ccl_device_inline float4 rcp(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ /* Don't use _mm_rcp_ps due to poor precision. */ return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); -#else +# else return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w); -#endif +# endif } ccl_device_inline float4 sqrt(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_sqrt_ps(a.m128)); -#else +# else return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); -#endif +# endif } ccl_device_inline float4 sqr(const float4 &a) @@ -296,39 +299,39 @@ ccl_device_inline float4 sqr(const float4 &a) ccl_device_inline float4 cross(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) - (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b)); -#else +# else return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f); -#endif +# endif } ccl_device_inline bool is_zero(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return a == make_float4(0.0f); -#else +# else return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); -#endif +# endif } ccl_device_inline float4 reduce_add(const float4 &a) { -#if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) return float4(vdupq_n_f32(vaddvq_f32(a))); -# elif defined(__KERNEL_SSE3__) +# elif defined(__KERNEL_SSE3__) float4 h(_mm_hadd_ps(a.m128, a.m128)); return float4(_mm_hadd_ps(h.m128, h.m128)); -# else +# else float4 h(shuffle<1, 0, 3, 2>(a) + a); return shuffle<2, 3, 0, 1>(h) + h; -# endif -#else +# endif +# else float sum = (a.x + a.y) + (a.z + a.w); return make_float4(sum, sum, sum, sum); -#endif +# endif } ccl_device_inline float average(const float4 &a) @@ -354,20 +357,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a) ccl_device_inline float4 min(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_min_ps(a.m128, b.m128)); -#else +# else return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -#endif +# endif } ccl_device_inline float4 max(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_max_ps(a.m128, b.m128)); -#else +# else return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -#endif +# endif } ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx) @@ -377,24 +380,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 & ccl_device_inline float4 fabs(const float4 &a) { -#if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) return float4(vabsq_f32(a)); -# else +# else return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); -# endif -#else +# endif +# else return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); -#endif +# endif } ccl_device_inline float4 floor(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_floor_ps(a)); -#else +# else return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); -#endif +# endif } ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) @@ -402,6 +405,8 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) return a + t * (b - a); } +#endif /* !__KERNEL_METAL__*/ + #ifdef __KERNEL_SSE__ template __forceinline const float4 shuffle(const float4 &b) diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h index 5b04be92152..39dc3b28f11 100644 --- a/intern/cycles/util/math_int2.h +++ b/intern/cycles/util/math_int2.h @@ -27,17 +27,20 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline bool operator==(const int2 a, const int2 b); ccl_device_inline int2 operator+(const int2 &a, const int2 &b); ccl_device_inline int2 operator+=(int2 &a, const int2 &b); ccl_device_inline int2 operator-(const int2 &a, const int2 &b); ccl_device_inline int2 operator*(const int2 &a, const int2 &b); ccl_device_inline int2 operator/(const int2 &a, const int2 &b); +#endif /* !__KERNEL_METAL__ */ /******************************************************************************* * Definition. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline bool operator==(const int2 a, const int2 b) { return (a.x == b.x && a.y == b.y); @@ -67,6 +70,7 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b) { return make_int2(a.x / b.x, a.y / b.y); } +#endif /* !__KERNEL_METAL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h index 128f2cb53b8..a09c68ef49a 100644 --- a/intern/cycles/util/math_int3.h +++ b/intern/cycles/util/math_int3.h @@ -27,49 +27,52 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline int3 min(int3 a, int3 b); ccl_device_inline int3 max(int3 a, int3 b); ccl_device_inline int3 clamp(const int3 &a, int mn, int mx); ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx); +#endif /* !defined(__KERNEL_METAL__) */ /******************************************************************************* * Definition. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline int3 min(int3 a, int3 b) { -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) +# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) return int3(_mm_min_epi32(a.m128, b.m128)); -#else +# else return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif +# endif } ccl_device_inline int3 max(int3 a, int3 b) { -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) +# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) return int3(_mm_max_epi32(a.m128, b.m128)); -#else +# else return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif +# endif } ccl_device_inline int3 clamp(const int3 &a, int mn, int mx) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return min(max(a, make_int3(mn)), make_int3(mx)); -#else +# else return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); -#endif +# endif } ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return min(max(a, mn), make_int3(mx)); -#else +# else return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)); -#endif +# endif } ccl_device_inline bool operator==(const int3 &a, const int3 &b) @@ -89,21 +92,22 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b) ccl_device_inline int3 operator+(const int3 &a, const int3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int3(_mm_add_epi32(a.m128, b.m128)); -#else +# else return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); -#endif +# endif } ccl_device_inline int3 operator-(const int3 &a, const int3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int3(_mm_sub_epi32(a.m128, b.m128)); -#else +# else return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); -#endif +# endif } +#endif /* !__KERNEL_METAL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/math_matrix.h b/intern/cycles/util/math_matrix.h index bff7ddb4cee..c1be71517e3 100644 --- a/intern/cycles/util/math_matrix.h +++ b/intern/cycles/util/math_matrix.h @@ -162,7 +162,7 @@ ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A, { for (int row = 0; row < n; row++) { for (int col = 0; col <= row; col++) { - MATHS(A, row, col, 1) += v[row] * v[col] * weight; + atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight); } } } diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp index 5704c4ef8ef..7437ad7651a 100644 --- a/intern/cycles/util/path.cpp +++ b/intern/cycles/util/path.cpp @@ -313,7 +313,7 @@ static char *path_specials(const string &sub) if (env_shader_path != NULL && sub == "shader") { return env_shader_path; } - else if (env_shader_path != NULL && sub == "source") { + else if (env_source_path != NULL && sub == "source") { return env_source_path; } return NULL; diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h index 7bfe747fcfb..1d78dfd1385 100644 --- a/intern/cycles/util/transform.h +++ b/intern/cycles/util/transform.h @@ -53,6 +53,15 @@ typedef struct DecomposedTransform { /* Functions */ +#ifdef __KERNEL_METAL__ +/* transform_point specialized for ccl_global */ +ccl_device_inline float3 transform_point(ccl_global const Transform *t, const float3 a) +{ + ccl_global const float3x3 &b(*(ccl_global const float3x3 *)t); + return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w); +} +#endif + ccl_device_inline float3 transform_point(ccl_private const Transform *t, const float3 a) { /* TODO(sergey): Disabled for now, causes crashes in certain cases. */ @@ -73,6 +82,9 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f tmp += w; return float3(tmp.m128); +#elif defined(__KERNEL_METAL__) + ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t); + return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w); #else float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z + t->x.w, a.x * t->y.x + a.y * t->y.y + a.z * t->y.z + t->y.w, @@ -99,6 +111,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con tmp = madd(shuffle<2>(aa), z, tmp); return float3(tmp.m128); +#elif defined(__KERNEL_METAL__) + ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t); + return (a * b).xyz; #else float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z, a.x * t->y.x + a.y * t->y.y + a.z * t->y.z, @@ -450,8 +465,8 @@ ccl_device_inline void transform_compose(ccl_private Transform *tfm, } /* Interpolate from array of decomposed transforms. */ -ccl_device void transform_motion_array_interpolate(Transform *tfm, - const DecomposedTransform *motion, +ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm, + ccl_global const DecomposedTransform *motion, uint numsteps, float time) { @@ -460,8 +475,8 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm, int step = min((int)(time * maxstep), maxstep - 1); float t = time * maxstep - step; - const DecomposedTransform *a = motion + step; - const DecomposedTransform *b = motion + step + 1; + ccl_global const DecomposedTransform *a = motion + step; + ccl_global const DecomposedTransform *b = motion + step + 1; /* Interpolate rotation, translation and scale. */ DecomposedTransform decomp; diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h index 697dc2b44ea..58a6d134819 100644 --- a/intern/cycles/util/types.h +++ b/intern/cycles/util/types.h @@ -17,7 +17,9 @@ #ifndef __UTIL_TYPES_H__ #define __UTIL_TYPES_H__ -#include +#if !defined(__KERNEL_METAL__) +# include +#endif /* Standard Integer Types */ -- cgit v1.2.3