Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/atomic.h56
-rw-r--r--intern/cycles/util/debug.cpp12
-rw-r--r--intern/cycles/util/debug.h14
-rw-r--r--intern/cycles/util/defines.h4
-rw-r--r--intern/cycles/util/half.h36
-rw-r--r--intern/cycles/util/ies.cpp6
-rw-r--r--intern/cycles/util/math.h77
-rw-r--r--intern/cycles/util/math_float2.h5
-rw-r--r--intern/cycles/util/math_float3.h170
-rw-r--r--intern/cycles/util/math_float4.h145
-rw-r--r--intern/cycles/util/math_int2.h4
-rw-r--r--intern/cycles/util/math_int3.h40
-rw-r--r--intern/cycles/util/math_matrix.h2
-rw-r--r--intern/cycles/util/path.cpp4
-rw-r--r--intern/cycles/util/progress.h2
-rw-r--r--intern/cycles/util/ssef.h2
-rw-r--r--intern/cycles/util/transform.h23
-rw-r--r--intern/cycles/util/types.h4
-rw-r--r--intern/cycles/util/types_float3.h35
19 files changed, 455 insertions, 186 deletions
diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h
index faba411c769..afc3fd019df 100644
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -63,6 +63,62 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
# endif /* __KERNEL_CUDA__ */
+# ifdef __KERNEL_METAL__
+
+// global address space versions
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_source,
+ const float operand)
+{
+ volatile ccl_global atomic_int *source = (ccl_global atomic_int *)_source;
+ union {
+ int int_value;
+ float float_value;
+ } new_value, prev_value;
+ prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed);
+ do {
+ new_value.float_value = prev_value.float_value + operand;
+ } while (!atomic_compare_exchange_weak_explicit(source,
+ &prev_value.int_value,
+ new_value.int_value,
+ memory_order_relaxed,
+ memory_order_relaxed));
+
+ return new_value.float_value;
+}
+
+# define atomic_fetch_and_add_uint32(p, x) \
+ atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+# define atomic_fetch_and_sub_uint32(p, x) \
+ atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+# define atomic_fetch_and_inc_uint32(p) \
+ atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
+# define atomic_fetch_and_dec_uint32(p) \
+ atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
+# define atomic_fetch_and_or_uint32(p, x) \
+ atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+
+ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
+ const float old_val,
+ const float new_val)
+{
+ int prev_value;
+ prev_value = __float_as_int(old_val);
+ atomic_compare_exchange_weak_explicit((ccl_global atomic_int *)dest,
+ &prev_value,
+ __float_as_int(new_val),
+ memory_order_relaxed,
+ memory_order_relaxed);
+ return __int_as_float(prev_value);
+}
+
+# define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
+# define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
+
+# define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
+# define ccl_barrier(flags) threadgroup_barrier(flags)
+
+# endif /* __KERNEL_METAL__ */
+
#endif /* __KERNEL_GPU__ */
#endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp
index 7d5b6d4e54e..717e55a2c9a 100644
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -64,6 +64,11 @@ DebugFlags::HIP::HIP() : adaptive_compile(false)
reset();
}
+DebugFlags::Metal::Metal() : adaptive_compile(false)
+{
+ reset();
+}
+
void DebugFlags::CUDA::reset()
{
if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
@@ -76,6 +81,12 @@ void DebugFlags::HIP::reset()
adaptive_compile = true;
}
+void DebugFlags::Metal::reset()
+{
+ if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
+ adaptive_compile = true;
+}
+
DebugFlags::OptiX::OptiX()
{
reset();
@@ -97,6 +108,7 @@ void DebugFlags::reset()
cpu.reset();
cuda.reset();
optix.reset();
+ metal.reset();
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h
index 548c67600e5..1e431fde68a 100644
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -116,6 +116,17 @@ class DebugFlags {
bool use_debug;
};
+ /* Descriptor of Metal feature-set to be used. */
+ struct Metal {
+ Metal();
+
+ /* Reset flags to their defaults. */
+ void reset();
+
+ /* Whether adaptive feature based runtime compile is enabled or not.*/
+ bool adaptive_compile;
+ };
+
/* Get instance of debug flags registry. */
static DebugFlags &get()
{
@@ -138,6 +149,9 @@ class DebugFlags {
/* Requested HIP flags. */
HIP hip;
+ /* Requested Metal flags. */
+ Metal metal;
+
private:
DebugFlags();
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index a778bef52b2..a2e8d83adb7 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -44,6 +44,7 @@
# if defined(_WIN32) && !defined(FREE_WINDOWS)
# define ccl_device_inline static __forceinline
# define ccl_device_forceinline static __forceinline
+# define ccl_device_inline_method __forceinline
# define ccl_align(...) __declspec(align(__VA_ARGS__))
# ifdef __KERNEL_64_BIT__
# define ccl_try_align(...) __declspec(align(__VA_ARGS__))
@@ -58,6 +59,7 @@
# else /* _WIN32 && !FREE_WINDOWS */
# define ccl_device_inline static inline __attribute__((always_inline))
# define ccl_device_forceinline static inline __attribute__((always_inline))
+# define ccl_device_inline_method __attribute__((always_inline))
# define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
# ifndef FREE_WINDOWS64
# define __forceinline inline __attribute__((always_inline))
@@ -70,7 +72,7 @@
/* Address spaces for GPU. */
# define ccl_global
-# define ccl_static_constant static const
+# define ccl_inline_constant inline constexpr
# define ccl_constant const
# define ccl_private
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index 016975e3c25..555f17259bd 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -28,8 +28,27 @@ CCL_NAMESPACE_BEGIN
/* Half Floats */
+#if defined(__KERNEL_METAL__)
+
+ccl_device_inline float half_to_float(half h_in)
+{
+ float f;
+ union {
+ half h;
+ uint16_t s;
+ } val;
+ val.h = h_in;
+
+ *((ccl_private int *)&f) = ((val.s & 0x8000) << 16) | (((val.s & 0x7c00) + 0x1C000) << 13) |
+ ((val.s & 0x03FF) << 13);
+
+ return f;
+}
+
+#else
+
/* CUDA has its own half data type, no need to define then */
-#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
+# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
/* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
* unsigned shorts. */
class half {
@@ -53,11 +72,12 @@ class half {
private:
unsigned short v;
};
-#endif
+# endif
struct half4 {
half x, y, z, w;
};
+#endif
/* Conversion to/from half float for image textures
*
@@ -66,7 +86,9 @@ struct half4 {
ccl_device_inline half float_to_half_image(float f)
{
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+ return half(f);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
return __float2half(f);
#else
const uint u = __float_as_uint(f);
@@ -92,7 +114,9 @@ ccl_device_inline half float_to_half_image(float f)
ccl_device_inline float half_to_float_image(half h)
{
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+ return half_to_float(h);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
return __half2float(h);
#else
const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
@@ -125,7 +149,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
ccl_device_inline half float_to_half_display(const float f)
{
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+ return half(f);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
return __float2half(f);
#else
const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
diff --git a/intern/cycles/util/ies.cpp b/intern/cycles/util/ies.cpp
index 5e879478df5..e924d660407 100644
--- a/intern/cycles/util/ies.cpp
+++ b/intern/cycles/util/ies.cpp
@@ -23,10 +23,10 @@
CCL_NAMESPACE_BEGIN
-// NOTE: For some reason gcc-7.2 does not instantiate this versio of allocator
-// gere (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8.
+// NOTE: For some reason gcc-7.2 does not instantiate this version of the
+// allocator here (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8.
//
-// TODO(sergey): Get to the root of this issue, or confirm this i a compiler
+// TODO(sergey): Get to the root of this issue, or confirm this is a compiler
// issue.
template class GuardedAllocator<char>;
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index e4c7df6e44a..6cfeb1aa917 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -30,9 +30,11 @@
# include <hip/hip_vector_types.h>
#endif
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
+#if !defined(__KERNEL_METAL__)
+# include <float.h>
+# include <math.h>
+# include <stdio.h>
+#endif /* !defined(__KERNEL_METAL__) */
#include "util/types.h"
@@ -174,6 +176,7 @@ ccl_device_inline float max4(float a, float b, float c, float d)
return max(max(a, b), max(c, d));
}
+#if !defined(__KERNEL_METAL__)
/* Int/Float conversion */
ccl_device_inline int as_int(uint i)
@@ -206,7 +209,7 @@ ccl_device_inline uint as_uint(float f)
return u.i;
}
-#ifndef __HIP__
+# ifndef __HIP__
ccl_device_inline int __float_as_int(float f)
{
union {
@@ -246,28 +249,33 @@ ccl_device_inline float __uint_as_float(uint i)
u.i = i;
return u.f;
}
-#endif
+# endif
ccl_device_inline int4 __float4_as_int4(float4 f)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(f.m128));
-#else
+# else
return make_int4(
__float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-#endif
+# endif
}
ccl_device_inline float4 __int4_as_float4(int4 i)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_castsi128_ps(i.m128));
-#else
+# else
return make_float4(
__int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-#endif
+# endif
}
+#endif /* !defined(__KERNEL_METAL__) */
+#if defined(__KERNEL_METAL__)
+# define isnan_safe(v) isnan(v)
+# define isfinite_safe(v) isfinite(v)
+#else
template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
{
return ((uint64_t)ptr) & 0xFFFFFFFF;
@@ -311,12 +319,14 @@ ccl_device_inline bool isfinite_safe(float f)
unsigned int x = __float_as_uint(f);
return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
}
+#endif
ccl_device_inline float ensure_finite(float v)
{
return isfinite_safe(v) ? v : 0.0f;
}
+#if !defined(__KERNEL_METAL__)
ccl_device_inline int clamp(int a, int mn, int mx)
{
return min(max(a, mn), mx);
@@ -346,15 +356,17 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
return result;
}
-#ifndef __KERNEL_CUDA__
+#endif /* !defined(__KERNEL_METAL__) */
+
+#if defined(__KERNEL_CUDA__)
ccl_device_inline float saturatef(float a)
{
- return clamp(a, 0.0f, 1.0f);
+ return __saturatef(a);
}
-#else
+#elif !defined(__KERNEL_METAL__)
ccl_device_inline float saturatef(float a)
{
- return __saturatef(a);
+ return clamp(a, 0.0f, 1.0f);
}
#endif /* __KERNEL_CUDA__ */
@@ -491,6 +503,7 @@ CCL_NAMESPACE_END
CCL_NAMESPACE_BEGIN
+#if !defined(__KERNEL_METAL__)
/* Interpolation */
template<class A, class B> A lerp(const A &a, const A &b, const B &t)
@@ -498,6 +511,8 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
return (A)(a * ((B)1 - t) + b * t);
}
+#endif /* __KERNEL_METAL__ */
+
/* Triangle */
ccl_device_inline float triangle_area(ccl_private const float3 &v1,
@@ -627,7 +642,11 @@ ccl_device_inline float safe_sqrtf(float f)
ccl_device_inline float inversesqrtf(float f)
{
+#if defined(__KERNEL_METAL__)
+ return (f > 0.0f) ? rsqrt(f) : 0.0f;
+#else
return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
+#endif
}
ccl_device float safe_asinf(float a)
@@ -715,10 +734,30 @@ ccl_device float bits_to_01(uint bits)
return bits * (1.0f / (float)0xFFFFFFFF);
}
+#if !defined(__KERNEL_GPU__)
+# if defined(__GNUC__)
+# define popcount(x) __builtin_popcount(x)
+# else
+ccl_device_inline uint popcount(uint x)
+{
+ /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
+ uint i = x & 0xaaaaaaaa;
+ i = i - ((i >> 1) & 0x55555555);
+ i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+ i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+ return i & 1;
+}
+# endif
+#elif !defined(__KERNEL_METAL__)
+# define popcount(x) __popc(x)
+#endif
+
ccl_device_inline uint count_leading_zeros(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return __clz(x);
+#elif defined(__KERNEL_METAL__)
+ return clz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
@@ -735,6 +774,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return (__ffs(x) - 1);
+#elif defined(__KERNEL_METAL__)
+ return ctz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
@@ -751,6 +792,8 @@ ccl_device_inline uint find_first_set(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
return __ffs(x);
+#elif defined(__KERNEL_METAL__)
+ return (x != 0) ? ctz(x) + 1 : 0;
#else
# ifdef _MSC_VER
return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
@@ -801,7 +844,7 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
* https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
*/
-ccl_device_inline float compare_floats(float a, float b, float abs_diff, int ulp_diff)
+ccl_device_inline bool compare_floats(float a, float b, float abs_diff, int ulp_diff)
{
if (fabsf(a - b) < abs_diff) {
return true;
@@ -849,6 +892,8 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
return x;
#elif defined(__KERNEL_CUDA__)
return __brev(x);
+#elif defined(__KERNEL_METAL__)
+ return reverse_bits(x);
#elif __has_builtin(__builtin_bitreverse32)
return __builtin_bitreverse32(x);
#else
diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h
index 87141d5bc37..8ff75c6c20a 100644
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
+#if !defined(__KERNEL_METAL__)
ccl_device_inline float2 operator-(const float2 &a);
ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
ccl_device_inline float2 operator*(const float2 &a, float f);
@@ -63,6 +64,7 @@ ccl_device_inline float2 fabs(const float2 &a);
ccl_device_inline float2 as_float2(const float4 &a);
ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
ccl_device_inline float2 floor(const float2 &a);
+#endif /* !__KERNEL_METAL__ */
ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
@@ -80,6 +82,7 @@ ccl_device_inline float2 one_float2()
return make_float2(1.0f, 1.0f);
}
+#if !defined(__KERNEL_METAL__)
ccl_device_inline float2 operator-(const float2 &a)
{
return make_float2(-a.x, -a.y);
@@ -259,6 +262,8 @@ ccl_device_inline float2 floor(const float2 &a)
return make_float2(floorf(a.x), floorf(a.y));
}
+#endif /* !__KERNEL_METAL__ */
+
ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
{
return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 81550c5d03c..1a0213f2a6d 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
+#if !defined(__KERNEL_METAL__)
ccl_device_inline float3 operator-(const float3 &a);
ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
ccl_device_inline float3 operator*(const float3 &a, const float f);
@@ -62,19 +63,20 @@ ccl_device_inline float3 rcp(const float3 &a);
ccl_device_inline float3 sqrt(const float3 &a);
ccl_device_inline float3 floor(const float3 &a);
ccl_device_inline float3 ceil(const float3 &a);
+ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
+#endif /* !defined(__KERNEL_METAL__) */
ccl_device_inline float min3(float3 a);
ccl_device_inline float max3(float3 a);
ccl_device_inline float len(const float3 a);
ccl_device_inline float len_squared(const float3 a);
-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
ccl_device_inline float3 project(const float3 v, const float3 v_proj);
ccl_device_inline float3 saturate3(float3 a);
ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
+ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
+ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b);
ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b);
ccl_device_inline float3 interp(float3 a, float3 b, float t);
@@ -103,49 +105,58 @@ ccl_device_inline float3 one_float3()
return make_float3(1.0f, 1.0f, 1.0f);
}
+#if defined(__KERNEL_METAL__)
+
+ccl_device_inline float3 rcp(float3 a)
+{
+ return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
+}
+
+#else
+
ccl_device_inline float3 operator-(const float3 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#else
+# else
return make_float3(-a.x, -a.y, -a.z);
-#endif
+# endif
}
ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128, b.m128));
-#else
+# else
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-#endif
+# endif
}
ccl_device_inline float3 operator*(const float3 &a, const float f)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
-#else
+# else
return make_float3(a.x * f, a.y * f, a.z * f);
-#endif
+# endif
}
ccl_device_inline float3 operator*(const float f, const float3 &a)
{
-#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_SSE__)
return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#else
+# else
return make_float3(a.x * f, a.y * f, a.z * f);
-#endif
+# endif
}
ccl_device_inline float3 operator/(const float f, const float3 &a)
{
-#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_SSE__)
return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
-#else
+# else
return make_float3(f / a.x, f / a.y, f / a.z);
-#endif
+# endif
}
ccl_device_inline float3 operator/(const float3 &a, const float f)
@@ -156,11 +167,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
{
-#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_SSE__)
return float3(_mm_div_ps(a.m128, b.m128));
-#else
+# else
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#endif
+# endif
}
ccl_device_inline float3 operator+(const float3 &a, const float f)
@@ -170,11 +181,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)
ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_add_ps(a.m128, b.m128));
-#else
+# else
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
+# endif
}
ccl_device_inline float3 operator-(const float3 &a, const float f)
@@ -184,11 +195,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)
ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_sub_ps(a.m128, b.m128));
-#else
+# else
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
+# endif
}
ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
@@ -222,13 +233,39 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
return a = a * invf;
}
+#if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__))
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b)
+{
+ a = float3(a) * b;
+ return a;
+}
+
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f)
+{
+ a = float3(a) * f;
+ return a;
+}
+
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b)
+{
+ a = float3(a) / b;
+ return a;
+}
+
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
+{
+ a = float3(a) / f;
+ return a;
+}
+#endif
+
ccl_device_inline bool operator==(const float3 &a, const float3 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#else
+# else
return (a.x == b.x && a.y == b.y && a.z == b.z);
-#endif
+# endif
}
ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
@@ -243,20 +280,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)
ccl_device_inline float dot(const float3 &a, const float3 &b)
{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#else
+# else
return a.x * b.x + a.y * b.y + a.z * b.z;
-#endif
+# endif
}
ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#else
+# else
return a.x * b.x + a.y * b.y;
-#endif
+# endif
}
ccl_device_inline float3 cross(const float3 &a, const float3 &b)
@@ -267,30 +304,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)
ccl_device_inline float3 normalize(const float3 &a)
{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
return float3(_mm_div_ps(a.m128, norm));
-#else
+# else
return a / len(a);
-#endif
+# endif
}
ccl_device_inline float3 min(const float3 &a, const float3 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_min_ps(a.m128, b.m128));
-#else
+# else
return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+# endif
}
ccl_device_inline float3 max(const float3 &a, const float3 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_max_ps(a.m128, b.m128));
-#else
+# else
return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+# endif
}
ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
@@ -300,43 +337,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
ccl_device_inline float3 fabs(const float3 &a)
{
-#ifdef __KERNEL_SSE__
-# ifdef __KERNEL_NEON__
+# ifdef __KERNEL_SSE__
+# ifdef __KERNEL_NEON__
return float3(vabsq_f32(a.m128));
-# else
+# else
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return float3(_mm_and_ps(a.m128, mask));
-# endif
-#else
+# endif
+# else
return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
-#endif
+# endif
}
ccl_device_inline float3 sqrt(const float3 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_sqrt_ps(a));
-#else
+# else
return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
-#endif
+# endif
}
ccl_device_inline float3 floor(const float3 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_floor_ps(a));
-#else
+# else
return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
-#endif
+# endif
}
ccl_device_inline float3 ceil(const float3 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float3(_mm_ceil_ps(a));
-#else
+# else
return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
-#endif
+# endif
}
ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
@@ -346,13 +383,14 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
ccl_device_inline float3 rcp(const float3 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
/* Don't use _mm_rcp_ps due to poor precision. */
return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#else
+# else
return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
-#endif
+# endif
}
+#endif /* !__KERNEL_METAL__ */
ccl_device_inline float min3(float3 a)
{
@@ -378,6 +416,7 @@ ccl_device_inline float len_squared(const float3 a)
return dot(a, a);
}
+#if !defined(__KERNEL_METAL__)
ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
{
float3 unit_normal = normalize(normal);
@@ -399,6 +438,7 @@ ccl_device_inline float3 faceforward(const float3 vector,
{
return (dot(reference, incident) < 0.0f) ? vector : -vector;
}
+#endif
ccl_device_inline float3 project(const float3 v, const float3 v_proj)
{
@@ -479,7 +519,11 @@ ccl_device_inline float average(const float3 a)
ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
{
+#if defined(__KERNEL_METAL__)
+ return all(a == b);
+#else
return a == b;
+#endif
}
ccl_device_inline float3 pow3(float3 v, float e)
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index c76959ee7ff..1203a10cca4 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
+#if !defined(__KERNEL_METAL__)
ccl_device_inline float4 operator-(const float4 &a);
ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
ccl_device_inline float4 operator*(const float4 &a, float f);
@@ -65,6 +66,7 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
ccl_device_inline float4 fabs(const float4 &a);
ccl_device_inline float4 floor(const float4 &a);
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
+#endif /* !__KERNEL_METAL__*/
ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
@@ -110,32 +112,33 @@ ccl_device_inline float4 one_float4()
return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
}
+#if !defined(__KERNEL_METAL__)
ccl_device_inline float4 operator-(const float4 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
return float4(_mm_xor_ps(a.m128, mask));
-#else
+# else
return make_float4(-a.x, -a.y, -a.z, -a.w);
-#endif
+# endif
}
ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_mul_ps(a.m128, b.m128));
-#else
+# else
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-#endif
+# endif
}
ccl_device_inline float4 operator*(const float4 &a, float f)
{
-#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_SSE__)
return a * make_float4(f);
-#else
+# else
return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
-#endif
+# endif
}
ccl_device_inline float4 operator*(float f, const float4 &a)
@@ -150,11 +153,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)
ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_div_ps(a.m128, b.m128));
-#else
+# else
return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-#endif
+# endif
}
ccl_device_inline float4 operator+(const float4 &a, const float f)
@@ -164,11 +167,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)
ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_add_ps(a.m128, b.m128));
-#else
+# else
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-#endif
+# endif
}
ccl_device_inline float4 operator-(const float4 &a, const float f)
@@ -178,11 +181,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)
ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_sub_ps(a.m128, b.m128));
-#else
+# else
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-#endif
+# endif
}
ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
@@ -212,38 +215,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
-#else
+# else
return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#endif
+# endif
}
ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
-#else
+# else
return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#endif
+# endif
}
ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
-#else
+# else
return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#endif
+# endif
}
ccl_device_inline bool operator==(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#else
+# else
return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#endif
+# endif
}
ccl_device_inline float distance(const float4 &a, const float4 &b)
@@ -253,16 +256,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
ccl_device_inline float dot(const float4 &a, const float4 &b)
{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
+# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
__m128 t = vmulq_f32(a, b);
return vaddvq_f32(t);
-# else
+# else
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-# endif
-#else
+# endif
+# else
return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
-#endif
+# endif
}
ccl_device_inline float len_squared(const float4 &a)
@@ -272,21 +275,21 @@ ccl_device_inline float len_squared(const float4 &a)
ccl_device_inline float4 rcp(const float4 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
/* Don't use _mm_rcp_ps due to poor precision. */
return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#else
+# else
return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
-#endif
+# endif
}
ccl_device_inline float4 sqrt(const float4 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_sqrt_ps(a.m128));
-#else
+# else
return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#endif
+# endif
}
ccl_device_inline float4 sqr(const float4 &a)
@@ -296,39 +299,39 @@ ccl_device_inline float4 sqr(const float4 &a)
ccl_device_inline float4 cross(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
(shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#else
+# else
return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#endif
+# endif
}
ccl_device_inline bool is_zero(const float4 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return a == make_float4(0.0f);
-#else
+# else
return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#endif
+# endif
}
ccl_device_inline float4 reduce_add(const float4 &a)
{
-#if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
return float4(vdupq_n_f32(vaddvq_f32(a)));
-# elif defined(__KERNEL_SSE3__)
+# elif defined(__KERNEL_SSE3__)
float4 h(_mm_hadd_ps(a.m128, a.m128));
return float4(_mm_hadd_ps(h.m128, h.m128));
-# else
+# else
float4 h(shuffle<1, 0, 3, 2>(a) + a);
return shuffle<2, 3, 0, 1>(h) + h;
-# endif
-#else
+# endif
+# else
float sum = (a.x + a.y) + (a.z + a.w);
return make_float4(sum, sum, sum, sum);
-#endif
+# endif
}
ccl_device_inline float average(const float4 &a)
@@ -354,20 +357,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)
ccl_device_inline float4 min(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_min_ps(a.m128, b.m128));
-#else
+# else
return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
+# endif
}
ccl_device_inline float4 max(const float4 &a, const float4 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_max_ps(a.m128, b.m128));
-#else
+# else
return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
+# endif
}
ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
@@ -377,24 +380,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
ccl_device_inline float4 fabs(const float4 &a)
{
-#if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
+# if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
return float4(vabsq_f32(a));
-# else
+# else
return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-# endif
-#else
+# endif
+# else
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-#endif
+# endif
}
ccl_device_inline float4 floor(const float4 &a)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return float4(_mm_floor_ps(a));
-#else
+# else
return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#endif
+# endif
}
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
@@ -402,6 +405,8 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
return a + t * (b - a);
}
+#endif /* !__KERNEL_METAL__*/
+
#ifdef __KERNEL_SSE__
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &b)
diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h
index 5b04be92152..39dc3b28f11 100644
--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@@ -27,17 +27,20 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
+#if !defined(__KERNEL_METAL__)
ccl_device_inline bool operator==(const int2 a, const int2 b);
ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
+#endif /* !__KERNEL_METAL__ */
/*******************************************************************************
* Definition.
*/
+#if !defined(__KERNEL_METAL__)
ccl_device_inline bool operator==(const int2 a, const int2 b)
{
return (a.x == b.x && a.y == b.y);
@@ -67,6 +70,7 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
{
return make_int2(a.x / b.x, a.y / b.y);
}
+#endif /* !__KERNEL_METAL__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h
index 128f2cb53b8..a09c68ef49a 100644
--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -27,49 +27,52 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
+#if !defined(__KERNEL_METAL__)
ccl_device_inline int3 min(int3 a, int3 b);
ccl_device_inline int3 max(int3 a, int3 b);
ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
+#endif /* !defined(__KERNEL_METAL__) */
/*******************************************************************************
* Definition.
*/
+#if !defined(__KERNEL_METAL__)
ccl_device_inline int3 min(int3 a, int3 b)
{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
return int3(_mm_min_epi32(a.m128, b.m128));
-#else
+# else
return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+# endif
}
ccl_device_inline int3 max(int3 a, int3 b)
{
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
return int3(_mm_max_epi32(a.m128, b.m128));
-#else
+# else
return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+# endif
}
ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return min(max(a, make_int3(mn)), make_int3(mx));
-#else
+# else
return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#endif
+# endif
}
ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return min(max(a, mn), make_int3(mx));
-#else
+# else
return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#endif
+# endif
}
ccl_device_inline bool operator==(const int3 &a, const int3 &b)
@@ -89,21 +92,22 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)
ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return int3(_mm_add_epi32(a.m128, b.m128));
-#else
+# else
return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
+# endif
}
ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
{
-#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_SSE__
return int3(_mm_sub_epi32(a.m128, b.m128));
-#else
+# else
return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
+# endif
}
+#endif /* !__KERNEL_METAL__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/math_matrix.h b/intern/cycles/util/math_matrix.h
index bff7ddb4cee..c1be71517e3 100644
--- a/intern/cycles/util/math_matrix.h
+++ b/intern/cycles/util/math_matrix.h
@@ -162,7 +162,7 @@ ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A,
{
for (int row = 0; row < n; row++) {
for (int col = 0; col <= row; col++) {
- MATHS(A, row, col, 1) += v[row] * v[col] * weight;
+ atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight);
}
}
}
diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp
index 5704c4ef8ef..aad790482d5 100644
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -313,7 +313,7 @@ static char *path_specials(const string &sub)
if (env_shader_path != NULL && sub == "shader") {
return env_shader_path;
}
- else if (env_shader_path != NULL && sub == "source") {
+ else if (env_source_path != NULL && sub == "source") {
return env_source_path;
}
return NULL;
@@ -541,7 +541,7 @@ static string path_make_compatible(const string &path)
if ((path.size() >= 3) && (path[0] == DIR_SEP) && (path[1] == DIR_SEP)) {
result = path_cleanup_unc(result);
}
- /* Make sure volume-only path ends up wit ha directory separator. */
+ /* Make sure volume-only path ends up wit a directory separator. */
if (result.size() == 2 && result[1] == ':') {
result += DIR_SEP;
}
diff --git a/intern/cycles/util/progress.h b/intern/cycles/util/progress.h
index f2d80e49ab8..15bd26d34bf 100644
--- a/intern/cycles/util/progress.h
+++ b/intern/cycles/util/progress.h
@@ -207,7 +207,7 @@ class Progress {
if (total_pixel_samples > 0) {
return ((double)pixel_samples) / (double)total_pixel_samples;
}
- return 0.0f;
+ return 0.0;
}
void add_samples(uint64_t pixel_samples_, int tile_sample)
diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h
index ea5e78b54d2..fc496e55a0c 100644
--- a/intern/cycles/util/ssef.h
+++ b/intern/cycles/util/ssef.h
@@ -906,7 +906,7 @@ __forceinline void store4f_nt(void *ptr, const ssef &v)
}
////////////////////////////////////////////////////////////////////////////////
-/// Euclidian Space Operators
+/// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline float dot(const ssef &a, const ssef &b)
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 7bfe747fcfb..1d78dfd1385 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -53,6 +53,15 @@ typedef struct DecomposedTransform {
/* Functions */
+#ifdef __KERNEL_METAL__
+/* transform_point specialized for ccl_global */
+ccl_device_inline float3 transform_point(ccl_global const Transform *t, const float3 a)
+{
+ ccl_global const float3x3 &b(*(ccl_global const float3x3 *)t);
+ return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
+}
+#endif
+
ccl_device_inline float3 transform_point(ccl_private const Transform *t, const float3 a)
{
/* TODO(sergey): Disabled for now, causes crashes in certain cases. */
@@ -73,6 +82,9 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
tmp += w;
return float3(tmp.m128);
+#elif defined(__KERNEL_METAL__)
+ ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
+ return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
#else
float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z + t->x.w,
a.x * t->y.x + a.y * t->y.y + a.z * t->y.z + t->y.w,
@@ -99,6 +111,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
tmp = madd(shuffle<2>(aa), z, tmp);
return float3(tmp.m128);
+#elif defined(__KERNEL_METAL__)
+ ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
+ return (a * b).xyz;
#else
float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z,
a.x * t->y.x + a.y * t->y.y + a.z * t->y.z,
@@ -450,8 +465,8 @@ ccl_device_inline void transform_compose(ccl_private Transform *tfm,
}
/* Interpolate from array of decomposed transforms. */
-ccl_device void transform_motion_array_interpolate(Transform *tfm,
- const DecomposedTransform *motion,
+ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm,
+ ccl_global const DecomposedTransform *motion,
uint numsteps,
float time)
{
@@ -460,8 +475,8 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
int step = min((int)(time * maxstep), maxstep - 1);
float t = time * maxstep - step;
- const DecomposedTransform *a = motion + step;
- const DecomposedTransform *b = motion + step + 1;
+ ccl_global const DecomposedTransform *a = motion + step;
+ ccl_global const DecomposedTransform *b = motion + step + 1;
/* Interpolate rotation, translation and scale. */
DecomposedTransform decomp;
diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h
index 697dc2b44ea..58a6d134819 100644
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -17,7 +17,9 @@
#ifndef __UTIL_TYPES_H__
#define __UTIL_TYPES_H__
-#include <stdlib.h>
+#if !defined(__KERNEL_METAL__)
+# include <stdlib.h>
+#endif
/* Standard Integer Types */
diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h
index f990367e7b8..cafcfebf526 100644
--- a/intern/cycles/util/types_float3.h
+++ b/intern/cycles/util/types_float3.h
@@ -55,6 +55,41 @@ ccl_device_inline float3 make_float3(float x, float y, float z);
ccl_device_inline void print_float3(const char *label, const float3 &a);
#endif /* __KERNEL_GPU__ */
+/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
+ * CPU SIMD instructions can be used. */
+#if defined(__KERNEL_METAL__)
+/* Metal has native packed_float3. */
+#elif defined(__KERNEL_CUDA__)
+/* CUDA float3 is already packed. */
+typedef float3 packed_float3;
+#else
+/* HIP float3 is not packed (https://github.com/ROCm-Developer-Tools/HIP/issues/706). */
+struct packed_float3 {
+ ccl_device_inline_method packed_float3(){};
+
+ ccl_device_inline_method packed_float3(const float3 &a) : x(a.x), y(a.y), z(a.z)
+ {
+ }
+
+ ccl_device_inline_method operator float3() const
+ {
+ return make_float3(x, y, z);
+ }
+
+ ccl_device_inline_method packed_float3 &operator=(const float3 &a)
+ {
+ x = a.x;
+ y = a.y;
+ z = a.z;
+ return *this;
+ }
+
+ float x, y, z;
+};
+#endif
+
+static_assert(sizeof(packed_float3) == 12, "packed_float3 expected to be exactly 12 bytes");
+
CCL_NAMESPACE_END
#endif /* __UTIL_TYPES_FLOAT3_H__ */