19 files changed, 455 insertions, 186 deletions
diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h
index faba411c769..afc3fd019df 100644
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -63,6 +63,62 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
 
 #  endif /* __KERNEL_CUDA__ */
 
+#  ifdef __KERNEL_METAL__
+
+// global address space versions
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_source,
+                                                   const float operand)
+{
+  volatile ccl_global atomic_int *source = (ccl_global atomic_int *)_source;
+  union {
+    int int_value;
+    float float_value;
+  } new_value, prev_value;
+  prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed);
+  do {
+    new_value.float_value = prev_value.float_value + operand;
+  } while (!atomic_compare_exchange_weak_explicit(source,
+                                                  &prev_value.int_value,
+                                                  new_value.int_value,
+                                                  memory_order_relaxed,
+                                                  memory_order_relaxed));
+
+  return new_value.float_value;
+}
+
+#    define atomic_fetch_and_add_uint32(p, x) \
+      atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+#    define atomic_fetch_and_sub_uint32(p, x) \
+      atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+#    define atomic_fetch_and_inc_uint32(p) \
+      atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
+#    define atomic_fetch_and_dec_uint32(p) \
+      atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
+#    define atomic_fetch_and_or_uint32(p, x) \
+      atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+
+ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
+                                                      const float old_val,
+                                                      const float new_val)
+{
+  int prev_value;
+  prev_value = __float_as_int(old_val);
+  atomic_compare_exchange_weak_explicit((ccl_global atomic_int *)dest,
+                                        &prev_value,
+                                        __float_as_int(new_val),
+                                        memory_order_relaxed,
+                                        memory_order_relaxed);
+  return __int_as_float(prev_value);
+}
+
+#    define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
+#    define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
+
+#    define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
+#    define ccl_barrier(flags) threadgroup_barrier(flags)
+
+#  endif /* __KERNEL_METAL__ */
+
 #endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp
index 7d5b6d4e54e..717e55a2c9a 100644
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -64,6 +64,11 @@ DebugFlags::HIP::HIP() : adaptive_compile(false)
   reset();
 }
 
+DebugFlags::Metal::Metal() : adaptive_compile(false)
+{
+  reset();
+}
+
 void DebugFlags::CUDA::reset()
 {
   if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
@@ -76,6 +81,12 @@ void DebugFlags::HIP::reset()
     adaptive_compile = true;
 }
 
+void DebugFlags::Metal::reset()
+{
+  if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
+    adaptive_compile = true;
+}
+
 DebugFlags::OptiX::OptiX()
 {
   reset();
@@ -97,6 +108,7 @@ void DebugFlags::reset()
   cpu.reset();
   cuda.reset();
   optix.reset();
+  metal.reset();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h
index 548c67600e5..1e431fde68a 100644
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -116,6 +116,17 @@ class DebugFlags {
     bool use_debug;
   };
 
+  /* Descriptor of Metal feature-set to be used. */
+  struct Metal {
+    Metal();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Whether adaptive feature based runtime compile is enabled or not.*/
+    bool adaptive_compile;
+  };
+
   /* Get instance of debug flags registry. */
   static DebugFlags &get()
   {
@@ -138,6 +149,9 @@ class DebugFlags {
   /* Requested HIP flags. */
   HIP hip;
 
+  /* Requested Metal flags. */
+  Metal metal;
+
  private:
   DebugFlags();
 
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index a778bef52b2..a2e8d83adb7 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -44,6 +44,7 @@
 #  if defined(_WIN32) && !defined(FREE_WINDOWS)
 #    define ccl_device_inline static __forceinline
 #    define ccl_device_forceinline static __forceinline
+#    define ccl_device_inline_method __forceinline
 #    define ccl_align(...) __declspec(align(__VA_ARGS__))
 #    ifdef __KERNEL_64_BIT__
 #      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
@@ -58,6 +59,7 @@
 #  else /* _WIN32 && !FREE_WINDOWS */
 #    define ccl_device_inline static inline __attribute__((always_inline))
 #    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_device_inline_method __attribute__((always_inline))
 #    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
 #    ifndef FREE_WINDOWS64
 #      define __forceinline inline __attribute__((always_inline))
@@ -70,7 +72,7 @@
 
 /* Address spaces for GPU. */
 #  define ccl_global
-#  define ccl_static_constant static const
+#  define ccl_inline_constant inline constexpr
 #  define ccl_constant const
 #  define ccl_private
 
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index 016975e3c25..555f17259bd 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -28,8 +28,27 @@ CCL_NAMESPACE_BEGIN
 
 /* Half Floats */
 
+#if defined(__KERNEL_METAL__)
+
+ccl_device_inline float half_to_float(half h_in)
+{
+  float f;
+  union {
+    half h;
+    uint16_t s;
+  } val;
+  val.h = h_in;
+
+  *((ccl_private int *)&f) = ((val.s & 0x8000) << 16) | (((val.s & 0x7c00) + 0x1C000) << 13) |
+                             ((val.s & 0x03FF) << 13);
+
+  return f;
+}
+
+#else
+
 /* CUDA has its own half data type, no need to define then */
-#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
+#  if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -53,11 +72,12 @@ class half {
  private:
   unsigned short v;
 };
-#endif
+#  endif
 
 struct half4 {
   half x, y, z, w;
 };
+#endif
 
 /* Conversion to/from half float for image textures
  *
@@ -66,7 +86,9 @@ struct half4 {
 
 ccl_device_inline half float_to_half_image(float f)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half(f);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __float2half(f);
 #else
   const uint u = __float_as_uint(f);
@@ -92,7 +114,9 @@ ccl_device_inline half float_to_half_image(float f)
 
 ccl_device_inline float half_to_float_image(half h)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half_to_float(h);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __half2float(h);
 #else
   const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
@@ -125,7 +149,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
 
 ccl_device_inline half float_to_half_display(const float f)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half(f);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __float2half(f);
 #else
   const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
diff --git a/intern/cycles/util/ies.cpp b/intern/cycles/util/ies.cpp
index 5e879478df5..e924d660407 100644
--- a/intern/cycles/util/ies.cpp
+++ b/intern/cycles/util/ies.cpp
@@ -23,10 +23,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-// NOTE: For some reason gcc-7.2 does not instantiate this versio of allocator
-// gere (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8.
+// NOTE: For some reason gcc-7.2 does not instantiate this version of the
+// allocator here (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8.
 //
-// TODO(sergey): Get to the root of this issue, or confirm this i a compiler
+// TODO(sergey): Get to the root of this issue, or confirm this is a compiler
 // issue.
 template class GuardedAllocator<char>;
 
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index e4c7df6e44a..6cfeb1aa917 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -30,9 +30,11 @@
 #  include <hip/hip_vector_types.h>
 #endif
 
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
+#if !defined(__KERNEL_METAL__)
+#  include <float.h>
+#  include <math.h>
+#  include <stdio.h>
+#endif /* !defined(__KERNEL_METAL__) */
 
 #include "util/types.h"
 
@@ -174,6 +176,7 @@ ccl_device_inline float max4(float a, float b, float c, float d)
   return max(max(a, b), max(c, d));
 }
 
+#if !defined(__KERNEL_METAL__)
 /* Int/Float conversion */
 
 ccl_device_inline int as_int(uint i)
@@ -206,7 +209,7 @@ ccl_device_inline uint as_uint(float f)
   return u.i;
 }
 
-#ifndef __HIP__
+#  ifndef __HIP__
 ccl_device_inline int __float_as_int(float f)
 {
   union {
@@ -246,28 +249,33 @@ ccl_device_inline float __uint_as_float(uint i)
   u.i = i;
   return u.f;
 }
-#endif
+#  endif
 
 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(f.m128));
-#else
+#  else
   return make_int4(
       __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 __int4_as_float4(int4 i)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_castsi128_ps(i.m128));
-#else
+#  else
   return make_float4(
       __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-#endif
+#  endif
 }
+#endif /* !defined(__KERNEL_METAL__) */
 
+#if defined(__KERNEL_METAL__)
+#  define isnan_safe(v) isnan(v)
+#  define isfinite_safe(v) isfinite(v)
+#else
 template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
 {
   return ((uint64_t)ptr) & 0xFFFFFFFF;
@@ -311,12 +319,14 @@ ccl_device_inline bool isfinite_safe(float f)
   unsigned int x = __float_as_uint(f);
   return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
 }
+#endif
 
 ccl_device_inline float ensure_finite(float v)
 {
   return isfinite_safe(v) ? v : 0.0f;
 }
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int clamp(int a, int mn, int mx)
 {
   return min(max(a, mn), mx);
@@ -346,15 +356,17 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
   return result;
 }
 
-#ifndef __KERNEL_CUDA__
+#endif /* !defined(__KERNEL_METAL__) */
+
+#if defined(__KERNEL_CUDA__)
 ccl_device_inline float saturatef(float a)
 {
-  return clamp(a, 0.0f, 1.0f);
+  return __saturatef(a);
 }
-#else
+#elif !defined(__KERNEL_METAL__)
 ccl_device_inline float saturatef(float a)
 {
-  return __saturatef(a);
+  return clamp(a, 0.0f, 1.0f);
 }
 #endif /* __KERNEL_CUDA__ */
 
@@ -491,6 +503,7 @@ CCL_NAMESPACE_END
 
 CCL_NAMESPACE_BEGIN
 
+#if !defined(__KERNEL_METAL__)
 /* Interpolation */
 
 template<class A, class B> A lerp(const A &a, const A &b, const B &t)
@@ -498,6 +511,8 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
   return (A)(a * ((B)1 - t) + b * t);
 }
 
+#endif /* __KERNEL_METAL__ */
+
 /* Triangle */
 
 ccl_device_inline float triangle_area(ccl_private const float3 &v1,
@@ -627,7 +642,11 @@ ccl_device_inline float safe_sqrtf(float f)
 
 ccl_device_inline float inversesqrtf(float f)
 {
+#if defined(__KERNEL_METAL__)
+  return (f > 0.0f) ? rsqrt(f) : 0.0f;
+#else
   return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
+#endif
 }
 
 ccl_device float safe_asinf(float a)
@@ -715,10 +734,30 @@ ccl_device float bits_to_01(uint bits)
   return bits * (1.0f / (float)0xFFFFFFFF);
 }
 
+#if !defined(__KERNEL_GPU__)
+#  if defined(__GNUC__)
+#    define popcount(x) __builtin_popcount(x)
+#  else
+ccl_device_inline uint popcount(uint x)
+{
+  /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
+  uint i = x & 0xaaaaaaaa;
+  i = i - ((i >> 1) & 0x55555555);
+  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+  i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+  return i & 1;
+}
+#  endif
+#elif !defined(__KERNEL_METAL__)
+#  define popcount(x) __popc(x)
+#endif
+
 ccl_device_inline uint count_leading_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __clz(x);
+#elif defined(__KERNEL_METAL__)
+  return clz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -735,6 +774,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return (__ffs(x) - 1);
+#elif defined(__KERNEL_METAL__)
+  return ctz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -751,6 +792,8 @@ ccl_device_inline uint find_first_set(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __ffs(x);
+#elif defined(__KERNEL_METAL__)
+  return (x != 0) ? ctz(x) + 1 : 0;
 #else
 #  ifdef _MSC_VER
   return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
@@ -801,7 +844,7 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
  * https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
  */
 
-ccl_device_inline float compare_floats(float a, float b, float abs_diff, int ulp_diff)
+ccl_device_inline bool compare_floats(float a, float b, float abs_diff, int ulp_diff)
 {
   if (fabsf(a - b) < abs_diff) {
     return true;
@@ -849,6 +892,8 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
   return x;
 #elif defined(__KERNEL_CUDA__)
   return __brev(x);
+#elif defined(__KERNEL_METAL__)
+  return reverse_bits(x);
 #elif __has_builtin(__builtin_bitreverse32)
   return __builtin_bitreverse32(x);
 #else
diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h
index 87141d5bc37..8ff75c6c20a 100644
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float2 operator-(const float2 &a);
 ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
 ccl_device_inline float2 operator*(const float2 &a, float f);
@@ -63,6 +64,7 @@ ccl_device_inline float2 fabs(const float2 &a);
 ccl_device_inline float2 as_float2(const float4 &a);
 ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
 ccl_device_inline float2 floor(const float2 &a);
+#endif /* !__KERNEL_METAL__ */
 
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
 
@@ -80,6 +82,7 @@ ccl_device_inline float2 one_float2()
   return make_float2(1.0f, 1.0f);
 }
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float2 operator-(const float2 &a)
 {
   return make_float2(-a.x, -a.y);
@@ -259,6 +262,8 @@ ccl_device_inline float2 floor(const float2 &a)
   return make_float2(floorf(a.x), floorf(a.y));
 }
 
+#endif /* !__KERNEL_METAL__ */
+
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 81550c5d03c..1a0213f2a6d 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 operator-(const float3 &a);
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
 ccl_device_inline float3 operator*(const float3 &a, const float f);
@@ -62,19 +63,20 @@ ccl_device_inline float3 rcp(const float3 &a);
 ccl_device_inline float3 sqrt(const float3 &a);
 ccl_device_inline float3 floor(const float3 &a);
 ccl_device_inline float3 ceil(const float3 &a);
+ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
+#endif /* !defined(__KERNEL_METAL__) */
 
 ccl_device_inline float min3(float3 a);
 ccl_device_inline float max3(float3 a);
 ccl_device_inline float len(const float3 a);
 ccl_device_inline float len_squared(const float3 a);
 
-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
 ccl_device_inline float3 project(const float3 v, const float3 v_proj);
 
 ccl_device_inline float3 saturate3(float3 a);
 ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
+ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
+ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
 ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b);
 ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b);
 ccl_device_inline float3 interp(float3 a, float3 b, float t);
@@ -103,49 +105,58 @@ ccl_device_inline float3 one_float3()
   return make_float3(1.0f, 1.0f, 1.0f);
 }
 
+#if defined(__KERNEL_METAL__)
+
+ccl_device_inline float3 rcp(float3 a)
+{
+  return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
+}
+
+#else
+
 ccl_device_inline float3 operator-(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#else
+#  else
   return make_float3(-a.x, -a.y, -a.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float f)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
-#else
+#  else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator*(const float f, const float3 &a)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
   return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#else
+#  else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator/(const float f, const float3 &a)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
-#else
+#  else
   return make_float3(f / a.x, f / a.y, f / a.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator/(const float3 &a, const float f)
@@ -156,11 +167,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 
 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator+(const float3 &a, const float f)
@@ -170,11 +181,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)
 
 ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_add_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator-(const float3 &a, const float f)
@@ -184,11 +195,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)
 
 ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_sub_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
@@ -222,13 +233,39 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
   return a = a * invf;
 }
 
+#if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__))
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b)
+{
+  a = float3(a) * b;
+  return a;
+}
+
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f)
+{
+  a = float3(a) * f;
+  return a;
+}
+
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b)
+{
+  a = float3(a) / b;
+  return a;
+}
+
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
+{
+  a = float3(a) / f;
+  return a;
+}
+#endif
+
 ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#else
+#  else
   return (a.x == b.x && a.y == b.y && a.z == b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
@@ -243,20 +280,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)
 
 ccl_device_inline float dot(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#else
+#  else
   return a.x * b.x + a.y * b.y + a.z * b.z;
-#endif
+#  endif
 }
 
 ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#else
+#  else
   return a.x * b.x + a.y * b.y;
-#endif
+#  endif
 }
 
 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
@@ -267,30 +304,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 
 ccl_device_inline float3 normalize(const float3 &a)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
   return float3(_mm_div_ps(a.m128, norm));
-#else
+#  else
   return a / len(a);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 min(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_min_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 max(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_max_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
@@ -300,43 +337,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
 
 ccl_device_inline float3 fabs(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
-#  ifdef __KERNEL_NEON__
+#  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_NEON__
   return float3(vabsq_f32(a.m128));
-#  else
+#    else
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
   return float3(_mm_and_ps(a.m128, mask));
-#  endif
-#else
+#    endif
+#  else
   return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 sqrt(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_sqrt_ps(a));
-#else
+#  else
   return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 floor(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_floor_ps(a));
-#else
+#  else
   return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 ceil(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_ceil_ps(a));
-#else
+#  else
   return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
@@ -346,13 +383,14 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
 
 ccl_device_inline float3 rcp(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#else
+#  else
   return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
-#endif
+#  endif
 }
+#endif /* !__KERNEL_METAL__ */
 
 ccl_device_inline float min3(float3 a)
 {
@@ -378,6 +416,7 @@ ccl_device_inline float len_squared(const float3 a)
   return dot(a, a);
 }
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
 {
   float3 unit_normal = normalize(normal);
@@ -399,6 +438,7 @@ ccl_device_inline float3 faceforward(const float3 vector,
 {
   return (dot(reference, incident) < 0.0f) ? vector : -vector;
 }
+#endif
 
 ccl_device_inline float3 project(const float3 v, const float3 v_proj)
 {
@@ -479,7 +519,11 @@ ccl_device_inline float average(const float3 a)
 
 ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 {
+#if defined(__KERNEL_METAL__)
+  return all(a == b);
+#else
   return a == b;
+#endif
 }
 
 ccl_device_inline float3 pow3(float3 v, float e)
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index c76959ee7ff..1203a10cca4 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a);
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
 ccl_device_inline float4 operator*(const float4 &a, float f);
@@ -65,6 +66,7 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 ccl_device_inline float4 fabs(const float4 &a);
 ccl_device_inline float4 floor(const float4 &a);
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
+#endif /* !__KERNEL_METAL__*/
 
 ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
 
@@ -110,32 +112,33 @@ ccl_device_inline float4 one_float4()
   return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
   return float4(_mm_xor_ps(a.m128, mask));
-#else
+#  else
   return make_float4(-a.x, -a.y, -a.z, -a.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_mul_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, float f)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
   return a * make_float4(f);
-#else
+#  else
   return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator*(float f, const float4 &a)
@@ -150,11 +153,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)
 
 ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_div_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator+(const float4 &a, const float f)
@@ -164,11 +167,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)
 
 ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_add_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator-(const float4 &a, const float f)
@@ -178,11 +181,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)
 
 ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_sub_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
@@ -212,38 +215,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
 
 ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
-#else
+#  else
   return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
-#else
+#  else
   return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
-#else
+#  else
   return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#else
+#  else
   return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float distance(const float4 &a, const float4 &b)
@@ -253,16 +256,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
 
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
   __m128 t = vmulq_f32(a, b);
   return vaddvq_f32(t);
-#  else
+#    else
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#  endif
-#else
+#    endif
+#  else
   return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float len_squared(const float4 &a)
@@ -272,21 +275,21 @@ ccl_device_inline float len_squared(const float4 &a)
 
 ccl_device_inline float4 rcp(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#else
+#  else
   return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 sqrt(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_sqrt_ps(a.m128));
-#else
+#  else
   return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 sqr(const float4 &a)
@@ -296,39 +299,39 @@ ccl_device_inline float4 sqr(const float4 &a)
 
 ccl_device_inline float4 cross(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
          (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#else
+#  else
   return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#endif
+#  endif
 }
 
 ccl_device_inline bool is_zero(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return a == make_float4(0.0f);
-#else
+#  else
   return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#if defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
   return float4(vdupq_n_f32(vaddvq_f32(a)));
-#  elif defined(__KERNEL_SSE3__)
+#    elif defined(__KERNEL_SSE3__)
   float4 h(_mm_hadd_ps(a.m128, a.m128));
   return float4(_mm_hadd_ps(h.m128, h.m128));
-#  else
+#    else
   float4 h(shuffle<1, 0, 3, 2>(a) + a);
   return shuffle<2, 3, 0, 1>(h) + h;
-#  endif
-#else
+#    endif
+#  else
   float sum = (a.x + a.y) + (a.z + a.w);
   return make_float4(sum, sum, sum, sum);
-#endif
+#  endif
 }
 
 ccl_device_inline float average(const float4 &a)
@@ -354,20 +357,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)
 
 ccl_device_inline float4 min(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_min_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 max(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_max_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
@@ -377,24 +380,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 
 ccl_device_inline float4 fabs(const float4 &a)
 {
-#if defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
   return float4(vabsq_f32(a));
-#  else
+#    else
   return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#  endif
-#else
+#    endif
+#  else
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 floor(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_floor_ps(a));
-#else
+#  else
   return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
@@ -402,6 +405,8 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
   return a + t * (b - a);
 }
 
+#endif /* !__KERNEL_METAL__*/
+
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h
index 5b04be92152..39dc3b28f11 100644
--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@@ -27,17 +27,20 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b);
 ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
 ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
+#endif /* !__KERNEL_METAL__ */
 
 /*******************************************************************************
  * Definition.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
   return (a.x == b.x && a.y == b.y);
@@ -67,6 +70,7 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
 {
   return make_int2(a.x / b.x, a.y / b.y);
 }
+#endif /* !__KERNEL_METAL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h
index 128f2cb53b8..a09c68ef49a 100644
--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -27,49 +27,52 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b);
 ccl_device_inline int3 max(int3 a, int3 b);
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
+#endif /*  !defined(__KERNEL_METAL__) */
 
 /*******************************************************************************
  * Definition.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b)
 {
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_min_epi32(a.m128, b.m128));
-#else
+#  else
   return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+#  endif
 }
 
 ccl_device_inline int3 max(int3 a, int3 b)
 {
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_max_epi32(a.m128, b.m128));
-#else
+#  else
   return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+#  endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return min(max(a, make_int3(mn)), make_int3(mx));
-#else
+#  else
   return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#endif
+#  endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return min(max(a, mn), make_int3(mx));
-#else
+#  else
   return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#endif
+#  endif
 }
 
 ccl_device_inline bool operator==(const int3 &a, const int3 &b)
@@ -89,21 +92,22 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)
 
 ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int3(_mm_add_epi32(a.m128, b.m128));
-#else
+#  else
   return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int3(_mm_sub_epi32(a.m128, b.m128));
-#else
+#  else
   return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
+#  endif
 }
+#endif /* !__KERNEL_METAL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/math_matrix.h b/intern/cycles/util/math_matrix.h
index bff7ddb4cee..c1be71517e3 100644
--- a/intern/cycles/util/math_matrix.h
+++ b/intern/cycles/util/math_matrix.h
@@ -162,7 +162,7 @@ ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A,
 {
   for (int row = 0; row < n; row++) {
     for (int col = 0; col <= row; col++) {
-      MATHS(A, row, col, 1) += v[row] * v[col] * weight;
+      atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight);
     }
   }
 }
diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp
index 5704c4ef8ef..aad790482d5 100644
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -313,7 +313,7 @@ static char *path_specials(const string &sub)
   if (env_shader_path != NULL && sub == "shader") {
     return env_shader_path;
   }
-  else if (env_shader_path != NULL && sub == "source") {
+  else if (env_source_path != NULL && sub == "source") {
     return env_source_path;
   }
   return NULL;
@@ -541,7 +541,7 @@ static string path_make_compatible(const string &path)
   if ((path.size() >= 3) && (path[0] == DIR_SEP) && (path[1] == DIR_SEP)) {
     result = path_cleanup_unc(result);
   }
-  /* Make sure volume-only path ends up wit ha directory separator. */
+  /* Make sure volume-only path ends up wit a directory separator. */
   if (result.size() == 2 && result[1] == ':') {
     result += DIR_SEP;
   }
diff --git a/intern/cycles/util/progress.h b/intern/cycles/util/progress.h
index f2d80e49ab8..15bd26d34bf 100644
--- a/intern/cycles/util/progress.h
+++ b/intern/cycles/util/progress.h
@@ -207,7 +207,7 @@ class Progress {
     if (total_pixel_samples > 0) {
       return ((double)pixel_samples) / (double)total_pixel_samples;
     }
-    return 0.0f;
+    return 0.0;
   }
 
   void add_samples(uint64_t pixel_samples_, int tile_sample)
diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h
index ea5e78b54d2..fc496e55a0c 100644
--- a/intern/cycles/util/ssef.h
+++ b/intern/cycles/util/ssef.h
@@ -906,7 +906,7 @@ __forceinline void store4f_nt(void *ptr, const ssef &v)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-/// Euclidian Space Operators
+/// Euclidean Space Operators
 ////////////////////////////////////////////////////////////////////////////////
 
 __forceinline float dot(const ssef &a, const ssef &b)
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 7bfe747fcfb..1d78dfd1385 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -53,6 +53,15 @@ typedef struct DecomposedTransform {
 
 /* Functions */
 
+#ifdef __KERNEL_METAL__
+/* transform_point specialized for ccl_global */
+ccl_device_inline float3 transform_point(ccl_global const Transform *t, const float3 a)
+{
+  ccl_global const float3x3 &b(*(ccl_global const float3x3 *)t);
+  return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
+}
+#endif
+
 ccl_device_inline float3 transform_point(ccl_private const Transform *t, const float3 a)
 {
   /* TODO(sergey): Disabled for now, causes crashes in certain cases. */
@@ -73,6 +82,9 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
   tmp += w;
 
   return float3(tmp.m128);
+#elif defined(__KERNEL_METAL__)
+  ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
+  return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
 #else
   float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z + t->x.w,
                          a.x * t->y.x + a.y * t->y.y + a.z * t->y.z + t->y.w,
@@ -99,6 +111,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
   tmp = madd(shuffle<2>(aa), z, tmp);
 
   return float3(tmp.m128);
+#elif defined(__KERNEL_METAL__)
+  ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
+  return (a * b).xyz;
 #else
   float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z,
                          a.x * t->y.x + a.y * t->y.y + a.z * t->y.z,
@@ -450,8 +465,8 @@ ccl_device_inline void transform_compose(ccl_private Transform *tfm,
 }
 
 /* Interpolate from array of decomposed transforms. */
-ccl_device void transform_motion_array_interpolate(Transform *tfm,
-                                                   const DecomposedTransform *motion,
+ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm,
+                                                   ccl_global const DecomposedTransform *motion,
                                                    uint numsteps,
                                                    float time)
 {
@@ -460,8 +475,8 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
   int step = min((int)(time * maxstep), maxstep - 1);
   float t = time * maxstep - step;
 
-  const DecomposedTransform *a = motion + step;
-  const DecomposedTransform *b = motion + step + 1;
+  ccl_global const DecomposedTransform *a = motion + step;
+  ccl_global const DecomposedTransform *b = motion + step + 1;
 
   /* Interpolate rotation, translation and scale. */
   DecomposedTransform decomp;
diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h
index 697dc2b44ea..58a6d134819 100644
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -17,7 +17,9 @@
 #ifndef __UTIL_TYPES_H__
 #define __UTIL_TYPES_H__
 
-#include <stdlib.h>
+#if !defined(__KERNEL_METAL__)
+#  include <stdlib.h>
+#endif
 
 /* Standard Integer Types */
 
diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h
index f990367e7b8..cafcfebf526 100644
--- a/intern/cycles/util/types_float3.h
+++ b/intern/cycles/util/types_float3.h
@@ -55,6 +55,41 @@ ccl_device_inline float3 make_float3(float x, float y, float z);
 ccl_device_inline void print_float3(const char *label, const float3 &a);
 #endif /* __KERNEL_GPU__ */
 
+/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
+ * CPU SIMD instructions can be used. */
+#if defined(__KERNEL_METAL__)
+/* Metal has native packed_float3. */
+#elif defined(__KERNEL_CUDA__)
+/* CUDA float3 is already packed. */
+typedef float3 packed_float3;
+#else
+/* HIP float3 is not packed (https://github.com/ROCm-Developer-Tools/HIP/issues/706). */
+struct packed_float3 {
+  ccl_device_inline_method packed_float3(){};
+
+  ccl_device_inline_method packed_float3(const float3 &a) : x(a.x), y(a.y), z(a.z)
+  {
+  }
+
+  ccl_device_inline_method operator float3() const
+  {
+    return make_float3(x, y, z);
+  }
+
+  ccl_device_inline_method packed_float3 &operator=(const float3 &a)
+  {
+    x = a.x;
+    y = a.y;
+    z = a.z;
+    return *this;
+  }
+
+  float x, y, z;
+};
+#endif
+
+static_assert(sizeof(packed_float3) == 12, "packed_float3 expected to be exactly 12 bytes");
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TYPES_FLOAT3_H__ */