60 files changed, 1596 insertions, 725 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index fddac1dbbcf..997d574a3b0 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -3,7 +3,6 @@
 
 set(INC
   ..
-  ../../glew-mx
 )
 
 set(INC_SYS
@@ -26,6 +25,8 @@ set(SRC
   thread.cpp
   time.cpp
   transform.cpp
+  transform_avx2.cpp
+  transform_sse41.cpp
   windows.cpp
 )
 
@@ -63,6 +64,7 @@ set(SRC_HEADERS
   math_float2.h
   math_float3.h
   math_float4.h
+  math_float8.h
   math_int2.h
   math_int3.h
   math_int4.h
@@ -115,6 +117,7 @@ set(SRC_HEADERS
   types_int3_impl.h
   types_int4.h
   types_int4_impl.h
+  types_spectrum.h
   types_uchar2.h
   types_uchar2_impl.h
   types_uchar3.h
@@ -128,8 +131,6 @@ set(SRC_HEADERS
   types_uint4.h
   types_uint4_impl.h
   types_ushort4.h
-  types_vector3.h
-  types_vector3_impl.h
   unique_ptr.h
   vector.h
   version.h
@@ -137,9 +138,14 @@ set(SRC_HEADERS
   xml.h
 )
 
+if(CXX_HAS_SSE)
+  set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+endif()
+if(CXX_HAS_AVX2)
+  set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-add_definitions(${GL_DEFINITIONS})
-
 cycles_add_library(cycles_util "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h
index f89eb28b0b7..1ebf085ae13 100644
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -106,6 +106,116 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 
 #  endif /* __KERNEL_METAL__ */
 
+#  ifdef __KERNEL_ONEAPI__
+
+ccl_device_inline float atomic_add_and_fetch_float(ccl_global float *p, float x)
+{
+  sycl::atomic_ref<float,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_add(x);
+}
+
+ccl_device_inline float atomic_compare_and_swap_float(ccl_global float *source,
+                                                      float old_val,
+                                                      float new_val)
+{
+  sycl::atomic_ref<float,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*source);
+  atomic.compare_exchange_weak(old_val, new_val);
+  return old_val;
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_add_uint32(ccl_global unsigned int *p,
+                                                           unsigned int x)
+{
+  sycl::atomic_ref<unsigned int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_add(x);
+}
+
+ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_add(x);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p,
+                                                           unsigned int x)
+{
+  sycl::atomic_ref<unsigned int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_sub(x);
+}
+
+ccl_device_inline int atomic_fetch_and_sub_uint32(ccl_global int *p, int x)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_sub(x);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_inc_uint32(ccl_global unsigned int *p)
+{
+  return atomic_fetch_and_add_uint32(p, 1);
+}
+
+ccl_device_inline int atomic_fetch_and_inc_uint32(ccl_global int *p)
+{
+  return atomic_fetch_and_add_uint32(p, 1);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_dec_uint32(ccl_global unsigned int *p)
+{
+  return atomic_fetch_and_sub_uint32(p, 1);
+}
+
+ccl_device_inline int atomic_fetch_and_dec_uint32(ccl_global int *p)
+{
+  return atomic_fetch_and_sub_uint32(p, 1);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_or_uint32(ccl_global unsigned int *p,
+                                                          unsigned int x)
+{
+  sycl::atomic_ref<unsigned int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_or(x);
+}
+
+ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_or(x);
+}
+
+#  endif /* __KERNEL_ONEAPI__ */
+
 #endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h
index 795c3754976..537f8ab6771 100644
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -318,14 +318,14 @@ ccl_device float3 color_highlight_compress(float3 color, ccl_private float3 *var
 {
   color += one_float3();
   if (variance) {
-    *variance *= sqr3(one_float3() / color);
+    *variance *= sqr(one_float3() / color);
   }
-  return log3(color);
+  return log(color);
 }
 
 ccl_device float3 color_highlight_uncompress(float3 color)
 {
-  return exp3(color) - one_float3();
+  return exp(color) - one_float3();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp
index 65d108bb9d1..8210e21f951 100644
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -13,7 +13,6 @@
 CCL_NAMESPACE_BEGIN
 
 DebugFlags::CPU::CPU()
-    : avx2(true), avx(true), sse41(true), sse3(true), sse2(true), bvh_layout(BVH_LAYOUT_AUTO)
 {
   reset();
 }
@@ -25,7 +24,7 @@ void DebugFlags::CPU::reset()
   do { \
     flag = (getenv(env) == NULL); \
     if (!flag) { \
-      VLOG(1) << "Disabling " << STRINGIFY(flag) << " instruction set."; \
+      VLOG_INFO << "Disabling " << STRINGIFY(flag) << " instruction set."; \
     } \
   } while (0)
 
@@ -41,17 +40,17 @@ void DebugFlags::CPU::reset()
   bvh_layout = BVH_LAYOUT_AUTO;
 }
 
-DebugFlags::CUDA::CUDA() : adaptive_compile(false)
+DebugFlags::CUDA::CUDA()
 {
   reset();
 }
 
-DebugFlags::HIP::HIP() : adaptive_compile(false)
+DebugFlags::HIP::HIP()
 {
   reset();
 }
 
-DebugFlags::Metal::Metal() : adaptive_compile(false)
+DebugFlags::Metal::Metal()
 {
   reset();
 }
@@ -84,14 +83,13 @@ void DebugFlags::OptiX::reset()
   use_debug = false;
 }
 
-DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false)
+DebugFlags::DebugFlags()
 {
   /* Nothing for now. */
 }
 
 void DebugFlags::reset()
 {
-  viewport_static_bvh = false;
   cpu.reset();
   cuda.reset();
   optix.reset();
diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h
index 3565fdea17f..ab200649f59 100644
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -17,11 +17,6 @@ CCL_NAMESPACE_BEGIN
  */
 class DebugFlags {
  public:
-  /* Use static BVH in viewport, to match final render exactly. */
-  bool viewport_static_bvh;
-
-  bool running_inside_blender;
-
   /* Descriptor of CPU feature-set to be used. */
   struct CPU {
     CPU();
@@ -30,11 +25,11 @@ class DebugFlags {
     void reset();
 
     /* Flags describing which instructions sets are allowed for use. */
-    bool avx2;
-    bool avx;
-    bool sse41;
-    bool sse3;
-    bool sse2;
+    bool avx2 = true;
+    bool avx = true;
+    bool sse41 = true;
+    bool sse3 = true;
+    bool sse2 = true;
 
     /* Check functions to see whether instructions up to the given one
      * are allowed for use.
@@ -65,7 +60,7 @@ class DebugFlags {
      * By default the fastest will be used. For debugging the BVH used by other
      * CPUs and GPUs can be selected here instead.
      */
-    BVHLayout bvh_layout;
+    BVHLayout bvh_layout = BVH_LAYOUT_AUTO;
   };
 
   /* Descriptor of CUDA feature-set to be used. */
@@ -77,7 +72,7 @@ class DebugFlags {
 
     /* Whether adaptive feature based runtime compile is enabled or not.
      * Requires the CUDA Toolkit and only works on Linux at the moment. */
-    bool adaptive_compile;
+    bool adaptive_compile = false;
   };
 
   /* Descriptor of HIP feature-set to be used. */
@@ -88,7 +83,7 @@ class DebugFlags {
     void reset();
 
     /* Whether adaptive feature based runtime compile is enabled or not. */
-    bool adaptive_compile;
+    bool adaptive_compile = false;
   };
 
   /* Descriptor of OptiX feature-set to be used. */
@@ -100,7 +95,7 @@ class DebugFlags {
 
     /* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable
      * validations, and lower optimization level. */
-    bool use_debug;
+    bool use_debug = false;
   };
 
   /* Descriptor of Metal feature-set to be used. */
@@ -111,7 +106,7 @@ class DebugFlags {
     void reset();
 
     /* Whether adaptive feature based runtime compile is enabled or not. */
-    bool adaptive_compile;
+    bool adaptive_compile = false;
   };
 
   /* Get instance of debug flags registry. */
@@ -142,15 +137,9 @@ class DebugFlags {
  private:
   DebugFlags();
 
-#if (__cplusplus > 199711L)
  public:
   explicit DebugFlags(DebugFlags const & /*other*/) = delete;
   void operator=(DebugFlags const & /*other*/) = delete;
-#else
- private:
-  explicit DebugFlags(DebugFlags const & /*other*/);
-  void operator=(DebugFlags const & /*other*/);
-#endif
 };
 
 typedef DebugFlags &DebugFlagsRef;
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index 115a747cf1c..1969529eff0 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -81,7 +81,7 @@
 /* macros */
 
 /* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
 #  define LIKELY(x) __builtin_expect(!!(x), 1)
 #  define UNLIKELY(x) __builtin_expect(!!(x), 0)
 #else
@@ -89,46 +89,6 @@
 #  define UNLIKELY(x) (x)
 #endif
 
-#if defined(__GNUC__) || defined(__clang__)
-#  if defined(__cplusplus)
-/* Some magic to be sure we don't have reference in the type. */
-template<typename T> static inline T decltype_helper(T x)
-{
-  return x;
-}
-#    define TYPEOF(x) decltype(decltype_helper(x))
-#  else
-#    define TYPEOF(x) typeof(x)
-#  endif
-#endif
-
-/* Causes warning:
- * incompatible types when assigning to type 'Foo' from type 'Bar'
- * ... the compiler optimizes away the temp var */
-#ifdef __GNUC__
-#  define CHECK_TYPE(var, type) \
-    { \
-      TYPEOF(var) * __tmp; \
-      __tmp = (type *)NULL; \
-      (void)__tmp; \
-    } \
-    (void)0
-
-#  define CHECK_TYPE_PAIR(var_a, var_b) \
-    { \
-      TYPEOF(var_a) * __tmp; \
-      __tmp = (typeof(var_b) *)NULL; \
-      (void)__tmp; \
-    } \
-    (void)0
-#else
-#  define CHECK_TYPE(var, type)
-#  define CHECK_TYPE_PAIR(var_a, var_b)
-#endif
-
-/* can be used in simple macros */
-#define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val)))
-
 #ifndef __KERNEL_GPU__
 #  include <cassert>
 #  define util_assert(statement) assert(statement)
@@ -136,4 +96,7 @@ template<typename T> static inline T decltype_helper(T x)
 #  define util_assert(statement)
 #endif
 
+#define CONCAT_HELPER(a, ...) a##__VA_ARGS__
+#define CONCAT(a, ...) CONCAT_HELPER(a, __VA_ARGS__)
+
 #endif /* __UTIL_DEFINES_H__ */
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index 434bc12d670..c668638eb02 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -35,7 +35,7 @@ ccl_device_inline float half_to_float(half h_in)
 #else
 
 /* CUDA has its own half data type, no need to define then */
-#  if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
+#  if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) && !defined(__KERNEL_ONEAPI__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -73,7 +73,7 @@ struct half4 {
 
 ccl_device_inline half float_to_half_image(float f)
 {
-#if defined(__KERNEL_METAL__)
+#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
   return half(min(f, 65504.0f));
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __float2half(min(f, 65504.0f));
@@ -103,6 +103,8 @@ ccl_device_inline float half_to_float_image(half h)
 {
 #if defined(__KERNEL_METAL__)
   return half_to_float(h);
+#elif defined(__KERNEL_ONEAPI__)
+  return float(h);
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __half2float(h);
 #else
@@ -136,7 +138,7 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
 
 ccl_device_inline half float_to_half_display(const float f)
 {
-#if defined(__KERNEL_METAL__)
+#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
   return half(min(f, 65504.0f));
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __float2half(min(f, 65504.0f));
diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h
index 081b33025d8..4f83f331229 100644
--- a/intern/cycles/util/hash.h
+++ b/intern/cycles/util/hash.h
@@ -4,10 +4,28 @@
 #ifndef __UTIL_HASH_H__
 #define __UTIL_HASH_H__
 
+#include "util/math.h"
 #include "util/types.h"
 
 CCL_NAMESPACE_BEGIN
 
+/* [0, uint_max] -> [0.0, 1.0) */
+ccl_device_forceinline float uint_to_float_excl(uint n)
+{
+  // Note: we divide by 4294967808 instead of 2^32 because the latter
+  // leads to a [0.0, 1.0] mapping instead of [0.0, 1.0) due to floating
+  // point rounding error. 4294967808 unfortunately leaves (precisely)
+  // one unused ulp between the max number this outputs and 1.0, but
+  // that's the best you can do with this construction.
+  return (float)n * (1.0f / 4294967808.0f);
+}
+
+/* [0, uint_max] -> [0.0, 1.0] */
+ccl_device_forceinline float uint_to_float_incl(uint n)
+{
+  return (float)n * (1.0f / (float)0xFFFFFFFFu);
+}
+
 /* ***** Jenkins Lookup3 Hash Functions ***** */
 
 /* Source: http://burtleburtle.net/bob/c/lookup3.c */
@@ -116,22 +134,22 @@ ccl_device_inline uint hash_uint4(uint kx, uint ky, uint kz, uint kw)
 
 ccl_device_inline float hash_uint_to_float(uint kx)
 {
-  return (float)hash_uint(kx) / (float)0xFFFFFFFFu;
+  return uint_to_float_incl(hash_uint(kx));
 }
 
 ccl_device_inline float hash_uint2_to_float(uint kx, uint ky)
 {
-  return (float)hash_uint2(kx, ky) / (float)0xFFFFFFFFu;
+  return uint_to_float_incl(hash_uint2(kx, ky));
 }
 
 ccl_device_inline float hash_uint3_to_float(uint kx, uint ky, uint kz)
 {
-  return (float)hash_uint3(kx, ky, kz) / (float)0xFFFFFFFFu;
+  return uint_to_float_incl(hash_uint3(kx, ky, kz));
 }
 
 ccl_device_inline float hash_uint4_to_float(uint kx, uint ky, uint kz, uint kw)
 {
-  return (float)hash_uint4(kx, ky, kz, kw) / (float)0xFFFFFFFFu;
+  return uint_to_float_incl(hash_uint4(kx, ky, kz, kw));
 }
 
 /* Hashing float or float[234] into a float in the range [0, 1]. */
@@ -359,6 +377,123 @@ ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
 
 #endif
 
+/* ***** Hash Prospector Hash Functions *****
+ *
+ * These are based on the high-quality 32-bit hash/mixing functions from
+ * https://github.com/skeeto/hash-prospector
+ */
+
+ccl_device_inline uint hash_hp_uint(uint i)
+{
+  // The actual mixing function from Hash Prospector.
+  i ^= i >> 16;
+  i *= 0x21f0aaad;
+  i ^= i >> 15;
+  i *= 0xd35a2d97;
+  i ^= i >> 15;
+
+  // The xor is just to make input zero not map to output zero.
+  // The number is randomly selected and isn't special.
+  return i ^ 0xe6fe3beb;
+}
+
+/* Seedable version of hash_hp_uint() above. */
+ccl_device_inline uint hash_hp_seeded_uint(uint i, uint seed)
+{
+  // Manipulate the seed so it doesn't interact poorly with n when they
+  // are both e.g. incrementing.  This isn't fool-proof, but is good
+  // enough for practical use.
+  seed ^= seed << 19;
+
+  return hash_hp_uint(i ^ seed);
+}
+
+/* Outputs [0.0, 1.0). */
+ccl_device_inline float hash_hp_float(uint i)
+{
+  return uint_to_float_excl(hash_hp_uint(i));
+}
+
+/* Outputs [0.0, 1.0). */
+ccl_device_inline float hash_hp_seeded_float(uint i, uint seed)
+{
+  return uint_to_float_excl(hash_hp_seeded_uint(i, seed));
+}
+
+/* ***** Modified Wang Hash Functions *****
+ *
+ * These are based on a bespoke modified version of the Wang hash, and
+ * can serve as a faster hash when quality isn't critical.
+ *
+ * The original Wang hash is documented here:
+ * https://www.burtleburtle.net/bob/hash/integer.html
+ */
+
+ccl_device_inline uint hash_wang_seeded_uint(uint i, uint seed)
+{
+  i = (i ^ 61) ^ seed;
+  i += i << 3;
+  i ^= i >> 4;
+  i *= 0x27d4eb2d;
+  return i;
+}
+
+/* Outputs [0.0, 1.0). */
+ccl_device_inline float hash_wang_seeded_float(uint i, uint seed)
+{
+  return uint_to_float_excl(hash_wang_seeded_uint(i, seed));
+}
+
+/* ***** Index Shuffling Hash Function *****
+ *
+ * This function takes an index, the length of the thing the index points
+ * into, and returns a shuffled index.  For example, if you pass indices
+ * 0 through 19 to this function with a length parameter of 20, it will
+ * return the indices in a shuffled order with no repeats.  Indices
+ * larger than the length parameter will simply repeat the same shuffled
+ * pattern over and over.
+ *
+ * This is useful for iterating over an array in random shuffled order
+ * without having to shuffle the array itself.
+ *
+ * Passing different seeds results in different random shuffles.
+ *
+ * This function runs in average O(1) time.
+ *
+ * See https://andrew-helmer.github.io/permute/ for details on how this
+ * works.
+ */
+ccl_device_inline uint hash_shuffle_uint(uint i, uint length, uint seed)
+{
+  i = i % length;
+  uint mask = (1 << (32 - count_leading_zeros(length - 1))) - 1;
+
+  do {
+    i ^= seed;
+    i *= 0xe170893d;
+    i ^= seed >> 16;
+    i ^= (i & mask) >> 4;
+    i ^= seed >> 8;
+    i *= 0x0929eb3f;
+    i ^= seed >> 23;
+    i ^= (i & mask) >> 1;
+    i *= 1 | seed >> 27;
+    i *= 0x6935fa69;
+    i ^= (i & mask) >> 11;
+    i *= 0x74dcb303;
+    i ^= (i & mask) >> 2;
+    i *= 0x9e501cc3;
+    i ^= (i & mask) >> 2;
+    i *= 0xc860a3df;
+    i &= mask;
+    i ^= i >> 5;
+  } while (i >= length);
+
+  return i;
+}
+
+/* ********** */
+
 #ifndef __KERNEL_GPU__
 static inline uint hash_string(const char *str)
 {
diff --git a/intern/cycles/util/log.h b/intern/cycles/util/log.h
index b33c826d6f5..3780d03c0d1 100644
--- a/intern/cycles/util/log.h
+++ b/intern/cycles/util/log.h
@@ -69,9 +69,22 @@ class LogMessageVoidify {
 #  define LOG_ASSERT(expression) LOG_SUPPRESS()
 #endif
 
-#define VLOG_ONCE(level, flag) \
-  if (!flag) \
-  flag = true, VLOG(level)
+/* Verbose logging categories. */
+
+/* Warnings. */
+#define VLOG_WARNING VLOG(1)
+/* Info about devices, scene contents and features used. */
+#define VLOG_INFO VLOG(2)
+#define VLOG_INFO_IS_ON VLOG_IS_ON(2)
+/* Work being performed and timing/memory stats about that work. */
+#define VLOG_WORK VLOG(3)
+#define VLOG_WORK_IS_ON VLOG_IS_ON(3)
+/* Detailed device timing stats. */
+#define VLOG_DEVICE_STATS VLOG(4)
+#define VLOG_DEVICE_STATS_IS_ON VLOG_IS_ON(4)
+/* Verbose debug messages. */
+#define VLOG_DEBUG VLOG(5)
+#define VLOG_DEBUG_IS_ON VLOG_IS_ON(5)
 
 struct int2;
 struct float3;
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 555a5304764..0905b3ec5c9 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -79,7 +79,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Scalar */
 
-#ifndef __HIP__
+#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__)
 #  ifdef _WIN32
 ccl_device_inline float fmaxf(float a, float b)
 {
@@ -92,12 +92,18 @@ ccl_device_inline float fminf(float a, float b)
 }
 
 #  endif /* _WIN32 */
-#endif   /* __HIP__ */
+#endif   /* __HIP__, __KERNEL_ONEAPI__ */
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#  ifndef __KERNEL_ONEAPI__
 using std::isfinite;
 using std::isnan;
 using std::sqrt;
+#  else
+using sycl::sqrt;
+#    define isfinite(x) sycl::isfinite((x))
+#    define isnan(x) sycl::isnan((x))
+#  endif
 
 ccl_device_inline int abs(int x)
 {
@@ -297,8 +303,15 @@ ccl_device_inline float4 __int4_as_float4(int4 i)
 #endif /* !defined(__KERNEL_METAL__) */
 
 #if defined(__KERNEL_METAL__)
-#  define isnan_safe(v) isnan(v)
-#  define isfinite_safe(v) isfinite(v)
+ccl_device_forceinline bool isnan_safe(float f)
+{
+  return isnan(f);
+}
+
+ccl_device_forceinline bool isfinite_safe(float f)
+{
+  return isfinite(f);
+}
 #else
 template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
 {
@@ -498,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
   return make_float4(a.x, a.y, a.z, 1.0f);
 }
 
+ccl_device_inline float4 float3_to_float4(const float3 a, const float w)
+{
+  return make_float4(a.x, a.y, a.z, w);
+}
+
 ccl_device_inline float inverse_lerp(float a, float b, float x)
 {
   return (x - a) / (b - a);
@@ -522,6 +540,7 @@ CCL_NAMESPACE_END
 #include "util/math_float2.h"
 #include "util/math_float3.h"
 #include "util/math_float4.h"
+#include "util/math_float8.h"
 
 #include "util/rect.h"
 
@@ -576,26 +595,26 @@ ccl_device_inline void make_orthonormals(const float3 N,
 
 /* Color division */
 
-ccl_device_inline float3 safe_invert_color(float3 a)
+ccl_device_inline Spectrum safe_invert_color(Spectrum a)
 {
-  float x, y, z;
-
-  x = (a.x != 0.0f) ? 1.0f / a.x : 0.0f;
-  y = (a.y != 0.0f) ? 1.0f / a.y : 0.0f;
-  z = (a.z != 0.0f) ? 1.0f / a.z : 0.0f;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(a, i) != 0.0f) ?
+                                     1.0f / GET_SPECTRUM_CHANNEL(a, i) :
+                                     0.0f;
+  }
 
-  return make_float3(x, y, z);
+  return a;
 }
 
-ccl_device_inline float3 safe_divide_color(float3 a, float3 b)
+ccl_device_inline Spectrum safe_divide_color(Spectrum a, Spectrum b)
 {
-  float x, y, z;
-
-  x = (b.x != 0.0f) ? a.x / b.x : 0.0f;
-  y = (b.y != 0.0f) ? a.y / b.y : 0.0f;
-  z = (b.z != 0.0f) ? a.z / b.z : 0.0f;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    GET_SPECTRUM_CHANNEL(a, i) = (GET_SPECTRUM_CHANNEL(b, i) != 0.0f) ?
+                                     GET_SPECTRUM_CHANNEL(a, i) / GET_SPECTRUM_CHANNEL(b, i) :
+                                     0.0f;
+  }
 
-  return make_float3(x, y, z);
+  return a;
 }
 
 ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
@@ -786,6 +805,11 @@ ccl_device_inline uint popcount(uint x)
   return i & 1;
 }
 #  endif
+#elif defined(__KERNEL_ONEAPI__)
+#  define popcount(x) sycl::popcount(x)
+#elif defined(__KERNEL_HIP__)
+/* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */
+#  define popcount(x) __popcll(x)
 #elif !defined(__KERNEL_METAL__)
 #  define popcount(x) __popc(x)
 #endif
@@ -796,6 +820,8 @@ ccl_device_inline uint count_leading_zeros(uint x)
   return __clz(x);
 #elif defined(__KERNEL_METAL__)
   return clz(x);
+#elif defined(__KERNEL_ONEAPI__)
+  return sycl::clz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -814,6 +840,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
   return (__ffs(x) - 1);
 #elif defined(__KERNEL_METAL__)
   return ctz(x);
+#elif defined(__KERNEL_ONEAPI__)
+  return sycl::ctz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -858,16 +886,16 @@ ccl_device_inline float2 map_to_tube(const float3 co)
 
 ccl_device_inline float2 map_to_sphere(const float3 co)
 {
-  float l = len(co);
+  float l = dot(co, co);
   float u, v;
   if (l > 0.0f) {
     if (UNLIKELY(co.x == 0.0f && co.y == 0.0f)) {
       u = 0.0f; /* Otherwise domain error. */
     }
     else {
-      u = (1.0f - atan2f(co.x, co.y) / M_PI_F) / 2.0f;
+      u = (0.5f - atan2f(co.x, co.y) * M_1_2PI_F);
     }
-    v = 1.0f - safe_acosf(co.z / l) / M_PI_F;
+    v = 1.0f - safe_acosf(co.z / sqrtf(l)) * M_1_PI_F;
   }
   else {
     u = v = 0.0f;
@@ -925,7 +953,11 @@ ccl_device_inline uint prev_power_of_two(uint x)
 ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
 {
   /* Use a native instruction if it exists. */
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__KERNEL_CUDA__)
+  return __brev(x);
+#elif defined(__KERNEL_METAL__)
+  return reverse_bits(x);
+#elif defined(__aarch64__) || defined(_M_ARM64)
   /* Assume the rbit is always available on 64bit ARM architecture. */
   __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
   return x;
@@ -934,10 +966,6 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
    * This 32-bit Thumb instruction is available in ARMv6T2 and above. */
   __asm__("rbit %0, %1" : "=r"(x) : "r"(x));
   return x;
-#elif defined(__KERNEL_CUDA__)
-  return __brev(x);
-#elif defined(__KERNEL_METAL__)
-  return reverse_bits(x);
 #elif __has_builtin(__builtin_bitreverse32)
   return __builtin_bitreverse32(x);
 #else
diff --git a/intern/cycles/util/math_fast.h b/intern/cycles/util/math_fast.h
index 2221e7a9835..142a664a1d2 100644
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -420,7 +420,7 @@ ccl_device_inline float fast_expf(float x)
   return fast_exp2f(x / M_LN2_F);
 }
 
-#if defined(__KERNEL_CPU__) && !defined(_MSC_VER)
+#if !defined(__KERNEL_GPU__) && !defined(_MSC_VER)
 /* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see
  * T78047 and T78869 for details. Just disable for now, it only makes
  * a small difference in denoising performance. */
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 365c322dd7e..c408eadf195 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -53,26 +53,25 @@ ccl_device_inline float3 ceil(const float3 &a);
 ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
 #endif /* !defined(__KERNEL_METAL__) */
 
-ccl_device_inline float min3(float3 a);
-ccl_device_inline float max3(float3 a);
+ccl_device_inline float reduce_min(float3 a);
+ccl_device_inline float reduce_max(float3 a);
 ccl_device_inline float len(const float3 a);
 ccl_device_inline float len_squared(const float3 a);
 
 ccl_device_inline float3 project(const float3 v, const float3 v_proj);
 
-ccl_device_inline float3 saturate3(float3 a);
 ccl_device_inline float3 safe_normalize(const float3 a);
 ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
 ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b);
-ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b);
+ccl_device_inline float3 safe_divide(const float3 a, const float3 b);
+ccl_device_inline float3 safe_divide(const float3 a, const float b);
 ccl_device_inline float3 interp(float3 a, float3 b, float t);
-ccl_device_inline float3 sqr3(float3 a);
+ccl_device_inline float3 sqr(float3 a);
 
 ccl_device_inline bool is_zero(const float3 a);
 ccl_device_inline float reduce_add(const float3 a);
 ccl_device_inline float average(const float3 a);
-ccl_device_inline bool isequal_float3(const float3 a, const float3 b);
+ccl_device_inline bool isequal(const float3 a, const float3 b);
 
 /*******************************************************************************
  * Definition.
@@ -148,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
 
 ccl_device_inline float3 operator/(const float3 &a, const float f)
 {
-  float invf = 1.0f / f;
-  return a * invf;
+#  if defined(__KERNEL_SSE__)
+  return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
+#  else
+  return make_float3(a.x / f, a.y / f, a.z / f);
+#  endif
 }
 
 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
@@ -285,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
 
 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 {
-  float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
-  return r;
+#  ifdef __KERNEL_SSE__
+  return float3(shuffle<1, 2, 0, 3>(
+      msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+#  else
+  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#  endif
 }
 
 ccl_device_inline float3 normalize(const float3 &a)
@@ -377,14 +383,30 @@ ccl_device_inline float3 rcp(const float3 &a)
   return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
 #  endif
 }
+
+ccl_device_inline float3 saturate(float3 a)
+{
+  return make_float3(saturatef(a.x), saturatef(a.y), saturatef(a.z));
+}
+
+ccl_device_inline float3 exp(float3 v)
+{
+  return make_float3(expf(v.x), expf(v.y), expf(v.z));
+}
+
+ccl_device_inline float3 log(float3 v)
+{
+  return make_float3(logf(v.x), logf(v.y), logf(v.z));
+}
+
 #endif /* !__KERNEL_METAL__ */
 
-ccl_device_inline float min3(float3 a)
+ccl_device_inline float reduce_min(float3 a)
 {
   return min(min(a.x, a.y), a.z);
 }
 
-ccl_device_inline float max3(float3 a)
+ccl_device_inline float reduce_max(float3 a)
 {
   return max(max(a.x, a.y), a.z);
 }
@@ -433,11 +455,6 @@ ccl_device_inline float3 project(const float3 v, const float3 v_proj)
   return (len_squared != 0.0f) ? (dot(v, v_proj) / len_squared) * v_proj : zero_float3();
 }
 
-ccl_device_inline float3 saturate3(float3 a)
-{
-  return make_float3(saturatef(a.x), saturatef(a.y), saturatef(a.z));
-}
-
 ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t)
 {
   *t = len(a);
@@ -457,14 +474,14 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t
   return (*t != 0.0f) ? a / (*t) : a;
 }
 
-ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b)
+ccl_device_inline float3 safe_divide(const float3 a, const float3 b)
 {
   return make_float3((b.x != 0.0f) ? a.x / b.x : 0.0f,
                      (b.y != 0.0f) ? a.y / b.y : 0.0f,
                      (b.z != 0.0f) ? a.z / b.z : 0.0f);
 }
 
-ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b)
+ccl_device_inline float3 safe_divide(const float3 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float3();
 }
@@ -474,7 +491,7 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t)
   return a + t * (b - a);
 }
 
-ccl_device_inline float3 sqr3(float3 a)
+ccl_device_inline float3 sqr(float3 a)
 {
   return a * a;
 }
@@ -504,7 +521,7 @@ ccl_device_inline float average(const float3 a)
   return reduce_add(a) * (1.0f / 3.0f);
 }
 
-ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
+ccl_device_inline bool isequal(const float3 a, const float3 b)
 {
 #if defined(__KERNEL_METAL__)
   return all(a == b);
@@ -513,21 +530,11 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 #endif
 }
 
-ccl_device_inline float3 pow3(float3 v, float e)
+ccl_device_inline float3 pow(float3 v, float e)
 {
   return make_float3(powf(v.x, e), powf(v.y, e), powf(v.z, e));
 }
 
-ccl_device_inline float3 exp3(float3 v)
-{
-  return make_float3(expf(v.x), expf(v.y), expf(v.z));
-}
-
-ccl_device_inline float3 log3(float3 v)
-{
-  return make_float3(logf(v.x), logf(v.y), logf(v.z));
-}
-
 ccl_device_inline int3 quick_floor_to_int3(const float3 a)
 {
 #ifdef __KERNEL_SSE__
@@ -540,12 +547,12 @@ ccl_device_inline int3 quick_floor_to_int3(const float3 a)
 #endif
 }
 
-ccl_device_inline bool isfinite3_safe(float3 v)
+ccl_device_inline bool isfinite_safe(float3 v)
 {
   return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z);
 }
 
-ccl_device_inline float3 ensure_finite3(float3 v)
+ccl_device_inline float3 ensure_finite(float3 v)
 {
   if (!isfinite_safe(v.x))
     v.x = 0.0f;
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index ae9dfe75a9c..c2721873037 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -55,7 +55,8 @@ ccl_device_inline float4 floor(const float4 &a);
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
 #endif /* !__KERNEL_METAL__*/
 
-ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
+ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
+ccl_device_inline float4 safe_divide(const float4 a, const float b);
 
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
@@ -74,11 +75,14 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
 #  endif
 #endif /* __KERNEL_SSE__ */
 
+ccl_device_inline float reduce_min(const float4 a);
+ccl_device_inline float reduce_max(const float4 a);
+ccl_device_inline float reduce_add(const float4 a);
+
+ccl_device_inline bool isequal(const float4 a, const float4 b);
+
 #ifndef __KERNEL_GPU__
 ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-ccl_device_inline float4 reduce_min(const float4 &a);
-ccl_device_inline float4 reduce_max(const float4 &a);
-ccl_device_inline float4 reduce_add(const float4 &a);
 #endif /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -303,27 +307,9 @@ ccl_device_inline bool is_zero(const float4 &a)
 #  endif
 }
 
-ccl_device_inline float4 reduce_add(const float4 &a)
-{
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vdupq_n_f32(vaddvq_f32(a)));
-#    elif defined(__KERNEL_SSE3__)
-  float4 h(_mm_hadd_ps(a.m128, a.m128));
-  return float4(_mm_hadd_ps(h.m128, h.m128));
-#    else
-  float4 h(shuffle<1, 0, 3, 2>(a) + a);
-  return shuffle<2, 3, 0, 1>(h) + h;
-#    endif
-#  else
-  float sum = (a.x + a.y) + (a.z + a.w);
-  return make_float4(sum, sum, sum, sum);
-#  endif
-}
-
 ccl_device_inline float average(const float4 &a)
 {
-  return reduce_add(a).x * 0.25f;
+  return reduce_add(a) * 0.25f;
 }
 
 ccl_device_inline float len(const float4 &a)
@@ -392,8 +378,77 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
   return a + t * (b - a);
 }
 
+ccl_device_inline float4 saturate(const float4 &a)
+{
+  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
 #endif /* !__KERNEL_METAL__*/
 
+ccl_device_inline float reduce_add(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
+  return vaddvq_f32(a);
+#  elif defined(__KERNEL_SSE3__)
+  float4 h(_mm_hadd_ps(a.m128, a.m128));
+  return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+#  else
+  float4 h(shuffle<1, 0, 3, 2>(a) + a);
+  return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
+#  endif
+#else
+  return a.x + a.y + a.z + a.w;
+#endif
+}
+
+ccl_device_inline float reduce_min(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
+  return vminvq_f32(a);
+#  else
+  float4 h = min(shuffle<1, 0, 3, 2>(a), a);
+  return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h));
+#  endif
+#else
+  return min(min(a.x, a.y), min(a.z, a.w));
+#endif
+}
+
+ccl_device_inline float reduce_max(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
+  return vmaxvq_f32(a);
+#  else
+  float4 h = max(shuffle<1, 0, 3, 2>(a), a);
+  return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h));
+#  endif
+#else
+  return max(max(a.x, a.y), max(a.z, a.w));
+#endif
+}
+
+ccl_device_inline bool isequal(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_METAL__)
+  return all(a == b);
+#else
+  return a == b;
+#endif
+}
+
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
@@ -461,34 +516,6 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
   return select(mask, a, zero_float4());
 }
 
-ccl_device_inline float4 reduce_min(const float4 &a)
-{
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vdupq_n_f32(vminvq_f32(a)));
-#    else
-  float4 h = min(shuffle<1, 0, 3, 2>(a), a);
-  return min(shuffle<2, 3, 0, 1>(h), h);
-#    endif
-#  else
-  return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
-#  endif
-}
-
-ccl_device_inline float4 reduce_max(const float4 &a)
-{
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vdupq_n_f32(vmaxvq_f32(a)));
-#    else
-  float4 h = max(shuffle<1, 0, 3, 2>(a), a);
-  return max(shuffle<2, 3, 0, 1>(h), h);
-#    endif
-#  else
-  return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
-#  endif
-}
-
 ccl_device_inline float4 load_float4(ccl_private const float *v)
 {
 #  ifdef __KERNEL_SSE__
@@ -500,17 +527,25 @@ ccl_device_inline float4 load_float4(ccl_private const float *v)
 
 #endif /* !__KERNEL_GPU__ */
 
-ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b)
+ccl_device_inline float4 safe_divide(const float4 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float4();
 }
 
-ccl_device_inline bool isfinite4_safe(float4 v)
+ccl_device_inline float4 safe_divide(const float4 a, const float4 b)
+{
+  return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f,
+                     (b.y != 0.0f) ? a.y / b.y : 0.0f,
+                     (b.z != 0.0f) ? a.z / b.z : 0.0f,
+                     (b.w != 0.0f) ? a.w / b.w : 0.0f);
+}
+
+ccl_device_inline bool isfinite_safe(float4 v)
 {
   return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w);
 }
 
-ccl_device_inline float4 ensure_finite4(float4 v)
+ccl_device_inline float4 ensure_finite(float4 v)
 {
   if (!isfinite_safe(v.x))
     v.x = 0.0f;
@@ -523,6 +558,11 @@ ccl_device_inline float4 ensure_finite4(float4 v)
   return v;
 }
 
+ccl_device_inline float4 pow(float4 v, float e)
+{
+  return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.z, e));
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h
new file mode 100644
index 00000000000..b538cfbe70b
--- /dev/null
+++ b/intern/cycles/util/math_float8.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_FLOAT8_H__
+#define __UTIL_MATH_FLOAT8_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+ccl_device_inline float8_t operator+(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator+(const float8_t a, const float f);
+ccl_device_inline float8_t operator+(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator-(const float8_t a);
+ccl_device_inline float8_t operator-(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator-(const float8_t a, const float f);
+ccl_device_inline float8_t operator-(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator*(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator*(const float8_t a, const float f);
+ccl_device_inline float8_t operator*(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator/(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator/(const float8_t a, float f);
+ccl_device_inline float8_t operator/(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator+=(float8_t a, const float8_t b);
+
+ccl_device_inline float8_t operator*=(float8_t a, const float8_t b);
+ccl_device_inline float8_t operator*=(float8_t a, float f);
+
+ccl_device_inline float8_t operator/=(float8_t a, float f);
+
+ccl_device_inline bool operator==(const float8_t a, const float8_t b);
+
+ccl_device_inline float8_t rcp(const float8_t a);
+ccl_device_inline float8_t sqrt(const float8_t a);
+ccl_device_inline float8_t sqr(const float8_t a);
+ccl_device_inline bool is_zero(const float8_t a);
+ccl_device_inline float average(const float8_t a);
+ccl_device_inline float8_t min(const float8_t a, const float8_t b);
+ccl_device_inline float8_t max(const float8_t a, const float8_t b);
+ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx);
+ccl_device_inline float8_t fabs(const float8_t a);
+ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t);
+ccl_device_inline float8_t saturate(const float8_t a);
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float b);
+ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b);
+
+ccl_device_inline float reduce_min(const float8_t a);
+ccl_device_inline float reduce_max(const float8_t a);
+ccl_device_inline float reduce_add(const float8_t a);
+
+ccl_device_inline bool isequal(const float8_t a, const float8_t b);
+
+/*******************************************************************************
+ * Definition.
+ */
+
+ccl_device_inline float8_t zero_float8_t()
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_setzero_ps());
+#else
+  return make_float8_t(0.0f);
+#endif
+}
+
+ccl_device_inline float8_t one_float8_t()
+{
+  return make_float8_t(1.0f);
+}
+
+ccl_device_inline float8_t operator+(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_add_ps(a.m256, b.m256));
+#else
+  return make_float8_t(
+      a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator+(const float8_t a, const float f)
+{
+  return a + make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator+(const float f, const float8_t a)
+{
+  return make_float8_t(f) + a;
+}
+
+ccl_device_inline float8_t operator-(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+  return float8_t(_mm256_xor_ps(a.m256, mask));
+#else
+  return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+#endif
+}
+
+ccl_device_inline float8_t operator-(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_sub_ps(a.m256, b.m256));
+#else
+  return make_float8_t(
+      a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator-(const float8_t a, const float f)
+{
+  return a - make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator-(const float f, const float8_t a)
+{
+  return make_float8_t(f) - a;
+}
+
+ccl_device_inline float8_t operator*(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_mul_ps(a.m256, b.m256));
+#else
+  return make_float8_t(
+      a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator*(const float8_t a, const float f)
+{
+  return a * make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator*(const float f, const float8_t a)
+{
+  return make_float8_t(f) * a;
+}
+
+ccl_device_inline float8_t operator/(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_div_ps(a.m256, b.m256));
+#else
+  return make_float8_t(
+      a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator/(const float8_t a, const float f)
+{
+  return a / make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator/(const float f, const float8_t a)
+{
+  return make_float8_t(f) / a;
+}
+
+ccl_device_inline float8_t operator+=(float8_t a, const float8_t b)
+{
+  return a = a + b;
+}
+
+ccl_device_inline float8_t operator-=(float8_t a, const float8_t b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline float8_t operator*=(float8_t a, const float8_t b)
+{
+  return a = a * b;
+}
+
+ccl_device_inline float8_t operator*=(float8_t a, float f)
+{
+  return a = a * f;
+}
+
+ccl_device_inline float8_t operator/=(float8_t a, float f)
+{
+  return a = a / f;
+}
+
+ccl_device_inline bool operator==(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return (_mm256_movemask_ps(_mm256_castsi256_ps(
+              _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
+          0b11111111) == 0b11111111;
+#else
+  return (a.a == b.a && a.b == b.b && a.c == b.c && a.d == b.d && a.e == b.e && a.f == b.f &&
+          a.g == b.g && a.h == b.h);
+#endif
+}
+
+ccl_device_inline float8_t rcp(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_rcp_ps(a.m256));
+#else
+  return make_float8_t(1.0f / a.a,
+                       1.0f / a.b,
+                       1.0f / a.c,
+                       1.0f / a.d,
+                       1.0f / a.e,
+                       1.0f / a.f,
+                       1.0f / a.g,
+                       1.0f / a.h);
+#endif
+}
+
+ccl_device_inline float8_t sqrt(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_sqrt_ps(a.m256));
+#else
+  return make_float8_t(sqrtf(a.a),
+                       sqrtf(a.b),
+                       sqrtf(a.c),
+                       sqrtf(a.d),
+                       sqrtf(a.e),
+                       sqrtf(a.f),
+                       sqrtf(a.g),
+                       sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline float8_t sqr(const float8_t a)
+{
+  return a * a;
+}
+
+ccl_device_inline bool is_zero(const float8_t a)
+{
+  return a == make_float8_t(0.0f);
+}
+
+ccl_device_inline float average(const float8_t a)
+{
+  return reduce_add(a) / 8.0f;
+}
+
+ccl_device_inline float8_t min(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_min_ps(a.m256, b.m256));
+#else
+  return make_float8_t(min(a.a, b.a),
+                       min(a.b, b.b),
+                       min(a.c, b.c),
+                       min(a.d, b.d),
+                       min(a.e, b.e),
+                       min(a.f, b.f),
+                       min(a.g, b.g),
+                       min(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8_t max(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_max_ps(a.m256, b.m256));
+#else
+  return make_float8_t(max(a.a, b.a),
+                       max(a.b, b.b),
+                       max(a.c, b.c),
+                       max(a.d, b.d),
+                       max(a.e, b.e),
+                       max(a.f, b.f),
+                       max(a.g, b.g),
+                       max(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx)
+{
+  return min(max(a, mn), mx);
+}
+
+ccl_device_inline float8_t fabs(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+  return make_float8_t(fabsf(a.a),
+                       fabsf(a.b),
+                       fabsf(a.c),
+                       fabsf(a.d),
+                       fabsf(a.e),
+                       fabsf(a.f),
+                       fabsf(a.g),
+                       fabsf(a.h));
+#endif
+}
+
+ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float8_t saturate(const float8_t a)
+{
+  return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f));
+}
+
+ccl_device_inline float8_t exp(float8_t v)
+{
+  return make_float8_t(
+      expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
+}
+
+ccl_device_inline float8_t log(float8_t v)
+{
+  return make_float8_t(
+      logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
+}
+
+ccl_device_inline float dot(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+  return t[0] + t[4];
+#else
+  return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
+         (a.g * b.g) + (a.h * b.h);
+#endif
+}
+
+ccl_device_inline float8_t pow(float8_t v, float e)
+{
+  return make_float8_t(powf(v.a, e),
+                       powf(v.b, e),
+                       powf(v.c, e),
+                       powf(v.d, e),
+                       powf(v.e, e),
+                       powf(v.f, e),
+                       powf(v.g, e),
+                       powf(v.h, e));
+}
+
+ccl_device_inline float reduce_min(const float8_t a)
+{
+  return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_max(const float8_t a)
+{
+  return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_add(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  float8_t b(_mm256_hadd_ps(a.m256, a.m256));
+  float8_t h(_mm256_hadd_ps(b.m256, b.m256));
+  return h[0] + h[4];
+#else
+  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline bool isequal(const float8_t a, const float8_t b)
+{
+  return a == b;
+}
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float b)
+{
+  return (b != 0.0f) ? a / b : make_float8_t(0.0f);
+}
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b)
+{
+  return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f,
+                       (b.b != 0.0f) ? a.b / b.b : 0.0f,
+                       (b.c != 0.0f) ? a.c / b.c : 0.0f,
+                       (b.d != 0.0f) ? a.d / b.d : 0.0f,
+                       (b.e != 0.0f) ? a.e / b.e : 0.0f,
+                       (b.f != 0.0f) ? a.f / b.f : 0.0f,
+                       (b.g != 0.0f) ? a.g / b.g : 0.0f,
+                       (b.h != 0.0f) ? a.h / b.h : 0.0f);
+}
+
+ccl_device_inline float8_t ensure_finite(float8_t v)
+{
+  v.a = ensure_finite(v.a);
+  v.b = ensure_finite(v.b);
+  v.c = ensure_finite(v.c);
+  v.d = ensure_finite(v.d);
+  v.e = ensure_finite(v.e);
+  v.f = ensure_finite(v.f);
+  v.g = ensure_finite(v.g);
+  v.h = ensure_finite(v.h);
+
+  return v;
+}
+
+ccl_device_inline bool isfinite_safe(float8_t v)
+{
+  return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
+         isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT8_H__ */
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index b0de0b25a45..aa28682f8c1 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device bool ray_sphere_intersect(float3 ray_P,
                                      float3 ray_D,
-                                     float ray_t,
+                                     float ray_tmin,
+                                     float ray_tmax,
                                      float3 sphere_P,
                                      float sphere_radius,
                                      ccl_private float3 *isect_P,
@@ -33,7 +34,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
       return false;
     }
     const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */
-    if (t < ray_t) {
+    if (t > ray_tmin && t < ray_tmax) {
       *isect_t = t;
       *isect_P = ray_P + ray_D * t;
       return true;
@@ -44,7 +45,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
 
 ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
                                            float3 ray_D,
-                                           float ray_t,
+                                           float ray_tmin,
+                                           float ray_tmax,
                                            float3 disk_P,
                                            float disk_radius,
                                            ccl_private float3 *isect_P,
@@ -59,7 +61,7 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
   }
   /* Compute t to intersection point. */
   const float t = -disk_t / div;
-  if (t < 0.0f || t > ray_t) {
+  if (!(t > ray_tmin && t < ray_tmax)) {
     return false;
   }
   /* Test if within radius. */
@@ -74,7 +76,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
 
 ccl_device bool ray_disk_intersect(float3 ray_P,
                                    float3 ray_D,
-                                   float ray_t,
+                                   float ray_tmin,
+                                   float ray_tmax,
                                    float3 disk_P,
                                    float3 disk_N,
                                    float disk_radius,
@@ -92,7 +95,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
     }
     float3 P = ray_P + t * ray_D;
     float3 T = P - disk_P;
-    if (dot(T, T) < sqr(disk_radius) /*&& t > 0.f*/ && t <= ray_t) {
+
+    if (dot(T, T) < sqr(disk_radius) && (t > ray_tmin && t < ray_tmax)) {
       *isect_P = ray_P + t * ray_D;
       *isect_t = t;
       return true;
@@ -101,9 +105,55 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
   return false;
 }
 
-ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
-                                                   float3 ray_dir,
-                                                   float ray_t,
+/* Custom rcp, cross and dot implementations that match Embree bit for bit. */
+ccl_device_forceinline float ray_triangle_rcp(const float x)
+{
+#ifdef __KERNEL_NEON__
+  /* Move scalar to vector register and do rcp. */
+  __m128 a;
+  a[0] = x;
+  float32x4_t reciprocal = vrecpeq_f32(a);
+  reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+  return reciprocal[0];
+#elif defined(__KERNEL_SSE__)
+  const __m128 a = _mm_set_ss(x);
+  const __m128 r = _mm_rcp_ss(a);
+
+#  ifdef __KERNEL_AVX2_
+  return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#  else
+  return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#  endif
+#else
+  return 1.0f / x;
+#endif
+}
+
+ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+#else
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+#endif
+}
+
+ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
+                     msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
+                     msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+#else
+  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#endif
+}
+
+ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
+                                                   const float3 ray_D,
+                                                   const float ray_tmin,
+                                                   const float ray_tmax,
                                                    const float3 tri_a,
                                                    const float3 tri_b,
                                                    const float3 tri_c,
@@ -111,14 +161,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
                                                    ccl_private float *isect_v,
                                                    ccl_private float *isect_t)
 {
-#define dot3(a, b) dot(a, b)
-  const float3 P = ray_P;
-  const float3 dir = ray_dir;
+  /* This implementation matches the Plücker coordinates triangle intersection
+   * in Embree. */
 
   /* Calculate vertices relative to ray origin. */
-  const float3 v0 = tri_c - P;
-  const float3 v1 = tri_a - P;
-  const float3 v2 = tri_b - P;
+  const float3 v0 = tri_a - ray_P;
+  const float3 v1 = tri_b - ray_P;
+  const float3 v2 = tri_c - ray_P;
 
   /* Calculate triangle edges. */
   const float3 e0 = v2 - v0;
@@ -126,42 +175,73 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
   const float3 e2 = v1 - v2;
 
   /* Perform edge tests. */
-  const float U = dot(cross(v2 + v0, e0), ray_dir);
-  const float V = dot(cross(v0 + v1, e1), ray_dir);
-  const float W = dot(cross(v1 + v2, e2), ray_dir);
+  const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D);
+  const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D);
+  const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D);
 
+  const float UVW = U + V + W;
+  const float eps = FLT_EPSILON * fabsf(UVW);
   const float minUVW = min(U, min(V, W));
   const float maxUVW = max(U, max(V, W));
 
-  if (minUVW < 0.0f && maxUVW > 0.0f) {
+  if (!(minUVW >= -eps || maxUVW <= eps)) {
     return false;
   }
 
   /* Calculate geometry normal and denominator. */
-  const float3 Ng1 = cross(e1, e0);
-  // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+  const float3 Ng1 = ray_triangle_cross(e1, e0);
   const float3 Ng = Ng1 + Ng1;
-  const float den = dot3(Ng, dir);
+  const float den = dot(Ng, ray_D);
   /* Avoid division by 0. */
   if (UNLIKELY(den == 0.0f)) {
     return false;
   }
 
   /* Perform depth test. */
-  const float T = dot3(v0, Ng);
-  const int sign_den = (__float_as_int(den) & 0x80000000);
-  const float sign_T = xor_signmask(T, sign_den);
-  if ((sign_T < 0.0f) || (sign_T > ray_t * xor_signmask(den, sign_den))) {
+  const float T = dot(v0, Ng);
+  const float t = T / den;
+  if (!(t >= ray_tmin && t <= ray_tmax)) {
     return false;
   }
 
-  const float inv_den = 1.0f / den;
-  *isect_u = U * inv_den;
-  *isect_v = V * inv_den;
-  *isect_t = T * inv_den;
+  const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW);
+  *isect_u = min(U * rcp_uvw, 1.0f);
+  *isect_v = min(V * rcp_uvw, 1.0f);
+  *isect_t = t;
   return true;
+}
+
+ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P,
+                                                        const float3 ray_D,
+                                                        const float3 tri_a,
+                                                        const float3 tri_b,
+                                                        const float3 tri_c)
+{
+  /* Matches logic in ray_triangle_intersect, self intersection test to validate
+   * if a ray is going to hit self or might incorrectly hit a neighboring triangle. */
 
-#undef dot3
+  /* Calculate vertices relative to ray origin. */
+  const float3 v0 = tri_a - ray_P;
+  const float3 v1 = tri_b - ray_P;
+  const float3 v2 = tri_c - ray_P;
+
+  /* Calculate triangle edges. */
+  const float3 e0 = v2 - v0;
+  const float3 e1 = v0 - v1;
+  const float3 e2 = v1 - v2;
+
+  /* Perform edge tests. */
+  const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D);
+  const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D);
+  const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D);
+
+  const float eps = FLT_EPSILON * fabsf(U + V + W);
+  const float minUVW = min(U, min(V, W));
+  const float maxUVW = max(U, max(V, W));
+
+  /* Note the extended epsilon compared to ray_triangle_intersect, to account
+   * for intersections with neighboring triangles that have an epsilon. */
+  return (minUVW >= eps || maxUVW <= -eps);
 }
 
 /* Tests for an intersection between a ray and a quad defined by
@@ -171,8 +251,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
  */
 ccl_device bool ray_quad_intersect(float3 ray_P,
                                    float3 ray_D,
-                                   float ray_mint,
-                                   float ray_maxt,
+                                   float ray_tmin,
+                                   float ray_tmax,
                                    float3 quad_P,
                                    float3 quad_u,
                                    float3 quad_v,
@@ -185,7 +265,7 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
 {
   /* Perform intersection test. */
   float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
-  if (t < ray_mint || t > ray_maxt) {
+  if (!(t > ray_tmin && t < ray_tmax)) {
     return false;
   }
   const float3 hit = ray_P + t * ray_D;
@@ -207,10 +287,13 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
     *isect_P = hit;
   if (isect_t != NULL)
     *isect_t = t;
+
+  /* NOTE: Return barycentric coordinates in the same notation as Embree and OptiX. */
   if (isect_u != NULL)
-    *isect_u = u + 0.5f;
+    *isect_u = v + 0.5f;
   if (isect_v != NULL)
-    *isect_v = v + 0.5f;
+    *isect_v = -u - v;
+
   return true;
 }
 
diff --git a/intern/cycles/util/opengl.h b/intern/cycles/util/opengl.h
index 090deb861c4..fefee4ec022 100644
--- a/intern/cycles/util/opengl.h
+++ b/intern/cycles/util/opengl.h
@@ -7,6 +7,6 @@
 /* OpenGL header includes, used everywhere we use OpenGL, to deal with
  * platform differences in one central place. */
 
-#include <GL/glew.h>
+#include <epoxy/gl.h>
 
 #endif /* __UTIL_OPENGL_H__ */
diff --git a/intern/cycles/util/progress.h b/intern/cycles/util/progress.h
index 37eafd57491..586979d2021 100644
--- a/intern/cycles/util/progress.h
+++ b/intern/cycles/util/progress.h
@@ -28,6 +28,7 @@ class Progress {
     denoised_tiles = 0;
     start_time = time_dt();
     render_start_time = time_dt();
+    time_limit = 0.0;
     end_time = 0.0;
     status = "Initializing";
     substatus = "";
@@ -68,6 +69,7 @@ class Progress {
     denoised_tiles = 0;
     start_time = time_dt();
     render_start_time = time_dt();
+    time_limit = 0.0;
     end_time = 0.0;
     status = "Initializing";
     substatus = "";
@@ -145,6 +147,13 @@ class Progress {
     render_start_time = time_dt();
   }
 
+  void set_time_limit(double time_limit_)
+  {
+    thread_scoped_lock lock(progress_mutex);
+
+    time_limit = time_limit_;
+  }
+
   void add_skip_time(const scoped_timer &start_timer, bool only_render)
   {
     double skip_time = time_dt() - start_timer.get_start();
@@ -191,8 +200,13 @@ class Progress {
   {
     thread_scoped_lock lock(progress_mutex);
 
-    if (total_pixel_samples > 0) {
-      return ((double)pixel_samples) / (double)total_pixel_samples;
+    if (pixel_samples > 0) {
+      double progress_percent = (double)pixel_samples / (double)total_pixel_samples;
+      if (time_limit != 0.0) {
+        double time_since_render_start = time_dt() - render_start_time;
+        progress_percent = max(progress_percent, time_since_render_start / time_limit);
+      }
+      return min(1.0, progress_percent);
     }
     return 0.0;
   }
@@ -335,7 +349,7 @@ class Progress {
    * in which case the current_tile_sample is displayed. */
   int rendered_tiles, denoised_tiles;
 
-  double start_time, render_start_time;
+  double start_time, render_start_time, time_limit;
   /* End time written when render is done, so it doesn't keep increasing on redraws. */
   double end_time;
 
diff --git a/intern/cycles/util/string.cpp b/intern/cycles/util/string.cpp
index 66ff866ee10..0c318cea44a 100644
--- a/intern/cycles/util/string.cpp
+++ b/intern/cycles/util/string.cpp
@@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other)
   }
 }
 
+void string_replace_same_length(string &haystack, const string &needle, const string &other)
+{
+  assert(needle.size() == other.size());
+  size_t pos = 0;
+  while (pos != string::npos) {
+    pos = haystack.find(needle, pos);
+    if (pos != string::npos) {
+      memcpy(haystack.data() + pos, other.data(), other.size());
+      pos += other.size();
+    }
+  }
+}
+
 string string_remove_trademark(const string &s)
 {
   string result = s;
@@ -164,6 +177,11 @@ string to_string(const char *str)
   return string(str);
 }
 
+string to_string(const float4 &v)
+{
+  return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w);
+}
+
 string string_to_lower(const string &s)
 {
   string r = s;
diff --git a/intern/cycles/util/string.h b/intern/cycles/util/string.h
index a74feee1750..ecbe9e106c6 100644
--- a/intern/cycles/util/string.h
+++ b/intern/cycles/util/string.h
@@ -38,12 +38,14 @@ void string_split(vector<string> &tokens,
                   const string &separators = "\t ",
                   bool skip_empty_tokens = true);
 void string_replace(string &haystack, const string &needle, const string &other);
+void string_replace_same_length(string &haystack, const string &needle, const string &other);
 bool string_startswith(string_view s, string_view start);
 bool string_endswith(string_view s, string_view end);
 string string_strip(const string &s);
 string string_remove_trademark(const string &s);
 string string_from_bool(const bool var);
 string to_string(const char *str);
+string to_string(const float4 &v);
 string string_to_lower(const string &s);
 
 /* Wide char strings are only used on Windows to deal with non-ASCII
diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp
index a13ad95b9fe..3183ac06f26 100644
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@@ -128,53 +128,42 @@ int system_cpu_bits()
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
 
 struct CPUCapabilities {
-  bool x64;
-  bool mmx;
-  bool sse;
   bool sse2;
   bool sse3;
-  bool ssse3;
   bool sse41;
-  bool sse42;
-  bool sse4a;
   bool avx;
-  bool f16c;
   bool avx2;
-  bool xop;
-  bool fma3;
-  bool fma4;
-  bool bmi1;
-  bool bmi2;
 };
 
 static CPUCapabilities &system_cpu_capabilities()
 {
-  static CPUCapabilities caps;
+  static CPUCapabilities caps = {};
   static bool caps_init = false;
 
   if (!caps_init) {
     int result[4], num;
 
-    memset(&caps, 0, sizeof(caps));
-
     __cpuid(result, 0);
     num = result[0];
 
     if (num >= 1) {
       __cpuid(result, 0x00000001);
-      caps.mmx = (result[3] & ((int)1 << 23)) != 0;
-      caps.sse = (result[3] & ((int)1 << 25)) != 0;
-      caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
-      caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
+      const bool sse = (result[3] & ((int)1 << 25)) != 0;
+      const bool sse2 = (result[3] & ((int)1 << 26)) != 0;
+      const bool sse3 = (result[2] & ((int)1 << 0)) != 0;
+
+      const bool ssse3 = (result[2] & ((int)1 << 9)) != 0;
+      const bool sse41 = (result[2] & ((int)1 << 19)) != 0;
+      /* const bool sse42 = (result[2] & ((int)1 << 20)) != 0; */
 
-      caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
-      caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
-      caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
+      const bool fma3 = (result[2] & ((int)1 << 12)) != 0;
+      const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
+      const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
 
-      caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
-      caps.avx = false;
-      bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
-      bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
+      /* Simplify to combined capabilities for which we specialize kernels. */
+      caps.sse2 = sse && sse2;
+      caps.sse3 = sse && sse2 && sse3 && ssse3;
+      caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41;
 
       if (os_uses_xsave_xrestore && cpu_avx_support) {
         // Check if the OS will save the YMM registers
@@ -189,15 +178,18 @@ static CPUCapabilities &system_cpu_capabilities()
 #  else
         xcr_feature_mask = 0;
 #  endif
-        caps.avx = (xcr_feature_mask & 0x6) == 0x6;
-      }
+        const bool avx = (xcr_feature_mask & 0x6) == 0x6;
+        const bool f16c = (result[2] & ((int)1 << 29)) != 0;
 
-      caps.f16c = (result[2] & ((int)1 << 29)) != 0;
+        __cpuid(result, 0x00000007);
+        bool bmi1 = (result[1] & ((int)1 << 3)) != 0;
+        bool bmi2 = (result[1] & ((int)1 << 8)) != 0;
+        bool avx2 = (result[1] & ((int)1 << 5)) != 0;
 
-      __cpuid(result, 0x00000007);
-      caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
-      caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
-      caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
+        caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx;
+        caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 &&
+                    bmi2;
+      }
     }
 
     caps_init = true;
@@ -209,32 +201,31 @@ static CPUCapabilities &system_cpu_capabilities()
 bool system_cpu_support_sse2()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2;
+  return caps.sse2;
 }
 
 bool system_cpu_support_sse3()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
+  return caps.sse3;
 }
 
 bool system_cpu_support_sse41()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41;
+  return caps.sse41;
 }
 
 bool system_cpu_support_avx()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
+  return caps.avx;
 }
 
 bool system_cpu_support_avx2()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c &&
-         caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+  return caps.avx2;
 }
 #else
 
@@ -264,26 +255,6 @@ bool system_cpu_support_avx2()
 
 #endif
 
-bool system_call_self(const vector<string> &args)
-{
-  /* Escape program and arguments in case they contain spaces. */
-  string cmd = "\"" + Sysutil::this_program_path() + "\"";
-
-  for (int i = 0; i < args.size(); i++) {
-    cmd += " \"" + args[i] + "\"";
-  }
-
-#ifdef _WIN32
-  /* Use cmd /S to avoid issues with spaces in arguments. */
-  cmd = "cmd /S /C \"" + cmd + " > nul \"";
-#else
-  /* Quiet output. */
-  cmd += " > /dev/null";
-#endif
-
-  return (system(cmd.c_str()) == 0);
-}
-
 size_t system_physical_ram()
 {
 #ifdef _WIN32
diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h
index 23dcfdd303a..2152b89ed24 100644
--- a/intern/cycles/util/system.h
+++ b/intern/cycles/util/system.h
@@ -4,15 +4,17 @@
 #ifndef __UTIL_SYSTEM_H__
 #define __UTIL_SYSTEM_H__
 
-#include "util/string.h"
-#include "util/vector.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <string>
 
 CCL_NAMESPACE_BEGIN
 
 /* Get width in characters of the current console output. */
 int system_console_width();
 
-string system_cpu_brand_string();
+std::string system_cpu_brand_string();
 int system_cpu_bits();
 bool system_cpu_support_sse2();
 bool system_cpu_support_sse3();
@@ -22,9 +24,6 @@ bool system_cpu_support_avx2();
 
 size_t system_physical_ram();
 
-/* Start a new process of the current application with the given arguments. */
-bool system_call_self(const vector<string> &args);
-
 /* Get identifier of the currently running process. */
 uint64_t system_self_process_id();
 
diff --git a/intern/cycles/util/task.cpp b/intern/cycles/util/task.cpp
index 2edc82eb7c3..12f661f752d 100644
--- a/intern/cycles/util/task.cpp
+++ b/intern/cycles/util/task.cpp
@@ -70,7 +70,7 @@ void TaskScheduler::init(int num_threads)
   }
   if (num_threads > 0) {
     /* Automatic number of threads. */
-    VLOG(1) << "Overriding number of TBB threads to " << num_threads << ".";
+    VLOG_INFO << "Overriding number of TBB threads to " << num_threads << ".";
     global_control = new tbb::global_control(tbb::global_control::max_allowed_parallelism,
                                              num_threads);
     active_num_threads = num_threads;
diff --git a/intern/cycles/util/time.cpp b/intern/cycles/util/time.cpp
index d27a0415106..0295a071f39 100644
--- a/intern/cycles/util/time.cpp
+++ b/intern/cycles/util/time.cpp
@@ -102,7 +102,7 @@ double time_human_readable_to_seconds(const string &time_string)
   }
   else if (fraction_tokens.size() == 2) {
     result = atof(fraction_tokens[1].c_str());
-    result *= pow(0.1, fraction_tokens[1].length());
+    result *= ::pow(0.1, fraction_tokens[1].length());
   }
   else {
     /* This is not a valid string, the result can not be reliable. */
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index fa50e1db063..cb985c65dd8 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Transform Inverse */
 
-static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
+static bool projection_matrix4_inverse(float R[][4], float M[][4])
 {
   /* SPDX-License-Identifier: BSD-3-Clause
    * Adapted from code:
@@ -98,16 +98,8 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
   memcpy(R, &tfmR, sizeof(R));
   memcpy(M, &tfm, sizeof(M));
 
-  if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-    /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
-     * never be in this situation, but try to invert it anyway with tweak */
-    M[0][0] += 1e-8f;
-    M[1][1] += 1e-8f;
-    M[2][2] += 1e-8f;
-
-    if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-      return projection_identity();
-    }
+  if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
+    return projection_identity();
   }
 
   memcpy(&tfmR, R, sizeof(R));
@@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
   return tfmR;
 }
 
-Transform transform_inverse(const Transform &tfm)
-{
-  ProjectionTransform projection(tfm);
-  return projection_to_transform(projection_inverse(projection));
-}
-
 Transform transform_transposed_inverse(const Transform &tfm)
 {
-  ProjectionTransform projection(tfm);
-  ProjectionTransform iprojection = projection_inverse(projection);
+  ProjectionTransform iprojection(transform_inverse(tfm));
   return projection_to_transform(projection_transpose(iprojection));
 }
 
@@ -229,17 +214,17 @@ static void transform_decompose(DecomposedTransform *decomp, const Transform *tf
   /* extract scale and shear first */
   float3 scale, shear;
   scale.x = len(colx);
-  colx = safe_divide_float3_float(colx, scale.x);
+  colx = safe_divide(colx, scale.x);
   shear.z = dot(colx, coly);
   coly -= shear.z * colx;
   scale.y = len(coly);
-  coly = safe_divide_float3_float(coly, scale.y);
+  coly = safe_divide(coly, scale.y);
   shear.y = dot(colx, colz);
   colz -= shear.y * colx;
   shear.x = dot(coly, colz);
   colz -= shear.x * coly;
   scale.z = len(colz);
-  colz = safe_divide_float3_float(colz, scale.z);
+  colz = safe_divide(colz, scale.z);
 
   transform_set_column(&M, 0, colx);
   transform_set_column(&M, 1, coly);
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 477272f0ba6..24184dc7074 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -11,6 +11,10 @@
 #include "util/math.h"
 #include "util/types.h"
 
+#ifndef __KERNEL_GPU__
+#  include "util/system.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Affine transformation, stored as 4x3 matrix. */
@@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
   float4 x, y, z, w;
 } DecomposedTransform;
 
+CCL_NAMESPACE_END
+
+#include "util/transform_inverse.h"
+
+CCL_NAMESPACE_BEGIN
+
 /* Functions */
 
 #ifdef __KERNEL_METAL__
@@ -63,10 +73,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
 
   _MM_TRANSPOSE4_PS(x, y, z, w);
 
-  ssef tmp = shuffle<0>(aa) * x;
-  tmp = madd(shuffle<1>(aa), y, tmp);
+  ssef tmp = w;
   tmp = madd(shuffle<2>(aa), z, tmp);
-  tmp += w;
+  tmp = madd(shuffle<1>(aa), y, tmp);
+  tmp = madd(shuffle<0>(aa), x, tmp);
 
   return float3(tmp.m128);
 #elif defined(__KERNEL_METAL__)
@@ -93,9 +103,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
 
   _MM_TRANSPOSE4_PS(x, y, z, w);
 
-  ssef tmp = shuffle<0>(aa) * x;
+  ssef tmp = shuffle<2>(aa) * z;
   tmp = madd(shuffle<1>(aa), y, tmp);
-  tmp = madd(shuffle<2>(aa), z, tmp);
+  tmp = madd(shuffle<0>(aa), x, tmp);
 
   return float3(tmp.m128);
 #elif defined(__KERNEL_METAL__)
@@ -312,7 +322,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val
   t->z[column] = value.z;
 }
 
-Transform transform_inverse(const Transform &a);
 Transform transform_transposed_inverse(const Transform &a);
 
 ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale)
@@ -392,39 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 #endif /* defined(__KERNEL_GPU_RAYTRACING__) */
 }
 
-ccl_device_inline Transform transform_quick_inverse(Transform M)
-{
-  /* possible optimization: can we avoid doing this altogether and construct
-   * the inverse matrix directly from negated translation, transposed rotation,
-   * scale can be inverted but what about shearing? */
-  Transform R;
-  float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
-              M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
-  if (det == 0.0f) {
-    M.x.x += 1e-8f;
-    M.y.y += 1e-8f;
-    M.z.z += 1e-8f;
-    det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
-          M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
-  }
-  det = (det != 0.0f) ? 1.0f / det : 0.0f;
-
-  float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z,
-                                M.z.y * M.x.z - M.z.z * M.x.y,
-                                M.y.z * M.x.y - M.y.y * M.x.z);
-  float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x,
-                                M.z.z * M.x.x - M.z.x * M.x.z,
-                                M.y.x * M.x.z - M.y.z * M.x.x);
-  float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y,
-                                M.z.x * M.x.y - M.z.y * M.x.x,
-                                M.y.y * M.x.x - M.y.x * M.x.y);
-  float3 T = -make_float3(M.x.w, M.y.w, M.z.w);
+#ifndef __KERNEL_GPU__
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
+#endif
 
-  R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T));
-  R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T));
-  R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T));
+ccl_device_inline Transform transform_inverse(const Transform tfm)
+{
+  /* Optimized transform implementations. */
+#ifndef __KERNEL_GPU__
+  if (system_cpu_support_avx2()) {
+    Transform itfm;
+    transform_inverse_cpu_avx2(tfm, itfm);
+    return itfm;
+  }
+  else if (system_cpu_support_sse41()) {
+    Transform itfm;
+    transform_inverse_cpu_sse41(tfm, itfm);
+    return itfm;
+  }
+#endif
 
-  return R;
+  return transform_inverse_impl(tfm);
 }
 
 ccl_device_inline void transform_compose(ccl_private Transform *tfm,
@@ -493,13 +491,13 @@ ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm,
 
 ccl_device_inline bool transform_isfinite_safe(ccl_private Transform *tfm)
 {
-  return isfinite4_safe(tfm->x) && isfinite4_safe(tfm->y) && isfinite4_safe(tfm->z);
+  return isfinite_safe(tfm->x) && isfinite_safe(tfm->y) && isfinite_safe(tfm->z);
 }
 
 ccl_device_inline bool transform_decomposed_isfinite_safe(ccl_private DecomposedTransform *decomp)
 {
-  return isfinite4_safe(decomp->x) && isfinite4_safe(decomp->y) && isfinite4_safe(decomp->z) &&
-         isfinite4_safe(decomp->w);
+  return isfinite_safe(decomp->x) && isfinite_safe(decomp->y) && isfinite_safe(decomp->z) &&
+         isfinite_safe(decomp->w);
 }
 
 #ifndef __KERNEL_GPU__
diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp
new file mode 100644
index 00000000000..57c160388e2
--- /dev/null
+++ b/intern/cycles/util/transform_avx2.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
+{
+  itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
new file mode 100644
index 00000000000..07fd06c1467
--- /dev/null
+++ b/intern/cycles/util/transform_inverse.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Custom cross and dot implementations that match Embree bit for bit.
+ * Normally we don't use SSE41/AVX outside the kernel, but for this it's
+ * important to match exactly for ray tracing precision. */
+
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+{
+#ifdef __AVX2__
+  const ssef sse_a = (const __m128 &)a;
+  const ssef sse_b = (const __m128 &)b;
+  const ssef r = shuffle<1, 2, 0, 3>(
+      ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+  return (const float3 &)r;
+#endif
+
+  return cross(a, b);
+}
+
+ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+{
+#ifdef __SSE4_1__
+  return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#endif
+
+  return dot(a, b);
+}
+
+ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
+{
+  /* This implementation matches the one in Embree exactly, to ensure consistent
+   * results with the ray intersection of instances. */
+  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+  /* Compute determinant. */
+  float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
+
+  if (det == 0.0f) {
+    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+     * never be in this situation, but try to invert it anyway with tweak.
+     *
+     * This logic does not match Embree which would just give an invalid
+     * matrix. A better solution would be to remove this and ensure any object
+     * matrix is valid. */
+    x.x += 1e-8f;
+    y.y += 1e-8f;
+    z.z += 1e-8f;
+
+    det = transform_inverse_dot(x, cross(y, z));
+    if (det == 0.0f) {
+      det = FLT_MAX;
+    }
+  }
+
+  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+  const float3 inverse_x = transform_inverse_cross(y, z) / det;
+  const float3 inverse_y = transform_inverse_cross(z, x) / det;
+  const float3 inverse_z = transform_inverse_cross(x, y) / det;
+
+  /* Compute translation and fill transform. */
+  Transform itfm;
+  itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
+  itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
+  itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
+
+  return itfm;
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp
new file mode 100644
index 00000000000..8a698807a9c
--- /dev/null
+++ b/intern/cycles/util/transform_sse41.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
+{
+  itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h
index 031c2f7c4c1..1ab6f76f9bc 100644
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -12,6 +12,7 @@
 
 #if !defined(__KERNEL_GPU__)
 #  include <stdint.h>
+#  include <stdio.h>
 #endif
 
 #include "util/defines.h"
@@ -70,6 +71,24 @@ ccl_device_inline bool is_power_of_two(size_t x)
 
 CCL_NAMESPACE_END
 
+/* Device side printf only tested on CUDA, may work on more GPU devices. */
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_CUDA__)
+#  define __KERNEL_PRINTF__
+#endif
+
+ccl_device_inline void print_float(ccl_private const char *label, const float a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f\n", label, (double)a);
+#endif
+}
+
+/* Most GPU APIs matching native vector types, so we only need to implement them for
+ * CPU and oneAPI. */
+#if defined(__KERNEL_GPU__) && !defined(__KERNEL_ONEAPI__)
+#  define __KERNEL_NATIVE_VECTOR_TYPES__
+#endif
+
 /* Vectorized types declaration. */
 #include "util/types_uchar2.h"
 #include "util/types_uchar3.h"
@@ -90,7 +109,7 @@ CCL_NAMESPACE_END
 #include "util/types_float4.h"
 #include "util/types_float8.h"
 
-#include "util/types_vector3.h"
+#include "util/types_spectrum.h"
 
 /* Vectorized types implementation. */
 #include "util/types_uchar2_impl.h"
@@ -110,8 +129,6 @@ CCL_NAMESPACE_END
 #include "util/types_float4_impl.h"
 #include "util/types_float8_impl.h"
 
-#include "util/types_vector3_impl.h"
-
 /* SSE types. */
 #ifndef __KERNEL_GPU__
 #  include "util/sseb.h"
diff --git a/intern/cycles/util/types_float2.h b/intern/cycles/util/types_float2.h
index d8b2efb7b4b..ea510ef832c 100644
--- a/intern/cycles/util/types_float2.h
+++ b/intern/cycles/util/types_float2.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_FLOAT2_H__
-#define __UTIL_TYPES_FLOAT2_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,18 +9,19 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct float2 {
   float x, y;
 
+#  ifndef __KERNEL_GPU__
   __forceinline float operator[](int i) const;
   __forceinline float &operator[](int i);
+#  endif
 };
 
 ccl_device_inline float2 make_float2(float x, float y);
-ccl_device_inline void print_float2(const char *label, const float2 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
-CCL_NAMESPACE_END
+ccl_device_inline void print_float2(ccl_private const char *label, const float2 a);
 
-#endif /* __UTIL_TYPES_FLOAT2_H__ */
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_float2_impl.h b/intern/cycles/util/types_float2_impl.h
index d67ec946b79..7ba7dee2e3a 100644
--- a/intern/cycles/util/types_float2_impl.h
+++ b/intern/cycles/util/types_float2_impl.h
@@ -1,20 +1,16 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__
-#define __UTIL_TYPES_FLOAT2_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif
 
-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 __forceinline float float2::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -28,19 +24,20 @@ __forceinline float &float2::operator[](int i)
   util_assert(i < 2);
   return *(&x + i);
 }
+#  endif
 
 ccl_device_inline float2 make_float2(float x, float y)
 {
   float2 a = {x, y};
   return a;
 }
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
-ccl_device_inline void print_float2(const char *label, const float2 &a)
+ccl_device_inline void print_float2(ccl_private const char *label, const float2 a)
 {
+#ifdef __KERNEL_PRINTF__
   printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
+#endif
 }
-#endif /* __KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT2_IMPL_H__ */
diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h
index 060c2ac4152..87c6b1d3654 100644
--- a/intern/cycles/util/types_float3.h
+++ b/intern/cycles/util/types_float3.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_FLOAT3_H__
-#define __UTIL_TYPES_FLOAT3_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,28 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct ccl_try_align(16) float3
 {
-#  ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_GPU__
+  /* Compact structure for GPU. */
+  float x, y, z;
+#  else
+  /* SIMD aligned structure for CPU. */
+#    ifdef __KERNEL_SSE__
   union {
     __m128 m128;
     struct {
       float x, y, z, w;
     };
   };
+#    else
+  float x, y, z, w;
+#    endif
+#  endif
 
+#  ifdef __KERNEL_SSE__
+  /* Convenient constructors and operators for SIMD, otherwise default is enough. */
   __forceinline float3();
   __forceinline float3(const float3 &a);
   __forceinline explicit float3(const __m128 &a);
@@ -29,18 +39,19 @@ struct ccl_try_align(16) float3
   __forceinline operator __m128 &();
 
   __forceinline float3 &operator=(const float3 &a);
-#  else  /* __KERNEL_SSE__ */
-  float x, y, z, w;
-#  endif /* __KERNEL_SSE__ */
+#  endif
 
+#  ifndef __KERNEL_GPU__
   __forceinline float operator[](int i) const;
   __forceinline float &operator[](int i);
+#  endif
 };
 
-ccl_device_inline float3 make_float3(float f);
 ccl_device_inline float3 make_float3(float x, float y, float z);
-ccl_device_inline void print_float3(const char *label, const float3 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline float3 make_float3(float f);
+ccl_device_inline void print_float3(ccl_private const char *label, const float3 a);
 
 /* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
  * CPU SIMD instructions can be used. */
@@ -78,5 +89,3 @@ struct packed_float3 {
 static_assert(sizeof(packed_float3) == 12, "packed_float3 expected to be exactly 12 bytes");
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT3_H__ */
diff --git a/intern/cycles/util/types_float3_impl.h b/intern/cycles/util/types_float3_impl.h
index f5ffc48c1be..da76ab2ab2a 100644
--- a/intern/cycles/util/types_float3_impl.h
+++ b/intern/cycles/util/types_float3_impl.h
@@ -1,20 +1,15 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__
-#define __UTIL_TYPES_FLOAT3_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif
 
-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 #  ifdef __KERNEL_SSE__
 __forceinline float3::float3()
 {
@@ -45,6 +40,7 @@ __forceinline float3 &float3::operator=(const float3 &a)
 }
 #  endif /* __KERNEL_SSE__ */
 
+#  ifndef __KERNEL_GPU__
 __forceinline float float3::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -58,33 +54,37 @@ __forceinline float &float3::operator[](int i)
   util_assert(i < 3);
   return *(&x + i);
 }
+#  endif
 
-ccl_device_inline float3 make_float3(float f)
+ccl_device_inline float3 make_float3(float x, float y, float z)
 {
-#  ifdef __KERNEL_SSE__
-  float3 a(_mm_set1_ps(f));
+#  if defined(__KERNEL_GPU__)
+  return {x, y, z};
+#  elif defined(__KERNEL_SSE__)
+  return float3(_mm_set_ps(0.0f, z, y, x));
 #  else
-  float3 a = {f, f, f, f};
+  return {x, y, z, 0.0f};
 #  endif
-  return a;
 }
 
-ccl_device_inline float3 make_float3(float x, float y, float z)
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline float3 make_float3(float f)
 {
-#  ifdef __KERNEL_SSE__
-  float3 a(_mm_set_ps(0.0f, z, y, x));
-#  else
-  float3 a = {x, y, z, 0.0f};
-#  endif
-  return a;
+#if defined(__KERNEL_GPU__)
+  return make_float3(f, f, f);
+#elif defined(__KERNEL_SSE__)
+  return float3(_mm_set1_ps(f));
+#else
+  return {f, f, f, f};
+#endif
 }
 
-ccl_device_inline void print_float3(const char *label, const float3 &a)
+ccl_device_inline void print_float3(ccl_private const char *label, const float3 a)
 {
+#ifdef __KERNEL_PRINTF__
   printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
+#endif
 }
-#endif /* __KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT3_IMPL_H__ */
diff --git a/intern/cycles/util/types_float4.h b/intern/cycles/util/types_float4.h
index 68ba787dac0..a347cfce9a1 100644
--- a/intern/cycles/util/types_float4.h
+++ b/intern/cycles/util/types_float4.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_FLOAT4_H__
-#define __UTIL_TYPES_FLOAT4_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct int4;
 
 struct ccl_try_align(16) float4
@@ -35,16 +34,17 @@ struct ccl_try_align(16) float4
   float x, y, z, w;
 #  endif /* __KERNEL_SSE__ */
 
+#  ifndef __KERNEL_GPU__
   __forceinline float operator[](int i) const;
   __forceinline float &operator[](int i);
+#  endif
 };
 
-ccl_device_inline float4 make_float4(float f);
 ccl_device_inline float4 make_float4(float x, float y, float z, float w);
-ccl_device_inline float4 make_float4(const int4 &i);
-ccl_device_inline void print_float4(const char *label, const float4 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
-CCL_NAMESPACE_END
+ccl_device_inline float4 make_float4(float f);
+ccl_device_inline float4 make_float4(const int4 i);
+ccl_device_inline void print_float4(ccl_private const char *label, const float4 a);
 
-#endif /* __UTIL_TYPES_FLOAT4_H__ */
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_float4_impl.h b/intern/cycles/util/types_float4_impl.h
index de2e7cb7061..420d9316926 100644
--- a/intern/cycles/util/types_float4_impl.h
+++ b/intern/cycles/util/types_float4_impl.h
@@ -1,20 +1,15 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__
-#define __UTIL_TYPES_FLOAT4_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif
 
-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 #  ifdef __KERNEL_SSE__
 __forceinline float4::float4()
 {
@@ -41,6 +36,7 @@ __forceinline float4 &float4::operator=(const float4 &a)
 }
 #  endif /* __KERNEL_SSE__ */
 
+#  ifndef __KERNEL_GPU__
 __forceinline float float4::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -54,43 +50,42 @@ __forceinline float &float4::operator[](int i)
   util_assert(i < 4);
   return *(&x + i);
 }
+#  endif
 
-ccl_device_inline float4 make_float4(float f)
+ccl_device_inline float4 make_float4(float x, float y, float z, float w)
 {
 #  ifdef __KERNEL_SSE__
-  float4 a(_mm_set1_ps(f));
+  return float4(_mm_set_ps(w, z, y, x));
 #  else
-  float4 a = {f, f, f, f};
+  return {x, y, z, w};
 #  endif
-  return a;
 }
 
-ccl_device_inline float4 make_float4(float x, float y, float z, float w)
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline float4 make_float4(float f)
 {
-#  ifdef __KERNEL_SSE__
-  float4 a(_mm_set_ps(w, z, y, x));
-#  else
-  float4 a = {x, y, z, w};
-#  endif
-  return a;
+#ifdef __KERNEL_SSE__
+  return float4(_mm_set1_ps(f));
+#else
+  return make_float4(f, f, f, f);
+#endif
 }
 
-ccl_device_inline float4 make_float4(const int4 &i)
+ccl_device_inline float4 make_float4(const int4 i)
 {
-#  ifdef __KERNEL_SSE__
-  float4 a(_mm_cvtepi32_ps(i.m128));
-#  else
-  float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w};
-#  endif
-  return a;
+#ifdef __KERNEL_SSE__
+  return float4(_mm_cvtepi32_ps(i.m128));
+#else
+  return make_float4((float)i.x, (float)i.y, (float)i.z, (float)i.w);
+#endif
 }
 
-ccl_device_inline void print_float4(const char *label, const float4 &a)
+ccl_device_inline void print_float4(ccl_private const char *label, const float4 a)
 {
+#ifdef __KERNEL_PRINTF__
   printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
+#endif
 }
-#endif /* __KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT4_IMPL_H__ */
diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h
index 99f9ec9b867..29fd632f08e 100644
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -2,8 +2,7 @@
  * Original code Copyright 2017, Intel Corporation
  * Modifications Copyright 2018-2022 Blender Foundation. */
 
-#ifndef __UTIL_TYPES_FLOAT8_H__
-#define __UTIL_TYPES_FLOAT8_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -11,11 +10,16 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+/* float8 is a reserved type in Metal that has not been implemented. For
+ * that reason this is named float8_t and not using native vector types. */
 
-struct ccl_try_align(32) float8
+#ifdef __KERNEL_GPU__
+struct float8_t
+#else
+struct ccl_try_align(32) float8_t
+#endif
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   union {
     __m256 m256;
     struct {
@@ -23,28 +27,27 @@ struct ccl_try_align(32) float8
     };
   };
 
-  __forceinline float8();
-  __forceinline float8(const float8 &a);
-  __forceinline explicit float8(const __m256 &a);
+  __forceinline float8_t();
+  __forceinline float8_t(const float8_t &a);
+  __forceinline explicit float8_t(const __m256 &a);
 
   __forceinline operator const __m256 &() const;
   __forceinline operator __m256 &();
 
-  __forceinline float8 &operator=(const float8 &a);
+  __forceinline float8_t &operator=(const float8_t &a);
 
-#  else  /* __KERNEL_AVX2__ */
+#else  /* __KERNEL_AVX2__ */
   float a, b, c, d, e, f, g, h;
-#  endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */
 
+#ifndef __KERNEL_GPU__
   __forceinline float operator[](int i) const;
   __forceinline float &operator[](int i);
+#endif
 };
 
-ccl_device_inline float8 make_float8(float f);
-ccl_device_inline float8
-make_float8(float a, float b, float c, float d, float e, float f, float g, float h);
-#endif /* __KERNEL_GPU__ */
+ccl_device_inline float8_t make_float8_t(float f);
+ccl_device_inline float8_t
+make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h);
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT8_H__ */
diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h
index 19818976b50..e8576cdaf70 100644
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -2,87 +2,79 @@
  * Original code Copyright 2017, Intel Corporation
  * Modifications Copyright 2018-2022 Blender Foundation. */
 
-#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
-#define __UTIL_TYPES_FLOAT8_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif
 
-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
-#  ifdef __KERNEL_AVX2__
-__forceinline float8::float8()
+#ifdef __KERNEL_AVX2__
+__forceinline float8_t::float8_t()
 {
 }
 
-__forceinline float8::float8(const float8 &f) : m256(f.m256)
+__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256)
 {
 }
 
-__forceinline float8::float8(const __m256 &f) : m256(f)
+__forceinline float8_t::float8_t(const __m256 &f) : m256(f)
 {
 }
 
-__forceinline float8::operator const __m256 &() const
+__forceinline float8_t::operator const __m256 &() const
 {
   return m256;
 }
 
-__forceinline float8::operator __m256 &()
+__forceinline float8_t::operator __m256 &()
 {
   return m256;
 }
 
-__forceinline float8 &float8::operator=(const float8 &f)
+__forceinline float8_t &float8_t::operator=(const float8_t &f)
 {
   m256 = f.m256;
   return *this;
 }
-#  endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */
 
-__forceinline float float8::operator[](int i) const
+#ifndef __KERNEL_GPU__
+__forceinline float float8_t::operator[](int i) const
 {
   util_assert(i >= 0);
   util_assert(i < 8);
   return *(&a + i);
 }
 
-__forceinline float &float8::operator[](int i)
+__forceinline float &float8_t::operator[](int i)
 {
   util_assert(i >= 0);
   util_assert(i < 8);
   return *(&a + i);
 }
+#endif
 
-ccl_device_inline float8 make_float8(float f)
+ccl_device_inline float8_t make_float8_t(float f)
 {
-#  ifdef __KERNEL_AVX2__
-  float8 r(_mm256_set1_ps(f));
-#  else
-  float8 r = {f, f, f, f, f, f, f, f};
-#  endif
+#ifdef __KERNEL_AVX2__
+  float8_t r(_mm256_set1_ps(f));
+#else
+  float8_t r = {f, f, f, f, f, f, f, f};
+#endif
   return r;
 }
 
-ccl_device_inline float8
-make_float8(float a, float b, float c, float d, float e, float f, float g, float h)
+ccl_device_inline float8_t
+make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h)
 {
-#  ifdef __KERNEL_AVX2__
-  float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
-#  else
-  float8 r = {a, b, c, d, e, f, g, h};
-#  endif
+#ifdef __KERNEL_AVX2__
+  float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#else
+  float8_t r = {a, b, c, d, e, f, g, h};
+#endif
   return r;
 }
 
-#endif /* __KERNEL_GPU__ */
-
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */
diff --git a/intern/cycles/util/types_int2.h b/intern/cycles/util/types_int2.h
index 4daf387d9cf..604713dffcd 100644
--- a/intern/cycles/util/types_int2.h
+++ b/intern/cycles/util/types_int2.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_INT2_H__
-#define __UTIL_TYPES_INT2_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct int2 {
   int x, y;
 
+#  ifndef __KERNEL_GPU__
   __forceinline int operator[](int i) const;
   __forceinline int &operator[](int i);
+#  endif
 };
 
 ccl_device_inline int2 make_int2(int x, int y);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT2_H__ */
diff --git a/intern/cycles/util/types_int2_impl.h b/intern/cycles/util/types_int2_impl.h
index 7989c4d5506..f48c6f46729 100644
--- a/intern/cycles/util/types_int2_impl.h
+++ b/intern/cycles/util/types_int2_impl.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_INT2_IMPL_H__
-#define __UTIL_TYPES_INT2_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 int int2::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -24,14 +24,13 @@ int &int2::operator[](int i)
   util_assert(i < 2);
   return *(&x + i);
 }
+#  endif
 
 ccl_device_inline int2 make_int2(int x, int y)
 {
   int2 a = {x, y};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT2_IMPL_H__ */
diff --git a/intern/cycles/util/types_int3.h b/intern/cycles/util/types_int3.h
index ad9bcb39bbe..e059ddd3660 100644
--- a/intern/cycles/util/types_int3.h
+++ b/intern/cycles/util/types_int3.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_INT3_H__
-#define __UTIL_TYPES_INT3_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,10 +9,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct ccl_try_align(16) int3
 {
-#  ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_GPU__
+  /* Compact structure on the GPU. */
+  int x, y, z;
+#  else
+  /* SIMD aligned structure for CPU. */
+#    ifdef __KERNEL_SSE__
   union {
     __m128i m128;
     struct {
@@ -29,19 +33,21 @@ struct ccl_try_align(16) int3
   __forceinline operator __m128i &();
 
   __forceinline int3 &operator=(const int3 &a);
-#  else  /* __KERNEL_SSE__ */
+#    else  /* __KERNEL_SSE__ */
   int x, y, z, w;
-#  endif /* __KERNEL_SSE__ */
+#    endif /* __KERNEL_SSE__ */
+#  endif
 
+#  ifndef __KERNEL_GPU__
   __forceinline int operator[](int i) const;
   __forceinline int &operator[](int i);
+#  endif
 };
 
-ccl_device_inline int3 make_int3(int i);
 ccl_device_inline int3 make_int3(int x, int y, int z);
-ccl_device_inline void print_int3(const char *label, const int3 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
-CCL_NAMESPACE_END
+ccl_device_inline int3 make_int3(int i);
+ccl_device_inline void print_int3(ccl_private const char *label, const int3 a);
 
-#endif /* __UTIL_TYPES_INT3_H__ */
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int3_impl.h b/intern/cycles/util/types_int3_impl.h
index 4cfc1cf2987..830dfa3c658 100644
--- a/intern/cycles/util/types_int3_impl.h
+++ b/intern/cycles/util/types_int3_impl.h
@@ -1,20 +1,15 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_INT3_IMPL_H__
-#define __UTIL_TYPES_INT3_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif
 
-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 #  ifdef __KERNEL_SSE__
 __forceinline int3::int3()
 {
@@ -45,6 +40,7 @@ __forceinline int3 &int3::operator=(const int3 &a)
 }
 #  endif /* __KERNEL_SSE__ */
 
+#  ifndef __KERNEL_GPU__
 __forceinline int int3::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -58,34 +54,37 @@ __forceinline int &int3::operator[](int i)
   util_assert(i < 3);
   return *(&x + i);
 }
-
-ccl_device_inline int3 make_int3(int i)
-{
-#  ifdef __KERNEL_SSE__
-  int3 a(_mm_set1_epi32(i));
-#  else
-  int3 a = {i, i, i, i};
 #  endif
-  return a;
-}
 
 ccl_device_inline int3 make_int3(int x, int y, int z)
 {
-#  ifdef __KERNEL_SSE__
-  int3 a(_mm_set_epi32(0, z, y, x));
+#  if defined(__KERNEL_GPU__)
+  return {x, y, z};
+#  elif defined(__KERNEL_SSE__)
+  return int3(_mm_set_epi32(0, z, y, x));
 #  else
-  int3 a = {x, y, z, 0};
+  return {x, y, z, 0};
 #  endif
+}
 
-  return a;
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline int3 make_int3(int i)
+{
+#if defined(__KERNEL_GPU__)
+  return make_int3(i, i, i);
+#elif defined(__KERNEL_SSE__)
+  return int3(_mm_set1_epi32(i));
+#else
+  return {i, i, i, i};
+#endif
 }
 
-ccl_device_inline void print_int3(const char *label, const int3 &a)
+ccl_device_inline void print_int3(ccl_private const char *label, const int3 a)
 {
+#ifdef __KERNEL_PRINTF__
   printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
+#endif
 }
-#endif /* __KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT3_IMPL_H__ */
diff --git a/intern/cycles/util/types_int4.h b/intern/cycles/util/types_int4.h
index f35632fb52f..1a13c03e60e 100644
--- a/intern/cycles/util/types_int4.h
+++ b/intern/cycles/util/types_int4.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_INT4_H__
-#define __UTIL_TYPES_INT4_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 
 struct float3;
 struct float4;
@@ -37,17 +36,18 @@ struct ccl_try_align(16) int4
   int x, y, z, w;
 #  endif /* __KERNEL_SSE__ */
 
+#  ifndef __KERNEL_GPU__
   __forceinline int operator[](int i) const;
   __forceinline int &operator[](int i);
+#  endif
 };
 
-ccl_device_inline int4 make_int4(int i);
 ccl_device_inline int4 make_int4(int x, int y, int z, int w);
-ccl_device_inline int4 make_int4(const float3 &f);
-ccl_device_inline int4 make_int4(const float4 &f);
-ccl_device_inline void print_int4(const char *label, const int4 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
-CCL_NAMESPACE_END
+ccl_device_inline int4 make_int4(int i);
+ccl_device_inline int4 make_int4(const float3 f);
+ccl_device_inline int4 make_int4(const float4 f);
+ccl_device_inline void print_int4(ccl_private const char *label, const int4 a);
 
-#endif /* __UTIL_TYPES_INT4_H__ */
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int4_impl.h b/intern/cycles/util/types_int4_impl.h
index adb4a4cebac..067794e67b4 100644
--- a/intern/cycles/util/types_int4_impl.h
+++ b/intern/cycles/util/types_int4_impl.h
@@ -1,20 +1,15 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_INT4_IMPL_H__
-#define __UTIL_TYPES_INT4_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif
 
-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 #  ifdef __KERNEL_SSE__
 __forceinline int4::int4()
 {
@@ -45,6 +40,7 @@ __forceinline int4 &int4::operator=(const int4 &a)
 }
 #  endif /* __KERNEL_SSE__ */
 
+#  ifndef __KERNEL_GPU__
 __forceinline int int4::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -58,53 +54,53 @@ __forceinline int &int4::operator[](int i)
   util_assert(i < 4);
   return *(&x + i);
 }
+#  endif
 
-ccl_device_inline int4 make_int4(int i)
+ccl_device_inline int4 make_int4(int x, int y, int z, int w)
 {
 #  ifdef __KERNEL_SSE__
-  int4 a(_mm_set1_epi32(i));
+  return int4(_mm_set_epi32(w, z, y, x));
 #  else
-  int4 a = {i, i, i, i};
+  return {x, y, z, w};
 #  endif
-  return a;
 }
 
-ccl_device_inline int4 make_int4(int x, int y, int z, int w)
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
+
+ccl_device_inline int4 make_int4(int i)
 {
-#  ifdef __KERNEL_SSE__
-  int4 a(_mm_set_epi32(w, z, y, x));
-#  else
-  int4 a = {x, y, z, w};
-#  endif
-  return a;
+#ifdef __KERNEL_SSE__
+  return int4(_mm_set1_epi32(i));
+#else
+  return make_int4(i, i, i, i);
+#endif
 }
 
-ccl_device_inline int4 make_int4(const float3 &f)
+ccl_device_inline int4 make_int4(const float3 f)
 {
-#  ifdef __KERNEL_SSE__
-  int4 a(_mm_cvtps_epi32(f.m128));
-#  else
-  int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
-#  endif
-  return a;
+#if defined(__KERNEL_GPU__)
+  return make_int4((int)f.x, (int)f.y, (int)f.z, 0);
+#elif defined(__KERNEL_SSE__)
+  return int4(_mm_cvtps_epi32(f.m128));
+#else
+  return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
+#endif
 }
 
-ccl_device_inline int4 make_int4(const float4 &f)
+ccl_device_inline int4 make_int4(const float4 f)
 {
-#  ifdef __KERNEL_SSE__
-  int4 a(_mm_cvtps_epi32(f.m128));
-#  else
-  int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
-#  endif
-  return a;
+#ifdef __KERNEL_SSE__
+  return int4(_mm_cvtps_epi32(f.m128));
+#else
+  return make_int4((int)f.x, (int)f.y, (int)f.z, (int)f.w);
+#endif
 }
 
-ccl_device_inline void print_int4(const char *label, const int4 &a)
+ccl_device_inline void print_int4(ccl_private const char *label, const int4 a)
 {
+#ifdef __KERNEL_PRINTF__
   printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
+#endif
 }
-#endif /* __KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT4_IMPL_H__ */
diff --git a/intern/cycles/util/types_spectrum.h b/intern/cycles/util/types_spectrum.h
new file mode 100644
index 00000000000..c59230b83ae
--- /dev/null
+++ b/intern/cycles/util/types_spectrum.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Blender Foundation */
+
+#ifndef __UTIL_TYPES_SPECTRUM_H__
+#define __UTIL_TYPES_SPECTRUM_H__
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#define SPECTRUM_CHANNELS 3
+#define SPECTRUM_DATA_TYPE float3
+#define PACKED_SPECTRUM_DATA_TYPE packed_float3
+
+using Spectrum = SPECTRUM_DATA_TYPE;
+using PackedSpectrum = PACKED_SPECTRUM_DATA_TYPE;
+
+#define make_spectrum(f) CONCAT(make_, SPECTRUM_DATA_TYPE(f))
+#define load_spectrum(f) CONCAT(load_, SPECTRUM_DATA_TYPE(f))
+#define store_spectrum(s, f) CONCAT(store_, SPECTRUM_DATA_TYPE((s), (f)))
+
+#define zero_spectrum CONCAT(zero_, SPECTRUM_DATA_TYPE)
+#define one_spectrum CONCAT(one_, SPECTRUM_DATA_TYPE)
+
+#define FOREACH_SPECTRUM_CHANNEL(counter) \
+  for (int counter = 0; counter < SPECTRUM_CHANNELS; counter++)
+
+#define GET_SPECTRUM_CHANNEL(v, i) (((ccl_private float *)(&(v)))[i])
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_TYPES_SPECTRUM_H__ */
diff --git a/intern/cycles/util/types_uchar2.h b/intern/cycles/util/types_uchar2.h
index 445fa8dd703..ce617248e6e 100644
--- a/intern/cycles/util/types_uchar2.h
+++ b/intern/cycles/util/types_uchar2.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_UCHAR2_H__
-#define __UTIL_TYPES_UCHAR2_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct uchar2 {
   uchar x, y;
 
+#  ifndef __KERNEL_GPU__
   __forceinline uchar operator[](int i) const;
   __forceinline uchar &operator[](int i);
+#  endif
 };
 
 ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UCHAR2_H__ */
diff --git a/intern/cycles/util/types_uchar2_impl.h b/intern/cycles/util/types_uchar2_impl.h
index cec1c679050..9f3f3a4efb9 100644
--- a/intern/cycles/util/types_uchar2_impl.h
+++ b/intern/cycles/util/types_uchar2_impl.h
@@ -10,7 +10,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 uchar uchar2::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -24,13 +25,14 @@ uchar &uchar2::operator[](int i)
   util_assert(i < 2);
   return *(&x + i);
 }
+#  endif
 
 ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
 {
   uchar2 a = {x, y};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar3.h b/intern/cycles/util/types_uchar3.h
index 1ebd86441c3..aed04c4775e 100644
--- a/intern/cycles/util/types_uchar3.h
+++ b/intern/cycles/util/types_uchar3.h
@@ -10,16 +10,18 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct uchar3 {
   uchar x, y, z;
 
+#  ifndef __KERNEL_GPU__
   __forceinline uchar operator[](int i) const;
   __forceinline uchar &operator[](int i);
+#  endif
 };
 
 ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar3_impl.h b/intern/cycles/util/types_uchar3_impl.h
index 0656baa3da4..83eb3c99b3c 100644
--- a/intern/cycles/util/types_uchar3_impl.h
+++ b/intern/cycles/util/types_uchar3_impl.h
@@ -10,7 +10,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 uchar uchar3::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -24,13 +25,14 @@ uchar &uchar3::operator[](int i)
   util_assert(i < 3);
   return *(&x + i);
 }
+#  endif
 
 ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
 {
   uchar3 a = {x, y, z};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar4.h b/intern/cycles/util/types_uchar4.h
index 2ac4fb56cbb..fb13a98875e 100644
--- a/intern/cycles/util/types_uchar4.h
+++ b/intern/cycles/util/types_uchar4.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_UCHAR4_H__
-#define __UTIL_TYPES_UCHAR4_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct uchar4 {
   uchar x, y, z, w;
 
+#  ifndef __KERNEL_GPU__
   __forceinline uchar operator[](int i) const;
   __forceinline uchar &operator[](int i);
+#  endif
 };
 
 ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UCHAR4_H__ */
diff --git a/intern/cycles/util/types_uchar4_impl.h b/intern/cycles/util/types_uchar4_impl.h
index b3e8abfe873..244bb98f883 100644
--- a/intern/cycles/util/types_uchar4_impl.h
+++ b/intern/cycles/util/types_uchar4_impl.h
@@ -10,7 +10,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 uchar uchar4::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -24,13 +25,14 @@ uchar &uchar4::operator[](int i)
   util_assert(i < 4);
   return *(&x + i);
 }
+#  endif
 
 ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
 {
   uchar4 a = {x, y, z, w};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uint2.h b/intern/cycles/util/types_uint2.h
index e3254b9f0e1..4d76b628088 100644
--- a/intern/cycles/util/types_uint2.h
+++ b/intern/cycles/util/types_uint2.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_UINT2_H__
-#define __UTIL_TYPES_UINT2_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct uint2 {
   uint x, y;
 
+#  ifndef __KERNEL_GPU__
   __forceinline uint operator[](uint i) const;
   __forceinline uint &operator[](uint i);
+#  endif
 };
 
 ccl_device_inline uint2 make_uint2(uint x, uint y);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT2_H__ */
diff --git a/intern/cycles/util/types_uint2_impl.h b/intern/cycles/util/types_uint2_impl.h
index e67134a011e..b508aaf2543 100644
--- a/intern/cycles/util/types_uint2_impl.h
+++ b/intern/cycles/util/types_uint2_impl.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_UINT2_IMPL_H__
-#define __UTIL_TYPES_UINT2_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 __forceinline uint uint2::operator[](uint i) const
 {
   util_assert(i < 2);
@@ -22,14 +22,13 @@ __forceinline uint &uint2::operator[](uint i)
   util_assert(i < 2);
   return *(&x + i);
 }
+#  endif
 
 ccl_device_inline uint2 make_uint2(uint x, uint y)
 {
   uint2 a = {x, y};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT2_IMPL_H__ */
diff --git a/intern/cycles/util/types_uint3.h b/intern/cycles/util/types_uint3.h
index 885a8fb84ce..b1571716fc7 100644
--- a/intern/cycles/util/types_uint3.h
+++ b/intern/cycles/util/types_uint3.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_UINT3_H__
-#define __UTIL_TYPES_UINT3_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct uint3 {
   uint x, y, z;
 
+#  ifndef __KERNEL_GPU__
   __forceinline uint operator[](uint i) const;
   __forceinline uint &operator[](uint i);
+#  endif
 };
 
 ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT3_H__ */
diff --git a/intern/cycles/util/types_uint3_impl.h b/intern/cycles/util/types_uint3_impl.h
index f4d3d72469c..d36c9f52de9 100644
--- a/intern/cycles/util/types_uint3_impl.h
+++ b/intern/cycles/util/types_uint3_impl.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_UINT3_IMPL_H__
-#define __UTIL_TYPES_UINT3_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 __forceinline uint uint3::operator[](uint i) const
 {
   util_assert(i < 3);
@@ -22,14 +22,13 @@ __forceinline uint &uint3::operator[](uint i)
   util_assert(i < 3);
   return *(&x + i);
 }
+#  endif
 
 ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
 {
   uint3 a = {x, y, z};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT3_IMPL_H__ */
diff --git a/intern/cycles/util/types_uint4.h b/intern/cycles/util/types_uint4.h
index d582b91d2a0..4982b30f577 100644
--- a/intern/cycles/util/types_uint4.h
+++ b/intern/cycles/util/types_uint4.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_UINT4_H__
-#define __UTIL_TYPES_UINT4_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct uint4 {
   uint x, y, z, w;
 
+#  ifndef __KERNEL_GPU__
   __forceinline uint operator[](uint i) const;
   __forceinline uint &operator[](uint i);
+#  endif
 };
 
 ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT4_H__ */
diff --git a/intern/cycles/util/types_uint4_impl.h b/intern/cycles/util/types_uint4_impl.h
index 98a4c5e9fe9..1cfdb9e0992 100644
--- a/intern/cycles/util/types_uint4_impl.h
+++ b/intern/cycles/util/types_uint4_impl.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __UTIL_TYPES_UINT4_IMPL_H__
-#define __UTIL_TYPES_UINT4_IMPL_H__
+#pragma once
 
 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 __forceinline uint uint4::operator[](uint i) const
 {
   util_assert(i < 3);
@@ -22,14 +22,13 @@ __forceinline uint &uint4::operator[](uint i)
   util_assert(i < 3);
   return *(&x + i);
 }
+#  endif
 
 ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
 {
   uint4 a = {x, y, z, w};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_UINT4_IMPL_H__ */
diff --git a/intern/cycles/util/types_ushort4.h b/intern/cycles/util/types_ushort4.h
index 1766c6bf734..aef36f63285 100644
--- a/intern/cycles/util/types_ushort4.h
+++ b/intern/cycles/util/types_ushort4.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 
 struct ushort4 {
   uint16_t x, y, z, w;
diff --git a/intern/cycles/util/types_vector3.h b/intern/cycles/util/types_vector3.h
deleted file mode 100644
index 2e0d68e1bd0..00000000000
--- a/intern/cycles/util/types_vector3.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#ifndef __UTIL_TYPES_VECTOR3_H__
-#define __UTIL_TYPES_VECTOR3_H__
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_GPU__
-template<typename T> class vector3 {
- public:
-  T x, y, z;
-
-  __forceinline vector3();
-  __forceinline vector3(const T &a);
-  __forceinline vector3(const T &x, const T &y, const T &z);
-};
-#endif /* __KERNEL_GPU__ */
-
-CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_VECTOR3_H__ */
diff --git a/intern/cycles/util/types_vector3_impl.h b/intern/cycles/util/types_vector3_impl.h
deleted file mode 100644
index a765780e2d3..00000000000
--- a/intern/cycles/util/types_vector3_impl.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#ifndef __UTIL_TYPES_VECTOR3_IMPL_H__
-#define __UTIL_TYPES_VECTOR3_IMPL_H__
-
-#ifndef __UTIL_TYPES_H__
-#  error "Do not include this file directly, include util/types.h instead."
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __KERNEL_GPU__
-template<typename T> ccl_always_inline vector3<T>::vector3()
-{
-}
-
-template<typename T> ccl_always_inline vector3<T>::vector3(const T &a) : x(a), y(a), z(a)
-{
-}
-
-template<typename T>
-ccl_always_inline vector3<T>::vector3(const T &x, const T &y, const T &z) : x(x), y(y), z(z)
-{
-}
-#endif /* __KERNEL_GPU__ */
-
-CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_VECTOR3_IMPL_H__ */
diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h
index 0056fb269ae..9e27997cf2c 100644
--- a/intern/cycles/util/vector.h
+++ b/intern/cycles/util/vector.h
@@ -10,7 +10,6 @@
 
 #include "util/aligned_malloc.h"
 #include "util/guarded_allocator.h"
-#include "util/types.h"
 
 CCL_NAMESPACE_BEGIN