14 files changed, 648 insertions, 161 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index fddac1dbbcf..9bc9f00e142 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -63,6 +63,7 @@ set(SRC_HEADERS
   math_float2.h
   math_float3.h
   math_float4.h
+  math_float8.h
   math_int2.h
   math_int3.h
   math_int4.h
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index 115a747cf1c..d0df1a221fc 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -81,7 +81,7 @@
 /* macros */
 
 /* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
 #  define LIKELY(x) __builtin_expect(!!(x), 1)
 #  define UNLIKELY(x) __builtin_expect(!!(x), 0)
 #else
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index af2f1ea092d..2631304c84b 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -511,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
   return make_float4(a.x, a.y, a.z, 1.0f);
 }
 
+ccl_device_inline float4 float3_to_float4(const float3 a, const float w)
+{
+  return make_float4(a.x, a.y, a.z, w);
+}
+
 ccl_device_inline float inverse_lerp(float a, float b, float x)
 {
   return (x - a) / (b - a);
@@ -535,6 +540,7 @@ CCL_NAMESPACE_END
 #include "util/math_float2.h"
 #include "util/math_float3.h"
 #include "util/math_float4.h"
+#include "util/math_float8.h"
 
 #include "util/rect.h"
 
diff --git a/intern/cycles/util/math_fast.h b/intern/cycles/util/math_fast.h
index 2221e7a9835..142a664a1d2 100644
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -420,7 +420,7 @@ ccl_device_inline float fast_expf(float x)
   return fast_exp2f(x / M_LN2_F);
 }
 
-#if defined(__KERNEL_CPU__) && !defined(_MSC_VER)
+#if !defined(__KERNEL_GPU__) && !defined(_MSC_VER)
 /* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see
  * T78047 and T78869 for details. Just disable for now, it only makes
  * a small difference in denoising performance. */
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index c02b4cdbf0d..c408eadf195 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -147,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
 
 ccl_device_inline float3 operator/(const float3 &a, const float f)
 {
-  float invf = 1.0f / f;
-  return a * invf;
+#  if defined(__KERNEL_SSE__)
+  return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
+#  else
+  return make_float3(a.x / f, a.y / f, a.z / f);
+#  endif
 }
 
 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
@@ -284,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
 
 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 {
-  float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
-  return r;
+#  ifdef __KERNEL_SSE__
+  return float3(shuffle<1, 2, 0, 3>(
+      msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+#  else
+  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#  endif
 }
 
 ccl_device_inline float3 normalize(const float3 &a)
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index 073c65c2d6a..c2721873037 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -55,6 +55,7 @@ ccl_device_inline float4 floor(const float4 &a);
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
 #endif /* !__KERNEL_METAL__*/
 
+ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
 ccl_device_inline float4 safe_divide(const float4 a, const float b);
 
 #ifdef __KERNEL_SSE__
@@ -74,11 +75,14 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
 #  endif
 #endif /* __KERNEL_SSE__ */
 
+ccl_device_inline float reduce_min(const float4 a);
+ccl_device_inline float reduce_max(const float4 a);
+ccl_device_inline float reduce_add(const float4 a);
+
+ccl_device_inline bool isequal(const float4 a, const float4 b);
+
 #ifndef __KERNEL_GPU__
 ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-ccl_device_inline float4 reduce_min(const float4 &a);
-ccl_device_inline float4 reduce_max(const float4 &a);
-ccl_device_inline float4 reduce_add(const float4 &a);
 #endif /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -303,27 +307,9 @@ ccl_device_inline bool is_zero(const float4 &a)
 #  endif
 }
 
-ccl_device_inline float4 reduce_add(const float4 &a)
-{
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vdupq_n_f32(vaddvq_f32(a)));
-#    elif defined(__KERNEL_SSE3__)
-  float4 h(_mm_hadd_ps(a.m128, a.m128));
-  return float4(_mm_hadd_ps(h.m128, h.m128));
-#    else
-  float4 h(shuffle<1, 0, 3, 2>(a) + a);
-  return shuffle<2, 3, 0, 1>(h) + h;
-#    endif
-#  else
-  float sum = (a.x + a.y) + (a.z + a.w);
-  return make_float4(sum, sum, sum, sum);
-#  endif
-}
-
 ccl_device_inline float average(const float4 &a)
 {
-  return reduce_add(a).x * 0.25f;
+  return reduce_add(a) * 0.25f;
 }
 
 ccl_device_inline float len(const float4 &a)
@@ -392,8 +378,77 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
   return a + t * (b - a);
 }
 
+ccl_device_inline float4 saturate(const float4 &a)
+{
+  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
 #endif /* !__KERNEL_METAL__*/
 
+ccl_device_inline float reduce_add(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
+  return vaddvq_f32(a);
+#  elif defined(__KERNEL_SSE3__)
+  float4 h(_mm_hadd_ps(a.m128, a.m128));
+  return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128));
+#  else
+  float4 h(shuffle<1, 0, 3, 2>(a) + a);
+  return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h);
+#  endif
+#else
+  return a.x + a.y + a.z + a.w;
+#endif
+}
+
+ccl_device_inline float reduce_min(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
+  return vminvq_f32(a);
+#  else
+  float4 h = min(shuffle<1, 0, 3, 2>(a), a);
+  return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h));
+#  endif
+#else
+  return min(min(a.x, a.y), min(a.z, a.w));
+#endif
+}
+
+ccl_device_inline float reduce_max(const float4 a)
+{
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
+  return vmaxvq_f32(a);
+#  else
+  float4 h = max(shuffle<1, 0, 3, 2>(a), a);
+  return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h));
+#  endif
+#else
+  return max(max(a.x, a.y), max(a.z, a.w));
+#endif
+}
+
+ccl_device_inline bool isequal(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_METAL__)
+  return all(a == b);
+#else
+  return a == b;
+#endif
+}
+
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
@@ -461,34 +516,6 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
   return select(mask, a, zero_float4());
 }
 
-ccl_device_inline float4 reduce_min(const float4 &a)
-{
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vdupq_n_f32(vminvq_f32(a)));
-#    else
-  float4 h = min(shuffle<1, 0, 3, 2>(a), a);
-  return min(shuffle<2, 3, 0, 1>(h), h);
-#    endif
-#  else
-  return make_float4(min(min(a.x, a.y), min(a.z, a.w)));
-#  endif
-}
-
-ccl_device_inline float4 reduce_max(const float4 &a)
-{
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vdupq_n_f32(vmaxvq_f32(a)));
-#    else
-  float4 h = max(shuffle<1, 0, 3, 2>(a), a);
-  return max(shuffle<2, 3, 0, 1>(h), h);
-#    endif
-#  else
-  return make_float4(max(max(a.x, a.y), max(a.z, a.w)));
-#  endif
-}
-
 ccl_device_inline float4 load_float4(ccl_private const float *v)
 {
 #  ifdef __KERNEL_SSE__
@@ -505,6 +532,14 @@ ccl_device_inline float4 safe_divide(const float4 a, const float b)
   return (b != 0.0f) ? a / b : zero_float4();
 }
 
+ccl_device_inline float4 safe_divide(const float4 a, const float4 b)
+{
+  return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f,
+                     (b.y != 0.0f) ? a.y / b.y : 0.0f,
+                     (b.z != 0.0f) ? a.z / b.z : 0.0f,
+                     (b.w != 0.0f) ? a.w / b.w : 0.0f);
+}
+
 ccl_device_inline bool isfinite_safe(float4 v)
 {
   return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w);
@@ -523,6 +558,11 @@ ccl_device_inline float4 ensure_finite(float4 v)
   return v;
 }
 
+ccl_device_inline float4 pow(float4 v, float e)
+{
+  return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.z, e));
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_FLOAT4_H__ */
diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h
new file mode 100644
index 00000000000..8ed8d56a034
--- /dev/null
+++ b/intern/cycles/util/math_float8.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_FLOAT8_H__
+#define __UTIL_MATH_FLOAT8_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+ccl_device_inline float8 operator+(const float8 &a, const float8 &b);
+ccl_device_inline float8 operator+(const float8 &a, const float f);
+ccl_device_inline float8 operator+(const float f, const float8 &a);
+
+ccl_device_inline float8 operator-(const float8 &a);
+ccl_device_inline float8 operator-(const float8 &a, const float8 &b);
+ccl_device_inline float8 operator-(const float8 &a, const float f);
+ccl_device_inline float8 operator-(const float f, const float8 &a);
+
+ccl_device_inline float8 operator*(const float8 &a, const float8 &b);
+ccl_device_inline float8 operator*(const float8 &a, const float f);
+ccl_device_inline float8 operator*(const float f, const float8 &a);
+
+ccl_device_inline float8 operator/(const float8 &a, const float8 &b);
+ccl_device_inline float8 operator/(const float8 &a, float f);
+ccl_device_inline float8 operator/(const float f, const float8 &a);
+
+ccl_device_inline float8 operator+=(float8 &a, const float8 &b);
+
+ccl_device_inline float8 operator*=(float8 &a, const float8 &b);
+ccl_device_inline float8 operator*=(float8 &a, float f);
+
+ccl_device_inline float8 operator/=(float8 &a, float f);
+
+ccl_device_inline bool operator==(const float8 &a, const float8 &b);
+
+ccl_device_inline float8 rcp(const float8 &a);
+ccl_device_inline float8 sqrt(const float8 &a);
+ccl_device_inline float8 sqr(const float8 &a);
+ccl_device_inline bool is_zero(const float8 &a);
+ccl_device_inline float average(const float8 &a);
+ccl_device_inline float8 min(const float8 &a, const float8 &b);
+ccl_device_inline float8 max(const float8 &a, const float8 &b);
+ccl_device_inline float8 clamp(const float8 &a, const float8 &mn, const float8 &mx);
+ccl_device_inline float8 fabs(const float8 &a);
+ccl_device_inline float8 mix(const float8 &a, const float8 &b, float t);
+
+ccl_device_inline float8 safe_divide(const float8 a, const float b);
+ccl_device_inline float8 safe_divide(const float8 a, const float8 b);
+
+ccl_device_inline float reduce_min(const float8 &a);
+ccl_device_inline float reduce_max(const float8 &a);
+ccl_device_inline float reduce_add(const float8 &a);
+
+ccl_device_inline float8 saturate(const float8 &a);
+ccl_device_inline bool isequal(const float8 a, const float8 b);
+
+/*******************************************************************************
+ * Definition.
+ */
+
+ccl_device_inline float8 zero_float8()
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_setzero_ps());
+#else
+  return make_float8(0.0f);
+#endif
+}
+
+ccl_device_inline float8 one_float8()
+{
+  return make_float8(1.0f);
+}
+
+ccl_device_inline float8 operator+(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_add_ps(a.m256, b.m256));
+#else
+  return make_float8(
+      a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+#endif
+}
+
+ccl_device_inline float8 operator+(const float8 &a, const float f)
+{
+  return a + make_float8(f);
+}
+
+ccl_device_inline float8 operator+(const float f, const float8 &a)
+{
+  return make_float8(f) + a;
+}
+
+ccl_device_inline float8 operator-(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+  __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+  return float8(_mm256_xor_ps(a.m256, mask));
+#else
+  return make_float8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+#endif
+}
+
+ccl_device_inline float8 operator-(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_sub_ps(a.m256, b.m256));
+#else
+  return make_float8(
+      a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+#endif
+}
+
+ccl_device_inline float8 operator-(const float8 &a, const float f)
+{
+  return a - make_float8(f);
+}
+
+ccl_device_inline float8 operator-(const float f, const float8 &a)
+{
+  return make_float8(f) - a;
+}
+
+ccl_device_inline float8 operator*(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_mul_ps(a.m256, b.m256));
+#else
+  return make_float8(
+      a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
+#endif
+}
+
+ccl_device_inline float8 operator*(const float8 &a, const float f)
+{
+  return a * make_float8(f);
+}
+
+ccl_device_inline float8 operator*(const float f, const float8 &a)
+{
+  return make_float8(f) * a;
+}
+
+ccl_device_inline float8 operator/(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_div_ps(a.m256, b.m256));
+#else
+  return make_float8(
+      a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
+#endif
+}
+
+ccl_device_inline float8 operator/(const float8 &a, const float f)
+{
+  return a / make_float8(f);
+}
+
+ccl_device_inline float8 operator/(const float f, const float8 &a)
+{
+  return make_float8(f) / a;
+}
+
+ccl_device_inline float8 operator+=(float8 &a, const float8 &b)
+{
+  return a = a + b;
+}
+
+ccl_device_inline float8 operator-=(float8 &a, const float8 &b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline float8 operator*=(float8 &a, const float8 &b)
+{
+  return a = a * b;
+}
+
+ccl_device_inline float8 operator*=(float8 &a, float f)
+{
+  return a = a * f;
+}
+
+ccl_device_inline float8 operator/=(float8 &a, float f)
+{
+  return a = a / f;
+}
+
+ccl_device_inline bool operator==(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+  return (_mm256_movemask_ps(_mm256_castsi256_ps(
+              _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
+          0b11111111) == 0b11111111;
+#else
+  return (a.a == b.a && a.b == b.b && a.c == b.c && a.d == b.d && a.e == b.e && a.f == b.f &&
+          a.g == b.g && a.h == b.h);
+#endif
+}
+
+ccl_device_inline float8 rcp(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_rcp_ps(a.m256));
+#else
+  return make_float8(1.0f / a.a,
+                     1.0f / a.b,
+                     1.0f / a.c,
+                     1.0f / a.d,
+                     1.0f / a.e,
+                     1.0f / a.f,
+                     1.0f / a.g,
+                     1.0f / a.h);
+#endif
+}
+
+ccl_device_inline float8 sqrt(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_sqrt_ps(a.m256));
+#else
+  return make_float8(sqrtf(a.a),
+                     sqrtf(a.b),
+                     sqrtf(a.c),
+                     sqrtf(a.d),
+                     sqrtf(a.e),
+                     sqrtf(a.f),
+                     sqrtf(a.g),
+                     sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline float8 sqr(const float8 &a)
+{
+  return a * a;
+}
+
+ccl_device_inline bool is_zero(const float8 &a)
+{
+  return a == make_float8(0.0f);
+}
+
+ccl_device_inline float average(const float8 &a)
+{
+  return reduce_add(a) / 8.0f;
+}
+
+ccl_device_inline float8 min(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_min_ps(a.m256, b.m256));
+#else
+  return make_float8(min(a.a, b.a),
+                     min(a.b, b.b),
+                     min(a.c, b.c),
+                     min(a.d, b.d),
+                     min(a.e, b.e),
+                     min(a.f, b.f),
+                     min(a.g, b.g),
+                     min(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8 max(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_max_ps(a.m256, b.m256));
+#else
+  return make_float8(max(a.a, b.a),
+                     max(a.b, b.b),
+                     max(a.c, b.c),
+                     max(a.d, b.d),
+                     max(a.e, b.e),
+                     max(a.f, b.f),
+                     max(a.g, b.g),
+                     max(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8 clamp(const float8 &a, const float8 &mn, const float8 &mx)
+{
+  return min(max(a, mn), mx);
+}
+
+ccl_device_inline float8 fabs(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+  return make_float8(fabsf(a.a),
+                     fabsf(a.b),
+                     fabsf(a.c),
+                     fabsf(a.d),
+                     fabsf(a.e),
+                     fabsf(a.f),
+                     fabsf(a.g),
+                     fabsf(a.h));
+#endif
+}
+
+ccl_device_inline float8 mix(const float8 &a, const float8 &b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float reduce_min(const float8 &a)
+{
+  return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_max(const float8 &a)
+{
+  return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_add(const float8 &a)
+{
+#ifdef __KERNEL_AVX2__
+  float8 b(_mm256_hadd_ps(a.m256, a.m256));
+  float8 h(_mm256_hadd_ps(b.m256, b.m256));
+  return h[0] + h[4];
+#else
+  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline float8 saturate(const float8 &a)
+{
+  return clamp(a, make_float8(0.0f), make_float8(1.0f));
+}
+
+ccl_device_inline bool isequal(const float8 a, const float8 b)
+{
+  return a == b;
+}
+
+ccl_device_inline float8 safe_divide(const float8 a, const float b)
+{
+  return (b != 0.0f) ? a / b : make_float8(0.0f);
+}
+
+ccl_device_inline float8 safe_divide(const float8 a, const float8 b)
+{
+  return make_float8((b.a != 0.0f) ? a.a / b.a : 0.0f,
+                     (b.b != 0.0f) ? a.b / b.b : 0.0f,
+                     (b.c != 0.0f) ? a.c / b.c : 0.0f,
+                     (b.d != 0.0f) ? a.d / b.d : 0.0f,
+                     (b.e != 0.0f) ? a.e / b.e : 0.0f,
+                     (b.f != 0.0f) ? a.f / b.f : 0.0f,
+                     (b.g != 0.0f) ? a.g / b.g : 0.0f,
+                     (b.h != 0.0f) ? a.h / b.h : 0.0f);
+}
+
+ccl_device_inline float8 ensure_finite(float8 v)
+{
+  v.a = ensure_finite(v.a);
+  v.b = ensure_finite(v.b);
+  v.c = ensure_finite(v.c);
+  v.d = ensure_finite(v.d);
+  v.e = ensure_finite(v.e);
+  v.f = ensure_finite(v.f);
+  v.g = ensure_finite(v.g);
+  v.h = ensure_finite(v.h);
+
+  return v;
+}
+
+ccl_device_inline bool isfinite_safe(float8 v)
+{
+  return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
+         isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
+}
+
+ccl_device_inline float8 pow(float8 v, float e)
+{
+  return make_float8(powf(v.a, e),
+                     powf(v.b, e),
+                     powf(v.c, e),
+                     powf(v.d, e),
+                     powf(v.e, e),
+                     powf(v.f, e),
+                     powf(v.g, e),
+                     powf(v.h, e));
+}
+
+ccl_device_inline float8 exp(float8 v)
+{
+  return make_float8(
+      expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
+}
+
+ccl_device_inline float8 log(float8 v)
+{
+  return make_float8(
+      logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
+}
+
+ccl_device_inline float dot(const float8 &a, const float8 &b)
+{
+#ifdef __KERNEL_AVX2__
+  float8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+  return t[0] + t[4];
+#else
+  return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
+         (a.g * b.g) + (a.h * b.h);
+#endif
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT8_H__ */
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index b0de0b25a45..3e5891b2507 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device bool ray_sphere_intersect(float3 ray_P,
                                      float3 ray_D,
-                                     float ray_t,
+                                     float ray_tmin,
+                                     float ray_tmax,
                                      float3 sphere_P,
                                      float sphere_radius,
                                      ccl_private float3 *isect_P,
@@ -33,7 +34,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
       return false;
     }
     const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */
-    if (t < ray_t) {
+    if (t > ray_tmin && t < ray_tmax) {
       *isect_t = t;
       *isect_P = ray_P + ray_D * t;
       return true;
@@ -44,7 +45,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
 
 ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
                                            float3 ray_D,
-                                           float ray_t,
+                                           float ray_tmin,
+                                           float ray_tmax,
                                            float3 disk_P,
                                            float disk_radius,
                                            ccl_private float3 *isect_P,
@@ -59,7 +61,7 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
   }
   /* Compute t to intersection point. */
   const float t = -disk_t / div;
-  if (t < 0.0f || t > ray_t) {
+  if (!(t > ray_tmin && t < ray_tmax)) {
     return false;
   }
   /* Test if within radius. */
@@ -74,7 +76,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
 
 ccl_device bool ray_disk_intersect(float3 ray_P,
                                    float3 ray_D,
-                                   float ray_t,
+                                   float ray_tmin,
+                                   float ray_tmax,
                                    float3 disk_P,
                                    float3 disk_N,
                                    float disk_radius,
@@ -92,7 +95,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
     }
     float3 P = ray_P + t * ray_D;
     float3 T = P - disk_P;
-    if (dot(T, T) < sqr(disk_radius) /*&& t > 0.f*/ && t <= ray_t) {
+
+    if (dot(T, T) < sqr(disk_radius) && (t > ray_tmin && t < ray_tmax)) {
       *isect_P = ray_P + t * ray_D;
       *isect_t = t;
       return true;
@@ -101,9 +105,10 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
   return false;
 }
 
-ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
-                                                   float3 ray_dir,
-                                                   float ray_t,
+ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
+                                                   const float3 ray_D,
+                                                   const float ray_tmin,
+                                                   const float ray_tmax,
                                                    const float3 tri_a,
                                                    const float3 tri_b,
                                                    const float3 tri_c,
@@ -111,14 +116,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
                                                    ccl_private float *isect_v,
                                                    ccl_private float *isect_t)
 {
-#define dot3(a, b) dot(a, b)
-  const float3 P = ray_P;
-  const float3 dir = ray_dir;
+  /* This implementation matches the Plücker coordinates triangle intersection
+   * in Embree. */
 
   /* Calculate vertices relative to ray origin. */
-  const float3 v0 = tri_c - P;
-  const float3 v1 = tri_a - P;
-  const float3 v2 = tri_b - P;
+  const float3 v0 = tri_c - ray_P;
+  const float3 v1 = tri_a - ray_P;
+  const float3 v2 = tri_b - ray_P;
 
   /* Calculate triangle edges. */
   const float3 e0 = v2 - v0;
@@ -126,42 +130,38 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
   const float3 e2 = v1 - v2;
 
   /* Perform edge tests. */
-  const float U = dot(cross(v2 + v0, e0), ray_dir);
-  const float V = dot(cross(v0 + v1, e1), ray_dir);
-  const float W = dot(cross(v1 + v2, e2), ray_dir);
+  const float U = dot(cross(v2 + v0, e0), ray_D);
+  const float V = dot(cross(v0 + v1, e1), ray_D);
+  const float W = dot(cross(v1 + v2, e2), ray_D);
 
+  const float eps = FLT_EPSILON * fabsf(U + V + W);
   const float minUVW = min(U, min(V, W));
   const float maxUVW = max(U, max(V, W));
 
-  if (minUVW < 0.0f && maxUVW > 0.0f) {
+  if (!(minUVW >= -eps || maxUVW <= eps)) {
     return false;
   }
 
   /* Calculate geometry normal and denominator. */
   const float3 Ng1 = cross(e1, e0);
-  // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
   const float3 Ng = Ng1 + Ng1;
-  const float den = dot3(Ng, dir);
+  const float den = dot(Ng, ray_D);
   /* Avoid division by 0. */
   if (UNLIKELY(den == 0.0f)) {
     return false;
   }
 
   /* Perform depth test. */
-  const float T = dot3(v0, Ng);
-  const int sign_den = (__float_as_int(den) & 0x80000000);
-  const float sign_T = xor_signmask(T, sign_den);
-  if ((sign_T < 0.0f) || (sign_T > ray_t * xor_signmask(den, sign_den))) {
+  const float T = dot(v0, Ng);
+  const float t = T / den;
+  if (!(t >= ray_tmin && t <= ray_tmax)) {
     return false;
   }
 
-  const float inv_den = 1.0f / den;
-  *isect_u = U * inv_den;
-  *isect_v = V * inv_den;
-  *isect_t = T * inv_den;
+  *isect_u = U / den;
+  *isect_v = V / den;
+  *isect_t = t;
   return true;
-
-#undef dot3
 }
 
 /* Tests for an intersection between a ray and a quad defined by
@@ -171,8 +171,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
  */
 ccl_device bool ray_quad_intersect(float3 ray_P,
                                    float3 ray_D,
-                                   float ray_mint,
-                                   float ray_maxt,
+                                   float ray_tmin,
+                                   float ray_tmax,
                                    float3 quad_P,
                                    float3 quad_u,
                                    float3 quad_v,
@@ -185,7 +185,7 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
 {
   /* Perform intersection test. */
   float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
-  if (t < ray_mint || t > ray_maxt) {
+  if (!(t > ray_tmin && t < ray_tmax)) {
     return false;
   }
   const float3 hit = ray_P + t * ray_D;
diff --git a/intern/cycles/util/string.cpp b/intern/cycles/util/string.cpp
index 66ff866ee10..0c318cea44a 100644
--- a/intern/cycles/util/string.cpp
+++ b/intern/cycles/util/string.cpp
@@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other)
   }
 }
 
+void string_replace_same_length(string &haystack, const string &needle, const string &other)
+{
+  assert(needle.size() == other.size());
+  size_t pos = 0;
+  while (pos != string::npos) {
+    pos = haystack.find(needle, pos);
+    if (pos != string::npos) {
+      memcpy(haystack.data() + pos, other.data(), other.size());
+      pos += other.size();
+    }
+  }
+}
+
 string string_remove_trademark(const string &s)
 {
   string result = s;
@@ -164,6 +177,11 @@ string to_string(const char *str)
   return string(str);
 }
 
+string to_string(const float4 &v)
+{
+  return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w);
+}
+
 string string_to_lower(const string &s)
 {
   string r = s;
diff --git a/intern/cycles/util/string.h b/intern/cycles/util/string.h
index a74feee1750..ecbe9e106c6 100644
--- a/intern/cycles/util/string.h
+++ b/intern/cycles/util/string.h
@@ -38,12 +38,14 @@ void string_split(vector<string> &tokens,
                   const string &separators = "\t ",
                   bool skip_empty_tokens = true);
 void string_replace(string &haystack, const string &needle, const string &other);
+void string_replace_same_length(string &haystack, const string &needle, const string &other);
 bool string_startswith(string_view s, string_view start);
 bool string_endswith(string_view s, string_view end);
 string string_strip(const string &s);
 string string_remove_trademark(const string &s);
 string string_from_bool(const bool var);
 string to_string(const char *str);
+string to_string(const float4 &v);
 string string_to_lower(const string &s);
 
 /* Wide char strings are only used on Windows to deal with non-ASCII
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index 0bf5de57a20..0b87e88871d 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -99,15 +99,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
   memcpy(M, &tfm, sizeof(M));
 
   if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-    /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
-     * never be in this situation, but try to invert it anyway with tweak */
-    M[0][0] += 1e-8f;
-    M[1][1] += 1e-8f;
-    M[2][2] += 1e-8f;
-
-    if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-      return projection_identity();
-    }
+    return projection_identity();
   }
 
   memcpy(&tfmR, R, sizeof(R));
@@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
   return tfmR;
 }
 
-Transform transform_inverse(const Transform &tfm)
-{
-  ProjectionTransform projection(tfm);
-  return projection_to_transform(projection_inverse(projection));
-}
-
 Transform transform_transposed_inverse(const Transform &tfm)
 {
-  ProjectionTransform projection(tfm);
-  ProjectionTransform iprojection = projection_inverse(projection);
+  ProjectionTransform iprojection(transform_inverse(tfm));
   return projection_to_transform(projection_transpose(iprojection));
 }
 
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index a460581d1f3..71164efbac1 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -63,10 +63,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
 
   _MM_TRANSPOSE4_PS(x, y, z, w);
 
-  ssef tmp = shuffle<0>(aa) * x;
-  tmp = madd(shuffle<1>(aa), y, tmp);
+  ssef tmp = w;
   tmp = madd(shuffle<2>(aa), z, tmp);
-  tmp += w;
+  tmp = madd(shuffle<1>(aa), y, tmp);
+  tmp = madd(shuffle<0>(aa), x, tmp);
 
   return float3(tmp.m128);
 #elif defined(__KERNEL_METAL__)
@@ -93,9 +93,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
 
   _MM_TRANSPOSE4_PS(x, y, z, w);
 
-  ssef tmp = shuffle<0>(aa) * x;
+  ssef tmp = shuffle<2>(aa) * z;
   tmp = madd(shuffle<1>(aa), y, tmp);
-  tmp = madd(shuffle<2>(aa), z, tmp);
+  tmp = madd(shuffle<0>(aa), x, tmp);
 
   return float3(tmp.m128);
 #elif defined(__KERNEL_METAL__)
@@ -312,7 +312,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val
   t->z[column] = value.z;
 }
 
-Transform transform_inverse(const Transform &a);
 Transform transform_transposed_inverse(const Transform &a);
 
 ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale)
@@ -392,39 +391,47 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 #endif /* defined(__KERNEL_GPU_RAYTRACING__) */
 }
 
-ccl_device_inline Transform transform_quick_inverse(Transform M)
+ccl_device_inline Transform transform_inverse(const Transform tfm)
 {
-  /* possible optimization: can we avoid doing this altogether and construct
-   * the inverse matrix directly from negated translation, transposed rotation,
-   * scale can be inverted but what about shearing? */
-  Transform R;
-  float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
-              M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
+  /* This implementation matches the one in Embree exactly, to ensure consistent
+   * results with the ray intersection of instances. */
+  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+  /* Compute determinant. */
+  float det = dot(x, cross(y, z));
+
   if (det == 0.0f) {
-    M.x.x += 1e-8f;
-    M.y.y += 1e-8f;
-    M.z.z += 1e-8f;
-    det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
-          M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
+    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+     * never be in this situation, but try to invert it anyway with tweak.
+     *
+     * This logic does not match Embree which would just give an invalid
+     * matrix. A better solution would be to remove this and ensure any object
+     * matrix is valid. */
+    x.x += 1e-8f;
+    y.y += 1e-8f;
+    z.z += 1e-8f;
+
+    det = dot(x, cross(y, z));
+    if (det == 0.0f) {
+      det = FLT_MAX;
+    }
   }
-  det = (det != 0.0f) ? 1.0f / det : 0.0f;
-
-  float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z,
-                                M.z.y * M.x.z - M.z.z * M.x.y,
-                                M.y.z * M.x.y - M.y.y * M.x.z);
-  float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x,
-                                M.z.z * M.x.x - M.z.x * M.x.z,
-                                M.y.x * M.x.z - M.y.z * M.x.x);
-  float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y,
-                                M.z.x * M.x.y - M.z.y * M.x.x,
-                                M.y.y * M.x.x - M.y.x * M.x.y);
-  float3 T = -make_float3(M.x.w, M.y.w, M.z.w);
-
-  R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T));
-  R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T));
-  R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T));
-
-  return R;
+
+  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+  const float3 inverse_x = cross(y, z) / det;
+  const float3 inverse_y = cross(z, x) / det;
+  const float3 inverse_z = cross(x, y) / det;
+
+  /* Compute translation and fill transform. */
+  Transform itfm;
+  itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
+  itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
+  itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
+
+  return itfm;
 }
 
 ccl_device_inline void transform_compose(ccl_private Transform *tfm,
diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h
index d71149946f7..f04dc675c84 100644
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -11,11 +11,13 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
-
+#ifdef __KERNEL_GPU__
+struct float8
+#else
 struct ccl_try_align(32) float8
+#endif
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   union {
     __m256 m256;
     struct {
@@ -32,18 +34,19 @@ struct ccl_try_align(32) float8
 
   __forceinline float8 &operator=(const float8 &a);
 
-#  else  /* __KERNEL_AVX2__ */
+#else  /* __KERNEL_AVX2__ */
   float a, b, c, d, e, f, g, h;
-#  endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */
 
+#ifndef __KERNEL_GPU__
   __forceinline float operator[](int i) const;
   __forceinline float &operator[](int i);
+#endif
 };
 
 ccl_device_inline float8 make_float8(float f);
 ccl_device_inline float8
 make_float8(float a, float b, float c, float d, float e, float f, float g, float h);
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h
index 0694f5205a5..21931c55071 100644
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -15,8 +15,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
 __forceinline float8::float8()
 {
 }
@@ -44,8 +43,9 @@ __forceinline float8 &float8::operator=(const float8 &f)
   m256 = f.m256;
   return *this;
 }
-#  endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */
 
+#ifndef __KERNEL_GPU__
 __forceinline float float8::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -59,30 +59,29 @@ __forceinline float &float8::operator[](int i)
   util_assert(i < 8);
   return *(&a + i);
 }
+#endif
 
 ccl_device_inline float8 make_float8(float f)
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
   float8 r(_mm256_set1_ps(f));
-#  else
+#else
   float8 r = {f, f, f, f, f, f, f, f};
-#  endif
+#endif
   return r;
 }
 
 ccl_device_inline float8
 make_float8(float a, float b, float c, float d, float e, float f, float g, float h)
 {
-#  ifdef __KERNEL_AVX2__
-  float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
-#  else
+#ifdef __KERNEL_AVX2__
+  float8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#else
   float8 r = {a, b, c, d, e, f, g, h};
-#  endif
+#endif
   return r;
 }
 
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
-
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */