31 files changed, 1574 insertions, 4299 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 57628f99e35..7f8f4a5ce76 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -69,6 +69,7 @@ set(SRC_HEADERS
   math_int2.h
   math_int3.h
   math_int4.h
+  math_int8.h
   math_matrix.h
   md5.h
   murmurhash.h
@@ -85,13 +86,7 @@ set(SRC_HEADERS
   rect.h
   set.h
   simd.h
-  avxf.h
-  avxb.h
-  avxi.h
   semaphore.h
-  sseb.h
-  ssef.h
-  ssei.h
   stack_allocator.h
   static_assert.h
   stats.h
@@ -118,6 +113,8 @@ set(SRC_HEADERS
   types_int3_impl.h
   types_int4.h
   types_int4_impl.h
+  types_int8.h
+  types_int8_impl.h
   types_spectrum.h
   types_uchar2.h
   types_uchar2_impl.h
diff --git a/intern/cycles/util/avxb.h b/intern/cycles/util/avxb.h
deleted file mode 100644
index fa3cb565309..00000000000
--- a/intern/cycles/util/avxb.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_AVXB_H__
-#define __UTIL_AVXB_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxf;
-
-/*! 4-wide SSE bool type. */
-struct avxb {
-  typedef avxb Mask;   // mask type
-  typedef avxf Float;  // float type
-
-  enum { size = 8 };  // number of SIMD elements
-  union {
-    __m256 m256;
-    int32_t v[8];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxb()
-  {
-  }
-  __forceinline avxb(const avxb &other)
-  {
-    m256 = other.m256;
-  }
-  __forceinline avxb &operator=(const avxb &other)
-  {
-    m256 = other.m256;
-    return *this;
-  }
-
-  __forceinline avxb(const __m256 input) : m256(input)
-  {
-  }
-  __forceinline avxb(const __m128 &a, const __m128 &b)
-      : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
-  {
-  }
-  __forceinline operator const __m256 &(void) const
-  {
-    return m256;
-  }
-  __forceinline operator const __m256i(void) const
-  {
-    return _mm256_castps_si256(m256);
-  }
-  __forceinline operator const __m256d(void) const
-  {
-    return _mm256_castps_pd(m256);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxb(FalseTy) : m256(_mm256_setzero_ps())
-  {
-  }
-  __forceinline avxb(TrueTy) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1)))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return (_mm256_movemask_ps(m256) >> i) & 1;
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!(const avxb &a)
-{
-  return _mm256_xor_ps(a, avxb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&(const avxb &a, const avxb &b)
-{
-  return _mm256_and_ps(a, b);
-}
-__forceinline const avxb operator|(const avxb &a, const avxb &b)
-{
-  return _mm256_or_ps(a, b);
-}
-__forceinline const avxb operator^(const avxb &a, const avxb &b)
-{
-  return _mm256_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&=(avxb &a, const avxb &b)
-{
-  return a = a & b;
-}
-__forceinline const avxb operator|=(avxb &a, const avxb &b)
-{
-  return a = a | b;
-}
-__forceinline const avxb operator^=(avxb &a, const avxb &b)
-{
-  return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!=(const avxb &a, const avxb &b)
-{
-  return _mm256_xor_ps(a, b);
-}
-__forceinline const avxb operator==(const avxb &a, const avxb &b)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-#else
-  __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
-  __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
-  __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
-  __m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1));
-  __m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo);
-  __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
-  __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
-  return _mm256_castsi256_ps(result);
-#endif
-}
-
-__forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
-{
-#if defined(__KERNEL_SSE41__)
-  return _mm256_blendv_ps(f, t, m);
-#else
-  return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb unpacklo(const avxb &a, const avxb &b)
-{
-  return _mm256_unpacklo_ps(a, b);
-}
-__forceinline const avxb unpackhi(const avxb &a, const avxb &b)
-{
-  return _mm256_unpackhi_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const avxb &a)
-{
-  return _mm_popcnt_u32(_mm256_movemask_ps(a));
-}
-#else
-__forceinline uint32_t popcnt(const avxb &a)
-{
-  return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
-         bool(a[7]);
-}
-#endif
-
-__forceinline bool reduce_and(const avxb &a)
-{
-  return _mm256_movemask_ps(a) == 0xf;
-}
-__forceinline bool reduce_or(const avxb &a)
-{
-  return _mm256_movemask_ps(a) != 0x0;
-}
-__forceinline bool all(const avxb &b)
-{
-  return _mm256_movemask_ps(b) == 0xf;
-}
-__forceinline bool any(const avxb &b)
-{
-  return _mm256_movemask_ps(b) != 0x0;
-}
-__forceinline bool none(const avxb &b)
-{
-  return _mm256_movemask_ps(b) == 0x0;
-}
-
-__forceinline uint32_t movemask(const avxb &a)
-{
-  return _mm256_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxb(const char *label, const avxb &a)
-{
-  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/avxf.h b/intern/cycles/util/avxf.h
deleted file mode 100644
index 03a13f30490..00000000000
--- a/intern/cycles/util/avxf.h
+++ /dev/null
@@ -1,379 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2016 Intel Corporation */
-
-#ifndef __UTIL_AVXF_H__
-#define __UTIL_AVXF_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxf {
-  typedef avxf Float;
-
-  enum { size = 8 }; /* Number of SIMD elements. */
-
-  union {
-    __m256 m256;
-    float f[8];
-    int i[8];
-  };
-
-  __forceinline avxf()
-  {
-  }
-  __forceinline avxf(const avxf &other)
-  {
-    m256 = other.m256;
-  }
-  __forceinline avxf &operator=(const avxf &other)
-  {
-    m256 = other.m256;
-    return *this;
-  }
-
-  __forceinline avxf(const __m256 a) : m256(a)
-  {
-  }
-  __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps(a))
-  {
-  }
-
-  __forceinline operator const __m256 &() const
-  {
-    return m256;
-  }
-  __forceinline operator __m256 &()
-  {
-    return m256;
-  }
-
-  __forceinline avxf(float a) : m256(_mm256_set1_ps(a))
-  {
-  }
-
-  __forceinline avxf(float high32x4, float low32x4)
-      : m256(_mm256_set_ps(
-            high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4))
-  {
-  }
-
-  __forceinline avxf(float a3, float a2, float a1, float a0)
-      : m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0))
-  {
-  }
-
-  __forceinline avxf(
-      float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0)
-      : m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0))
-  {
-  }
-
-  __forceinline avxf(float3 a) : m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x))
-  {
-  }
-
-  __forceinline avxf(int a3, int a2, int a1, int a0)
-  {
-    const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
-    m256 = _mm256_castsi256_ps(foo);
-  }
-
-  __forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
-  {
-    const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
-    m256 = _mm256_castsi256_ps(foo);
-  }
-
-  __forceinline avxf(__m128 a, __m128 b)
-  {
-    const __m256 foo = _mm256_castps128_ps256(a);
-    m256 = _mm256_insertf128_ps(foo, b, 1);
-  }
-
-  __forceinline const float &operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return f[i];
-  }
-  __forceinline float &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return f[i];
-  }
-};
-
-__forceinline avxf cross(const avxf &a, const avxf &b)
-{
-  avxf r(0.0,
-         a[4] * b[5] - a[5] * b[4],
-         a[6] * b[4] - a[4] * b[6],
-         a[5] * b[6] - a[6] * b[5],
-         0.0,
-         a[0] * b[1] - a[1] * b[0],
-         a[2] * b[0] - a[0] * b[2],
-         a[1] * b[2] - a[2] * b[1]);
-  return r;
-}
-
-__forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
-{
-  const avxf t = _mm256_mul_ps(a.m256, b.m256);
-  den = ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
-  den2 = ((float *)&t)[4] + ((float *)&t)[5] + ((float *)&t)[6];
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf cast(const __m256i &a)
-{
-  return _mm256_castsi256_ps(a);
-}
-
-__forceinline const avxf mm256_sqrt(const avxf &a)
-{
-  return _mm256_sqrt_ps(a.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf operator+(const avxf &a, const avxf &b)
-{
-  return _mm256_add_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator+(const avxf &a, const float &b)
-{
-  return a + avxf(b);
-}
-__forceinline const avxf operator+(const float &a, const avxf &b)
-{
-  return avxf(a) + b;
-}
-
-__forceinline const avxf operator-(const avxf &a, const avxf &b)
-{
-  return _mm256_sub_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator-(const avxf &a, const float &b)
-{
-  return a - avxf(b);
-}
-__forceinline const avxf operator-(const float &a, const avxf &b)
-{
-  return avxf(a) - b;
-}
-
-__forceinline const avxf operator*(const avxf &a, const avxf &b)
-{
-  return _mm256_mul_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator*(const avxf &a, const float &b)
-{
-  return a * avxf(b);
-}
-__forceinline const avxf operator*(const float &a, const avxf &b)
-{
-  return avxf(a) * b;
-}
-
-__forceinline const avxf operator/(const avxf &a, const avxf &b)
-{
-  return _mm256_div_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator/(const avxf &a, const float &b)
-{
-  return a / avxf(b);
-}
-__forceinline const avxf operator/(const float &a, const avxf &b)
-{
-  return avxf(a) / b;
-}
-
-__forceinline const avxf operator|(const avxf &a, const avxf &b)
-{
-  return _mm256_or_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator^(const avxf &a, const avxf &b)
-{
-  return _mm256_xor_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator&(const avxf &a, const avxf &b)
-{
-  return _mm256_and_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf max(const avxf &a, const avxf &b)
-{
-  return _mm256_max_ps(a.m256, b.m256);
-}
-__forceinline const avxf min(const avxf &a, const avxf &b)
-{
-  return _mm256_min_ps(a.m256, b.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf shuffle(const avxf &a, const __m256i &shuf)
-{
-  return _mm256_permutevar_ps(a, shuf);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf shuffle(const avxf &a)
-{
-  return _mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
-  return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-}
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a)
-{
-  return shuffle<i0, i1, i2, i3>(a, a);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
-  return shuffle<i0, i0, i0, i0>(a, b);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
-{
-  return shuffle<i0>(a, a);
-}
-
-template<size_t i> __forceinline float extract(const avxf &a)
-{
-  __m256 b = shuffle<i, i, i, i>(a).m256;
-  return _mm256_cvtss_f32(b);
-}
-template<> __forceinline float extract<0>(const avxf &a)
-{
-  return _mm256_cvtss_f32(a.m256);
-}
-
-__forceinline ssef low(const avxf &a)
-{
-  return _mm256_extractf128_ps(a.m256, 0);
-}
-__forceinline ssef high(const avxf &a)
-{
-  return _mm256_extractf128_ps(a.m256, 1);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf permute(const avxf &a)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-#else
-  float temp[8];
-  _mm256_storeu_ps((float *)&temp, a);
-  return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-#endif
-}
-
-template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
-ccl_device_inline const avxf set_sign_bit(const avxf &a)
-{
-  return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31, S2 << 31, S1 << 31, S0 << 31);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3, size_t S4, size_t S5, size_t S6, size_t S7>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
-  return _mm256_blend_ps(
-      a, b, S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
-  return blend<S0, S1, S2, S3, S0, S1, S2, S3>(a, b);
-}
-
-//#if defined(__KERNEL_SSE41__)
-__forceinline avxf maxi(const avxf &a, const avxf &b)
-{
-  const avxf ci = _mm256_max_ps(a, b);
-  return ci;
-}
-
-__forceinline avxf mini(const avxf &a, const avxf &b)
-{
-  const avxf ci = _mm256_min_ps(a, b);
-  return ci;
-}
-//#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Ternary Operators
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fmadd_ps(a, b, c);
-#else
-  return c + (a * b);
-#endif
-}
-
-__forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fnmadd_ps(a, b, c);
-#else
-  return c - (a * b);
-#endif
-}
-__forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fmsub_ps(a, b, c);
-#else
-  return (a * b) - c;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxb operator<=(const avxf &a, const avxf &b)
-{
-  return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
-}
-
-__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
-{
-  return _mm256_blendv_ps(f, t, m);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Common Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
-{
-  return madd(t, b, (avxf(1.0f) - t) * a);
-}
-
-#ifndef _mm256_set_m128
-#  define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
-    _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
-#endif
-
-#define _mm256_loadu2_m128(/* float const* */ hiaddr, /* float const* */ loaddr) \
-  _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/avxi.h b/intern/cycles/util/avxi.h
deleted file mode 100644
index 966a04a6b97..00000000000
--- a/intern/cycles/util/avxi.h
+++ /dev/null
@@ -1,732 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2009-2013 Intel Corporation */
-
-#ifndef __UTIL_AVXI_H__
-#define __UTIL_AVXI_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxi {
-  typedef avxb Mask;  // mask type for us
-  enum { size = 8 };  // number of SIMD elements
-  union {             // data
-    __m256i m256;
-#if !defined(__KERNEL_AVX2__)
-    struct {
-      __m128i l, h;
-    };
-#endif
-    int32_t v[8];
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxi()
-  {
-  }
-  __forceinline avxi(const avxi &a)
-  {
-    m256 = a.m256;
-  }
-  __forceinline avxi &operator=(const avxi &a)
-  {
-    m256 = a.m256;
-    return *this;
-  }
-
-  __forceinline avxi(const __m256i a) : m256(a)
-  {
-  }
-  __forceinline operator const __m256i &(void) const
-  {
-    return m256;
-  }
-  __forceinline operator __m256i &(void)
-  {
-    return m256;
-  }
-
-  __forceinline explicit avxi(const ssei &a)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
-  {
-  }
-  __forceinline avxi(const ssei &a, const ssei &b)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
-  {
-  }
-#if defined(__KERNEL_AVX2__)
-  __forceinline avxi(const __m128i &a, const __m128i &b)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
-  {
-  }
-#else
-  __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
-  {
-  }
-#endif
-  __forceinline explicit avxi(const int32_t *const a)
-      : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
-  {
-  }
-  __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
-  {
-  }
-  __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
-  {
-  }
-  __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
-      : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
-  {
-  }
-  __forceinline avxi(
-      int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
-      : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
-  {
-  }
-
-  __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
-  {
-  }
-#if defined(__KERNEL_AVX2__)
-  __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
-  {
-  }
-  __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
-  {
-  }
-  __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
-  {
-  }
-#else
-  __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
-  {
-  }
-  __forceinline avxi(PosInfTy)
-      : m256(_mm256_set_epi32(
-            pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
-  {
-  }
-  __forceinline avxi(NegInfTy)
-      : m256(_mm256_set_epi32(
-            neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
-  {
-  }
-#endif
-  __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const int32_t &operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return v[i];
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi cast(const __m256 &a)
-{
-  return _mm256_castps_si256(a);
-}
-__forceinline const avxi operator+(const avxi &a)
-{
-  return a;
-}
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a)
-{
-  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
-}
-__forceinline const avxi abs(const avxi &a)
-{
-  return _mm256_abs_epi32(a.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a)
-{
-  return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
-}
-__forceinline const avxi abs(const avxi &a)
-{
-  return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
-  return _mm256_add_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator+(const avxi &a, const int32_t b)
-{
-  return a + avxi(b);
-}
-__forceinline const avxi operator+(const int32_t a, const avxi &b)
-{
-  return avxi(a) + b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
-  return _mm256_sub_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator-(const avxi &a, const int32_t b)
-{
-  return a - avxi(b);
-}
-__forceinline const avxi operator-(const int32_t a, const avxi &b)
-{
-  return avxi(a) - b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
-  return _mm256_mullo_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator*(const avxi &a, const int32_t b)
-{
-  return a * avxi(b);
-}
-__forceinline const avxi operator*(const int32_t a, const avxi &b)
-{
-  return avxi(a) * b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
-  return _mm256_and_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator&(const avxi &a, const int32_t b)
-{
-  return a & avxi(b);
-}
-__forceinline const avxi operator&(const int32_t a, const avxi &b)
-{
-  return avxi(a) & b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
-  return _mm256_or_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator|(const avxi &a, const int32_t b)
-{
-  return a | avxi(b);
-}
-__forceinline const avxi operator|(const int32_t a, const avxi &b)
-{
-  return avxi(a) | b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
-  return _mm256_xor_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator^(const avxi &a, const int32_t b)
-{
-  return a ^ avxi(b);
-}
-__forceinline const avxi operator^(const int32_t a, const avxi &b)
-{
-  return avxi(a) ^ b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
-  return _mm256_slli_epi32(a.m256, n);
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
-  return _mm256_srai_epi32(a.m256, n);
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
-  return _mm256_srai_epi32(a.m256, b);
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
-  return _mm256_srli_epi32(a.m256, b);
-}
-#else
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
-  return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
-  return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
-  return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
-  return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
-}
-#endif
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
-  return _mm256_min_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi min(const avxi &a, const int32_t b)
-{
-  return min(a, avxi(b));
-}
-__forceinline const avxi min(const int32_t a, const avxi &b)
-{
-  return min(avxi(a), b);
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
-  return _mm256_max_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi max(const avxi &a, const int32_t b)
-{
-  return max(a, avxi(b));
-}
-__forceinline const avxi max(const int32_t a, const avxi &b)
-{
-  return max(avxi(a), b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxi &operator+=(avxi &a, const avxi &b)
-{
-  return a = a + b;
-}
-__forceinline avxi &operator+=(avxi &a, const int32_t b)
-{
-  return a = a + b;
-}
-
-__forceinline avxi &operator-=(avxi &a, const avxi &b)
-{
-  return a = a - b;
-}
-__forceinline avxi &operator-=(avxi &a, const int32_t b)
-{
-  return a = a - b;
-}
-
-__forceinline avxi &operator*=(avxi &a, const avxi &b)
-{
-  return a = a * b;
-}
-__forceinline avxi &operator*=(avxi &a, const int32_t b)
-{
-  return a = a * b;
-}
-
-__forceinline avxi &operator&=(avxi &a, const avxi &b)
-{
-  return a = a & b;
-}
-__forceinline avxi &operator&=(avxi &a, const int32_t b)
-{
-  return a = a & b;
-}
-
-__forceinline avxi &operator|=(avxi &a, const avxi &b)
-{
-  return a = a | b;
-}
-__forceinline avxi &operator|=(avxi &a, const int32_t b)
-{
-  return a = a | b;
-}
-
-__forceinline avxi &operator^=(avxi &a, const avxi &b)
-{
-  return a = a ^ b;
-}
-__forceinline avxi &operator^=(avxi &a, const int32_t b)
-{
-  return a = a ^ b;
-}
-
-__forceinline avxi &operator<<=(avxi &a, const int32_t b)
-{
-  return a = a << b;
-}
-__forceinline avxi &operator>>=(avxi &a, const int32_t b)
-{
-  return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator==(const avxi &a, const int32_t b)
-{
-  return a == avxi(b);
-}
-__forceinline const avxb operator==(const int32_t a, const avxi &b)
-{
-  return avxi(a) == b;
-}
-
-__forceinline const avxb operator!=(const avxi &a, const avxi &b)
-{
-  return !(a == b);
-}
-__forceinline const avxb operator!=(const avxi &a, const int32_t b)
-{
-  return a != avxi(b);
-}
-__forceinline const avxb operator!=(const int32_t a, const avxi &b)
-{
-  return avxi(a) != b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
-}
-#else
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator<(const avxi &a, const int32_t b)
-{
-  return a < avxi(b);
-}
-__forceinline const avxb operator<(const int32_t a, const avxi &b)
-{
-  return avxi(a) < b;
-}
-
-__forceinline const avxb operator>=(const avxi &a, const avxi &b)
-{
-  return !(a < b);
-}
-__forceinline const avxb operator>=(const avxi &a, const int32_t b)
-{
-  return a >= avxi(b);
-}
-__forceinline const avxb operator>=(const int32_t a, const avxi &b)
-{
-  return avxi(a) >= b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator>(const avxi &a, const int32_t b)
-{
-  return a > avxi(b);
-}
-__forceinline const avxb operator>(const int32_t a, const avxi &b)
-{
-  return avxi(a) > b;
-}
-
-__forceinline const avxb operator<=(const avxi &a, const avxi &b)
-{
-  return !(a > b);
-}
-__forceinline const avxb operator<=(const avxi &a, const int32_t b)
-{
-  return a <= avxi(b);
-}
-__forceinline const avxb operator<=(const int32_t a, const avxi &b)
-{
-  return avxi(a) <= b;
-}
-
-__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
-{
-  return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
-  return _mm256_unpacklo_epi32(a.m256, b.m256);
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
-  return _mm256_unpackhi_epi32(a.m256, b.m256);
-}
-#else
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-
-template<size_t i> __forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
-  return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_castps_si256(
-      _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_shuffle_ps(
-      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
-{
-  return _mm256_castps_si256(
-      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
-}
-
-__forceinline const avxi broadcast(const int *ptr)
-{
-  return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
-}
-template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
-{
-  return _mm256_insertf128_si256(a, b, i);
-}
-template<size_t i> __forceinline const ssei extract(const avxi &a)
-{
-  return _mm256_extractf128_si256(a, i);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi vreduce_min2(const avxi &v)
-{
-  return min(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_min4(const avxi &v)
-{
-  avxi v1 = vreduce_min2(v);
-  return min(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_min(const avxi &v)
-{
-  avxi v1 = vreduce_min4(v);
-  return min(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_max2(const avxi &v)
-{
-  return max(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_max4(const avxi &v)
-{
-  avxi v1 = vreduce_max2(v);
-  return max(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_max(const avxi &v)
-{
-  avxi v1 = vreduce_max4(v);
-  return max(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_add2(const avxi &v)
-{
-  return v + shuffle<1, 0, 3, 2>(v);
-}
-__forceinline const avxi vreduce_add4(const avxi &v)
-{
-  avxi v1 = vreduce_add2(v);
-  return v1 + shuffle<2, 3, 0, 1>(v1);
-}
-__forceinline const avxi vreduce_add(const avxi &v)
-{
-  avxi v1 = vreduce_add4(v);
-  return v1 + shuffle<1, 0>(v1);
-}
-
-__forceinline int reduce_min(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_min(v)));
-}
-__forceinline int reduce_max(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_max(v)));
-}
-__forceinline int reduce_add(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_add(v)));
-}
-
-__forceinline uint32_t select_min(const avxi &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const avxi &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const avxb &valid, const avxi &v)
-{
-  const avxi a = select(valid, v, avxi(pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const avxb &valid, const avxi &v)
-{
-  const avxi a = select(valid, v, avxi(neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Output Operators
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxi(const char *label, const avxi &a)
-{
-  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h
index 537f8ab6771..93e984120f2 100644
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -228,28 +228,27 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y)
  * exp = exponent, encoded as uint32_t
  * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
  */
-template<unsigned exp, unsigned e2coeff> ccl_device_inline ssef fastpow(const ssef &arg)
+template<unsigned exp, unsigned e2coeff> ccl_device_inline float4 fastpow(const float4 &arg)
 {
-  ssef ret;
-  ret = arg * cast(ssei(e2coeff));
-  ret = ssef(cast(ret));
-  ret = ret * cast(ssei(exp));
-  ret = cast(ssei(ret));
+  float4 ret = arg * cast(make_int4(e2coeff));
+  ret = make_float4(cast(ret));
+  ret = ret * cast(make_int4(exp));
+  ret = cast(make_int4(ret));
   return ret;
 }
 
 /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
-ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x)
+ccl_device_inline float4 improve_5throot_solution(const float4 &old_result, const float4 &x)
 {
-  ssef approx2 = old_result * old_result;
-  ssef approx4 = approx2 * approx2;
-  ssef t = x / approx4;
-  ssef summ = madd(ssef(4.0f), old_result, t);
-  return summ * ssef(1.0f / 5.0f);
+  float4 approx2 = old_result * old_result;
+  float4 approx4 = approx2 * approx2;
+  float4 t = x / approx4;
+  float4 summ = madd(make_float4(4.0f), old_result, t);
+  return summ * make_float4(1.0f / 5.0f);
 }
 
 /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
-ccl_device_inline ssef fastpow24(const ssef &arg)
+ccl_device_inline float4 fastpow24(const float4 &arg)
 {
   /* max, avg and |avg| errors were calculated in gcc without FMA instructions
    * The final precision should be better than powf in glibc */
@@ -257,9 +256,10 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
   /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
   /* 0x3F4CCCCD = 4/5 */
   /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
-  ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
-  ssef arg2 = arg * arg;
-  ssef arg4 = arg2 * arg2;
+  float4 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(
+      arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
+  float4 arg2 = arg * arg;
+  float4 arg4 = arg2 * arg2;
 
   /* error max = 0.018     avg = 0.0031    |avg| = 0.0031 */
   x = improve_5throot_solution(x, arg4);
@@ -271,12 +271,12 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
   return x * (x * x);
 }
 
-ccl_device ssef color_srgb_to_linear(const ssef &c)
+ccl_device float4 color_srgb_to_linear(const float4 &c)
 {
-  sseb cmp = c < ssef(0.04045f);
-  ssef lt = max(c * ssef(1.0f / 12.92f), ssef(0.0f));
-  ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f / 1.055f); /* fma */
-  ssef gte = fastpow24(gtebase);
+  int4 cmp = c < make_float4(0.04045f);
+  float4 lt = max(c * make_float4(1.0f / 12.92f), make_float4(0.0f));
+  float4 gtebase = (c + make_float4(0.055f)) * make_float4(1.0f / 1.055f); /* fma */
+  float4 gte = fastpow24(gtebase);
   return select(cmp, lt, gte);
 }
 #endif /* __KERNEL_SSE2__ */
@@ -302,10 +302,8 @@ ccl_device float4 color_linear_to_srgb_v4(float4 c)
 ccl_device float4 color_srgb_to_linear_v4(float4 c)
 {
 #ifdef __KERNEL_SSE2__
-  ssef r_ssef;
-  float4 &r = (float4 &)r_ssef;
-  r = c;
-  r_ssef = color_srgb_to_linear(r_ssef);
+  float4 r = c;
+  r = color_srgb_to_linear(r);
   r.w = c.w;
   return r;
 #else
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index 1969529eff0..d5be14c8eba 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -23,6 +23,7 @@
 /* Leave inlining decisions to compiler for these, the inline keyword here
  * is not about performance but including function definitions in headers. */
 #  define ccl_device static inline
+#  define ccl_device_extern extern "C"
 #  define ccl_device_noinline static inline
 #  define ccl_device_noinline_cpu ccl_device_noinline
 
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index c668638eb02..5665dd4c075 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -154,17 +154,17 @@ ccl_device_inline half float_to_half_display(const float f)
 
 ccl_device_inline half4 float4_to_half4_display(const float4 f)
 {
-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
   /* CPU: SSE and AVX. */
-  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
+  float4 x = min(max(f, make_float4(0.0f)), make_float4(65504.0f));
 #  ifdef __KERNEL_AVX2__
-  ssei rpack = _mm_cvtps_ph(x, 0);
+  int4 rpack = int4(_mm_cvtps_ph(x, 0));
 #  else
-  ssei absolute = cast(x) & 0x7FFFFFFF;
-  ssei Z = absolute + 0xC8000000;
-  ssei result = andnot(absolute < 0x38800000, Z);
-  ssei rshift = (result >> 13) & 0x7FFF;
-  ssei rpack = _mm_packs_epi32(rshift, rshift);
+  int4 absolute = cast(x) & make_int4(0x7FFFFFFF);
+  int4 Z = absolute + make_int4(0xC8000000);
+  int4 result = andnot(absolute < make_int4(0x38800000), Z);
+  int4 rshift = (result >> 13) & make_int4(0x7FFF);
+  int4 rpack = int4(_mm_packs_epi32(rshift, rshift));
 #  endif
   half4 h;
   _mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack));
diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h
index 4f83f331229..74210ff020e 100644
--- a/intern/cycles/util/hash.h
+++ b/intern/cycles/util/hash.h
@@ -222,7 +222,7 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)
 
 /* SSE Versions Of Jenkins Lookup3 Hash Functions */
 
-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
 #  define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))
 
 #  define mix(a, b, c) \
@@ -265,10 +265,10 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)
       c -= rot(b, 24); \
     }
 
-ccl_device_inline ssei hash_ssei(ssei kx)
+ccl_device_inline int4 hash_int4(int4 kx)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (1 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (1 << 2) + 13);
 
   a += kx;
   final(a, b, c);
@@ -276,10 +276,10 @@ ccl_device_inline ssei hash_ssei(ssei kx)
   return c;
 }
 
-ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
+ccl_device_inline int4 hash_int4_2(int4 kx, int4 ky)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (2 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (2 << 2) + 13);
 
   b += ky;
   a += kx;
@@ -288,10 +288,10 @@ ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
   return c;
 }
 
-ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
+ccl_device_inline int4 hash_int4_3(int4 kx, int4 ky, int4 kz)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (3 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (3 << 2) + 13);
 
   c += kz;
   b += ky;
@@ -301,10 +301,10 @@ ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
   return c;
 }
 
-ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
+ccl_device_inline int4 hash_int4_4(int4 kx, int4 ky, int4 kz, int4 kw)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (4 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (4 << 2) + 13);
 
   a += kx;
   b += ky;
@@ -317,11 +317,11 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
   return c;
 }
 
-#  if defined(__KERNEL_AVX__)
-ccl_device_inline avxi hash_avxi(avxi kx)
+#  if defined(__KERNEL_AVX2__)
+ccl_device_inline vint8 hash_int8(vint8 kx)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (1 << 2) + 13);
 
   a += kx;
   final(a, b, c);
@@ -329,10 +329,10 @@ ccl_device_inline avxi hash_avxi(avxi kx)
   return c;
 }
 
-ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+ccl_device_inline vint8 hash_int8_2(vint8 kx, vint8 ky)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (2 << 2) + 13);
 
   b += ky;
   a += kx;
@@ -341,10 +341,10 @@ ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
   return c;
 }
 
-ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+ccl_device_inline vint8 hash_int8_3(vint8 kx, vint8 ky, vint8 kz)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (3 << 2) + 13);
 
   c += kz;
   b += ky;
@@ -354,10 +354,10 @@ ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
   return c;
 }
 
-ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+ccl_device_inline vint8 hash_int8_4(vint8 kx, vint8 ky, vint8 kz, vint8 kw)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (4 << 2) + 13);
 
   a += kx;
   b += ky;
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 3a2e0e074a2..0fbe7a67a4f 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -532,12 +532,14 @@ CCL_NAMESPACE_END
 #include "util/math_int2.h"
 #include "util/math_int3.h"
 #include "util/math_int4.h"
+#include "util/math_int8.h"
 
 #include "util/math_float2.h"
-#include "util/math_float3.h"
 #include "util/math_float4.h"
 #include "util/math_float8.h"
 
+#include "util/math_float3.h"
+
 #include "util/rect.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h
index 542dad93467..ad806d0f08a 100644
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -10,55 +10,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float2 operator-(const float2 &a);
-ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator*(const float2 &a, float f);
-ccl_device_inline float2 operator*(float f, const float2 &a);
-ccl_device_inline float2 operator/(float f, const float2 &a);
-ccl_device_inline float2 operator/(const float2 &a, float f);
-ccl_device_inline float2 operator/(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+(const float2 &a, const float f);
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator-(const float2 &a, const float f);
-ccl_device_inline float2 operator-(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, float f);
-ccl_device_inline float2 operator/=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator/=(float2 &a, float f);
-
-ccl_device_inline bool operator==(const float2 &a, const float2 &b);
-ccl_device_inline bool operator!=(const float2 &a, const float2 &b);
-
-ccl_device_inline bool is_zero(const float2 &a);
-ccl_device_inline float average(const float2 &a);
-ccl_device_inline float distance(const float2 &a, const float2 &b);
-ccl_device_inline float dot(const float2 &a, const float2 &b);
-ccl_device_inline float cross(const float2 &a, const float2 &b);
-ccl_device_inline float len(const float2 a);
-ccl_device_inline float2 normalize(const float2 &a);
-ccl_device_inline float2 normalize_len(const float2 &a, float *t);
-ccl_device_inline float2 safe_normalize(const float2 &a);
-ccl_device_inline float2 min(const float2 &a, const float2 &b);
-ccl_device_inline float2 max(const float2 &a, const float2 &b);
-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx);
-ccl_device_inline float2 fabs(const float2 &a);
-ccl_device_inline float2 as_float2(const float4 &a);
-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
-ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float2 zero_float2()
 {
   return make_float2(0.0f, 0.0f);
@@ -75,63 +26,63 @@ ccl_device_inline float2 operator-(const float2 &a)
   return make_float2(-a.x, -a.y);
 }
 
-ccl_device_inline float2 operator*(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator*(const float2 a, const float2 b)
 {
   return make_float2(a.x * b.x, a.y * b.y);
 }
 
-ccl_device_inline float2 operator*(const float2 &a, float f)
+ccl_device_inline float2 operator*(const float2 a, float f)
 {
   return make_float2(a.x * f, a.y * f);
 }
 
-ccl_device_inline float2 operator*(float f, const float2 &a)
+ccl_device_inline float2 operator*(float f, const float2 a)
 {
   return make_float2(a.x * f, a.y * f);
 }
 
-ccl_device_inline float2 operator/(float f, const float2 &a)
+ccl_device_inline float2 operator/(float f, const float2 a)
 {
   return make_float2(f / a.x, f / a.y);
 }
 
-ccl_device_inline float2 operator/(const float2 &a, float f)
+ccl_device_inline float2 operator/(const float2 a, float f)
 {
   float invf = 1.0f / f;
   return make_float2(a.x * invf, a.y * invf);
 }
 
-ccl_device_inline float2 operator/(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator/(const float2 a, const float2 b)
 {
   return make_float2(a.x / b.x, a.y / b.y);
 }
 
-ccl_device_inline float2 operator+(const float2 &a, const float f)
+ccl_device_inline float2 operator+(const float2 a, const float2 b)
 {
-  return a + make_float2(f, f);
+  return make_float2(a.x + b.x, a.y + b.y);
 }
 
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator+(const float2 a, const float f)
 {
-  return make_float2(a.x + b.x, a.y + b.y);
+  return a + make_float2(f, f);
 }
 
-ccl_device_inline float2 operator-(const float2 &a, const float f)
+ccl_device_inline float2 operator-(const float2 a, const float2 b)
 {
-  return a - make_float2(f, f);
+  return make_float2(a.x - b.x, a.y - b.y);
 }
 
-ccl_device_inline float2 operator-(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator-(const float2 a, const float f)
 {
-  return make_float2(a.x - b.x, a.y - b.y);
+  return a - make_float2(f, f);
 }
 
-ccl_device_inline float2 operator+=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator+=(float2 &a, const float2 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline float2 operator*=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator*=(float2 &a, const float2 b)
 {
   return a = a * b;
 }
@@ -141,7 +92,7 @@ ccl_device_inline float2 operator*=(float2 &a, float f)
   return a = a * f;
 }
 
-ccl_device_inline float2 operator/=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator/=(float2 &a, const float2 b)
 {
   return a = a / b;
 }
@@ -152,74 +103,81 @@ ccl_device_inline float2 operator/=(float2 &a, float f)
   return a = a * invf;
 }
 
-ccl_device_inline bool operator==(const float2 &a, const float2 &b)
+ccl_device_inline bool operator==(const float2 a, const float2 b)
 {
   return (a.x == b.x && a.y == b.y);
 }
 
-ccl_device_inline bool operator!=(const float2 &a, const float2 &b)
+ccl_device_inline bool operator!=(const float2 a, const float2 b)
 {
   return !(a == b);
 }
 
-ccl_device_inline bool is_zero(const float2 &a)
+ccl_device_inline bool is_zero(const float2 a)
 {
   return (a.x == 0.0f && a.y == 0.0f);
 }
 
-ccl_device_inline float average(const float2 &a)
+ccl_device_inline float average(const float2 a)
 {
   return (a.x + a.y) * (1.0f / 2.0f);
 }
 
-ccl_device_inline float distance(const float2 &a, const float2 &b)
+ccl_device_inline float dot(const float2 a, const float2 b)
 {
-  return len(a - b);
+  return a.x * b.x + a.y * b.y;
 }
+#endif
 
-ccl_device_inline float dot(const float2 &a, const float2 &b)
+ccl_device_inline float len(const float2 a)
 {
-  return a.x * b.x + a.y * b.y;
+  return sqrtf(dot(a, a));
 }
 
-ccl_device_inline float cross(const float2 &a, const float2 &b)
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float2 a, const float2 b)
+{
+  return len(a - b);
+}
+
+ccl_device_inline float cross(const float2 a, const float2 b)
 {
   return (a.x * b.y - a.y * b.x);
 }
 
-ccl_device_inline float2 normalize(const float2 &a)
+ccl_device_inline float2 normalize(const float2 a)
 {
   return a / len(a);
 }
 
-ccl_device_inline float2 normalize_len(const float2 &a, ccl_private float *t)
+ccl_device_inline float2 normalize_len(const float2 a, ccl_private float *t)
 {
   *t = len(a);
   return a / (*t);
 }
 
-ccl_device_inline float2 safe_normalize(const float2 &a)
+ccl_device_inline float2 safe_normalize(const float2 a)
 {
   float t = len(a);
   return (t != 0.0f) ? a / t : a;
 }
 
-ccl_device_inline float2 min(const float2 &a, const float2 &b)
+ccl_device_inline float2 min(const float2 a, const float2 b)
 {
   return make_float2(min(a.x, b.x), min(a.y, b.y));
 }
 
-ccl_device_inline float2 max(const float2 &a, const float2 &b)
+ccl_device_inline float2 max(const float2 a, const float2 b)
 {
   return make_float2(max(a.x, b.x), max(a.y, b.y));
 }
 
-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx)
+ccl_device_inline float2 clamp(const float2 a, const float2 mn, const float2 mx)
 {
   return min(max(a, mn), mx);
 }
 
-ccl_device_inline float2 fabs(const float2 &a)
+ccl_device_inline float2 fabs(const float2 a)
 {
   return make_float2(fabsf(a.x), fabsf(a.y));
 }
@@ -229,28 +187,23 @@ ccl_device_inline float2 as_float2(const float4 &a)
   return make_float2(a.x, a.y);
 }
 
-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 interp(const float2 a, const float2 b, float t)
 {
   return a + t * (b - a);
 }
 
-ccl_device_inline float2 mix(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 mix(const float2 a, const float2 b, float t)
 {
   return a + t * (b - a);
 }
 
-ccl_device_inline float2 floor(const float2 &a)
+ccl_device_inline float2 floor(const float2 a)
 {
   return make_float2(floorf(a.x), floorf(a.y));
 }
 
 #endif /* !__KERNEL_METAL__ */
 
-ccl_device_inline float len(const float2 a)
-{
-  return sqrtf(dot(a, a));
-}
-
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index eec7122b9dc..79ee86d9c82 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
  * Copyright 2011-2022 Blender Foundation */
 
 #ifndef __UTIL_MATH_FLOAT3_H__
@@ -10,73 +11,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float3 operator-(const float3 &a);
-ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator*(const float3 &a, const float f);
-ccl_device_inline float3 operator*(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float3 &a, const float f);
-ccl_device_inline float3 operator/(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+(const float3 &a, const float f);
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator-(const float3 &a, const float f);
-ccl_device_inline float3 operator-(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator-=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, float f);
-ccl_device_inline float3 operator/=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator/=(float3 &a, float f);
-
-ccl_device_inline bool operator==(const float3 &a, const float3 &b);
-ccl_device_inline bool operator!=(const float3 &a, const float3 &b);
-
-ccl_device_inline float distance(const float3 &a, const float3 &b);
-ccl_device_inline float dot(const float3 &a, const float3 &b);
-ccl_device_inline float dot_xy(const float3 &a, const float3 &b);
-ccl_device_inline float3 cross(const float3 &a, const float3 &b);
-ccl_device_inline float3 normalize(const float3 &a);
-ccl_device_inline float3 min(const float3 &a, const float3 &b);
-ccl_device_inline float3 max(const float3 &a, const float3 &b);
-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx);
-ccl_device_inline float3 fabs(const float3 &a);
-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t);
-ccl_device_inline float3 rcp(const float3 &a);
-ccl_device_inline float3 sqrt(const float3 &a);
-ccl_device_inline float3 floor(const float3 &a);
-ccl_device_inline float3 ceil(const float3 &a);
-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
-#endif /* !defined(__KERNEL_METAL__) */
-
-ccl_device_inline float reduce_min(float3 a);
-ccl_device_inline float reduce_max(float3 a);
-ccl_device_inline float len(const float3 a);
-ccl_device_inline float len_squared(const float3 a);
-
-ccl_device_inline float3 project(const float3 v, const float3 v_proj);
-
-ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_divide(const float3 a, const float3 b);
-ccl_device_inline float3 safe_divide(const float3 a, const float b);
-ccl_device_inline float3 interp(float3 a, float3 b, float t);
-ccl_device_inline float3 sqr(float3 a);
-
-ccl_device_inline bool is_zero(const float3 a);
-ccl_device_inline float reduce_add(const float3 a);
-ccl_device_inline float average(const float3 a);
-ccl_device_inline bool isequal(const float3 a, const float3 b);
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float3 zero_float3()
 {
 #ifdef __KERNEL_SSE__
@@ -109,7 +43,7 @@ ccl_device_inline float3 operator-(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator*(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, b.m128));
@@ -118,7 +52,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 operator*(const float3 &a, const float f)
+ccl_device_inline float3 operator*(const float3 a, const float f)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
@@ -127,7 +61,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float f)
 #  endif
 }
 
-ccl_device_inline float3 operator*(const float f, const float3 &a)
+ccl_device_inline float3 operator*(const float f, const float3 a)
 {
 #  if defined(__KERNEL_SSE__)
   return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
@@ -136,7 +70,7 @@ ccl_device_inline float3 operator*(const float f, const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 operator/(const float f, const float3 &a)
+ccl_device_inline float3 operator/(const float f, const float3 a)
 {
 #  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
@@ -145,7 +79,7 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 operator/(const float3 &a, const float f)
+ccl_device_inline float3 operator/(const float3 a, const float f)
 {
 #  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
@@ -154,7 +88,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 #  endif
 }
 
-ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator/(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, b.m128));
@@ -163,12 +97,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 operator+(const float3 &a, const float f)
-{
-  return a + make_float3(f, f, f);
-}
-
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator+(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_add_ps(a.m128, b.m128));
@@ -177,12 +106,12 @@ ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 operator-(const float3 &a, const float f)
+ccl_device_inline float3 operator+(const float3 a, const float f)
 {
-  return a - make_float3(f, f, f);
+  return a + make_float3(f, f, f);
 }
 
-ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_sub_ps(a.m128, b.m128));
@@ -191,17 +120,22 @@ ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float f)
+{
+  return a - make_float3(f, f, f);
+}
+
+ccl_device_inline float3 operator+=(float3 &a, const float3 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline float3 operator-=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-=(float3 &a, const float3 b)
 {
   return a = a - b;
 }
 
-ccl_device_inline float3 operator*=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator*=(float3 &a, const float3 b)
 {
   return a = a * b;
 }
@@ -211,7 +145,7 @@ ccl_device_inline float3 operator*=(float3 &a, float f)
   return a = a * f;
 }
 
-ccl_device_inline float3 operator/=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator/=(float3 &a, const float3 b)
 {
   return a = a / b;
 }
@@ -223,7 +157,7 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
 }
 
 #  if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__))
-ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 b)
 {
   a = float3(a) * b;
   return a;
@@ -235,7 +169,7 @@ ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f)
   return a;
 }
 
-ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 b)
 {
   a = float3(a) / b;
   return a;
@@ -248,7 +182,7 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
 }
 #  endif
 
-ccl_device_inline bool operator==(const float3 &a, const float3 &b)
+ccl_device_inline bool operator==(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@@ -257,17 +191,12 @@ ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
+ccl_device_inline bool operator!=(const float3 a, const float3 b)
 {
   return !(a == b);
 }
 
-ccl_device_inline float distance(const float3 &a, const float3 &b)
-{
-  return len(a - b);
-}
-
-ccl_device_inline float dot(const float3 &a, const float3 &b)
+ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@@ -276,26 +205,62 @@ ccl_device_inline float dot(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
+#endif
+
+ccl_device_inline float dot_xy(const float3 a, const float3 b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#  else
+#else
   return a.x * b.x + a.y * b.y;
-#  endif
+#endif
+}
+
+ccl_device_inline float len(const float3 a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
+  return sqrtf(dot(a, a));
+#endif
+}
+
+ccl_device_inline float reduce_min(float3 a)
+{
+  return min(min(a.x, a.y), a.z);
+}
+
+ccl_device_inline float reduce_max(float3 a)
+{
+  return max(max(a.x, a.y), a.z);
+}
+
+ccl_device_inline float len_squared(const float3 a)
+{
+  return dot(a, a);
+}
+
+#ifndef __KERNEL_METAL__
+
+ccl_device_inline float distance(const float3 a, const float3 b)
+{
+  return len(a - b);
 }
 
-ccl_device_inline float3 cross(const float3 &a, const float3 &b)
+ccl_device_inline float3 cross(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
-  return float3(shuffle<1, 2, 0, 3>(
-      msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+  const float4 x = float4(a.m128);
+  const float4 y = shuffle<1, 2, 0, 3>(float4(b.m128));
+  const float4 z = float4(_mm_mul_ps(shuffle<1, 2, 0, 3>(float4(a.m128)), float4(b.m128)));
+
+  return float3(shuffle<1, 2, 0, 3>(msub(x, y, z)).m128);
 #  else
   return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 #  endif
 }
 
-ccl_device_inline float3 normalize(const float3 &a)
+ccl_device_inline float3 normalize(const float3 a)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@@ -305,7 +270,7 @@ ccl_device_inline float3 normalize(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 min(const float3 &a, const float3 &b)
+ccl_device_inline float3 min(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_min_ps(a.m128, b.m128));
@@ -314,7 +279,7 @@ ccl_device_inline float3 min(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 max(const float3 &a, const float3 &b)
+ccl_device_inline float3 max(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_max_ps(a.m128, b.m128));
@@ -323,12 +288,12 @@ ccl_device_inline float3 max(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
+ccl_device_inline float3 clamp(const float3 a, const float3 mn, const float3 mx)
 {
   return min(max(a, mn), mx);
 }
 
-ccl_device_inline float3 fabs(const float3 &a)
+ccl_device_inline float3 fabs(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
 #    ifdef __KERNEL_NEON__
@@ -342,7 +307,7 @@ ccl_device_inline float3 fabs(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 sqrt(const float3 &a)
+ccl_device_inline float3 sqrt(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_sqrt_ps(a));
@@ -351,7 +316,7 @@ ccl_device_inline float3 sqrt(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 floor(const float3 &a)
+ccl_device_inline float3 floor(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_floor_ps(a));
@@ -360,7 +325,7 @@ ccl_device_inline float3 floor(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 ceil(const float3 &a)
+ccl_device_inline float3 ceil(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_ceil_ps(a));
@@ -369,12 +334,12 @@ ccl_device_inline float3 ceil(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
+ccl_device_inline float3 mix(const float3 a, const float3 b, float t)
 {
   return a + t * (b - a);
 }
 
-ccl_device_inline float3 rcp(const float3 &a)
+ccl_device_inline float3 rcp(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
@@ -399,33 +364,6 @@ ccl_device_inline float3 log(float3 v)
   return make_float3(logf(v.x), logf(v.y), logf(v.z));
 }
 
-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float reduce_min(float3 a)
-{
-  return min(min(a.x, a.y), a.z);
-}
-
-ccl_device_inline float reduce_max(float3 a)
-{
-  return max(max(a.x, a.y), a.z);
-}
-
-ccl_device_inline float len(const float3 a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
-#else
-  return sqrtf(dot(a, a));
-#endif
-}
-
-ccl_device_inline float len_squared(const float3 a)
-{
-  return dot(a, a);
-}
-
-#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
 {
   float3 unit_normal = normalize(normal);
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index c2721873037..301d2d789c0 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
  * Copyright 2011-2022 Blender Foundation */
 
 #ifndef __UTIL_MATH_FLOAT4_H__
@@ -10,85 +11,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float4 operator-(const float4 &a);
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator*(const float4 &a, float f);
-ccl_device_inline float4 operator*(float f, const float4 &a);
-ccl_device_inline float4 operator/(const float4 &a, float f);
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+(const float4 &a, const float f);
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator-(const float4 &a, const float f);
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, float f);
-ccl_device_inline float4 operator/=(float4 &a, float f);
-
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b);
-ccl_device_inline bool operator==(const float4 &a, const float4 &b);
-
-ccl_device_inline float distance(const float4 &a, const float4 &b);
-ccl_device_inline float dot(const float4 &a, const float4 &b);
-ccl_device_inline float len_squared(const float4 &a);
-ccl_device_inline float4 rcp(const float4 &a);
-ccl_device_inline float4 sqrt(const float4 &a);
-ccl_device_inline float4 sqr(const float4 &a);
-ccl_device_inline float4 cross(const float4 &a, const float4 &b);
-ccl_device_inline bool is_zero(const float4 &a);
-ccl_device_inline float average(const float4 &a);
-ccl_device_inline float len(const float4 &a);
-ccl_device_inline float4 normalize(const float4 &a);
-ccl_device_inline float4 safe_normalize(const float4 &a);
-ccl_device_inline float4 min(const float4 &a, const float4 &b);
-ccl_device_inline float4 max(const float4 &a, const float4 &b);
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx);
-ccl_device_inline float4 fabs(const float4 &a);
-ccl_device_inline float4 floor(const float4 &a);
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_METAL__*/
-
-ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
-ccl_device_inline float4 safe_divide(const float4 a, const float b);
-
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b);
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b);
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b);
-
-#  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b);
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
-#  endif
-#endif /* __KERNEL_SSE__ */
-
-ccl_device_inline float reduce_min(const float4 a);
-ccl_device_inline float reduce_max(const float4 a);
-ccl_device_inline float reduce_add(const float4 a);
-
-ccl_device_inline bool isequal(const float4 a, const float4 b);
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-#endif /* !__KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float4 zero_float4()
 {
 #ifdef __KERNEL_SSE__
@@ -103,6 +25,16 @@ ccl_device_inline float4 one_float4()
   return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }
 
+ccl_device_inline int4 cast(const float4 a)
+{
+#ifdef __KERNEL_SSE__
+  return int4(_mm_castps_si128(a));
+#else
+  return make_int4(
+      __float_as_int(a.x), __float_as_int(a.y), __float_as_int(a.z), __float_as_int(a.w));
+#endif
+}
+
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a)
 {
@@ -114,7 +46,7 @@ ccl_device_inline float4 operator-(const float4 &a)
 #  endif
 }
 
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator*(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_mul_ps(a.m128, b.m128));
@@ -123,7 +55,7 @@ ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float4 operator*(const float4 &a, float f)
+ccl_device_inline float4 operator*(const float4 a, float f)
 {
 #  if defined(__KERNEL_SSE__)
   return a * make_float4(f);
@@ -132,17 +64,17 @@ ccl_device_inline float4 operator*(const float4 &a, float f)
 #  endif
 }
 
-ccl_device_inline float4 operator*(float f, const float4 &a)
+ccl_device_inline float4 operator*(float f, const float4 a)
 {
   return a * f;
 }
 
-ccl_device_inline float4 operator/(const float4 &a, float f)
+ccl_device_inline float4 operator/(const float4 a, float f)
 {
   return a * (1.0f / f);
 }
 
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator/(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_div_ps(a.m128, b.m128));
@@ -151,12 +83,7 @@ ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float4 operator+(const float4 &a, const float f)
-{
-  return a + make_float4(f, f, f, f);
-}
-
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator+(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_add_ps(a.m128, b.m128));
@@ -165,12 +92,12 @@ ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float4 operator-(const float4 &a, const float f)
+ccl_device_inline float4 operator+(const float4 a, const float f)
 {
-  return a - make_float4(f, f, f, f);
+  return a + make_float4(f);
 }
 
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_sub_ps(a.m128, b.m128));
@@ -179,17 +106,22 @@ ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float f)
+{
+  return a - make_float4(f);
+}
+
+ccl_device_inline float4 operator+=(float4 &a, const float4 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline float4 operator-=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-=(float4 &a, const float4 b)
 {
   return a = a - b;
 }
 
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator*=(float4 &a, const float4 b)
 {
   return a = a * b;
 }
@@ -204,7 +136,7 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
   return a = a / f;
 }
 
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
@@ -213,7 +145,7 @@ ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator>=(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
@@ -222,7 +154,7 @@ ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<=(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
@@ -231,7 +163,7 @@ ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline bool operator==(const float4 &a, const float4 &b)
+ccl_device_inline bool operator==(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -240,160 +172,148 @@ ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float distance(const float4 &a, const float4 &b)
-{
-  return len(a - b);
-}
-
-ccl_device_inline float dot(const float4 &a, const float4 &b)
+ccl_device_inline const float4 operator^(const float4 a, const float4 b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  __m128 t = vmulq_f32(a, b);
-  return vaddvq_f32(t);
-#    else
-  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#    endif
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_xor_ps(a.m128, b.m128));
 #  else
-  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+  return make_float4(__uint_as_float(__float_as_uint(a.x) ^ __float_as_uint(b.x)),
+                     __uint_as_float(__float_as_uint(a.y) ^ __float_as_uint(b.y)),
+                     __uint_as_float(__float_as_uint(a.z) ^ __float_as_uint(b.z)),
+                     __uint_as_float(__float_as_uint(a.w) ^ __float_as_uint(b.w)));
 #  endif
 }
 
-ccl_device_inline float len_squared(const float4 &a)
-{
-  return dot(a, a);
-}
-
-ccl_device_inline float4 rcp(const float4 &a)
+ccl_device_inline float4 min(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
-  /* Don't use _mm_rcp_ps due to poor precision. */
-  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+  return float4(_mm_min_ps(a.m128, b.m128));
 #  else
-  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+  return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
 #  endif
 }
 
-ccl_device_inline float4 sqrt(const float4 &a)
+ccl_device_inline float4 max(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
-  return float4(_mm_sqrt_ps(a.m128));
+  return float4(_mm_max_ps(a.m128, b.m128));
 #  else
-  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+  return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
 #  endif
 }
 
-ccl_device_inline float4 sqr(const float4 &a)
+ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
 {
-  return a * a;
+  return min(max(a, mn), mx);
 }
+#endif /* !__KERNEL_METAL__*/
 
-ccl_device_inline float4 cross(const float4 &a, const float4 &b)
+ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
 {
-#  ifdef __KERNEL_SSE__
-  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
-         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
+  return float4(vfmaq_f32(c, a, b));
+#  elif defined(__KERNEL_AVX2__)
+  return float4(_mm_fmadd_ps(a, b, c));
 #  else
-  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+  return a * b + c;
 #  endif
+#else
+  return a * b + c;
+#endif
 }
 
-ccl_device_inline bool is_zero(const float4 &a)
+ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
 {
-#  ifdef __KERNEL_SSE__
-  return a == zero_float4();
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
+  return float4(vfmaq_f32(vnegq_f32(c), a, b));
+#  elif defined(__KERNEL_AVX2__)
+  return float4(_mm_fmsub_ps(a, b, c));
 #  else
-  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+  return a * b - c;
 #  endif
+#else
+  return a * b - c;
+#endif
 }
 
-ccl_device_inline float average(const float4 &a)
+#ifdef __KERNEL_SSE__
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 b)
 {
-  return reduce_add(a) * 0.25f;
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
+#  else
+  return float4(
+      _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+#  endif
 }
 
-ccl_device_inline float len(const float4 &a)
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
 {
-  return sqrtf(dot(a, a));
+  return float4(_mm_movelh_ps(a, a));
 }
 
-ccl_device_inline float4 normalize(const float4 &a)
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
 {
-  return a / len(a);
+  return float4(_mm_movehl_ps(a, a));
 }
 
-ccl_device_inline float4 safe_normalize(const float4 &a)
+#  ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
 {
-  float t = len(a);
-  return (t != 0.0f) ? a / t : a;
+  return float4(_mm_moveldup_ps(b));
 }
 
-ccl_device_inline float4 min(const float4 &a, const float4 &b)
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
 {
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_min_ps(a.m128, b.m128));
-#  else
-  return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#  endif
+  return float4(_mm_movehdup_ps(b));
 }
+#  endif /* __KERNEL_SSE3__ */
 
-ccl_device_inline float4 max(const float4 &a, const float4 &b)
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 a, const float4 b)
 {
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_max_ps(a.m128, b.m128));
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
 #  else
-  return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+  return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
 #  endif
 }
 
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
+template<size_t i0> __forceinline const float4 shuffle(const float4 b)
 {
-  return min(max(a, mn), mx);
+  return shuffle<i0, i0, i0, i0>(b);
 }
-
-ccl_device_inline float4 fabs(const float4 &a)
+template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
 {
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vabsq_f32(a));
-#    else
-  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#    endif
-#  else
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-#  endif
-}
-
-ccl_device_inline float4 floor(const float4 &a)
-{
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_floor_ps(a));
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
 #  else
-  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+  return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)));
 #  endif
 }
 
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
 {
-  return a + t * (b - a);
+  return float4(_mm_movelh_ps(a, b));
 }
 
-ccl_device_inline float4 saturate(const float4 &a)
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
 {
-  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+  return float4(_mm_movehl_ps(b, a));
 }
 
-ccl_device_inline float4 exp(float4 v)
+template<size_t i> __forceinline float extract(const float4 a)
 {
-  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+  return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
 }
-
-ccl_device_inline float4 log(float4 v)
+template<> __forceinline float extract<0>(const float4 a)
 {
-  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+  return _mm_cvtss_f32(a);
 }
-
-#endif /* !__KERNEL_METAL__*/
+#endif
 
 ccl_device_inline float reduce_add(const float4 a)
 {
@@ -440,77 +360,192 @@ ccl_device_inline float reduce_max(const float4 a)
 #endif
 }
 
-ccl_device_inline bool isequal(const float4 a, const float4 b)
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float dot(const float4 a, const float4 b)
 {
-#if defined(__KERNEL_METAL__)
-  return all(a == b);
-#else
-  return a == b;
-#endif
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  __m128 t = vmulq_f32(a, b);
+  return vaddvq_f32(t);
+#    else
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#    endif
+#  else
+  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+#  endif
 }
+#endif /* !defined(__KERNEL_METAL__) */
 
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b)
+ccl_device_inline float len(const float4 a)
 {
-#  if defined(__KERNEL_NEON__)
-  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+  return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float len_squared(const float4 a)
+{
+  return dot(a, a);
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float4 a, const float4 b)
+{
+  return len(a - b);
+}
+
+ccl_device_inline float4 rcp(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+  /* Don't use _mm_rcp_ps due to poor precision. */
+  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #  else
-  return float4(_mm_castsi128_ps(
-      _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
 #  endif
 }
 
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b)
+ccl_device_inline float4 sqrt(const float4 a)
 {
-#  if defined(__KERNEL_NEON__)
-  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_sqrt_ps(a.m128));
 #  else
-  return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
 #  endif
 }
 
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
+ccl_device_inline float4 sqr(const float4 a)
 {
-  return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
+  return a * a;
 }
 
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b)
+ccl_device_inline float4 cross(const float4 a, const float4 b)
 {
-  return float4(_mm_movelh_ps(a.m128, b.m128));
+#  ifdef __KERNEL_SSE__
+  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
+         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+#  else
+  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+#  endif
 }
 
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b)
+ccl_device_inline bool is_zero(const float4 a)
 {
-  return float4(_mm_movehl_ps(b.m128, a.m128));
+#  ifdef __KERNEL_SSE__
+  return a == zero_float4();
+#  else
+  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#  endif
 }
 
-#  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b)
+ccl_device_inline float average(const float4 a)
 {
-  return float4(_mm_moveldup_ps(b));
+  return reduce_add(a) * 0.25f;
 }
 
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b)
+ccl_device_inline float4 normalize(const float4 a)
 {
-  return float4(_mm_movehdup_ps(b));
+  return a / len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4 a)
+{
+  float t = len(a);
+  return (t != 0.0f) ? a / t : a;
+}
+
+ccl_device_inline float4 fabs(const float4 a)
+{
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vabsq_f32(a));
+#    else
+  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#    endif
+#  else
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 floor(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+#    if defined(__KERNEL_NEON__)
+  return float4(vrndmq_f32(a));
+#    else
+  return float4(_mm_floor_ps(a));
+#    endif
+#  else
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
+{
+#  ifdef __KERNEL_SSE__
+  const float4 f = floor(x);
+  *i = int4(_mm_cvttps_epi32(f.m128));
+  return x - f;
+#  else
+  float4 r;
+  r.x = floorfrac(x.x, &i->x);
+  r.y = floorfrac(x.y, &i->y);
+  r.z = floorfrac(x.z, &i->z);
+  r.w = floorfrac(x.w, &i->w);
+  return r;
+#  endif
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, const float4 t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float4 saturate(const float4 a)
+{
+  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
+#endif /* !__KERNEL_METAL__*/
+
+ccl_device_inline bool isequal(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_METAL__)
+  return all(a == b);
+#else
+  return a == b;
+#endif
 }
-#  endif /* __KERNEL_SSE3__ */
-#endif   /* __KERNEL_SSE__ */
 
 #ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b)
+ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_SSE41__
   return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
+#    else
+  return float4(
+      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b)));
+#    endif
 #  else
   return make_float4(
       (mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
 #  endif
 }
 
-ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
+ccl_device_inline float4 mask(const int4 mask, const float4 a)
 {
   /* Replace elements of x with zero where mask isn't set. */
   return select(mask, a, zero_float4());
diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h
index b538cfbe70b..755a720a10b 100644
--- a/intern/cycles/util/math_float8.h
+++ b/intern/cycles/util/math_float8.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
  * Copyright 2022 Blender Foundation */
 
 #ifndef __UTIL_MATH_FLOAT8_H__
@@ -10,193 +11,138 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator+(const float8_t a, const float f);
-ccl_device_inline float8_t operator+(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator-(const float8_t a);
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator-(const float8_t a, const float f);
-ccl_device_inline float8_t operator-(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*(const float8_t a, const float f);
-ccl_device_inline float8_t operator*(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator/(const float8_t a, float f);
-ccl_device_inline float8_t operator/(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b);
-
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*=(float8_t a, float f);
-
-ccl_device_inline float8_t operator/=(float8_t a, float f);
-
-ccl_device_inline bool operator==(const float8_t a, const float8_t b);
-
-ccl_device_inline float8_t rcp(const float8_t a);
-ccl_device_inline float8_t sqrt(const float8_t a);
-ccl_device_inline float8_t sqr(const float8_t a);
-ccl_device_inline bool is_zero(const float8_t a);
-ccl_device_inline float average(const float8_t a);
-ccl_device_inline float8_t min(const float8_t a, const float8_t b);
-ccl_device_inline float8_t max(const float8_t a, const float8_t b);
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx);
-ccl_device_inline float8_t fabs(const float8_t a);
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t);
-ccl_device_inline float8_t saturate(const float8_t a);
-
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b);
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b);
-
-ccl_device_inline float reduce_min(const float8_t a);
-ccl_device_inline float reduce_max(const float8_t a);
-ccl_device_inline float reduce_add(const float8_t a);
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b);
-
-/*******************************************************************************
- * Definition.
- */
-
-ccl_device_inline float8_t zero_float8_t()
+ccl_device_inline vfloat8 zero_vfloat8()
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_setzero_ps());
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_setzero_ps());
 #else
-  return make_float8_t(0.0f);
+  return make_vfloat8(0.0f);
 #endif
 }
 
-ccl_device_inline float8_t one_float8_t()
+ccl_device_inline vfloat8 one_vfloat8()
 {
-  return make_float8_t(1.0f);
+  return make_vfloat8(1.0f);
 }
 
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_add_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_add_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
       a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
 #endif
 }
 
-ccl_device_inline float8_t operator+(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const float f)
 {
-  return a + make_float8_t(f);
+  return a + make_vfloat8(f);
 }
 
-ccl_device_inline float8_t operator+(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator+(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) + a;
+  return make_vfloat8(f) + a;
 }
 
-ccl_device_inline float8_t operator-(const float8_t a)
+ccl_device_inline vfloat8 operator-(const vfloat8 a)
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
   __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
-  return float8_t(_mm256_xor_ps(a.m256, mask));
+  return vfloat8(_mm256_xor_ps(a.m256, mask));
 #else
-  return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+  return make_vfloat8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
 #endif
 }
 
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_sub_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_sub_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
       a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
 #endif
 }
 
-ccl_device_inline float8_t operator-(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const float f)
 {
-  return a - make_float8_t(f);
+  return a - make_vfloat8(f);
 }
 
-ccl_device_inline float8_t operator-(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator-(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) - a;
+  return make_vfloat8(f) - a;
 }
 
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_mul_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_mul_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
       a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
 #endif
 }
 
-ccl_device_inline float8_t operator*(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const float f)
 {
-  return a * make_float8_t(f);
+  return a * make_vfloat8(f);
 }
 
-ccl_device_inline float8_t operator*(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator*(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) * a;
+  return make_vfloat8(f) * a;
 }
 
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_div_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_div_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
       a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
 #endif
 }
 
-ccl_device_inline float8_t operator/(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const float f)
 {
-  return a / make_float8_t(f);
+  return a / make_vfloat8(f);
 }
 
-ccl_device_inline float8_t operator/(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator/(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) / a;
+  return make_vfloat8(f) / a;
 }
 
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+=(vfloat8 a, const vfloat8 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline float8_t operator-=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-=(vfloat8 a, const vfloat8 b)
 {
   return a = a - b;
 }
 
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, const vfloat8 b)
 {
   return a = a * b;
 }
 
-ccl_device_inline float8_t operator*=(float8_t a, float f)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, float f)
 {
   return a = a * f;
 }
 
-ccl_device_inline float8_t operator/=(float8_t a, float f)
+ccl_device_inline vfloat8 operator/=(vfloat8 a, float f)
 {
   return a = a / f;
 }
 
-ccl_device_inline bool operator==(const float8_t a, const float8_t b)
+ccl_device_inline bool operator==(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
   return (_mm256_movemask_ps(_mm256_castsi256_ps(
               _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
           0b11111111) == 0b11111111;
@@ -206,132 +152,180 @@ ccl_device_inline bool operator==(const float8_t a, const float8_t b)
 #endif
 }
 
-ccl_device_inline float8_t rcp(const float8_t a)
+ccl_device_inline const vfloat8 operator^(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_rcp_ps(a.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_xor_ps(a.m256, b.m256));
 #else
-  return make_float8_t(1.0f / a.a,
-                       1.0f / a.b,
-                       1.0f / a.c,
-                       1.0f / a.d,
-                       1.0f / a.e,
-                       1.0f / a.f,
-                       1.0f / a.g,
-                       1.0f / a.h);
+  return make_vfloat8(__uint_as_float(__float_as_uint(a.a) ^ __float_as_uint(b.a)),
+                      __uint_as_float(__float_as_uint(a.b) ^ __float_as_uint(b.b)),
+                      __uint_as_float(__float_as_uint(a.c) ^ __float_as_uint(b.c)),
+                      __uint_as_float(__float_as_uint(a.d) ^ __float_as_uint(b.d)),
+                      __uint_as_float(__float_as_uint(a.e) ^ __float_as_uint(b.e)),
+                      __uint_as_float(__float_as_uint(a.f) ^ __float_as_uint(b.f)),
+                      __uint_as_float(__float_as_uint(a.g) ^ __float_as_uint(b.g)),
+                      __uint_as_float(__float_as_uint(a.h) ^ __float_as_uint(b.h)));
 #endif
 }
 
-ccl_device_inline float8_t sqrt(const float8_t a)
+ccl_device_inline vfloat8 rcp(const vfloat8 a)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_sqrt_ps(a.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_rcp_ps(a.m256));
 #else
-  return make_float8_t(sqrtf(a.a),
-                       sqrtf(a.b),
-                       sqrtf(a.c),
-                       sqrtf(a.d),
-                       sqrtf(a.e),
-                       sqrtf(a.f),
-                       sqrtf(a.g),
-                       sqrtf(a.h));
+  return make_vfloat8(1.0f / a.a,
+                      1.0f / a.b,
+                      1.0f / a.c,
+                      1.0f / a.d,
+                      1.0f / a.e,
+                      1.0f / a.f,
+                      1.0f / a.g,
+                      1.0f / a.h);
 #endif
 }
 
-ccl_device_inline float8_t sqr(const float8_t a)
+ccl_device_inline vfloat8 sqrt(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_sqrt_ps(a.m256));
+#else
+  return make_vfloat8(sqrtf(a.a),
+                      sqrtf(a.b),
+                      sqrtf(a.c),
+                      sqrtf(a.d),
+                      sqrtf(a.e),
+                      sqrtf(a.f),
+                      sqrtf(a.g),
+                      sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 sqr(const vfloat8 a)
 {
   return a * a;
 }
 
-ccl_device_inline bool is_zero(const float8_t a)
+ccl_device_inline bool is_zero(const vfloat8 a)
 {
-  return a == make_float8_t(0.0f);
+  return a == make_vfloat8(0.0f);
 }
 
-ccl_device_inline float average(const float8_t a)
+ccl_device_inline float reduce_add(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  vfloat8 b(_mm256_hadd_ps(a.m256, a.m256));
+  vfloat8 h(_mm256_hadd_ps(b.m256, b.m256));
+  return h[0] + h[4];
+#else
+  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline float average(const vfloat8 a)
 {
   return reduce_add(a) / 8.0f;
 }
 
-ccl_device_inline float8_t min(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 min(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_min_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_min_ps(a.m256, b.m256));
 #else
-  return make_float8_t(min(a.a, b.a),
-                       min(a.b, b.b),
-                       min(a.c, b.c),
-                       min(a.d, b.d),
-                       min(a.e, b.e),
-                       min(a.f, b.f),
-                       min(a.g, b.g),
-                       min(a.h, b.h));
+  return make_vfloat8(min(a.a, b.a),
+                      min(a.b, b.b),
+                      min(a.c, b.c),
+                      min(a.d, b.d),
+                      min(a.e, b.e),
+                      min(a.f, b.f),
+                      min(a.g, b.g),
+                      min(a.h, b.h));
 #endif
 }
 
-ccl_device_inline float8_t max(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 max(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_max_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_max_ps(a.m256, b.m256));
 #else
-  return make_float8_t(max(a.a, b.a),
-                       max(a.b, b.b),
-                       max(a.c, b.c),
-                       max(a.d, b.d),
-                       max(a.e, b.e),
-                       max(a.f, b.f),
-                       max(a.g, b.g),
-                       max(a.h, b.h));
+  return make_vfloat8(max(a.a, b.a),
+                      max(a.b, b.b),
+                      max(a.c, b.c),
+                      max(a.d, b.d),
+                      max(a.e, b.e),
+                      max(a.f, b.f),
+                      max(a.g, b.g),
+                      max(a.h, b.h));
 #endif
 }
 
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx)
+ccl_device_inline vfloat8 clamp(const vfloat8 a, const vfloat8 mn, const vfloat8 mx)
 {
   return min(max(a, mn), mx);
 }
 
-ccl_device_inline float8_t fabs(const float8_t a)
+ccl_device_inline vfloat8 select(const vint8 mask, const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_blendv_ps(b, a, _mm256_castsi256_ps(mask)));
 #else
-  return make_float8_t(fabsf(a.a),
-                       fabsf(a.b),
-                       fabsf(a.c),
-                       fabsf(a.d),
-                       fabsf(a.e),
-                       fabsf(a.f),
-                       fabsf(a.g),
-                       fabsf(a.h));
+  return make_vfloat8((mask.a) ? a.a : b.a,
+                      (mask.b) ? a.b : b.b,
+                      (mask.c) ? a.c : b.c,
+                      (mask.d) ? a.d : b.d,
+                      (mask.e) ? a.e : b.e,
+                      (mask.f) ? a.f : b.f,
+                      (mask.g) ? a.g : b.g,
+                      (mask.h) ? a.h : b.h);
 #endif
 }
 
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t)
+ccl_device_inline vfloat8 fabs(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+  return make_vfloat8(fabsf(a.a),
+                      fabsf(a.b),
+                      fabsf(a.c),
+                      fabsf(a.d),
+                      fabsf(a.e),
+                      fabsf(a.f),
+                      fabsf(a.g),
+                      fabsf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, vfloat8 t)
 {
   return a + t * (b - a);
 }
 
-ccl_device_inline float8_t saturate(const float8_t a)
+ccl_device_inline vfloat8 saturate(const vfloat8 a)
 {
-  return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f));
+  return clamp(a, make_vfloat8(0.0f), make_vfloat8(1.0f));
 }
 
-ccl_device_inline float8_t exp(float8_t v)
+ccl_device_inline vfloat8 exp(vfloat8 v)
 {
-  return make_float8_t(
+  return make_vfloat8(
       expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
 }
 
-ccl_device_inline float8_t log(float8_t v)
+ccl_device_inline vfloat8 log(vfloat8 v)
 {
-  return make_float8_t(
+  return make_vfloat8(
       logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
 }
 
-ccl_device_inline float dot(const float8_t a, const float8_t b)
+ccl_device_inline float dot(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+#ifdef __KERNEL_AVX__
+  vfloat8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
   return t[0] + t[4];
 #else
   return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
@@ -339,62 +333,51 @@ ccl_device_inline float dot(const float8_t a, const float8_t b)
 #endif
 }
 
-ccl_device_inline float8_t pow(float8_t v, float e)
+ccl_device_inline vfloat8 pow(vfloat8 v, float e)
 {
-  return make_float8_t(powf(v.a, e),
-                       powf(v.b, e),
-                       powf(v.c, e),
-                       powf(v.d, e),
-                       powf(v.e, e),
-                       powf(v.f, e),
-                       powf(v.g, e),
-                       powf(v.h, e));
+  return make_vfloat8(powf(v.a, e),
+                      powf(v.b, e),
+                      powf(v.c, e),
+                      powf(v.d, e),
+                      powf(v.e, e),
+                      powf(v.f, e),
+                      powf(v.g, e),
+                      powf(v.h, e));
 }
 
-ccl_device_inline float reduce_min(const float8_t a)
+ccl_device_inline float reduce_min(const vfloat8 a)
 {
   return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
 }
 
-ccl_device_inline float reduce_max(const float8_t a)
+ccl_device_inline float reduce_max(const vfloat8 a)
 {
   return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
 }
 
-ccl_device_inline float reduce_add(const float8_t a)
-{
-#ifdef __KERNEL_AVX2__
-  float8_t b(_mm256_hadd_ps(a.m256, a.m256));
-  float8_t h(_mm256_hadd_ps(b.m256, b.m256));
-  return h[0] + h[4];
-#else
-  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
-#endif
-}
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b)
+ccl_device_inline bool isequal(const vfloat8 a, const vfloat8 b)
 {
   return a == b;
 }
 
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const float b)
 {
-  return (b != 0.0f) ? a / b : make_float8_t(0.0f);
+  return (b != 0.0f) ? a / b : make_vfloat8(0.0f);
 }
 
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const vfloat8 b)
 {
-  return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f,
-                       (b.b != 0.0f) ? a.b / b.b : 0.0f,
-                       (b.c != 0.0f) ? a.c / b.c : 0.0f,
-                       (b.d != 0.0f) ? a.d / b.d : 0.0f,
-                       (b.e != 0.0f) ? a.e / b.e : 0.0f,
-                       (b.f != 0.0f) ? a.f / b.f : 0.0f,
-                       (b.g != 0.0f) ? a.g / b.g : 0.0f,
-                       (b.h != 0.0f) ? a.h / b.h : 0.0f);
+  return make_vfloat8((b.a != 0.0f) ? a.a / b.a : 0.0f,
+                      (b.b != 0.0f) ? a.b / b.b : 0.0f,
+                      (b.c != 0.0f) ? a.c / b.c : 0.0f,
+                      (b.d != 0.0f) ? a.d / b.d : 0.0f,
+                      (b.e != 0.0f) ? a.e / b.e : 0.0f,
+                      (b.f != 0.0f) ? a.f / b.f : 0.0f,
+                      (b.g != 0.0f) ? a.g / b.g : 0.0f,
+                      (b.h != 0.0f) ? a.h / b.h : 0.0f);
 }
 
-ccl_device_inline float8_t ensure_finite(float8_t v)
+ccl_device_inline vfloat8 ensure_finite(vfloat8 v)
 {
   v.a = ensure_finite(v.a);
   v.b = ensure_finite(v.b);
@@ -408,12 +391,92 @@ ccl_device_inline float8_t ensure_finite(float8_t v)
   return v;
 }
 
-ccl_device_inline bool isfinite_safe(float8_t v)
+ccl_device_inline bool isfinite_safe(vfloat8 v)
 {
   return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
          isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
 }
 
+ccl_device_inline vint8 cast(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_castps_si256(a));
+#else
+  return make_vint8(__float_as_int(a.a),
+                    __float_as_int(a.b),
+                    __float_as_int(a.c),
+                    __float_as_int(a.d),
+                    __float_as_int(a.e),
+                    __float_as_int(a.f),
+                    __float_as_int(a.g),
+                    __float_as_int(a.h));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline float4 low(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return float4(_mm256_extractf128_ps(a.m256, 0));
+#  else
+  return make_float4(a.e, a.f, a.g, a.h);
+#  endif
+}
+ccl_device_forceinline float4 high(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return float4(_mm256_extractf128_ps(a.m256, 1));
+#  else
+  return make_float4(a.a, a.b, a.c, a.d);
+#  endif
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)));
+#  else
+  return make_vfloat8(a[i0], a[i1], a[i2], a[i3], a[i4 + 4], a[i5 + 4], a[i6 + 4], a[i7 + 4]);
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  else
+  return make_vfloat8(shuffle<i0, i1, i2, i3>(high(a), high(b)),
+                      shuffle<i0, i1, i2, i3>(low(a), low(b)));
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+  return shuffle<i0, i1, i2, i3>(a, a);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+  return shuffle<i0, i0, i0, i0>(a, b);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+  return shuffle<i0>(a, a);
+}
+
+template<size_t i> ccl_device_forceinline float extract(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  __m256 b = shuffle<i, i, i, i>(a).m256;
+  return _mm256_cvtss_f32(b);
+#  else
+  return a[i];
+#  endif
+}
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_FLOAT8_H__ */
diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h
index f4d8a71221a..2df2ec5505b 100644
--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@@ -10,23 +10,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline bool operator==(const int2 a, const int2 b);
-ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
-ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
-#endif /* !__KERNEL_METAL__ */
-
-/*******************************************************************************
- * Definition.
- */
-
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h
index 48bffeaf553..b5b972ddfb5 100644
--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -10,21 +10,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline int3 min(int3 a, int3 b);
-ccl_device_inline int3 max(int3 a, int3 b);
-ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
-ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
-#endif /*  !defined(__KERNEL_METAL__) */
-
-/*******************************************************************************
- * Definition.
- */
-
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b)
 {
@@ -44,7 +29,7 @@ ccl_device_inline int3 max(int3 a, int3 b)
 #  endif
 }
 
-ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
+ccl_device_inline int3 clamp(const int3 a, int mn, int mx)
 {
 #  ifdef __KERNEL_SSE__
   return min(max(a, make_int3(mn)), make_int3(mx));
@@ -53,7 +38,7 @@ ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 #  endif
 }
 
-ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
+ccl_device_inline int3 clamp(const int3 a, int3 &mn, int mx)
 {
 #  ifdef __KERNEL_SSE__
   return min(max(a, mn), make_int3(mx));
@@ -62,22 +47,22 @@ ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 #  endif
 }
 
-ccl_device_inline bool operator==(const int3 &a, const int3 &b)
+ccl_device_inline bool operator==(const int3 a, const int3 b)
 {
   return a.x == b.x && a.y == b.y && a.z == b.z;
 }
 
-ccl_device_inline bool operator!=(const int3 &a, const int3 &b)
+ccl_device_inline bool operator!=(const int3 a, const int3 b)
 {
   return !(a == b);
 }
 
-ccl_device_inline bool operator<(const int3 &a, const int3 &b)
+ccl_device_inline bool operator<(const int3 a, const int3 b)
 {
   return a.x < b.x && a.y < b.y && a.z < b.z;
 }
 
-ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
+ccl_device_inline int3 operator+(const int3 a, const int3 b)
 {
 #  ifdef __KERNEL_SSE__
   return int3(_mm_add_epi32(a.m128, b.m128));
@@ -86,7 +71,7 @@ ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 #  endif
 }
 
-ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
+ccl_device_inline int3 operator-(const int3 a, const int3 b)
 {
 #  ifdef __KERNEL_SSE__
   return int3(_mm_sub_epi32(a.m128, b.m128));
diff --git a/intern/cycles/util/math_int4.h b/intern/cycles/util/math_int4.h
index fbdada223cb..c6d767d7587 100644
--- a/intern/cycles/util/math_int4.h
+++ b/intern/cycles/util/math_int4.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
  * Copyright 2011-2022 Blender Foundation */
 
 #ifndef __UTIL_MATH_INT4_H__
@@ -10,30 +11,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
 #ifndef __KERNEL_GPU__
-ccl_device_inline int4 operator+(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator+=(int4 &a, const int4 &b);
-ccl_device_inline int4 operator>>(const int4 &a, int i);
-ccl_device_inline int4 operator<<(const int4 &a, int i);
-ccl_device_inline int4 operator<(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator>=(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator&(const int4 &a, const int4 &b);
-ccl_device_inline int4 min(int4 a, int4 b);
-ccl_device_inline int4 max(int4 a, int4 b);
-ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx);
-ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b);
-#endif /* __KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator+(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_add_epi32(a.m128, b.m128));
@@ -42,12 +21,26 @@ ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator+=(int4 &a, const int4 &b)
+ccl_device_inline int4 operator+=(int4 &a, const int4 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline int4 operator>>(const int4 &a, int i)
+ccl_device_inline int4 operator-(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_sub_epi32(a.m128, b.m128));
+#  else
+  return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator-=(int4 &a, const int4 b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline int4 operator>>(const int4 a, int i)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_srai_epi32(a.m128, i));
@@ -56,7 +49,7 @@ ccl_device_inline int4 operator>>(const int4 &a, int i)
 #  endif
 }
 
-ccl_device_inline int4 operator<<(const int4 &a, int i)
+ccl_device_inline int4 operator<<(const int4 a, int i)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_slli_epi32(a.m128, i));
@@ -65,7 +58,7 @@ ccl_device_inline int4 operator<<(const int4 &a, int i)
 #  endif
 }
 
-ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator<(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_cmplt_epi32(a.m128, b.m128));
@@ -74,7 +67,26 @@ ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator<(const int4 a, const int b)
+{
+  return a < make_int4(b);
+}
+
+ccl_device_inline int4 operator==(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_cmpeq_epi32(a.m128, b.m128));
+#  else
+  return make_int4(a.x == b.x, a.y == b.y, a.z == b.z, a.w == b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator==(const int4 a, const int b)
+{
+  return a == make_int4(b);
+}
+
+ccl_device_inline int4 operator>=(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
@@ -83,7 +95,12 @@ ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator>=(const int4 a, const int b)
+{
+  return a >= make_int4(b);
+}
+
+ccl_device_inline int4 operator&(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_and_si128(a.m128, b.m128));
@@ -92,6 +109,97 @@ ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
 #  endif
 }
 
+ccl_device_inline int4 operator|(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_or_si128(a.m128, b.m128));
+#  else
+  return make_int4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator^(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_xor_si128(a.m128, b.m128));
+#  else
+  return make_int4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator&(const int32_t a, const int4 b)
+{
+  return make_int4(a) & b;
+}
+
+ccl_device_inline int4 operator&(const int4 a, const int32_t b)
+{
+  return a & make_int4(b);
+}
+
+ccl_device_inline int4 operator|(const int32_t a, const int4 b)
+{
+  return make_int4(a) | b;
+}
+
+ccl_device_inline int4 operator|(const int4 a, const int32_t b)
+{
+  return a | make_int4(b);
+}
+
+ccl_device_inline int4 operator^(const int32_t a, const int4 b)
+{
+  return make_int4(a) ^ b;
+}
+
+ccl_device_inline int4 operator^(const int4 a, const int32_t b)
+{
+  return a ^ make_int4(b);
+}
+
+ccl_device_inline int4 &operator&=(int4 &a, const int4 b)
+{
+  return a = a & b;
+}
+ccl_device_inline int4 &operator&=(int4 &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+ccl_device_inline int4 &operator|=(int4 &a, const int4 b)
+{
+  return a = a | b;
+}
+ccl_device_inline int4 &operator|=(int4 &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+ccl_device_inline int4 &operator^=(int4 &a, const int4 b)
+{
+  return a = a ^ b;
+}
+ccl_device_inline int4 &operator^=(int4 &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+ccl_device_inline int4 &operator<<=(int4 &a, const int32_t b)
+{
+  return a = a << b;
+}
+ccl_device_inline int4 &operator>>=(int4 &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+#  ifdef __KERNEL_SSE__
+ccl_device_forceinline const int4 srl(const int4 a, const int32_t b)
+{
+  return int4(_mm_srli_epi32(a.m128, b));
+}
+#  endif
+
 ccl_device_inline int4 min(int4 a, int4 b)
 {
 #  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
@@ -110,12 +218,12 @@ ccl_device_inline int4 max(int4 a, int4 b)
 #  endif
 }
 
-ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx)
+ccl_device_inline int4 clamp(const int4 a, const int4 mn, const int4 mx)
 {
   return min(max(a, mn), mx);
 }
 
-ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b)
+ccl_device_inline int4 select(const int4 mask, const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)));
@@ -135,6 +243,52 @@ ccl_device_inline int4 load_int4(const int *v)
 }
 #endif /* __KERNEL_GPU__ */
 
+ccl_device_inline float4 cast(const int4 a)
+{
+#ifdef __KERNEL_SSE__
+  return float4(_mm_castsi128_ps(a));
+#else
+  return make_float4(
+      __int_as_float(a.x), __int_as_float(a.y), __int_as_float(a.z), __int_as_float(a.w));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline int4 andnot(const int4 a, const int4 b)
+{
+  return int4(_mm_andnot_si128(a.m128, b.m128));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline int4 shuffle(const int4 a)
+{
+#  ifdef __KERNEL_NEON__
+  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
+  return int4(vreinterpretq_m128i_s32(result));
+#  else
+  return int4(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline int4 shuffle(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_NEON__
+  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
+                                                             vreinterpretq_s32_m128i(b));
+  return int4(vreinterpretq_m128i_s32(result));
+#  else
+  return int4(_mm_castps_si128(
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+#  endif
+}
+
+template<size_t i0> ccl_device_forceinline int4 shuffle(const int4 b)
+{
+  return shuffle<i0, i0, i0, i0>(b);
+}
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_INT4_H__ */
diff --git a/intern/cycles/util/math_int8.h b/intern/cycles/util/math_int8.h
new file mode 100644
index 00000000000..d150b0b74ec
--- /dev/null
+++ b/intern/cycles/util/math_int8.h
@@ -0,0 +1,355 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_INT8_H__
+#define __UTIL_MATH_INT8_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline vint8 operator+(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_add_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator+=(vint8 &a, const vint8 b)
+{
+  return a = a + b;
+}
+
+ccl_device_inline vint8 operator-(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_sub_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator-=(vint8 &a, const vint8 b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline vint8 operator>>(const vint8 a, int i)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_srai_epi32(a.m256, i));
+#  else
+  return make_vint8(
+      a.a >> i, a.b >> i, a.c >> i, a.d >> i, a.e >> i, a.f >> i, a.g >> i, a.h >> i);
+#  endif
+}
+
+ccl_device_inline vint8 operator<<(const vint8 a, int i)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_slli_epi32(a.m256, i));
+#  else
+  return make_vint8(
+      a.a << i, a.b << i, a.c << i, a.d << i, a.e << i, a.f << i, a.g << i, a.h << i);
+#  endif
+}
+
+ccl_device_inline vint8 operator<(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_cmpgt_epi32(b.m256, a.m256));
+#  else
+  return make_vint8(
+      a.a < b.a, a.b < b.b, a.c < b.c, a.d < b.d, a.e < b.e, a.f < b.f, a.g < b.g, a.h < b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator<(const vint8 a, const int b)
+{
+  return a < make_vint8(b);
+}
+
+ccl_device_inline vint8 operator==(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_cmpeq_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(a.a == b.a,
+                    a.b == b.b,
+                    a.c == b.c,
+                    a.d == b.d,
+                    a.e == b.e,
+                    a.f == b.f,
+                    a.g == b.g,
+                    a.h == b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator==(const vint8 a, const int b)
+{
+  return a == make_vint8(b);
+}
+
+ccl_device_inline vint8 operator>=(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(
+      _mm256_xor_si256(_mm256_set1_epi32(0xffffffff), _mm256_cmpgt_epi32(b.m256, a.m256)));
+#  else
+  return make_vint8(a.a >= b.a,
+                    a.b >= b.b,
+                    a.c >= b.c,
+                    a.d >= b.d,
+                    a.e >= b.e,
+                    a.f >= b.f,
+                    a.g >= b.g,
+                    a.h >= b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator>=(const vint8 a, const int b)
+{
+  return a >= make_vint8(b);
+}
+
+ccl_device_inline vint8 operator&(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_and_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a & b.a, a.b & b.b, a.c & b.c, a.d & b.d, a.e & b.e, a.f & b.f, a.g & b.g, a.h & b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator|(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_or_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a | b.a, a.b | b.b, a.c | b.c, a.d | b.d, a.e | b.e, a.f | b.f, a.g | b.g, a.h | b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator^(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_xor_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a ^ b.a, a.b ^ b.b, a.c ^ b.c, a.d ^ b.d, a.e ^ b.e, a.f ^ b.f, a.g ^ b.g, a.h ^ b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator&(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) & b;
+}
+
+ccl_device_inline vint8 operator&(const vint8 a, const int32_t b)
+{
+  return a & make_vint8(b);
+}
+
+ccl_device_inline vint8 operator|(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) | b;
+}
+
+ccl_device_inline vint8 operator|(const vint8 a, const int32_t b)
+{
+  return a | make_vint8(b);
+}
+
+ccl_device_inline vint8 operator^(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) ^ b;
+}
+
+ccl_device_inline vint8 operator^(const vint8 a, const int32_t b)
+{
+  return a ^ make_vint8(b);
+}
+
+ccl_device_inline vint8 &operator&=(vint8 &a, const vint8 b)
+{
+  return a = a & b;
+}
+ccl_device_inline vint8 &operator&=(vint8 &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+ccl_device_inline vint8 &operator|=(vint8 &a, const vint8 b)
+{
+  return a = a | b;
+}
+ccl_device_inline vint8 &operator|=(vint8 &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+ccl_device_inline vint8 &operator^=(vint8 &a, const vint8 b)
+{
+  return a = a ^ b;
+}
+ccl_device_inline vint8 &operator^=(vint8 &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+ccl_device_inline vint8 &operator<<=(vint8 &a, const int32_t b)
+{
+  return a = a << b;
+}
+ccl_device_inline vint8 &operator>>=(vint8 &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+#  ifdef __KERNEL_AVX__
+ccl_device_forceinline const vint8 srl(const vint8 a, const int32_t b)
+{
+  return vint8(_mm256_srli_epi32(a.m256, b));
+}
+#  endif
+
+ccl_device_inline vint8 min(vint8 a, vint8 b)
+{
+#  if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
+  return vint8(_mm256_min_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(min(a.a, b.a),
+                    min(a.b, b.b),
+                    min(a.c, b.c),
+                    min(a.d, b.d),
+                    min(a.e, b.e),
+                    min(a.f, b.f),
+                    min(a.g, b.g),
+                    min(a.h, b.h));
+#  endif
+}
+
+ccl_device_inline vint8 max(vint8 a, vint8 b)
+{
+#  if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
+  return vint8(_mm256_max_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(max(a.a, b.a),
+                    max(a.b, b.b),
+                    max(a.c, b.c),
+                    max(a.d, b.d),
+                    max(a.e, b.e),
+                    max(a.f, b.f),
+                    max(a.g, b.g),
+                    max(a.h, b.h));
+#  endif
+}
+
+ccl_device_inline vint8 clamp(const vint8 a, const vint8 mn, const vint8 mx)
+{
+  return min(max(a, mn), mx);
+}
+
+ccl_device_inline vint8 select(const vint8 mask, const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_castps_si256(_mm256_blendv_ps(
+      _mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))));
+#  else
+  return make_vint8((mask.a) ? a.a : b.a,
+                    (mask.b) ? a.b : b.b,
+                    (mask.c) ? a.c : b.c,
+                    (mask.d) ? a.d : b.d,
+                    (mask.e) ? a.e : b.e,
+                    (mask.f) ? a.f : b.f,
+                    (mask.g) ? a.g : b.g,
+                    (mask.h) ? a.h : b.h);
+#  endif
+}
+
+ccl_device_inline vint8 load_vint8(const int *v)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_loadu_si256((__m256i *)v));
+#  else
+  return make_vint8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+#  endif
+}
+#endif /* __KERNEL_GPU__ */
+
+ccl_device_inline vfloat8 cast(const vint8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_castsi256_ps(a));
+#else
+  return make_vfloat8(__int_as_float(a.a),
+                      __int_as_float(a.b),
+                      __int_as_float(a.c),
+                      __int_as_float(a.d),
+                      __int_as_float(a.e),
+                      __int_as_float(a.f),
+                      __int_as_float(a.g),
+                      __int_as_float(a.h));
+#endif
+}
+
+#ifdef __KERNEL_AVX__
+template<size_t i> ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(
+      _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i))));
+}
+
+template<size_t i0, size_t i1> ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(_mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0)));
+}
+
+template<size_t i0, size_t i1>
+ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b)
+{
+  return vint8(_mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(
+      _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0))));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+}
+
+template<> __forceinline const vint8 shuffle<0, 0, 2, 2>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b))));
+}
+template<> __forceinline const vint8 shuffle<1, 1, 3, 3>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b))));
+}
+template<> __forceinline const vint8 shuffle<0, 1, 0, 1>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(
+      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b))))));
+}
+#endif
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT8_H__ */
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index aa28682f8c1..0727debf775 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -133,7 +133,9 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
 ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+  return madd(make_float4(a.x),
+              make_float4(b.x),
+              madd(make_float4(a.y), make_float4(b.y), make_float4(a.z) * make_float4(b.z)))[0];
 #else
   return a.x * b.x + a.y * b.y + a.z * b.z;
 #endif
@@ -142,9 +144,10 @@ ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
 ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
-                     msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
-                     msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+  return make_float3(
+      msub(make_float4(a.y), make_float4(b.z), make_float4(a.z) * make_float4(b.y))[0],
+      msub(make_float4(a.z), make_float4(b.x), make_float4(a.x) * make_float4(b.z))[0],
+      msub(make_float4(a.x), make_float4(b.y), make_float4(a.y) * make_float4(b.x))[0]);
 #else
   return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 #endif
diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp
index 17cff2f2977..cb6b8d7a740 100644
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -2,8 +2,11 @@
  * Copyright 2011-2022 Blender Foundation */
 
 #include "util/path.h"
+#include "util/algorithm.h"
+#include "util/map.h"
 #include "util/md5.h"
 #include "util/string.h"
+#include "util/vector.h"
 
 #include <OpenImageIO/filesystem.h>
 #include <OpenImageIO/strutil.h>
@@ -898,19 +901,54 @@ FILE *path_fopen(const string &path, const string &mode)
 #endif
 }
 
-void path_cache_clear_except(const string &name, const set<string> &except)
+/* LRU Cache for Kernels */
+
+static void path_cache_kernel_mark_used(const string &path)
 {
-  string dir = path_user_get("cache");
+  std::time_t current_time = std::time(nullptr);
+  OIIO::Filesystem::last_write_time(path, current_time);
+}
 
-  if (path_exists(dir)) {
-    directory_iterator it(dir), it_end;
+bool path_cache_kernel_exists_and_mark_used(const string &path)
+{
+  if (path_exists(path)) {
+    path_cache_kernel_mark_used(path);
+    return true;
+  }
+  else {
+    return false;
+  }
+}
 
-    for (; it != it_end; ++it) {
-      string filename = path_filename(it->path());
+void path_cache_kernel_mark_added_and_clear_old(const string &new_path,
+                                                const size_t max_old_kernel_of_same_type)
+{
+  path_cache_kernel_mark_used(new_path);
+
+  string dir = path_dirname(new_path);
+  if (!path_exists(dir)) {
+    return;
+  }
+
+  /* Remove older kernels within the same directory. */
+  directory_iterator it(dir), it_end;
+  vector<pair<std::time_t, string>> same_kernel_types;
+
+  for (; it != it_end; ++it) {
+    const string &path = it->path();
+    if (path == new_path) {
+      continue;
+    }
+
+    std::time_t last_time = OIIO::Filesystem::last_write_time(path);
+    same_kernel_types.emplace_back(last_time, path);
+  }
+
+  if (same_kernel_types.size() > max_old_kernel_of_same_type) {
+    sort(same_kernel_types.begin(), same_kernel_types.end());
 
-      if (string_startswith(filename, name.c_str()))
-        if (except.find(filename) == except.end())
-          path_remove(it->path());
+    for (int i = 0; i < same_kernel_types.size() - max_old_kernel_of_same_type; i++) {
+      path_remove(same_kernel_types[i].second);
     }
   }
 }
diff --git a/intern/cycles/util/path.h b/intern/cycles/util/path.h
index 48b1fb65919..6d02267e182 100644
--- a/intern/cycles/util/path.h
+++ b/intern/cycles/util/path.h
@@ -55,8 +55,15 @@ bool path_remove(const string &path);
 /* source code utility */
 string path_source_replace_includes(const string &source, const string &path);
 
-/* cache utility */
-void path_cache_clear_except(const string &name, const set<string> &except);
+/* Simple least-recently-used cache for kernels.
+ *
+ * Kernels of same type are cached in the same directory.
+ * Whenever a kernel is used, its last modified time is updated.
+ * When a new kernel is added to the cache, clear old entries of the same type (i.e. in the same
+ * directory). */
+bool path_cache_kernel_exists_and_mark_used(const string &path);
+void path_cache_kernel_mark_added_and_clear_old(const string &path,
+                                                const size_t max_old_kernel_of_same_type = 5);
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/sseb.h b/intern/cycles/util/sseb.h
deleted file mode 100644
index 6f78299711e..00000000000
--- a/intern/cycles/util/sseb.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEB_H__
-#define __UTIL_SSEB_H__
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct ssei;
-struct ssef;
-
-/*! 4-wide SSE bool type. */
-struct sseb {
-  typedef sseb Mask;   // mask type
-  typedef ssei Int;    // int type
-  typedef ssef Float;  // float type
-
-  enum { size = 4 };  // number of SIMD elements
-  union {
-    __m128 m128;
-    int32_t v[4];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline sseb()
-  {
-  }
-  __forceinline sseb(const sseb &other)
-  {
-    m128 = other.m128;
-  }
-  __forceinline sseb &operator=(const sseb &other)
-  {
-    m128 = other.m128;
-    return *this;
-  }
-
-  __forceinline sseb(const __m128 input) : m128(input)
-  {
-  }
-  __forceinline operator const __m128 &(void) const
-  {
-    return m128;
-  }
-  __forceinline operator const __m128i(void) const
-  {
-    return _mm_castps_si128(m128);
-  }
-  __forceinline operator const __m128d(void) const
-  {
-    return _mm_castps_pd(m128);
-  }
-
-  __forceinline sseb(bool a)
-      : m128(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(bool a, bool b)
-      : m128(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(bool a, bool b, bool c, bool d)
-      : m128(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(int mask)
-  {
-    assert(mask >= 0 && mask < 16);
-    m128 = _mm_lookupmask_ps[mask];
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline sseb(FalseTy) : m128(_mm_setzero_ps())
-  {
-  }
-  __forceinline sseb(TrueTy)
-      : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator[](const size_t i) const
-  {
-    assert(i < 4);
-    return (_mm_movemask_ps(m128) >> i) & 1;
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 4);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator!(const sseb &a)
-{
-  return _mm_xor_ps(a, sseb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator&(const sseb &a, const sseb &b)
-{
-  return _mm_and_ps(a, b);
-}
-__forceinline const sseb operator|(const sseb &a, const sseb &b)
-{
-  return _mm_or_ps(a, b);
-}
-__forceinline const sseb operator^(const sseb &a, const sseb &b)
-{
-  return _mm_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator&=(sseb &a, const sseb &b)
-{
-  return a = a & b;
-}
-__forceinline const sseb operator|=(sseb &a, const sseb &b)
-{
-  return a = a | b;
-}
-__forceinline const sseb operator^=(sseb &a, const sseb &b)
-{
-  return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator!=(const sseb &a, const sseb &b)
-{
-  return _mm_xor_ps(a, b);
-}
-__forceinline const sseb operator==(const sseb &a, const sseb &b)
-{
-  return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b));
-}
-
-__forceinline const sseb select(const sseb &m, const sseb &t, const sseb &f)
-{
-#  if defined(__KERNEL_SSE41__)
-  return _mm_blendv_ps(f, t, m);
-#  else
-  return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb unpacklo(const sseb &a, const sseb &b)
-{
-  return _mm_unpacklo_ps(a, b);
-}
-__forceinline const sseb unpackhi(const sseb &a, const sseb &b)
-{
-  return _mm_unpackhi_ps(a, b);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const sseb shuffle(const sseb &a)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a);
-#  else
-  return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
-#  endif
-}
-
-#  ifndef __KERNEL_NEON__
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
-{
-  return _mm_movelh_ps(a, a);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
-{
-  return _mm_movehl_ps(a, a);
-}
-#  endif
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const sseb shuffle(const sseb &a, const sseb &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b);
-#  else
-  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-#  endif
-}
-
-#  ifndef __KERNEL_NEON__
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
-{
-  return _mm_movelh_ps(a, b);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sseb &b)
-{
-  return _mm_movehl_ps(b, a);
-}
-#  endif
-
-#  if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__)
-template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
-{
-  return _mm_moveldup_ps(a);
-}
-template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
-{
-  return _mm_movehdup_ps(a);
-}
-#  endif
-
-#  if defined(__KERNEL_SSE41__)
-template<size_t dst, size_t src, size_t clr>
-__forceinline const sseb insert(const sseb &a, const sseb &b)
-{
-#    ifdef __KERNEL_NEON__
-  sseb res = a;
-  if (clr)
-    res[dst] = 0;
-  else
-    res[dst] = b[src];
-  return res;
-#    else
-  return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
-#    endif
-}
-template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
-{
-  return insert<dst, src, 0>(a, b);
-}
-template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b)
-{
-  return insert<dst, 0>(a, sseb(b));
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const sseb &a)
-{
-#    if defined(__KERNEL_NEON__)
-  const int32x4_t mask = {1, 1, 1, 1};
-  int32x4_t t = vandq_s32(vreinterpretq_s32_m128(a.m128), mask);
-  return vaddvq_s32(t);
-#    else
-  return _mm_popcnt_u32(_mm_movemask_ps(a));
-#    endif
-}
-#  else
-__forceinline uint32_t popcnt(const sseb &a)
-{
-  return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]);
-}
-#  endif
-
-__forceinline bool reduce_and(const sseb &a)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) == -4;
-#  else
-  return _mm_movemask_ps(a) == 0xf;
-#  endif
-}
-__forceinline bool reduce_or(const sseb &a)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) != 0x0;
-#  else
-  return _mm_movemask_ps(a) != 0x0;
-#  endif
-}
-__forceinline bool all(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == -4;
-#  else
-  return _mm_movemask_ps(b) == 0xf;
-#  endif
-}
-__forceinline bool any(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) != 0x0;
-#  else
-  return _mm_movemask_ps(b) != 0x0;
-#  endif
-}
-__forceinline bool none(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == 0x0;
-#  else
-  return _mm_movemask_ps(b) == 0x0;
-#  endif
-}
-
-__forceinline uint32_t movemask(const sseb &a)
-{
-  return _mm_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_sseb(const char *label, const sseb &a)
-{
-  printf("%s: %d %d %d %d\n", label, a[0], a[1], a[2], a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h
deleted file mode 100644
index 1e2bfa90354..00000000000
--- a/intern/cycles/util/ssef.h
+++ /dev/null
@@ -1,1090 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEF_H__
-#define __UTIL_SSEF_H__
-
-#include <math.h>
-
-#include "util/ssei.h"
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct sseb;
-struct ssef;
-
-/*! 4-wide SSE float type. */
-struct ssef {
-  typedef sseb Mask;   // mask type
-  typedef ssei Int;    // int type
-  typedef ssef Float;  // float type
-
-  enum { size = 4 };  // number of SIMD elements
-  union {
-    __m128 m128;
-    float f[4];
-    int i[4];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline ssef()
-  {
-  }
-  __forceinline ssef(const ssef &other)
-  {
-    m128 = other.m128;
-  }
-  __forceinline ssef &operator=(const ssef &other)
-  {
-    m128 = other.m128;
-    return *this;
-  }
-
-  __forceinline ssef(const __m128 a) : m128(a)
-  {
-  }
-  __forceinline operator const __m128 &() const
-  {
-    return m128;
-  }
-  __forceinline operator __m128 &()
-  {
-    return m128;
-  }
-
-  __forceinline ssef(float a) : m128(_mm_set1_ps(a))
-  {
-  }
-  __forceinline ssef(float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d))
-  {
-  }
-
-  __forceinline explicit ssef(const __m128i a) : m128(_mm_cvtepi32_ps(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Loads and Stores
-  ////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_AVX__)
-  static __forceinline ssef broadcast(const void *const a)
-  {
-    return _mm_broadcast_ss((float *)a);
-  }
-#  else
-  static __forceinline ssef broadcast(const void *const a)
-  {
-    return _mm_set1_ps(*(float *)a);
-  }
-#  endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const float &operator[](const size_t i) const
-  {
-    assert(i < 4);
-    return f[i];
-  }
-  __forceinline float &operator[](const size_t i)
-  {
-    assert(i < 4);
-    return f[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef cast(const __m128i &a)
-{
-  return _mm_castsi128_ps(a);
-}
-__forceinline const ssef operator+(const ssef &a)
-{
-  return a;
-}
-__forceinline const ssef operator-(const ssef &a)
-{
-  return _mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-}
-__forceinline const ssef abs(const ssef &a)
-{
-  return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
-}
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssef sign(const ssef &a)
-{
-  return _mm_blendv_ps(ssef(1.0f), -ssef(1.0f), _mm_cmplt_ps(a, ssef(0.0f)));
-}
-#  endif
-__forceinline const ssef signmsk(const ssef &a)
-{
-  return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-}
-
-__forceinline const ssef rcp(const ssef &a)
-{
-  const ssef r = _mm_rcp_ps(a.m128);
-  return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-}
-__forceinline const ssef sqr(const ssef &a)
-{
-  return _mm_mul_ps(a, a);
-}
-__forceinline const ssef mm_sqrt(const ssef &a)
-{
-  return _mm_sqrt_ps(a.m128);
-}
-__forceinline const ssef rsqrt(const ssef &a)
-{
-  const ssef r = _mm_rsqrt_ps(a.m128);
-  return _mm_add_ps(
-      _mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r),
-      _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r),
-                 _mm_mul_ps(r, r)));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef operator+(const ssef &a, const ssef &b)
-{
-  return _mm_add_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator+(const ssef &a, const float &b)
-{
-  return a + ssef(b);
-}
-__forceinline const ssef operator+(const float &a, const ssef &b)
-{
-  return ssef(a) + b;
-}
-
-__forceinline const ssef operator-(const ssef &a, const ssef &b)
-{
-  return _mm_sub_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator-(const ssef &a, const float &b)
-{
-  return a - ssef(b);
-}
-__forceinline const ssef operator-(const float &a, const ssef &b)
-{
-  return ssef(a) - b;
-}
-
-__forceinline const ssef operator*(const ssef &a, const ssef &b)
-{
-  return _mm_mul_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator*(const ssef &a, const float &b)
-{
-  return a * ssef(b);
-}
-__forceinline const ssef operator*(const float &a, const ssef &b)
-{
-  return ssef(a) * b;
-}
-
-__forceinline const ssef operator/(const ssef &a, const ssef &b)
-{
-  return _mm_div_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator/(const ssef &a, const float &b)
-{
-  return a / ssef(b);
-}
-__forceinline const ssef operator/(const float &a, const ssef &b)
-{
-  return ssef(a) / b;
-}
-
-__forceinline const ssef operator^(const ssef &a, const ssef &b)
-{
-  return _mm_xor_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator^(const ssef &a, const ssei &b)
-{
-  return _mm_xor_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef operator&(const ssef &a, const ssef &b)
-{
-  return _mm_and_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator&(const ssef &a, const ssei &b)
-{
-  return _mm_and_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef operator|(const ssef &a, const ssef &b)
-{
-  return _mm_or_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator|(const ssef &a, const ssei &b)
-{
-  return _mm_or_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef andnot(const ssef &a, const ssef &b)
-{
-  return _mm_andnot_ps(a.m128, b.m128);
-}
-
-__forceinline const ssef min(const ssef &a, const ssef &b)
-{
-  return _mm_min_ps(a.m128, b.m128);
-}
-__forceinline const ssef min(const ssef &a, const float &b)
-{
-  return _mm_min_ps(a.m128, ssef(b));
-}
-__forceinline const ssef min(const float &a, const ssef &b)
-{
-  return _mm_min_ps(ssef(a), b.m128);
-}
-
-__forceinline const ssef max(const ssef &a, const ssef &b)
-{
-  return _mm_max_ps(a.m128, b.m128);
-}
-__forceinline const ssef max(const ssef &a, const float &b)
-{
-  return _mm_max_ps(a.m128, ssef(b));
-}
-__forceinline const ssef max(const float &a, const ssef &b)
-{
-  return _mm_max_ps(ssef(a), b.m128);
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline ssef mini(const ssef &a, const ssef &b)
-{
-  const ssei ai = _mm_castps_si128(a);
-  const ssei bi = _mm_castps_si128(b);
-  const ssei ci = _mm_min_epi32(ai, bi);
-  return _mm_castsi128_ps(ci);
-}
-#  endif
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline ssef maxi(const ssef &a, const ssef &b)
-{
-  const ssei ai = _mm_castps_si128(a);
-  const ssei bi = _mm_castps_si128(b);
-  const ssei ci = _mm_max_epi32(ai, bi);
-  return _mm_castsi128_ps(ci);
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Ternary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
-{
-#  if defined(__KERNEL_NEON__)
-  return vfmaq_f32(c, a, b);
-#  elif defined(__KERNEL_AVX2__)
-  return _mm_fmadd_ps(a, b, c);
-#  else
-  return a * b + c;
-#  endif
-}
-__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
-{
-#  if defined(__KERNEL_NEON__)
-  return vfmaq_f32(vnegq_f32(c), a, b);
-#  elif defined(__KERNEL_AVX2__)
-  return _mm_fmsub_ps(a, b, c);
-#  else
-  return a * b - c;
-#  endif
-}
-__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
-{
-#  if defined(__KERNEL_NEON__)
-  return vfmsq_f32(c, a, b);
-#  elif defined(__KERNEL_AVX2__)
-  return _mm_fnmadd_ps(a, b, c);
-#  else
-  return c - a * b;
-#  endif
-}
-__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
-{
-#  if defined(__KERNEL_NEON__)
-  return vfmsq_f32(vnegq_f32(c), a, b);
-#  elif defined(__KERNEL_AVX2__)
-  return _mm_fnmsub_ps(a, b, c);
-#  else
-  return -a * b - c;
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef &operator+=(ssef &a, const ssef &b)
-{
-  return a = a + b;
-}
-__forceinline ssef &operator+=(ssef &a, const float &b)
-{
-  return a = a + b;
-}
-
-__forceinline ssef &operator-=(ssef &a, const ssef &b)
-{
-  return a = a - b;
-}
-__forceinline ssef &operator-=(ssef &a, const float &b)
-{
-  return a = a - b;
-}
-
-__forceinline ssef &operator*=(ssef &a, const ssef &b)
-{
-  return a = a * b;
-}
-__forceinline ssef &operator*=(ssef &a, const float &b)
-{
-  return a = a * b;
-}
-
-__forceinline ssef &operator/=(ssef &a, const ssef &b)
-{
-  return a = a / b;
-}
-__forceinline ssef &operator/=(ssef &a, const float &b)
-{
-  return a = a / b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator==(const ssef &a, const ssef &b)
-{
-  return _mm_cmpeq_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator==(const ssef &a, const float &b)
-{
-  return a == ssef(b);
-}
-__forceinline const sseb operator==(const float &a, const ssef &b)
-{
-  return ssef(a) == b;
-}
-
-__forceinline const sseb operator!=(const ssef &a, const ssef &b)
-{
-  return _mm_cmpneq_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator!=(const ssef &a, const float &b)
-{
-  return a != ssef(b);
-}
-__forceinline const sseb operator!=(const float &a, const ssef &b)
-{
-  return ssef(a) != b;
-}
-
-__forceinline const sseb operator<(const ssef &a, const ssef &b)
-{
-  return _mm_cmplt_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator<(const ssef &a, const float &b)
-{
-  return a < ssef(b);
-}
-__forceinline const sseb operator<(const float &a, const ssef &b)
-{
-  return ssef(a) < b;
-}
-
-__forceinline const sseb operator>=(const ssef &a, const ssef &b)
-{
-  return _mm_cmpnlt_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator>=(const ssef &a, const float &b)
-{
-  return a >= ssef(b);
-}
-__forceinline const sseb operator>=(const float &a, const ssef &b)
-{
-  return ssef(a) >= b;
-}
-
-__forceinline const sseb operator>(const ssef &a, const ssef &b)
-{
-  return _mm_cmpnle_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator>(const ssef &a, const float &b)
-{
-  return a > ssef(b);
-}
-__forceinline const sseb operator>(const float &a, const ssef &b)
-{
-  return ssef(a) > b;
-}
-
-__forceinline const sseb operator<=(const ssef &a, const ssef &b)
-{
-  return _mm_cmple_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator<=(const ssef &a, const float &b)
-{
-  return a <= ssef(b);
-}
-__forceinline const sseb operator<=(const float &a, const ssef &b)
-{
-  return ssef(a) <= b;
-}
-
-__forceinline const ssef select(const sseb &m, const ssef &t, const ssef &f)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_blendv_ps(f, t, m);
-#  else
-  return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-#  endif
-}
-
-__forceinline const ssef select(const ssef &m, const ssef &t, const ssef &f)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_blendv_ps(f, t, m);
-#  else
-  return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-#  endif
-}
-
-__forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
-{
-#  if defined(__KERNEL_SSE41__) && \
-      ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
-  return _mm_blend_ps(f, t, mask);
-#  else
-  return select(sseb(mask), t, f);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Rounding Functions
-////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssef round_even(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndnq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
-#    endif
-}
-__forceinline const ssef round_down(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndmq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
-#    endif
-}
-__forceinline const ssef round_up(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndpq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
-#    endif
-}
-__forceinline const ssef round_zero(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
-#    endif
-}
-__forceinline const ssef floor(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndmq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
-#    endif
-}
-__forceinline const ssef ceil(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndpq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
-#    endif
-}
-#  else
-/* Non-SSE4.1 fallback, needed for floorfrac. */
-__forceinline const ssef floor(const ssef &a)
-{
-  return _mm_set_ps(floorf(a.f[3]), floorf(a.f[2]), floorf(a.f[1]), floorf(a.f[0]));
-}
-#  endif
-
-__forceinline ssei truncatei(const ssef &a)
-{
-  return _mm_cvttps_epi32(a.m128);
-}
-
-__forceinline ssef floorfrac(const ssef &x, ssei *i)
-{
-  ssef f = floor(x);
-  *i = truncatei(f);
-  return x - f;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Common Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef mix(const ssef &a, const ssef &b, const ssef &t)
-{
-  return madd(t, b, (ssef(1.0f) - t) * a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef unpacklo(const ssef &a, const ssef &b)
-{
-  return _mm_unpacklo_ps(a.m128, b.m128);
-}
-__forceinline ssef unpackhi(const ssef &a, const ssef &b)
-{
-  return _mm_unpackhi_ps(a.m128, b.m128);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssef shuffle(const ssef &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128);
-#  else
-  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-#  endif
-}
-
-template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
-{
-  return _mm_movelh_ps(a, a);
-}
-
-template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
-{
-  return _mm_movehl_ps(a, a);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssef shuffle(const ssef &a, const ssef &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b);
-#  else
-  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-#  endif
-}
-
-template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b);
-#  else
-  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
-#  endif
-}
-
-#  ifndef __KERNEL_NEON__
-template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
-{
-  return _mm_movelh_ps(a, b);
-}
-
-template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const ssef &b)
-{
-  return _mm_movehl_ps(b, a);
-}
-#  endif
-
-#  if defined(__KERNEL_SSSE3__)
-__forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
-{
-  return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
-}
-#  endif
-
-#  if defined(__KERNEL_SSE3__)
-template<> __forceinline const ssef shuffle<0, 0, 2, 2>(const ssef &b)
-{
-  return _mm_moveldup_ps(b);
-}
-template<> __forceinline const ssef shuffle<1, 1, 3, 3>(const ssef &b)
-{
-  return _mm_movehdup_ps(b);
-}
-#  endif
-
-template<size_t i0> __forceinline const ssef shuffle(const ssef &b)
-{
-  return shuffle<i0, i0, i0, i0>(b);
-}
-
-#  if defined(__KERNEL_AVX__)
-__forceinline const ssef shuffle(const ssef &a, const ssei &shuf)
-{
-  return _mm_permutevar_ps(a, shuf);
-}
-#  endif
-
-template<size_t i> __forceinline float extract(const ssef &a)
-{
-  return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
-}
-template<> __forceinline float extract<0>(const ssef &a)
-{
-  return _mm_cvtss_f32(a);
-}
-
-#  if defined(__KERNEL_SSE41__)
-template<size_t dst, size_t src, size_t clr>
-__forceinline const ssef insert(const ssef &a, const ssef &b)
-{
-#    ifdef __KERNEL_NEON__
-  ssef res = a;
-  if (clr)
-    res[dst] = 0;
-  else
-    res[dst] = b[src];
-  return res;
-#    else
-  return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
-#    endif
-}
-template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
-{
-  return insert<dst, src, 0>(a, b);
-}
-template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b)
-{
-  return insert<dst, 0>(a, _mm_set_ss(b));
-}
-#  else
-template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b)
-{
-  ssef c = a;
-  c[dst] = b;
-  return c;
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Transpose
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline void transpose(const ssef &r0,
-                             const ssef &r1,
-                             const ssef &r2,
-                             const ssef &r3,
-                             ssef &c0,
-                             ssef &c1,
-                             ssef &c2,
-                             ssef &c3)
-{
-  ssef l02 = unpacklo(r0, r2);
-  ssef h02 = unpackhi(r0, r2);
-  ssef l13 = unpacklo(r1, r3);
-  ssef h13 = unpackhi(r1, r3);
-  c0 = unpacklo(l02, l13);
-  c1 = unpackhi(l02, l13);
-  c2 = unpacklo(h02, h13);
-  c3 = unpackhi(h02, h13);
-}
-
-__forceinline void transpose(
-    const ssef &r0, const ssef &r1, const ssef &r2, const ssef &r3, ssef &c0, ssef &c1, ssef &c2)
-{
-  ssef l02 = unpacklo(r0, r2);
-  ssef h02 = unpackhi(r0, r2);
-  ssef l13 = unpacklo(r1, r3);
-  ssef h13 = unpackhi(r1, r3);
-  c0 = unpacklo(l02, l13);
-  c1 = unpackhi(l02, l13);
-  c2 = unpacklo(h02, h13);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef vreduce_min(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vdupq_n_f32(vminvq_f32(v));
-#  else
-  ssef h = min(shuffle<1, 0, 3, 2>(v), v);
-  return min(shuffle<2, 3, 0, 1>(h), h);
-#  endif
-}
-__forceinline const ssef vreduce_max(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vdupq_n_f32(vmaxvq_f32(v));
-#  else
-  ssef h = max(shuffle<1, 0, 3, 2>(v), v);
-  return max(shuffle<2, 3, 0, 1>(h), h);
-#  endif
-}
-__forceinline const ssef vreduce_add(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vdupq_n_f32(vaddvq_f32(v));
-#  else
-  ssef h = shuffle<1, 0, 3, 2>(v) + v;
-  return shuffle<2, 3, 0, 1>(h) + h;
-#  endif
-}
-
-__forceinline float reduce_min(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vminvq_f32(v);
-#  else
-  return _mm_cvtss_f32(vreduce_min(v));
-#  endif
-}
-__forceinline float reduce_max(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vmaxvq_f32(v);
-#  else
-  return _mm_cvtss_f32(vreduce_max(v));
-#  endif
-}
-__forceinline float reduce_add(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vaddvq_f32(v);
-#  else
-  return _mm_cvtss_f32(vreduce_add(v));
-#  endif
-}
-
-__forceinline uint32_t select_min(const ssef &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const ssef &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const sseb &valid, const ssef &v)
-{
-  const ssef a = select(valid, v, ssef(pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const sseb &valid, const ssef &v)
-{
-  const ssef a = select(valid, v, ssef(neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-__forceinline uint32_t movemask(const ssef &a)
-{
-  return _mm_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Memory load and store operations
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef load4f(const float4 &a)
-{
-#  ifdef __KERNEL_WITH_SSE_ALIGN__
-  return _mm_load_ps(&a.x);
-#  else
-  return _mm_loadu_ps(&a.x);
-#  endif
-}
-
-__forceinline ssef load4f(const float3 &a)
-{
-#  ifdef __KERNEL_WITH_SSE_ALIGN__
-  return _mm_load_ps(&a.x);
-#  else
-  return _mm_loadu_ps(&a.x);
-#  endif
-}
-
-__forceinline ssef load4f(const void *const a)
-{
-  return _mm_load_ps((float *)a);
-}
-
-__forceinline ssef load1f_first(const float a)
-{
-  return _mm_set_ss(a);
-}
-
-__forceinline void store4f(void *ptr, const ssef &v)
-{
-  _mm_store_ps((float *)ptr, v);
-}
-
-__forceinline ssef loadu4f(const void *const a)
-{
-  return _mm_loadu_ps((float *)a);
-}
-
-__forceinline void storeu4f(void *ptr, const ssef &v)
-{
-  _mm_storeu_ps((float *)ptr, v);
-}
-
-__forceinline void store4f(const sseb &mask, void *ptr, const ssef &f)
-{
-#  if defined(__KERNEL_AVX__)
-  _mm_maskstore_ps((float *)ptr, (__m128i)mask, f);
-#  else
-  *(ssef *)ptr = select(mask, f, *(ssef *)ptr);
-#  endif
-}
-
-__forceinline ssef load4f_nt(void *ptr)
-{
-#  if defined(__KERNEL_SSE41__)
-  return _mm_castsi128_ps(_mm_stream_load_si128((__m128i *)ptr));
-#  else
-  return _mm_load_ps((float *)ptr);
-#  endif
-}
-
-__forceinline void store4f_nt(void *ptr, const ssef &v)
-{
-#  if defined(__KERNEL_SSE41__)
-  _mm_stream_ps((float *)ptr, v);
-#  else
-  _mm_store_ps((float *)ptr, v);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Euclidean Space Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline float dot(const ssef &a, const ssef &b)
-{
-  return reduce_add(a * b);
-}
-
-/* calculate shuffled cross product, useful when order of components does not matter */
-__forceinline ssef cross_zxy(const ssef &a, const ssef &b)
-{
-  const ssef a0 = a;
-  const ssef b0 = shuffle<1, 2, 0, 3>(b);
-  const ssef a1 = shuffle<1, 2, 0, 3>(a);
-  const ssef b1 = b;
-  return msub(a0, b0, a1 * b1);
-}
-
-__forceinline ssef cross(const ssef &a, const ssef &b)
-{
-  return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
-}
-
-ccl_device_inline const ssef dot3_splat(const ssef &a, const ssef &b)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_dp_ps(a.m128, b.m128, 0x7f);
-#  else
-  ssef t = a * b;
-  return ssef(((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2]);
-#  endif
-}
-
-/* squared length taking only specified axes into account */
-template<size_t X, size_t Y, size_t Z, size_t W> ccl_device_inline float len_squared(const ssef &a)
-{
-#  ifndef __KERNEL_SSE41__
-  float4 &t = (float4 &)a;
-  return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) +
-         (W ? t.w * t.w : 0.0f);
-#  else
-  return extract<0>(
-      ssef(_mm_dp_ps(a.m128, a.m128, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf)));
-#  endif
-}
-
-ccl_device_inline float dot3(const ssef &a, const ssef &b)
-{
-#  ifdef __KERNEL_SSE41__
-  return extract<0>(ssef(_mm_dp_ps(a.m128, b.m128, 0x7f)));
-#  else
-  ssef t = a * b;
-  return ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
-#  endif
-}
-
-ccl_device_inline const ssef len3_squared_splat(const ssef &a)
-{
-  return dot3_splat(a, a);
-}
-
-ccl_device_inline float len3_squared(const ssef &a)
-{
-  return dot3(a, a);
-}
-
-ccl_device_inline float len3(const ssef &a)
-{
-  return extract<0>(mm_sqrt(dot3_splat(a, a)));
-}
-
-/* SSE shuffle utility functions */
-
-#  ifdef __KERNEL_SSSE3__
-
-/* faster version for SSSE3 */
-typedef ssei shuffle_swap_t;
-
-ccl_device_inline shuffle_swap_t shuffle_swap_identity()
-{
-  return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-}
-
-ccl_device_inline shuffle_swap_t shuffle_swap_swap()
-{
-  return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-}
-
-ccl_device_inline const ssef shuffle_swap(const ssef &a, const shuffle_swap_t &shuf)
-{
-  return cast(_mm_shuffle_epi8(cast(a), shuf));
-}
-
-#  else
-
-/* somewhat slower version for SSE2 */
-typedef int shuffle_swap_t;
-
-ccl_device_inline shuffle_swap_t shuffle_swap_identity()
-{
-  return 0;
-}
-
-ccl_device_inline shuffle_swap_t shuffle_swap_swap()
-{
-  return 1;
-}
-
-ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
-{
-  /* shuffle value must be a constant, so we need to branch */
-  if (shuf)
-    return shuffle<1, 0, 3, 2>(a);
-  else
-    return shuffle<3, 2, 1, 0>(a);
-}
-
-#  endif
-
-#  if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__)
-
-ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
-                                          const shuffle_swap_t &shuf_identity,
-                                          const shuffle_swap_t &shuf_swap,
-                                          const float3 &idir,
-                                          ssef idirsplat[3],
-                                          shuffle_swap_t shufflexyz[3])
-{
-  const __m128 idirsplat_raw[] = {_mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z)};
-  idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
-  idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn);
-  idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn);
-
-  const ssef signmask = cast(ssei(0x80000000));
-  const ssef shuf_identity_f = cast(shuf_identity);
-  const ssef shuf_swap_f = cast(shuf_swap);
-
-  shufflexyz[0] = _mm_castps_si128(
-      _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask)));
-  shufflexyz[1] = _mm_castps_si128(
-      _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask)));
-  shufflexyz[2] = _mm_castps_si128(
-      _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask)));
-}
-
-#  else
-
-ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
-                                          const shuffle_swap_t &shuf_identity,
-                                          const shuffle_swap_t &shuf_swap,
-                                          const float3 &idir,
-                                          ssef idirsplat[3],
-                                          shuffle_swap_t shufflexyz[3])
-{
-  idirsplat[0] = ssef(idir.x) ^ pn;
-  idirsplat[1] = ssef(idir.y) ^ pn;
-  idirsplat[2] = ssef(idir.z) ^ pn;
-
-  shufflexyz[0] = (idir.x >= 0) ? shuf_identity : shuf_swap;
-  shufflexyz[1] = (idir.y >= 0) ? shuf_identity : shuf_swap;
-  shufflexyz[2] = (idir.z >= 0) ? shuf_identity : shuf_swap;
-}
-
-#  endif
-
-ccl_device_inline const ssef uint32_to_float(const ssei &in)
-{
-  ssei a = _mm_srli_epi32(in, 16);
-  ssei b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff));
-  ssei c = _mm_or_si128(a, _mm_set1_epi32(0x53000000));
-  ssef d = _mm_cvtepi32_ps(b);
-  ssef e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000)));
-  return _mm_add_ps(e, d);
-}
-
-template<size_t S1, size_t S2, size_t S3, size_t S4>
-ccl_device_inline const ssef set_sign_bit(const ssef &a)
-{
-  return cast(cast(a) ^ ssei(S1 << 31, S2 << 31, S3 << 31, S4 << 31));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_ssef(const char *label, const ssef &a)
-{
-  printf(
-      "%s: %.8f %.8f %.8f %.8f\n", label, (double)a[0], (double)a[1], (double)a[2], (double)a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/ssei.h b/intern/cycles/util/ssei.h
deleted file mode 100644
index 5caf44c967f..00000000000
--- a/intern/cycles/util/ssei.h
+++ /dev/null
@@ -1,633 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEI_H__
-#define __UTIL_SSEI_H__
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct sseb;
-struct ssef;
-
-/*! 4-wide SSE integer type. */
-struct ssei {
-  typedef sseb Mask;   // mask type
-  typedef ssei Int;    // int type
-  typedef ssef Float;  // float type
-
-  enum { size = 4 };  // number of SIMD elements
-  union {
-    __m128i m128;
-    int32_t i[4];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline ssei()
-  {
-  }
-  __forceinline ssei(const ssei &a)
-  {
-    m128 = a.m128;
-  }
-  __forceinline ssei &operator=(const ssei &a)
-  {
-    m128 = a.m128;
-    return *this;
-  }
-
-  __forceinline ssei(const __m128i a) : m128(a)
-  {
-  }
-  __forceinline operator const __m128i &(void) const
-  {
-    return m128;
-  }
-  __forceinline operator __m128i &(void)
-  {
-    return m128;
-  }
-
-  __forceinline ssei(const int a) : m128(_mm_set1_epi32(a))
-  {
-  }
-  __forceinline ssei(int a, int b, int c, int d) : m128(_mm_setr_epi32(a, b, c, d))
-  {
-  }
-
-  __forceinline explicit ssei(const __m128 a) : m128(_mm_cvtps_epi32(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const int32_t &operator[](const size_t index) const
-  {
-    assert(index < 4);
-    return i[index];
-  }
-  __forceinline int32_t &operator[](const size_t index)
-  {
-    assert(index < 4);
-    return i[index];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssei cast(const __m128 &a)
-{
-  return _mm_castps_si128(a);
-}
-__forceinline const ssei operator+(const ssei &a)
-{
-  return a;
-}
-__forceinline const ssei operator-(const ssei &a)
-{
-  return _mm_sub_epi32(_mm_setzero_si128(), a.m128);
-}
-#  if defined(__KERNEL_SSSE3__)
-__forceinline const ssei abs(const ssei &a)
-{
-  return _mm_abs_epi32(a.m128);
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssei operator+(const ssei &a, const ssei &b)
-{
-  return _mm_add_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator+(const ssei &a, const int32_t &b)
-{
-  return a + ssei(b);
-}
-__forceinline const ssei operator+(const int32_t &a, const ssei &b)
-{
-  return ssei(a) + b;
-}
-
-__forceinline const ssei operator-(const ssei &a, const ssei &b)
-{
-  return _mm_sub_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator-(const ssei &a, const int32_t &b)
-{
-  return a - ssei(b);
-}
-__forceinline const ssei operator-(const int32_t &a, const ssei &b)
-{
-  return ssei(a) - b;
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei operator*(const ssei &a, const ssei &b)
-{
-  return _mm_mullo_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator*(const ssei &a, const int32_t &b)
-{
-  return a * ssei(b);
-}
-__forceinline const ssei operator*(const int32_t &a, const ssei &b)
-{
-  return ssei(a) * b;
-}
-#  endif
-
-__forceinline const ssei operator&(const ssei &a, const ssei &b)
-{
-  return _mm_and_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator&(const ssei &a, const int32_t &b)
-{
-  return a & ssei(b);
-}
-__forceinline const ssei operator&(const int32_t &a, const ssei &b)
-{
-  return ssei(a) & b;
-}
-
-__forceinline const ssei operator|(const ssei &a, const ssei &b)
-{
-  return _mm_or_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator|(const ssei &a, const int32_t &b)
-{
-  return a | ssei(b);
-}
-__forceinline const ssei operator|(const int32_t &a, const ssei &b)
-{
-  return ssei(a) | b;
-}
-
-__forceinline const ssei operator^(const ssei &a, const ssei &b)
-{
-  return _mm_xor_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator^(const ssei &a, const int32_t &b)
-{
-  return a ^ ssei(b);
-}
-__forceinline const ssei operator^(const int32_t &a, const ssei &b)
-{
-  return ssei(a) ^ b;
-}
-
-__forceinline const ssei operator<<(const ssei &a, const int32_t &n)
-{
-  return _mm_slli_epi32(a.m128, n);
-}
-__forceinline const ssei operator>>(const ssei &a, const int32_t &n)
-{
-  return _mm_srai_epi32(a.m128, n);
-}
-
-__forceinline const ssei andnot(const ssei &a, const ssei &b)
-{
-  return _mm_andnot_si128(a.m128, b.m128);
-}
-__forceinline const ssei andnot(const sseb &a, const ssei &b)
-{
-  return _mm_andnot_si128(cast(a.m128), b.m128);
-}
-__forceinline const ssei andnot(const ssei &a, const sseb &b)
-{
-  return _mm_andnot_si128(a.m128, cast(b.m128));
-}
-
-__forceinline const ssei sra(const ssei &a, const int32_t &b)
-{
-  return _mm_srai_epi32(a.m128, b);
-}
-__forceinline const ssei srl(const ssei &a, const int32_t &b)
-{
-  return _mm_srli_epi32(a.m128, b);
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei min(const ssei &a, const ssei &b)
-{
-  return _mm_min_epi32(a.m128, b.m128);
-}
-__forceinline const ssei min(const ssei &a, const int32_t &b)
-{
-  return min(a, ssei(b));
-}
-__forceinline const ssei min(const int32_t &a, const ssei &b)
-{
-  return min(ssei(a), b);
-}
-
-__forceinline const ssei max(const ssei &a, const ssei &b)
-{
-  return _mm_max_epi32(a.m128, b.m128);
-}
-__forceinline const ssei max(const ssei &a, const int32_t &b)
-{
-  return max(a, ssei(b));
-}
-__forceinline const ssei max(const int32_t &a, const ssei &b)
-{
-  return max(ssei(a), b);
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei &operator+=(ssei &a, const ssei &b)
-{
-  return a = a + b;
-}
-__forceinline ssei &operator+=(ssei &a, const int32_t &b)
-{
-  return a = a + b;
-}
-
-__forceinline ssei &operator-=(ssei &a, const ssei &b)
-{
-  return a = a - b;
-}
-__forceinline ssei &operator-=(ssei &a, const int32_t &b)
-{
-  return a = a - b;
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline ssei &operator*=(ssei &a, const ssei &b)
-{
-  return a = a * b;
-}
-__forceinline ssei &operator*=(ssei &a, const int32_t &b)
-{
-  return a = a * b;
-}
-#  endif
-
-__forceinline ssei &operator&=(ssei &a, const ssei &b)
-{
-  return a = a & b;
-}
-__forceinline ssei &operator&=(ssei &a, const int32_t &b)
-{
-  return a = a & b;
-}
-
-__forceinline ssei &operator|=(ssei &a, const ssei &b)
-{
-  return a = a | b;
-}
-__forceinline ssei &operator|=(ssei &a, const int32_t &b)
-{
-  return a = a | b;
-}
-
-__forceinline ssei &operator^=(ssei &a, const ssei &b)
-{
-  return a = a ^ b;
-}
-__forceinline ssei &operator^=(ssei &a, const int32_t &b)
-{
-  return a = a ^ b;
-}
-
-__forceinline ssei &operator<<=(ssei &a, const int32_t &b)
-{
-  return a = a << b;
-}
-__forceinline ssei &operator>>=(ssei &a, const int32_t &b)
-{
-  return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator==(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator==(const ssei &a, const int32_t &b)
-{
-  return a == ssei(b);
-}
-__forceinline const sseb operator==(const int32_t &a, const ssei &b)
-{
-  return ssei(a) == b;
-}
-
-__forceinline const sseb operator!=(const ssei &a, const ssei &b)
-{
-  return !(a == b);
-}
-__forceinline const sseb operator!=(const ssei &a, const int32_t &b)
-{
-  return a != ssei(b);
-}
-__forceinline const sseb operator!=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) != b;
-}
-
-__forceinline const sseb operator<(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmplt_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator<(const ssei &a, const int32_t &b)
-{
-  return a < ssei(b);
-}
-__forceinline const sseb operator<(const int32_t &a, const ssei &b)
-{
-  return ssei(a) < b;
-}
-
-__forceinline const sseb operator>=(const ssei &a, const ssei &b)
-{
-  return !(a < b);
-}
-__forceinline const sseb operator>=(const ssei &a, const int32_t &b)
-{
-  return a >= ssei(b);
-}
-__forceinline const sseb operator>=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) >= b;
-}
-
-__forceinline const sseb operator>(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmpgt_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator>(const ssei &a, const int32_t &b)
-{
-  return a > ssei(b);
-}
-__forceinline const sseb operator>(const int32_t &a, const ssei &b)
-{
-  return ssei(a) > b;
-}
-
-__forceinline const sseb operator<=(const ssei &a, const ssei &b)
-{
-  return !(a > b);
-}
-__forceinline const sseb operator<=(const ssei &a, const int32_t &b)
-{
-  return a <= ssei(b);
-}
-__forceinline const sseb operator<=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) <= b;
-}
-
-__forceinline const ssei select(const sseb &m, const ssei &t, const ssei &f)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
-#  else
-  return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
-#  endif
-}
-
-__forceinline const ssei select(const int mask, const ssei &t, const ssei &f)
-{
-#  if defined(__KERNEL_SSE41__) && \
-      ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
-  return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
-#  else
-  return select(sseb(mask), t, f);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei unpacklo(const ssei &a, const ssei &b)
-{
-  return _mm_unpacklo_epi32(a, b);
-}
-__forceinline ssei unpackhi(const ssei &a, const ssei &b)
-{
-  return _mm_unpackhi_epi32(a, b);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssei shuffle(const ssei &a)
-{
-#  ifdef __KERNEL_NEON__
-  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
-  return vreinterpretq_m128i_s32(result);
-#  else
-  return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
-#  endif
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssei shuffle(const ssei &a, const ssei &b)
-{
-#  ifdef __KERNEL_NEON__
-  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
-                                                             vreinterpretq_s32_m128i(b));
-  return vreinterpretq_m128i_s32(result);
-#  else
-  return _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-#  endif
-}
-
-template<size_t i0> __forceinline const ssei shuffle(const ssei &b)
-{
-  return shuffle<i0, i0, i0, i0>(b);
-}
-
-#  if defined(__KERNEL_SSE41__)
-template<size_t src> __forceinline int extract(const ssei &b)
-{
-  return _mm_extract_epi32(b, src);
-}
-template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b)
-{
-  return _mm_insert_epi32(a, b, dst);
-}
-#  else
-template<size_t src> __forceinline int extract(const ssei &b)
-{
-  return b[src];
-}
-template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b)
-{
-  ssei c = a;
-  c[dst] = b;
-  return c;
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei vreduce_min(const ssei &v)
-{
-  ssei h = min(shuffle<1, 0, 3, 2>(v), v);
-  return min(shuffle<2, 3, 0, 1>(h), h);
-}
-__forceinline const ssei vreduce_max(const ssei &v)
-{
-  ssei h = max(shuffle<1, 0, 3, 2>(v), v);
-  return max(shuffle<2, 3, 0, 1>(h), h);
-}
-__forceinline const ssei vreduce_add(const ssei &v)
-{
-  ssei h = shuffle<1, 0, 3, 2>(v) + v;
-  return shuffle<2, 3, 0, 1>(h) + h;
-}
-
-__forceinline int reduce_min(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vminvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_min(v));
-#    endif
-}
-__forceinline int reduce_max(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vmaxvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_max(v));
-#    endif
-}
-__forceinline int reduce_add(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vaddvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_add(v));
-#    endif
-}
-
-__forceinline uint32_t select_min(const ssei &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const ssei &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const sseb &valid, const ssei &v)
-{
-  const ssei a = select(valid, v, ssei((int)pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const sseb &valid, const ssei &v)
-{
-  const ssei a = select(valid, v, ssei((int)neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-#  else
-
-__forceinline int ssei_min(int a, int b)
-{
-  return (a < b) ? a : b;
-}
-__forceinline int ssei_max(int a, int b)
-{
-  return (a > b) ? a : b;
-}
-__forceinline int reduce_min(const ssei &v)
-{
-  return ssei_min(ssei_min(v[0], v[1]), ssei_min(v[2], v[3]));
-}
-__forceinline int reduce_max(const ssei &v)
-{
-  return ssei_max(ssei_max(v[0], v[1]), ssei_max(v[2], v[3]));
-}
-__forceinline int reduce_add(const ssei &v)
-{
-  return v[0] + v[1] + v[2] + v[3];
-}
-
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Memory load and store operations
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei load4i(const void *const a)
-{
-  return _mm_load_si128((__m128i *)a);
-}
-
-__forceinline void store4i(void *ptr, const ssei &v)
-{
-  _mm_store_si128((__m128i *)ptr, v);
-}
-
-__forceinline void storeu4i(void *ptr, const ssei &v)
-{
-  _mm_storeu_si128((__m128i *)ptr, v);
-}
-
-__forceinline void store4i(const sseb &mask, void *ptr, const ssei &i)
-{
-#  if defined(__KERNEL_AVX__)
-  _mm_maskstore_ps((float *)ptr, (__m128i)mask, _mm_castsi128_ps(i));
-#  else
-  *(ssei *)ptr = select(mask, i, *(ssei *)ptr);
-#  endif
-}
-
-__forceinline ssei load4i_nt(void *ptr)
-{
-#  if defined(__KERNEL_SSE41__)
-  return _mm_stream_load_si128((__m128i *)ptr);
-#  else
-  return _mm_load_si128((__m128i *)ptr);
-#  endif
-}
-
-__forceinline void store4i_nt(void *ptr, const ssei &v)
-{
-#  if defined(__KERNEL_SSE41__)
-  _mm_stream_ps((float *)ptr, _mm_castsi128_ps(v));
-#  else
-  _mm_store_si128((__m128i *)ptr, v);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_ssei(const char *label, const ssei &a)
-{
-  printf("%s: %df %df %df %d\n", label, a[0], a[1], a[2], a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index cb985c65dd8..84116262437 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -102,7 +102,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
     return projection_identity();
   }
 
-  memcpy(&tfmR, R, sizeof(R));
+  memcpy(&tfmR.x[0], R, sizeof(R));
 
   return tfmR;
 }
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 24184dc7074..0c39901a63c 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -63,17 +63,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
 {
   /* TODO(sergey): Disabled for now, causes crashes in certain cases. */
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
-  ssef x, y, z, w, aa;
-  aa = a.m128;
+  const float4 aa(a.m128);
 
-  x = _mm_loadu_ps(&t->x.x);
-  y = _mm_loadu_ps(&t->y.x);
-  z = _mm_loadu_ps(&t->z.x);
-  w = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
+  float4 x(_mm_loadu_ps(&t->x.x));
+  float4 y(_mm_loadu_ps(&t->y.x));
+  float4 z(_mm_loadu_ps(&t->z.x));
+  float4 w(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f));
 
-  _MM_TRANSPOSE4_PS(x, y, z, w);
+  _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128);
 
-  ssef tmp = w;
+  float4 tmp = w;
   tmp = madd(shuffle<2>(aa), z, tmp);
   tmp = madd(shuffle<1>(aa), y, tmp);
   tmp = madd(shuffle<0>(aa), x, tmp);
@@ -94,16 +93,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
 ccl_device_inline float3 transform_direction(ccl_private const Transform *t, const float3 a)
 {
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
-  ssef x, y, z, w, aa;
-  aa = a.m128;
-  x = _mm_loadu_ps(&t->x.x);
-  y = _mm_loadu_ps(&t->y.x);
-  z = _mm_loadu_ps(&t->z.x);
-  w = _mm_setzero_ps();
+  const float4 aa(a.m128);
 
-  _MM_TRANSPOSE4_PS(x, y, z, w);
+  float4 x(_mm_loadu_ps(&t->x.x));
+  float4 y(_mm_loadu_ps(&t->y.x));
+  float4 z(_mm_loadu_ps(&t->z.x));
+  float4 w(_mm_setzero_ps());
 
-  ssef tmp = shuffle<2>(aa) * z;
+  _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128);
+
+  float4 tmp = shuffle<2>(aa) * z;
   tmp = madd(shuffle<1>(aa), y, tmp);
   tmp = madd(shuffle<0>(aa), x, tmp);
 
@@ -197,14 +196,7 @@ ccl_device_inline Transform make_transform_frame(float3 N)
   return make_transform(dx.x, dx.y, dx.z, 0.0f, dy.x, dy.y, dy.z, 0.0f, N.x, N.y, N.z, 0.0f);
 }
 
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline Transform transform_zero()
-{
-  Transform zero = {zero_float4(), zero_float4(), zero_float4()};
-  return zero;
-}
-
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline Transform operator*(const Transform a, const Transform b)
 {
   float4 c_x = make_float4(b.x.x, b.y.x, b.z.x, 0.0f);
@@ -219,6 +211,15 @@ ccl_device_inline Transform operator*(const Transform a, const Transform b)
 
   return t;
 }
+#endif
+
+#ifndef __KERNEL_GPU__
+
+ccl_device_inline Transform transform_zero()
+{
+  Transform zero = {zero_float4(), zero_float4(), zero_float4()};
+  return zero;
+}
 
 ccl_device_inline void print_transform(const char *label, const Transform &t)
 {
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
index bb410a6daef..2faac576d82 100644
--- a/intern/cycles/util/transform_inverse.h
+++ b/intern/cycles/util/transform_inverse.h
@@ -9,26 +9,33 @@ CCL_NAMESPACE_BEGIN
  * Normally we don't use SSE41/AVX outside the kernel, but for this it's
  * important to match exactly for ray tracing precision. */
 
-ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_)
 {
 #if defined(__AVX2__) && defined(__KERNEL_SSE2__)
-  const ssef sse_a = (const __m128 &)a;
-  const ssef sse_b = (const __m128 &)b;
-  const ssef r = shuffle<1, 2, 0, 3>(
-      ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+  const __m128 a = (const __m128 &)a_;
+  const __m128 b = (const __m128 &)b_;
+  const __m128 a_shuffle = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1)));
+  const __m128 b_shuffle = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1)));
+  const __m128 r = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))),
+                        _MM_SHUFFLE(3, 0, 2, 1)));
   return (const float3 &)r;
 #endif
 
-  return cross(a, b);
+  return cross(a_, b_);
 }
 
-ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_)
 {
-#ifdef __SSE4_1__
-  return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+  const __m128 a = (const __m128 &)a_;
+  const __m128 b = (const __m128 &)b_;
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
 #endif
 
-  return dot(a, b);
+  return dot(a_, b_);
 }
 
 ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm)
diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h
index 1ab6f76f9bc..cf7f35c4116 100644
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -97,6 +97,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_int2.h"
 #include "util/types_int3.h"
 #include "util/types_int4.h"
+#include "util/types_int8.h"
 
 #include "util/types_uint2.h"
 #include "util/types_uint3.h"
@@ -119,6 +120,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_int2_impl.h"
 #include "util/types_int3_impl.h"
 #include "util/types_int4_impl.h"
+#include "util/types_int8_impl.h"
 
 #include "util/types_uint2_impl.h"
 #include "util/types_uint3_impl.h"
@@ -129,16 +131,4 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_float4_impl.h"
 #include "util/types_float8_impl.h"
 
-/* SSE types. */
-#ifndef __KERNEL_GPU__
-#  include "util/sseb.h"
-#  include "util/ssef.h"
-#  include "util/ssei.h"
-#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-#    include "util/avxb.h"
-#    include "util/avxf.h"
-#    include "util/avxi.h"
-#  endif
-#endif
-
 #endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h
index 29fd632f08e..121141ddfd9 100644
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -11,15 +11,15 @@
 CCL_NAMESPACE_BEGIN
 
 /* float8 is a reserved type in Metal that has not been implemented. For
- * that reason this is named float8_t and not using native vector types. */
+ * that reason this is named vfloat8 and not using native vector types. */
 
 #ifdef __KERNEL_GPU__
-struct float8_t
+struct vfloat8
 #else
-struct ccl_try_align(32) float8_t
+struct ccl_try_align(32) vfloat8
 #endif
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
   union {
     __m256 m256;
     struct {
@@ -27,18 +27,18 @@ struct ccl_try_align(32) float8_t
     };
   };
 
-  __forceinline float8_t();
-  __forceinline float8_t(const float8_t &a);
-  __forceinline explicit float8_t(const __m256 &a);
+  __forceinline vfloat8();
+  __forceinline vfloat8(const vfloat8 &a);
+  __forceinline explicit vfloat8(const __m256 &a);
 
   __forceinline operator const __m256 &() const;
   __forceinline operator __m256 &();
 
-  __forceinline float8_t &operator=(const float8_t &a);
+  __forceinline vfloat8 &operator=(const vfloat8 &a);
 
-#else  /* __KERNEL_AVX2__ */
+#else  /* __KERNEL_AVX__ */
   float a, b, c, d, e, f, g, h;
-#endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX__ */
 
 #ifndef __KERNEL_GPU__
   __forceinline float operator[](int i) const;
@@ -46,8 +46,11 @@ struct ccl_try_align(32) float8_t
 #endif
 };
 
-ccl_device_inline float8_t make_float8_t(float f);
-ccl_device_inline float8_t
-make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h);
+ccl_device_inline vfloat8 make_vfloat8(float f);
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h);
+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b);
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h
index e8576cdaf70..9f42e0f663c 100644
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -10,45 +10,45 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_AVX2__
-__forceinline float8_t::float8_t()
+#ifdef __KERNEL_AVX__
+__forceinline vfloat8::vfloat8()
 {
 }
 
-__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256)
+__forceinline vfloat8::vfloat8(const vfloat8 &f) : m256(f.m256)
 {
 }
 
-__forceinline float8_t::float8_t(const __m256 &f) : m256(f)
+__forceinline vfloat8::vfloat8(const __m256 &f) : m256(f)
 {
 }
 
-__forceinline float8_t::operator const __m256 &() const
+__forceinline vfloat8::operator const __m256 &() const
 {
   return m256;
 }
 
-__forceinline float8_t::operator __m256 &()
+__forceinline vfloat8::operator __m256 &()
 {
   return m256;
 }
 
-__forceinline float8_t &float8_t::operator=(const float8_t &f)
+__forceinline vfloat8 &vfloat8::operator=(const vfloat8 &f)
 {
   m256 = f.m256;
   return *this;
 }
-#endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX__ */
 
 #ifndef __KERNEL_GPU__
-__forceinline float float8_t::operator[](int i) const
+__forceinline float vfloat8::operator[](int i) const
 {
   util_assert(i >= 0);
   util_assert(i < 8);
   return *(&a + i);
 }
 
-__forceinline float &float8_t::operator[](int i)
+__forceinline float &vfloat8::operator[](int i)
 {
   util_assert(i >= 0);
   util_assert(i < 8);
@@ -56,25 +56,50 @@ __forceinline float &float8_t::operator[](int i)
 }
 #endif
 
-ccl_device_inline float8_t make_float8_t(float f)
+ccl_device_inline vfloat8 make_vfloat8(float f)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t r(_mm256_set1_ps(f));
+#ifdef __KERNEL_AVX__
+  vfloat8 r(_mm256_set1_ps(f));
 #else
-  float8_t r = {f, f, f, f, f, f, f, f};
+  vfloat8 r = {f, f, f, f, f, f, f, f};
 #endif
   return r;
 }
 
-ccl_device_inline float8_t
-make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h)
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#ifdef __KERNEL_AVX__
+  vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
 #else
-  float8_t r = {a, b, c, d, e, f, g, h};
+  vfloat8 r = {a, b, c, d, e, f, g, h};
 #endif
   return r;
 }
 
+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1));
+#else
+  return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n",
+         label,
+         (double)a.a,
+         (double)a.b,
+         (double)a.c,
+         (double)a.d,
+         (double)a.e,
+         (double)a.f,
+         (double)a.g,
+         (double)a.h);
+#endif
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int8.h b/intern/cycles/util/types_int8.h
new file mode 100644
index 00000000000..8643ebe96ad
--- /dev/null
+++ b/intern/cycles/util/types_int8.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+struct vfloat8;
+
+#ifdef __KERNEL_GPU__
+struct vint8
+#else
+struct ccl_try_align(32) vint8
+#endif
+{
+#ifdef __KERNEL_AVX__
+  union {
+    __m256i m256;
+    struct {
+      int a, b, c, d, e, f, g, h;
+    };
+  };
+
+  __forceinline vint8();
+  __forceinline vint8(const vint8 &a);
+  __forceinline explicit vint8(const __m256i &a);
+
+  __forceinline operator const __m256i &() const;
+  __forceinline operator __m256i &();
+
+  __forceinline vint8 &operator=(const vint8 &a);
+#else  /* __KERNEL_AVX__ */
+  int a, b, c, d, e, f, g, h;
+#endif /* __KERNEL_AVX__ */
+
+#ifndef __KERNEL_GPU__
+  __forceinline int operator[](int i) const;
+  __forceinline int &operator[](int i);
+#endif
+};
+
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h);
+ccl_device_inline vint8 make_vint8(int i);
+ccl_device_inline vint8 make_vint8(const vfloat8 f);
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int8_impl.h b/intern/cycles/util/types_int8_impl.h
new file mode 100644
index 00000000000..080bcaa6a2b
--- /dev/null
+++ b/intern/cycles/util/types_int8_impl.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+__forceinline vint8::vint8()
+{
+}
+
+__forceinline vint8::vint8(const vint8 &a) : m256(a.m256)
+{
+}
+
+__forceinline vint8::vint8(const __m256i &a) : m256(a)
+{
+}
+
+__forceinline vint8::operator const __m256i &() const
+{
+  return m256;
+}
+
+__forceinline vint8::operator __m256i &()
+{
+  return m256;
+}
+
+__forceinline vint8 &vint8::operator=(const vint8 &a)
+{
+  m256 = a.m256;
+  return *this;
+}
+#endif /* __KERNEL_AVX__ */
+
+#ifndef __KERNEL_GPU__
+__forceinline int vint8::operator[](int i) const
+{
+  util_assert(i >= 0);
+  util_assert(i < 8);
+  return *(&a + i);
+}
+
+__forceinline int &vint8::operator[](int i)
+{
+  util_assert(i >= 0);
+  util_assert(i < 8);
+  return *(&a + i);
+}
+#endif
+
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a));
+#else
+  return {a, b, c, d, e, f, g, h};
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(int i)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_set1_epi32(i));
+#else
+  return make_vint8(i, i, i, i, i, i, i, i);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const vfloat8 f)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_cvtps_epi32(f.m256));
+#else
+  return make_vint8(
+      (int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1));
+#else
+  return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+CCL_NAMESPACE_END