Refactor: replace Cycles sse/avx types by vectorized float4/int4/float8/int8

The distinction existed for legacy reasons, to easily port of Embree intersection code without affecting the main vector types. However we are now using SIMD for these types as well, so no good reason to keep the distinction. Also more consistently pass these vector types by value in inline functions. Previously it was partially changed for functions used by Metal to avoid having to add address space qualifiers, simple to do it everywhere. Also removes function declarations for vector math headers, serves no real purpose. Differential Revision: https://developer.blender.org/D16146
author: Brecht Van Lommel <brecht> 2022-11-01 17:16:55 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2022-11-08 14:28:40 +0300
commit: e1b3d9112730bc3b569ffff732a1558752ded146 (patch)
tree: 27caad945dcc9ce7313b4c84cd0efbab6f70503c
parent: 32ec0521c542bb78a0080f8091856ec085030f09 (diff)
37 files changed, 1785 insertions, 4630 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3779fdc697a..3fbb346e94f 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -328,6 +328,7 @@ set(SRC_UTIL_HEADERS
   ../util/math_int2.h
   ../util/math_int3.h
   ../util/math_int4.h
+  ../util/math_int8.h
   ../util/math_matrix.h
   ../util/projection.h
   ../util/rect.h
@@ -350,6 +351,8 @@ set(SRC_UTIL_HEADERS
   ../util/types_int3_impl.h
   ../util/types_int4.h
   ../util/types_int4_impl.h
+  ../util/types_int8.h
+  ../util/types_int8_impl.h
   ../util/types_spectrum.h
   ../util/types_uchar2.h
   ../util/types_uchar2_impl.h
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index 01087c96dd6..558431961ab 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -7,6 +7,7 @@
  * one with SSE2 intrinsics.
  */
 #if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #endif
 
@@ -29,11 +30,15 @@
 #    define __KERNEL_SSE41__
 #  endif
 #  ifdef __AVX__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX2__
 #  endif
 #endif
diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h
index 31e77d87413..209195a03f1 100644
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@@ -39,11 +39,11 @@ ccl_device_noinline_cpu float perlin_1d(float x)
 }
 
 /* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if
- * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not
+ * SSE is supported, that is, if __KERNEL_SSE__ is defined. If it is not
  * supported, we do a standard implementation, but if it is supported, we
  * do an implementation using SSE intrinsics.
  */
-#if !defined(__KERNEL_SSE2__)
+#if !defined(__KERNEL_SSE__)
 
 /* ** Standard Implementation ** */
 
@@ -250,18 +250,18 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 
 /* SSE Bilinear Interpolation:
  *
- * The function takes two ssef inputs:
+ * The function takes two float4 inputs:
  * - p : Contains the values at the points (v0, v1, v2, v3).
  * - f : Contains the values (x, y, _, _). The third and fourth values are unused.
  *
  * The interpolation is done in two steps:
  * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1).
  *    (v2, v3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
  *    fourth values are unused.
  * 2. Interpolate g0 and g1 along the y axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
  *
  * v1          v3          g1
  *  @ + + + + @            @                    y
@@ -272,27 +272,27 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
  * v0          v2          g0
  *
  */
-ccl_device_inline ssef bi_mix(ssef p, ssef f)
+ccl_device_inline float4 bi_mix(float4 p, float4 f)
 {
-  ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
+  float4 g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
   return mix(g, shuffle<1>(g), shuffle<1>(f));
 }
 
-ccl_device_inline ssef fade(const ssef &t)
+ccl_device_inline float4 fade(const float4 t)
 {
-  ssef a = madd(t, 6.0f, -15.0f);
-  ssef b = madd(t, a, 10.0f);
+  float4 a = madd(t, make_float4(6.0f), make_float4(-15.0f));
+  float4 b = madd(t, a, make_float4(10.0f));
   return (t * t) * (t * b);
 }
 
 /* Negate val if the nth bit of h is 1. */
 #  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
 
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y)
 {
-  ssei h = hash & 7;
-  ssef u = select(h < 4, x, y);
-  ssef v = 2.0f * select(h < 4, y, x);
+  int4 h = hash & 7;
+  float4 u = select(h < 4, x, y);
+  float4 v = 2.0f * select(h < 4, y, x);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
@@ -310,28 +310,28 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
  */
 ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
-  ssei XY;
-  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
-  ssef uv = fade(fxy);
+  int4 XY;
+  float4 fxy = floorfrac(make_float4(x, y, 0.0f, 0.0f), &XY);
+  float4 uv = fade(fxy);
 
-  ssei XY1 = XY + 1;
-  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
-  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+  int4 XY1 = XY + make_int4(1);
+  int4 X = shuffle<0, 0, 0, 0>(XY, XY1);
+  int4 Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
 
-  ssei h = hash_ssei2(X, Y);
+  int4 h = hash_int4_2(X, Y);
 
-  ssef fxy1 = fxy - 1.0f;
-  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
-  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+  float4 fxy1 = fxy - make_float4(1.0f);
+  float4 fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+  float4 fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
 
-  ssef g = grad(h, fx, fy);
+  float4 g = grad(h, fx, fy);
 
   return extract<0>(bi_mix(g, uv));
 }
 
 /* SSE Trilinear Interpolation:
  *
- * The function takes three ssef inputs:
+ * The function takes three float4 inputs:
  * - p : Contains the values at the points (v0, v1, v2, v3).
  * - q : Contains the values at the points (v4, v5, v6, v7).
  * - f : Contains the values (x, y, z, _). The fourth value is unused.
@@ -340,11 +340,11 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
  * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3).
  * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1).
  *    (s2, s3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
  *    fourth values are unused.
  * 3. Interpolate g0 and g1 along the z axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
  *
  *   v3               v7
  *     @ + + + + + + @               s3 @
@@ -362,10 +362,10 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
  *          @ + + + + + + @                  @
  *        v0               v4                 s0
  */
-ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
+ccl_device_inline float4 tri_mix(float4 p, float4 q, float4 f)
 {
-  ssef s = mix(p, q, shuffle<0>(f));
-  ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
+  float4 s = mix(p, q, shuffle<0>(f));
+  float4 g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
   return mix(g, shuffle<1>(g), shuffle<2>(f));
 }
 
@@ -374,24 +374,24 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
  * supported, we do an SSE implementation, but if it is supported,
  * we do an implementation using AVX intrinsics.
  */
-#  if !defined(__KERNEL_AVX__)
+#  if !defined(__KERNEL_AVX2__)
 
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y, const float4 z)
 {
-  ssei h = hash & 15;
-  ssef u = select(h < 8, x, y);
-  ssef vt = select((h == 12) | (h == 14), x, z);
-  ssef v = select(h < 4, y, vt);
+  int4 h = hash & 15;
+  float4 u = select(h < 8, x, y);
+  float4 vt = select((h == 12) | (h == 14), x, z);
+  float4 v = select(h < 4, y, vt);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
-ccl_device_inline ssef
-grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w)
+ccl_device_inline float4
+grad(const int4 hash, const float4 x, const float4 y, const float4 z, const float4 w)
 {
-  ssei h = hash & 31;
-  ssef u = select(h < 24, x, y);
-  ssef v = select(h < 16, y, z);
-  ssef s = select(h < 8, z, w);
+  int4 h = hash & 31;
+  float4 u = select(h < 24, x, y);
+  float4 v = select(h < 16, y, z);
+  float4 s = select(h < 8, z, w);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }
 
@@ -401,7 +401,7 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &
  * between two trilinear interpolations.
  *
  */
-ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
+ccl_device_inline float4 quad_mix(float4 p, float4 q, float4 r, float4 s, float4 f)
 {
   return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
 }
@@ -427,23 +427,23 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
  */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);
 
-  ssei XYZ1 = XYZ + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
 
-  ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z);
-  ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z);
+  int4 h1 = hash_int4_3(shuffle<0>(XYZ), Y, Z);
+  int4 h2 = hash_int4_3(shuffle<0>(XYZ1), Y, Z);
 
-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+  float4 fxyz1 = fxyz - make_float4(1.0f);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
 
-  ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
-  ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
+  float4 g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
+  float4 g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
 
   return extract<0>(tri_mix(g1, g2, uvw));
 }
@@ -481,29 +481,29 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
  */
 ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
-  ssei XYZW;
-  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
-  ssef uvws = fade(fxyzw);
+  int4 XYZW;
+  float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+  float4 uvws = fade(fxyzw);
 
-  ssei XYZW1 = XYZW + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  int4 XYZW1 = XYZW + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
 
-  ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
-  ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
+  int4 h1 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
+  int4 h2 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
 
-  ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
-  ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
+  int4 h3 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
+  int4 h4 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
 
-  ssef fxyzw1 = fxyzw - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  float4 fxyzw1 = fxyzw - make_float4(1.0f);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
 
-  ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
-  ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
+  float4 g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
+  float4 g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
 
-  ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
-  ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
+  float4 g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
+  float4 g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
 
   return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
 }
@@ -512,22 +512,22 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 
 /* AVX Implementation */
 
-ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z)
+ccl_device_inline vfloat8 grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z)
 {
-  avxi h = hash & 15;
-  avxf u = select(h < 8, x, y);
-  avxf vt = select((h == 12) | (h == 14), x, z);
-  avxf v = select(h < 4, y, vt);
+  vint8 h = hash & 15;
+  vfloat8 u = select(h < 8, x, y);
+  vfloat8 vt = select((h == 12) | (h == 14), x, z);
+  vfloat8 v = select(h < 4, y, vt);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
-ccl_device_inline avxf
-grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w)
+ccl_device_inline vfloat8
+grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z, const vfloat8 w)
 {
-  avxi h = hash & 31;
-  avxf u = select(h < 24, x, y);
-  avxf v = select(h < 16, y, z);
-  avxf s = select(h < 8, z, w);
+  vint8 h = hash & 31;
+  vfloat8 u = select(h < 24, x, y);
+  vfloat8 v = select(h < 16, y, z);
+  vfloat8 s = select(h < 8, z, w);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }
 
@@ -537,13 +537,13 @@ grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &
  * 1. Interpolate p and q along the w axis to get s.
  * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final
  *    value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the
- *    low and high ssef from s.
+ *    low and high float4 from s.
  *
  */
-ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
+ccl_device_inline float4 quad_mix(vfloat8 p, vfloat8 q, float4 f)
 {
-  ssef fv = shuffle<3>(f);
-  avxf s = mix(p, q, avxf(fv, fv));
+  float4 fv = shuffle<3>(f);
+  vfloat8 s = mix(p, q, make_vfloat8(fv, fv));
   return tri_mix(low(s), high(s), f);
 }
 
@@ -565,25 +565,25 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
  */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);
 
-  ssei XYZ1 = XYZ + 1;
-  ssei X = shuffle<0>(XYZ);
-  ssei X1 = shuffle<0>(XYZ1);
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 X = shuffle<0>(XYZ);
+  int4 X1 = shuffle<0>(XYZ1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
 
-  avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z));
+  vint8 h = hash_int8_3(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z));
 
-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fx = shuffle<0>(fxyz);
-  ssef fx1 = shuffle<0>(fxyz1);
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+  float4 fxyz1 = fxyz - make_float4(1.0f);
+  float4 fx = shuffle<0>(fxyz);
+  float4 fx1 = shuffle<0>(fxyz1);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
 
-  avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz));
+  vfloat8 g = grad(h, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz));
 
   return extract<0>(tri_mix(low(g), high(g), uvw));
 }
@@ -617,31 +617,37 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
  */
 ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
-  ssei XYZW;
-  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
-  ssef uvws = fade(fxyzw);
-
-  ssei XYZW1 = XYZW + 1;
-  ssei X = shuffle<0>(XYZW);
-  ssei X1 = shuffle<0>(XYZW1);
-  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
-  ssei W = shuffle<3>(XYZW);
-  ssei W1 = shuffle<3>(XYZW1);
-
-  avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W));
-  avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1));
-
-  ssef fxyzw1 = fxyzw - 1.0f;
-  ssef fx = shuffle<0>(fxyzw);
-  ssef fx1 = shuffle<0>(fxyzw1);
-  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
-  ssef fw = shuffle<3>(fxyzw);
-  ssef fw1 = shuffle<3>(fxyzw1);
-
-  avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw));
-  avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1));
+  int4 XYZW;
+  float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+  float4 uvws = fade(fxyzw);
+
+  int4 XYZW1 = XYZW + make_int4(1);
+  int4 X = shuffle<0>(XYZW);
+  int4 X1 = shuffle<0>(XYZW1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  int4 W = shuffle<3>(XYZW);
+  int4 W1 = shuffle<3>(XYZW1);
+
+  vint8 h1 = hash_int8_4(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W, W));
+  vint8 h2 = hash_int8_4(
+      make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W1, W1));
+
+  float4 fxyzw1 = fxyzw - make_float4(1.0f);
+  float4 fx = shuffle<0>(fxyzw);
+  float4 fx1 = shuffle<0>(fxyzw1);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  float4 fw = shuffle<3>(fxyzw);
+  float4 fw1 = shuffle<3>(fxyzw1);
+
+  vfloat8 g1 = grad(
+      h1, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz), make_vfloat8(fw, fw));
+  vfloat8 g2 = grad(h2,
+                    make_vfloat8(fx, fx1),
+                    make_vfloat8(fy, fy),
+                    make_vfloat8(fz, fz),
+                    make_vfloat8(fw1, fw1));
 
   return extract<0>(quad_mix(g1, g2, uvws));
 }
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index c3ae81ed1db..34e5a4770ea 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -45,17 +45,24 @@ set(SRC
 # Disable AVX tests on macOS. Rosetta has problems running them, and other
 # platforms should be enough to verify AVX operations are implemented correctly.
 if(NOT APPLE)
+  if(CXX_HAS_SSE)
+    list(APPEND SRC
+      util_float8_sse2_test.cpp
+    )
+    set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+  endif()
+
   if(CXX_HAS_AVX)
     list(APPEND SRC
-      util_avxf_avx_test.cpp
+      util_float8_avx_test.cpp
     )
-    set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+    set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
   endif()
   if(CXX_HAS_AVX2)
     list(APPEND SRC
-      util_avxf_avx2_test.cpp
+      util_float8_avx2_test.cpp
     )
-    set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+    set_source_files_properties(util_float8_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
   endif()
 endif()
 
diff --git a/intern/cycles/test/util_avxf_test.h b/intern/cycles/test/util_avxf_test.h
deleted file mode 100644
index 34d966cc1a4..00000000000
--- a/intern/cycles/test/util_avxf_test.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#include "testing/testing.h"
-#include "util/system.h"
-#include "util/types.h"
-
-CCL_NAMESPACE_BEGIN
-
-static bool validate_cpu_capabilities()
-{
-
-#ifdef __KERNEL_AVX2__
-  return system_cpu_support_avx2();
-#else
-#  ifdef __KERNEL_AVX__
-  return system_cpu_support_avx();
-#  endif
-#endif
-}
-
-#define INIT_AVX_TEST \
-  if (!validate_cpu_capabilities()) \
-    return; \
-\
-  const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
-  const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
-  const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
-
-#define compare_vector_scalar(a, b) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_FLOAT_EQ(a[index], b);
-
-#define compare_vector_vector(a, b) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_FLOAT_EQ(a[index], b[index]);
-
-#define compare_vector_vector_near(a, b, abserror) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_NEAR(a[index], b[index], abserror);
-
-#define basic_test_vv(a, b, op) \
-  INIT_AVX_TEST \
-  avxf c = a op b; \
-  for (size_t i = 0; i < a.size; i++) \
-    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
-
-/* vector op float tests */
-#define basic_test_vf(a, b, op) \
-  INIT_AVX_TEST \
-  avxf c = a op b; \
-  for (size_t i = 0; i < a.size; i++) \
-    EXPECT_FLOAT_EQ(c[i], a[i] op b);
-
-static const float float_b = 1.5f;
-
-TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TEST_CATEGORY_NAME,
-                                                                             avxf_sub_vv){
-    basic_test_vv(avxf_a, avxf_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vv){
-    basic_test_vv(avxf_a, avxf_b, *)} TEST(TEST_CATEGORY_NAME, avxf_div_vv){
-    basic_test_vv(avxf_a, avxf_b, /)} TEST(TEST_CATEGORY_NAME, avxf_add_vf){
-    basic_test_vf(avxf_a, float_b, +)} TEST(TEST_CATEGORY_NAME, avxf_sub_vf){
-    basic_test_vf(avxf_a, float_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vf){
-    basic_test_vf(avxf_a, float_b, *)} TEST(TEST_CATEGORY_NAME,
-                                            avxf_div_vf){basic_test_vf(avxf_a, float_b, /)}
-
-TEST(TEST_CATEGORY_NAME, avxf_ctor)
-{
-  INIT_AVX_TEST
-  compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f),
-                        static_cast<float>(index));
-  compare_vector_scalar(avxf(1.0f), 1.0f);
-  compare_vector_vector(avxf(1.0f, 2.0f), avxf(1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f));
-  compare_vector_vector(avxf(1.0f, 2.0f, 3.0f, 4.0f),
-                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f));
-  compare_vector_vector(avxf(make_float3(1.0f, 2.0f, 3.0f)),
-                        avxf(0.0f, 3.0f, 2.0f, 1.0f, 0.0f, 3.0f, 2.0f, 1.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_sqrt)
-{
-  INIT_AVX_TEST
-  compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
-                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_min_max)
-{
-  INIT_AVX_TEST
-  compare_vector_vector(min(avxf_a, avxf_b), avxf_a);
-  compare_vector_vector(max(avxf_a, avxf_b), avxf_b);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_set_sign)
-{
-  INIT_AVX_TEST
-  avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a);
-  compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_msub)
-{
-  INIT_AVX_TEST
-  avxf res = msub(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7],
-                  (avxf_a[6] * avxf_b[6]) - avxf_c[6],
-                  (avxf_a[5] * avxf_b[5]) - avxf_c[5],
-                  (avxf_a[4] * avxf_b[4]) - avxf_c[4],
-                  (avxf_a[3] * avxf_b[3]) - avxf_c[3],
-                  (avxf_a[2] * avxf_b[2]) - avxf_c[2],
-                  (avxf_a[1] * avxf_b[1]) - avxf_c[1],
-                  (avxf_a[0] * avxf_b[0]) - avxf_c[0]);
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_madd)
-{
-  INIT_AVX_TEST
-  avxf res = madd(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7],
-                  (avxf_a[6] * avxf_b[6]) + avxf_c[6],
-                  (avxf_a[5] * avxf_b[5]) + avxf_c[5],
-                  (avxf_a[4] * avxf_b[4]) + avxf_c[4],
-                  (avxf_a[3] * avxf_b[3]) + avxf_c[3],
-                  (avxf_a[2] * avxf_b[2]) + avxf_c[2],
-                  (avxf_a[1] * avxf_b[1]) + avxf_c[1],
-                  (avxf_a[0] * avxf_b[0]) + avxf_c[0]);
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_nmadd)
-{
-  INIT_AVX_TEST
-  avxf res = nmadd(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]),
-                  avxf_c[6] - (avxf_a[6] * avxf_b[6]),
-                  avxf_c[5] - (avxf_a[5] * avxf_b[5]),
-                  avxf_c[4] - (avxf_a[4] * avxf_b[4]),
-                  avxf_c[3] - (avxf_a[3] * avxf_b[3]),
-                  avxf_c[2] - (avxf_a[2] * avxf_b[2]),
-                  avxf_c[1] - (avxf_a[1] * avxf_b[1]),
-                  avxf_c[0] - (avxf_a[0] * avxf_b[0]));
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_compare)
-{
-  INIT_AVX_TEST
-  avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f);
-  avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
-  avxb res = a <= b;
-  int exp[8] = {
-      a[0] <= b[0] ? -1 : 0,
-      a[1] <= b[1] ? -1 : 0,
-      a[2] <= b[2] ? -1 : 0,
-      a[3] <= b[3] ? -1 : 0,
-      a[4] <= b[4] ? -1 : 0,
-      a[5] <= b[5] ? -1 : 0,
-      a[6] <= b[6] ? -1 : 0,
-      a[7] <= b[7] ? -1 : 0,
-  };
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_permute)
-{
-  INIT_AVX_TEST
-  avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b);
-  compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_blend)
-{
-  INIT_AVX_TEST
-  avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b);
-  compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_shuffle)
-{
-  INIT_AVX_TEST
-  avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a);
-  compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_cross)
-{
-  INIT_AVX_TEST
-  avxf res = cross(avxf_b, avxf_c);
-  compare_vector_vector_near(res,
-                             avxf(0.0f,
-                                  -9.5367432e-07f,
-                                  0.0f,
-                                  4.7683716e-07f,
-                                  0.0f,
-                                  -3.8146973e-06f,
-                                  3.8146973e-06f,
-                                  3.8146973e-06f),
-                             0.000002000f);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_dot3)
-{
-  INIT_AVX_TEST
-  float den, den2;
-  dot3(avxf_a, avxf_b, den, den2);
-  EXPECT_FLOAT_EQ(den, 14.9f);
-  EXPECT_FLOAT_EQ(den2, 2.9f);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_avxf_avx2_test.cpp b/intern/cycles/test/util_float8_avx2_test.cpp
index 992c4d9a913..4682dce5b23 100644
--- a/intern/cycles/test/util_avxf_avx2_test.cpp
+++ b/intern/cycles/test/util_float8_avx2_test.cpp
@@ -1,11 +1,13 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
+#define __KERNEL_SSE__
+#define __KERNEL_AVX__
 #define __KERNEL_AVX2__
 
 #define TEST_CATEGORY_NAME util_avx2
 
 #if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
     defined(__AVX2__)
-#  include "util_avxf_test.h"
+#  include "util_float8_test.h"
 #endif
diff --git a/intern/cycles/test/util_avxf_avx_test.cpp b/intern/cycles/test/util_float8_avx_test.cpp
index abb98cdfb38..34fe750e766 100644
--- a/intern/cycles/test/util_avxf_avx_test.cpp
+++ b/intern/cycles/test/util_float8_avx_test.cpp
@@ -1,11 +1,12 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
+#define __KERNEL_SSE__
 #define __KERNEL_AVX__
 
 #define TEST_CATEGORY_NAME util_avx
 
 #if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
     defined(__AVX__)
-#  include "util_avxf_test.h"
+#  include "util_float8_test.h"
 #endif
diff --git a/intern/cycles/test/util_float8_sse2_test.cpp b/intern/cycles/test/util_float8_sse2_test.cpp
new file mode 100644
index 00000000000..ba8952a2b08
--- /dev/null
+++ b/intern/cycles/test/util_float8_sse2_test.cpp
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define __KERNEL_SSE__
+#define __KERNEL_SSE2__
+
+#define TEST_CATEGORY_NAME util_sse2
+
+#if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
+    defined(__SSE2__)
+#  include "util_float8_test.h"
+#endif
diff --git a/intern/cycles/test/util_float8_test.h b/intern/cycles/test/util_float8_test.h
new file mode 100644
index 00000000000..54701afaf8b
--- /dev/null
+++ b/intern/cycles/test/util_float8_test.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+#include "util/types.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool validate_cpu_capabilities()
+{
+
+#if defined(__KERNEL_AVX2__)
+  return system_cpu_support_avx2();
+#elif defined(__KERNEL_AVX__)
+  return system_cpu_support_avx();
+#elif defined(__KERNEL_SSE2__)
+  return system_cpu_support_sse2();
+#else
+  return false;
+#endif
+}
+
+#define INIT_FLOAT8_TEST \
+  if (!validate_cpu_capabilities()) \
+    return; \
+\
+  const vfloat8 float8_a = make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
+  const vfloat8 float8_b = make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
+  const vfloat8 float8_c = make_vfloat8(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
+
+#define compare_vector_scalar(a, b) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_FLOAT_EQ(a[index], b);
+
+#define compare_vector_vector(a, b) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_FLOAT_EQ(a[index], b[index]);
+
+#define compare_vector_vector_near(a, b, abserror) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_NEAR(a[index], b[index], abserror);
+
+#define basic_test_vv(a, b, op) \
+  INIT_FLOAT8_TEST \
+  vfloat8 c = a op b; \
+  for (size_t i = 0; i < 8; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
+
+/* vector op float tests */
+#define basic_test_vf(a, b, op) \
+  INIT_FLOAT8_TEST \
+  vfloat8 c = a op b; \
+  for (size_t i = 0; i < 8; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b);
+
+static const float float_b = 1.5f;
+
+TEST(TEST_CATEGORY_NAME,
+     float8_add_vv){basic_test_vv(float8_a, float8_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vv){
+    basic_test_vv(float8_a, float8_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vv){
+    basic_test_vv(float8_a, float8_b, *)} TEST(TEST_CATEGORY_NAME, float8_div_vv){
+    basic_test_vv(float8_a, float8_b, /)} TEST(TEST_CATEGORY_NAME, float8_add_vf){
+    basic_test_vf(float8_a, float_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vf){
+    basic_test_vf(float8_a, float_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vf){
+    basic_test_vf(float8_a, float_b, *)} TEST(TEST_CATEGORY_NAME,
+                                              float8_div_vf){basic_test_vf(float8_a, float_b, /)}
+
+TEST(TEST_CATEGORY_NAME, float8_ctor)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_scalar(make_vfloat8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
+                        static_cast<float>(index));
+  compare_vector_scalar(make_vfloat8(1.0f), 1.0f);
+}
+
+TEST(TEST_CATEGORY_NAME, float8_sqrt)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_vector(sqrt(make_vfloat8(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
+                        make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
+}
+
+TEST(TEST_CATEGORY_NAME, float8_min_max)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_vector(min(float8_a, float8_b), float8_a);
+  compare_vector_vector(max(float8_a, float8_b), float8_b);
+}
+
+TEST(TEST_CATEGORY_NAME, float8_shuffle)
+{
+  INIT_FLOAT8_TEST
+  vfloat8 res0 = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(float8_a);
+  compare_vector_vector(res0, make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.6f, 0.8f, 0.7f, 0.5f));
+  vfloat8 res1 = shuffle<3>(float8_a);
+  compare_vector_vector(res1, make_vfloat8(0.4f, 0.4f, 0.4f, 0.4f, 0.8f, 0.8f, 0.8f, 0.8f));
+  vfloat8 res2 = shuffle<3, 2, 1, 0>(float8_a, float8_b);
+  compare_vector_vector(res2, make_vfloat8(0.4f, 0.3f, 2.0f, 1.0f, 0.8f, 0.7f, 6.0f, 5.0f));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 57628f99e35..7f8f4a5ce76 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -69,6 +69,7 @@ set(SRC_HEADERS
   math_int2.h
   math_int3.h
   math_int4.h
+  math_int8.h
   math_matrix.h
   md5.h
   murmurhash.h
@@ -85,13 +86,7 @@ set(SRC_HEADERS
   rect.h
   set.h
   simd.h
-  avxf.h
-  avxb.h
-  avxi.h
   semaphore.h
-  sseb.h
-  ssef.h
-  ssei.h
   stack_allocator.h
   static_assert.h
   stats.h
@@ -118,6 +113,8 @@ set(SRC_HEADERS
   types_int3_impl.h
   types_int4.h
   types_int4_impl.h
+  types_int8.h
+  types_int8_impl.h
   types_spectrum.h
   types_uchar2.h
   types_uchar2_impl.h
diff --git a/intern/cycles/util/avxb.h b/intern/cycles/util/avxb.h
deleted file mode 100644
index fa3cb565309..00000000000
--- a/intern/cycles/util/avxb.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_AVXB_H__
-#define __UTIL_AVXB_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxf;
-
-/*! 4-wide SSE bool type. */
-struct avxb {
-  typedef avxb Mask;   // mask type
-  typedef avxf Float;  // float type
-
-  enum { size = 8 };  // number of SIMD elements
-  union {
-    __m256 m256;
-    int32_t v[8];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxb()
-  {
-  }
-  __forceinline avxb(const avxb &other)
-  {
-    m256 = other.m256;
-  }
-  __forceinline avxb &operator=(const avxb &other)
-  {
-    m256 = other.m256;
-    return *this;
-  }
-
-  __forceinline avxb(const __m256 input) : m256(input)
-  {
-  }
-  __forceinline avxb(const __m128 &a, const __m128 &b)
-      : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
-  {
-  }
-  __forceinline operator const __m256 &(void) const
-  {
-    return m256;
-  }
-  __forceinline operator const __m256i(void) const
-  {
-    return _mm256_castps_si256(m256);
-  }
-  __forceinline operator const __m256d(void) const
-  {
-    return _mm256_castps_pd(m256);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxb(FalseTy) : m256(_mm256_setzero_ps())
-  {
-  }
-  __forceinline avxb(TrueTy) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1)))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return (_mm256_movemask_ps(m256) >> i) & 1;
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!(const avxb &a)
-{
-  return _mm256_xor_ps(a, avxb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&(const avxb &a, const avxb &b)
-{
-  return _mm256_and_ps(a, b);
-}
-__forceinline const avxb operator|(const avxb &a, const avxb &b)
-{
-  return _mm256_or_ps(a, b);
-}
-__forceinline const avxb operator^(const avxb &a, const avxb &b)
-{
-  return _mm256_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&=(avxb &a, const avxb &b)
-{
-  return a = a & b;
-}
-__forceinline const avxb operator|=(avxb &a, const avxb &b)
-{
-  return a = a | b;
-}
-__forceinline const avxb operator^=(avxb &a, const avxb &b)
-{
-  return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!=(const avxb &a, const avxb &b)
-{
-  return _mm256_xor_ps(a, b);
-}
-__forceinline const avxb operator==(const avxb &a, const avxb &b)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-#else
-  __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
-  __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
-  __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
-  __m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1));
-  __m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo);
-  __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
-  __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
-  return _mm256_castsi256_ps(result);
-#endif
-}
-
-__forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
-{
-#if defined(__KERNEL_SSE41__)
-  return _mm256_blendv_ps(f, t, m);
-#else
-  return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb unpacklo(const avxb &a, const avxb &b)
-{
-  return _mm256_unpacklo_ps(a, b);
-}
-__forceinline const avxb unpackhi(const avxb &a, const avxb &b)
-{
-  return _mm256_unpackhi_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const avxb &a)
-{
-  return _mm_popcnt_u32(_mm256_movemask_ps(a));
-}
-#else
-__forceinline uint32_t popcnt(const avxb &a)
-{
-  return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
-         bool(a[7]);
-}
-#endif
-
-__forceinline bool reduce_and(const avxb &a)
-{
-  return _mm256_movemask_ps(a) == 0xf;
-}
-__forceinline bool reduce_or(const avxb &a)
-{
-  return _mm256_movemask_ps(a) != 0x0;
-}
-__forceinline bool all(const avxb &b)
-{
-  return _mm256_movemask_ps(b) == 0xf;
-}
-__forceinline bool any(const avxb &b)
-{
-  return _mm256_movemask_ps(b) != 0x0;
-}
-__forceinline bool none(const avxb &b)
-{
-  return _mm256_movemask_ps(b) == 0x0;
-}
-
-__forceinline uint32_t movemask(const avxb &a)
-{
-  return _mm256_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxb(const char *label, const avxb &a)
-{
-  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/avxf.h b/intern/cycles/util/avxf.h
deleted file mode 100644
index 03a13f30490..00000000000
--- a/intern/cycles/util/avxf.h
+++ /dev/null
@@ -1,379 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2016 Intel Corporation */
-
-#ifndef __UTIL_AVXF_H__
-#define __UTIL_AVXF_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxf {
-  typedef avxf Float;
-
-  enum { size = 8 }; /* Number of SIMD elements. */
-
-  union {
-    __m256 m256;
-    float f[8];
-    int i[8];
-  };
-
-  __forceinline avxf()
-  {
-  }
-  __forceinline avxf(const avxf &other)
-  {
-    m256 = other.m256;
-  }
-  __forceinline avxf &operator=(const avxf &other)
-  {
-    m256 = other.m256;
-    return *this;
-  }
-
-  __forceinline avxf(const __m256 a) : m256(a)
-  {
-  }
-  __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps(a))
-  {
-  }
-
-  __forceinline operator const __m256 &() const
-  {
-    return m256;
-  }
-  __forceinline operator __m256 &()
-  {
-    return m256;
-  }
-
-  __forceinline avxf(float a) : m256(_mm256_set1_ps(a))
-  {
-  }
-
-  __forceinline avxf(float high32x4, float low32x4)
-      : m256(_mm256_set_ps(
-            high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4))
-  {
-  }
-
-  __forceinline avxf(float a3, float a2, float a1, float a0)
-      : m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0))
-  {
-  }
-
-  __forceinline avxf(
-      float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0)
-      : m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0))
-  {
-  }
-
-  __forceinline avxf(float3 a) : m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x))
-  {
-  }
-
-  __forceinline avxf(int a3, int a2, int a1, int a0)
-  {
-    const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
-    m256 = _mm256_castsi256_ps(foo);
-  }
-
-  __forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
-  {
-    const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
-    m256 = _mm256_castsi256_ps(foo);
-  }
-
-  __forceinline avxf(__m128 a, __m128 b)
-  {
-    const __m256 foo = _mm256_castps128_ps256(a);
-    m256 = _mm256_insertf128_ps(foo, b, 1);
-  }
-
-  __forceinline const float &operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return f[i];
-  }
-  __forceinline float &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return f[i];
-  }
-};
-
-__forceinline avxf cross(const avxf &a, const avxf &b)
-{
-  avxf r(0.0,
-         a[4] * b[5] - a[5] * b[4],
-         a[6] * b[4] - a[4] * b[6],
-         a[5] * b[6] - a[6] * b[5],
-         0.0,
-         a[0] * b[1] - a[1] * b[0],
-         a[2] * b[0] - a[0] * b[2],
-         a[1] * b[2] - a[2] * b[1]);
-  return r;
-}
-
-__forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
-{
-  const avxf t = _mm256_mul_ps(a.m256, b.m256);
-  den = ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
-  den2 = ((float *)&t)[4] + ((float *)&t)[5] + ((float *)&t)[6];
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf cast(const __m256i &a)
-{
-  return _mm256_castsi256_ps(a);
-}
-
-__forceinline const avxf mm256_sqrt(const avxf &a)
-{
-  return _mm256_sqrt_ps(a.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf operator+(const avxf &a, const avxf &b)
-{
-  return _mm256_add_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator+(const avxf &a, const float &b)
-{
-  return a + avxf(b);
-}
-__forceinline const avxf operator+(const float &a, const avxf &b)
-{
-  return avxf(a) + b;
-}
-
-__forceinline const avxf operator-(const avxf &a, const avxf &b)
-{
-  return _mm256_sub_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator-(const avxf &a, const float &b)
-{
-  return a - avxf(b);
-}
-__forceinline const avxf operator-(const float &a, const avxf &b)
-{
-  return avxf(a) - b;
-}
-
-__forceinline const avxf operator*(const avxf &a, const avxf &b)
-{
-  return _mm256_mul_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator*(const avxf &a, const float &b)
-{
-  return a * avxf(b);
-}
-__forceinline const avxf operator*(const float &a, const avxf &b)
-{
-  return avxf(a) * b;
-}
-
-__forceinline const avxf operator/(const avxf &a, const avxf &b)
-{
-  return _mm256_div_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator/(const avxf &a, const float &b)
-{
-  return a / avxf(b);
-}
-__forceinline const avxf operator/(const float &a, const avxf &b)
-{
-  return avxf(a) / b;
-}
-
-__forceinline const avxf operator|(const avxf &a, const avxf &b)
-{
-  return _mm256_or_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator^(const avxf &a, const avxf &b)
-{
-  return _mm256_xor_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator&(const avxf &a, const avxf &b)
-{
-  return _mm256_and_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf max(const avxf &a, const avxf &b)
-{
-  return _mm256_max_ps(a.m256, b.m256);
-}
-__forceinline const avxf min(const avxf &a, const avxf &b)
-{
-  return _mm256_min_ps(a.m256, b.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf shuffle(const avxf &a, const __m256i &shuf)
-{
-  return _mm256_permutevar_ps(a, shuf);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf shuffle(const avxf &a)
-{
-  return _mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
-  return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-}
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a)
-{
-  return shuffle<i0, i1, i2, i3>(a, a);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
-  return shuffle<i0, i0, i0, i0>(a, b);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
-{
-  return shuffle<i0>(a, a);
-}
-
-template<size_t i> __forceinline float extract(const avxf &a)
-{
-  __m256 b = shuffle<i, i, i, i>(a).m256;
-  return _mm256_cvtss_f32(b);
-}
-template<> __forceinline float extract<0>(const avxf &a)
-{
-  return _mm256_cvtss_f32(a.m256);
-}
-
-__forceinline ssef low(const avxf &a)
-{
-  return _mm256_extractf128_ps(a.m256, 0);
-}
-__forceinline ssef high(const avxf &a)
-{
-  return _mm256_extractf128_ps(a.m256, 1);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf permute(const avxf &a)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-#else
-  float temp[8];
-  _mm256_storeu_ps((float *)&temp, a);
-  return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-#endif
-}
-
-template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
-ccl_device_inline const avxf set_sign_bit(const avxf &a)
-{
-  return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31, S2 << 31, S1 << 31, S0 << 31);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3, size_t S4, size_t S5, size_t S6, size_t S7>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
-  return _mm256_blend_ps(
-      a, b, S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
-  return blend<S0, S1, S2, S3, S0, S1, S2, S3>(a, b);
-}
-
-//#if defined(__KERNEL_SSE41__)
-__forceinline avxf maxi(const avxf &a, const avxf &b)
-{
-  const avxf ci = _mm256_max_ps(a, b);
-  return ci;
-}
-
-__forceinline avxf mini(const avxf &a, const avxf &b)
-{
-  const avxf ci = _mm256_min_ps(a, b);
-  return ci;
-}
-//#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Ternary Operators
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fmadd_ps(a, b, c);
-#else
-  return c + (a * b);
-#endif
-}
-
-__forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fnmadd_ps(a, b, c);
-#else
-  return c - (a * b);
-#endif
-}
-__forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fmsub_ps(a, b, c);
-#else
-  return (a * b) - c;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxb operator<=(const avxf &a, const avxf &b)
-{
-  return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
-}
-
-__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
-{
-  return _mm256_blendv_ps(f, t, m);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Common Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
-{
-  return madd(t, b, (avxf(1.0f) - t) * a);
-}
-
-#ifndef _mm256_set_m128
-#  define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
-    _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
-#endif
-
-#define _mm256_loadu2_m128(/* float const* */ hiaddr, /* float const* */ loaddr) \
-  _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/avxi.h b/intern/cycles/util/avxi.h
deleted file mode 100644
index 966a04a6b97..00000000000
--- a/intern/cycles/util/avxi.h
+++ /dev/null
@@ -1,732 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2009-2013 Intel Corporation */
-
-#ifndef __UTIL_AVXI_H__
-#define __UTIL_AVXI_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxi {
-  typedef avxb Mask;  // mask type for us
-  enum { size = 8 };  // number of SIMD elements
-  union {             // data
-    __m256i m256;
-#if !defined(__KERNEL_AVX2__)
-    struct {
-      __m128i l, h;
-    };
-#endif
-    int32_t v[8];
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxi()
-  {
-  }
-  __forceinline avxi(const avxi &a)
-  {
-    m256 = a.m256;
-  }
-  __forceinline avxi &operator=(const avxi &a)
-  {
-    m256 = a.m256;
-    return *this;
-  }
-
-  __forceinline avxi(const __m256i a) : m256(a)
-  {
-  }
-  __forceinline operator const __m256i &(void) const
-  {
-    return m256;
-  }
-  __forceinline operator __m256i &(void)
-  {
-    return m256;
-  }
-
-  __forceinline explicit avxi(const ssei &a)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
-  {
-  }
-  __forceinline avxi(const ssei &a, const ssei &b)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
-  {
-  }
-#if defined(__KERNEL_AVX2__)
-  __forceinline avxi(const __m128i &a, const __m128i &b)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
-  {
-  }
-#else
-  __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
-  {
-  }
-#endif
-  __forceinline explicit avxi(const int32_t *const a)
-      : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
-  {
-  }
-  __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
-  {
-  }
-  __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
-  {
-  }
-  __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
-      : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
-  {
-  }
-  __forceinline avxi(
-      int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
-      : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
-  {
-  }
-
-  __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
-  {
-  }
-#if defined(__KERNEL_AVX2__)
-  __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
-  {
-  }
-  __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
-  {
-  }
-  __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
-  {
-  }
-#else
-  __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
-  {
-  }
-  __forceinline avxi(PosInfTy)
-      : m256(_mm256_set_epi32(
-            pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
-  {
-  }
-  __forceinline avxi(NegInfTy)
-      : m256(_mm256_set_epi32(
-            neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
-  {
-  }
-#endif
-  __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const int32_t &operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return v[i];
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi cast(const __m256 &a)
-{
-  return _mm256_castps_si256(a);
-}
-__forceinline const avxi operator+(const avxi &a)
-{
-  return a;
-}
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a)
-{
-  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
-}
-__forceinline const avxi abs(const avxi &a)
-{
-  return _mm256_abs_epi32(a.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a)
-{
-  return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
-}
-__forceinline const avxi abs(const avxi &a)
-{
-  return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
-  return _mm256_add_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator+(const avxi &a, const int32_t b)
-{
-  return a + avxi(b);
-}
-__forceinline const avxi operator+(const int32_t a, const avxi &b)
-{
-  return avxi(a) + b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
-  return _mm256_sub_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator-(const avxi &a, const int32_t b)
-{
-  return a - avxi(b);
-}
-__forceinline const avxi operator-(const int32_t a, const avxi &b)
-{
-  return avxi(a) - b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
-  return _mm256_mullo_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator*(const avxi &a, const int32_t b)
-{
-  return a * avxi(b);
-}
-__forceinline const avxi operator*(const int32_t a, const avxi &b)
-{
-  return avxi(a) * b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
-  return _mm256_and_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator&(const avxi &a, const int32_t b)
-{
-  return a & avxi(b);
-}
-__forceinline const avxi operator&(const int32_t a, const avxi &b)
-{
-  return avxi(a) & b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
-  return _mm256_or_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator|(const avxi &a, const int32_t b)
-{
-  return a | avxi(b);
-}
-__forceinline const avxi operator|(const int32_t a, const avxi &b)
-{
-  return avxi(a) | b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
-  return _mm256_xor_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator^(const avxi &a, const int32_t b)
-{
-  return a ^ avxi(b);
-}
-__forceinline const avxi operator^(const int32_t a, const avxi &b)
-{
-  return avxi(a) ^ b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
-  return _mm256_slli_epi32(a.m256, n);
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
-  return _mm256_srai_epi32(a.m256, n);
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
-  return _mm256_srai_epi32(a.m256, b);
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
-  return _mm256_srli_epi32(a.m256, b);
-}
-#else
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
-  return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
-  return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
-  return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
-  return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
-}
-#endif
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
-  return _mm256_min_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi min(const avxi &a, const int32_t b)
-{
-  return min(a, avxi(b));
-}
-__forceinline const avxi min(const int32_t a, const avxi &b)
-{
-  return min(avxi(a), b);
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
-  return _mm256_max_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi max(const avxi &a, const int32_t b)
-{
-  return max(a, avxi(b));
-}
-__forceinline const avxi max(const int32_t a, const avxi &b)
-{
-  return max(avxi(a), b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxi &operator+=(avxi &a, const avxi &b)
-{
-  return a = a + b;
-}
-__forceinline avxi &operator+=(avxi &a, const int32_t b)
-{
-  return a = a + b;
-}
-
-__forceinline avxi &operator-=(avxi &a, const avxi &b)
-{
-  return a = a - b;
-}
-__forceinline avxi &operator-=(avxi &a, const int32_t b)
-{
-  return a = a - b;
-}
-
-__forceinline avxi &operator*=(avxi &a, const avxi &b)
-{
-  return a = a * b;
-}
-__forceinline avxi &operator*=(avxi &a, const int32_t b)
-{
-  return a = a * b;
-}
-
-__forceinline avxi &operator&=(avxi &a, const avxi &b)
-{
-  return a = a & b;
-}
-__forceinline avxi &operator&=(avxi &a, const int32_t b)
-{
-  return a = a & b;
-}
-
-__forceinline avxi &operator|=(avxi &a, const avxi &b)
-{
-  return a = a | b;
-}
-__forceinline avxi &operator|=(avxi &a, const int32_t b)
-{
-  return a = a | b;
-}
-
-__forceinline avxi &operator^=(avxi &a, const avxi &b)
-{
-  return a = a ^ b;
-}
-__forceinline avxi &operator^=(avxi &a, const int32_t b)
-{
-  return a = a ^ b;
-}
-
-__forceinline avxi &operator<<=(avxi &a, const int32_t b)
-{
-  return a = a << b;
-}
-__forceinline avxi &operator>>=(avxi &a, const int32_t b)
-{
-  return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator==(const avxi &a, const int32_t b)
-{
-  return a == avxi(b);
-}
-__forceinline const avxb operator==(const int32_t a, const avxi &b)
-{
-  return avxi(a) == b;
-}
-
-__forceinline const avxb operator!=(const avxi &a, const avxi &b)
-{
-  return !(a == b);
-}
-__forceinline const avxb operator!=(const avxi &a, const int32_t b)
-{
-  return a != avxi(b);
-}
-__forceinline const avxb operator!=(const int32_t a, const avxi &b)
-{
-  return avxi(a) != b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
-}
-#else
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator<(const avxi &a, const int32_t b)
-{
-  return a < avxi(b);
-}
-__forceinline const avxb operator<(const int32_t a, const avxi &b)
-{
-  return avxi(a) < b;
-}
-
-__forceinline const avxb operator>=(const avxi &a, const avxi &b)
-{
-  return !(a < b);
-}
-__forceinline const avxb operator>=(const avxi &a, const int32_t b)
-{
-  return a >= avxi(b);
-}
-__forceinline const avxb operator>=(const int32_t a, const avxi &b)
-{
-  return avxi(a) >= b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator>(const avxi &a, const int32_t b)
-{
-  return a > avxi(b);
-}
-__forceinline const avxb operator>(const int32_t a, const avxi &b)
-{
-  return avxi(a) > b;
-}
-
-__forceinline const avxb operator<=(const avxi &a, const avxi &b)
-{
-  return !(a > b);
-}
-__forceinline const avxb operator<=(const avxi &a, const int32_t b)
-{
-  return a <= avxi(b);
-}
-__forceinline const avxb operator<=(const int32_t a, const avxi &b)
-{
-  return avxi(a) <= b;
-}
-
-__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
-{
-  return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
-  return _mm256_unpacklo_epi32(a.m256, b.m256);
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
-  return _mm256_unpackhi_epi32(a.m256, b.m256);
-}
-#else
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-
-template<size_t i> __forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
-  return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_castps_si256(
-      _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_shuffle_ps(
-      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
-{
-  return _mm256_castps_si256(
-      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
-}
-
-__forceinline const avxi broadcast(const int *ptr)
-{
-  return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
-}
-template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
-{
-  return _mm256_insertf128_si256(a, b, i);
-}
-template<size_t i> __forceinline const ssei extract(const avxi &a)
-{
-  return _mm256_extractf128_si256(a, i);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi vreduce_min2(const avxi &v)
-{
-  return min(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_min4(const avxi &v)
-{
-  avxi v1 = vreduce_min2(v);
-  return min(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_min(const avxi &v)
-{
-  avxi v1 = vreduce_min4(v);
-  return min(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_max2(const avxi &v)
-{
-  return max(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_max4(const avxi &v)
-{
-  avxi v1 = vreduce_max2(v);
-  return max(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_max(const avxi &v)
-{
-  avxi v1 = vreduce_max4(v);
-  return max(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_add2(const avxi &v)
-{
-  return v + shuffle<1, 0, 3, 2>(v);
-}
-__forceinline const avxi vreduce_add4(const avxi &v)
-{
-  avxi v1 = vreduce_add2(v);
-  return v1 + shuffle<2, 3, 0, 1>(v1);
-}
-__forceinline const avxi vreduce_add(const avxi &v)
-{
-  avxi v1 = vreduce_add4(v);
-  return v1 + shuffle<1, 0>(v1);
-}
-
-__forceinline int reduce_min(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_min(v)));
-}
-__forceinline int reduce_max(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_max(v)));
-}
-__forceinline int reduce_add(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_add(v)));
-}
-
-__forceinline uint32_t select_min(const avxi &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const avxi &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const avxb &valid, const avxi &v)
-{
-  const avxi a = select(valid, v, avxi(pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const avxb &valid, const avxi &v)
-{
-  const avxi a = select(valid, v, avxi(neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Output Operators
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxi(const char *label, const avxi &a)
-{
-  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h
index 537f8ab6771..93e984120f2 100644
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -228,28 +228,27 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y)
  * exp = exponent, encoded as uint32_t
  * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
  */
-template<unsigned exp, unsigned e2coeff> ccl_device_inline ssef fastpow(const ssef &arg)
+template<unsigned exp, unsigned e2coeff> ccl_device_inline float4 fastpow(const float4 &arg)
 {
-  ssef ret;
-  ret = arg * cast(ssei(e2coeff));
-  ret = ssef(cast(ret));
-  ret = ret * cast(ssei(exp));
-  ret = cast(ssei(ret));
+  float4 ret = arg * cast(make_int4(e2coeff));
+  ret = make_float4(cast(ret));
+  ret = ret * cast(make_int4(exp));
+  ret = cast(make_int4(ret));
   return ret;
 }
 
 /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
-ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x)
+ccl_device_inline float4 improve_5throot_solution(const float4 &old_result, const float4 &x)
 {
-  ssef approx2 = old_result * old_result;
-  ssef approx4 = approx2 * approx2;
-  ssef t = x / approx4;
-  ssef summ = madd(ssef(4.0f), old_result, t);
-  return summ * ssef(1.0f / 5.0f);
+  float4 approx2 = old_result * old_result;
+  float4 approx4 = approx2 * approx2;
+  float4 t = x / approx4;
+  float4 summ = madd(make_float4(4.0f), old_result, t);
+  return summ * make_float4(1.0f / 5.0f);
 }
 
 /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
-ccl_device_inline ssef fastpow24(const ssef &arg)
+ccl_device_inline float4 fastpow24(const float4 &arg)
 {
   /* max, avg and |avg| errors were calculated in gcc without FMA instructions
    * The final precision should be better than powf in glibc */
@@ -257,9 +256,10 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
   /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
   /* 0x3F4CCCCD = 4/5 */
   /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
-  ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
-  ssef arg2 = arg * arg;
-  ssef arg4 = arg2 * arg2;
+  float4 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(
+      arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
+  float4 arg2 = arg * arg;
+  float4 arg4 = arg2 * arg2;
 
   /* error max = 0.018     avg = 0.0031    |avg| = 0.0031 */
   x = improve_5throot_solution(x, arg4);
@@ -271,12 +271,12 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
   return x * (x * x);
 }
 
-ccl_device ssef color_srgb_to_linear(const ssef &c)
+ccl_device float4 color_srgb_to_linear(const float4 &c)
 {
-  sseb cmp = c < ssef(0.04045f);
-  ssef lt = max(c * ssef(1.0f / 12.92f), ssef(0.0f));
-  ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f / 1.055f); /* fma */
-  ssef gte = fastpow24(gtebase);
+  int4 cmp = c < make_float4(0.04045f);
+  float4 lt = max(c * make_float4(1.0f / 12.92f), make_float4(0.0f));
+  float4 gtebase = (c + make_float4(0.055f)) * make_float4(1.0f / 1.055f); /* fma */
+  float4 gte = fastpow24(gtebase);
   return select(cmp, lt, gte);
 }
 #endif /* __KERNEL_SSE2__ */
@@ -302,10 +302,8 @@ ccl_device float4 color_linear_to_srgb_v4(float4 c)
 ccl_device float4 color_srgb_to_linear_v4(float4 c)
 {
 #ifdef __KERNEL_SSE2__
-  ssef r_ssef;
-  float4 &r = (float4 &)r_ssef;
-  r = c;
-  r_ssef = color_srgb_to_linear(r_ssef);
+  float4 r = c;
+  r = color_srgb_to_linear(r);
   r.w = c.w;
   return r;
 #else
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index c668638eb02..5665dd4c075 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -154,17 +154,17 @@ ccl_device_inline half float_to_half_display(const float f)
 
 ccl_device_inline half4 float4_to_half4_display(const float4 f)
 {
-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
   /* CPU: SSE and AVX. */
-  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
+  float4 x = min(max(f, make_float4(0.0f)), make_float4(65504.0f));
 #  ifdef __KERNEL_AVX2__
-  ssei rpack = _mm_cvtps_ph(x, 0);
+  int4 rpack = int4(_mm_cvtps_ph(x, 0));
 #  else
-  ssei absolute = cast(x) & 0x7FFFFFFF;
-  ssei Z = absolute + 0xC8000000;
-  ssei result = andnot(absolute < 0x38800000, Z);
-  ssei rshift = (result >> 13) & 0x7FFF;
-  ssei rpack = _mm_packs_epi32(rshift, rshift);
+  int4 absolute = cast(x) & make_int4(0x7FFFFFFF);
+  int4 Z = absolute + make_int4(0xC8000000);
+  int4 result = andnot(absolute < make_int4(0x38800000), Z);
+  int4 rshift = (result >> 13) & make_int4(0x7FFF);
+  int4 rpack = int4(_mm_packs_epi32(rshift, rshift));
 #  endif
   half4 h;
   _mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack));
diff --git a/intern/cycles/util/hash.h b/intern/cycles/util/hash.h
index 4f83f331229..74210ff020e 100644
--- a/intern/cycles/util/hash.h
+++ b/intern/cycles/util/hash.h
@@ -222,7 +222,7 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)
 
 /* SSE Versions Of Jenkins Lookup3 Hash Functions */
 
-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
 #  define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))
 
 #  define mix(a, b, c) \
@@ -265,10 +265,10 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)
       c -= rot(b, 24); \
     }
 
-ccl_device_inline ssei hash_ssei(ssei kx)
+ccl_device_inline int4 hash_int4(int4 kx)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (1 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (1 << 2) + 13);
 
   a += kx;
   final(a, b, c);
@@ -276,10 +276,10 @@ ccl_device_inline ssei hash_ssei(ssei kx)
   return c;
 }
 
-ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
+ccl_device_inline int4 hash_int4_2(int4 kx, int4 ky)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (2 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (2 << 2) + 13);
 
   b += ky;
   a += kx;
@@ -288,10 +288,10 @@ ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
   return c;
 }
 
-ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
+ccl_device_inline int4 hash_int4_3(int4 kx, int4 ky, int4 kz)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (3 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (3 << 2) + 13);
 
   c += kz;
   b += ky;
@@ -301,10 +301,10 @@ ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
   return c;
 }
 
-ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
+ccl_device_inline int4 hash_int4_4(int4 kx, int4 ky, int4 kz, int4 kw)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (4 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (4 << 2) + 13);
 
   a += kx;
   b += ky;
@@ -317,11 +317,11 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
   return c;
 }
 
-#  if defined(__KERNEL_AVX__)
-ccl_device_inline avxi hash_avxi(avxi kx)
+#  if defined(__KERNEL_AVX2__)
+ccl_device_inline vint8 hash_int8(vint8 kx)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (1 << 2) + 13);
 
   a += kx;
   final(a, b, c);
@@ -329,10 +329,10 @@ ccl_device_inline avxi hash_avxi(avxi kx)
   return c;
 }
 
-ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+ccl_device_inline vint8 hash_int8_2(vint8 kx, vint8 ky)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (2 << 2) + 13);
 
   b += ky;
   a += kx;
@@ -341,10 +341,10 @@ ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
   return c;
 }
 
-ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+ccl_device_inline vint8 hash_int8_3(vint8 kx, vint8 ky, vint8 kz)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (3 << 2) + 13);
 
   c += kz;
   b += ky;
@@ -354,10 +354,10 @@ ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
   return c;
 }
 
-ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+ccl_device_inline vint8 hash_int8_4(vint8 kx, vint8 ky, vint8 kz, vint8 kw)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (4 << 2) + 13);
 
   a += kx;
   b += ky;
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 3a2e0e074a2..0fbe7a67a4f 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -532,12 +532,14 @@ CCL_NAMESPACE_END
 #include "util/math_int2.h"
 #include "util/math_int3.h"
 #include "util/math_int4.h"
+#include "util/math_int8.h"
 
 #include "util/math_float2.h"
-#include "util/math_float3.h"
 #include "util/math_float4.h"
 #include "util/math_float8.h"
 
+#include "util/math_float3.h"
+
 #include "util/rect.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h
index 542dad93467..ad806d0f08a 100644
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -10,55 +10,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float2 operator-(const float2 &a);
-ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator*(const float2 &a, float f);
-ccl_device_inline float2 operator*(float f, const float2 &a);
-ccl_device_inline float2 operator/(float f, const float2 &a);
-ccl_device_inline float2 operator/(const float2 &a, float f);
-ccl_device_inline float2 operator/(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+(const float2 &a, const float f);
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator-(const float2 &a, const float f);
-ccl_device_inline float2 operator-(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, float f);
-ccl_device_inline float2 operator/=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator/=(float2 &a, float f);
-
-ccl_device_inline bool operator==(const float2 &a, const float2 &b);
-ccl_device_inline bool operator!=(const float2 &a, const float2 &b);
-
-ccl_device_inline bool is_zero(const float2 &a);
-ccl_device_inline float average(const float2 &a);
-ccl_device_inline float distance(const float2 &a, const float2 &b);
-ccl_device_inline float dot(const float2 &a, const float2 &b);
-ccl_device_inline float cross(const float2 &a, const float2 &b);
-ccl_device_inline float len(const float2 a);
-ccl_device_inline float2 normalize(const float2 &a);
-ccl_device_inline float2 normalize_len(const float2 &a, float *t);
-ccl_device_inline float2 safe_normalize(const float2 &a);
-ccl_device_inline float2 min(const float2 &a, const float2 &b);
-ccl_device_inline float2 max(const float2 &a, const float2 &b);
-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx);
-ccl_device_inline float2 fabs(const float2 &a);
-ccl_device_inline float2 as_float2(const float4 &a);
-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
-ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float2 zero_float2()
 {
   return make_float2(0.0f, 0.0f);
@@ -75,63 +26,63 @@ ccl_device_inline float2 operator-(const float2 &a)
   return make_float2(-a.x, -a.y);
 }
 
-ccl_device_inline float2 operator*(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator*(const float2 a, const float2 b)
 {
   return make_float2(a.x * b.x, a.y * b.y);
 }
 
-ccl_device_inline float2 operator*(const float2 &a, float f)
+ccl_device_inline float2 operator*(const float2 a, float f)
 {
   return make_float2(a.x * f, a.y * f);
 }
 
-ccl_device_inline float2 operator*(float f, const float2 &a)
+ccl_device_inline float2 operator*(float f, const float2 a)
 {
   return make_float2(a.x * f, a.y * f);
 }
 
-ccl_device_inline float2 operator/(float f, const float2 &a)
+ccl_device_inline float2 operator/(float f, const float2 a)
 {
   return make_float2(f / a.x, f / a.y);
 }
 
-ccl_device_inline float2 operator/(const float2 &a, float f)
+ccl_device_inline float2 operator/(const float2 a, float f)
 {
   float invf = 1.0f / f;
   return make_float2(a.x * invf, a.y * invf);
 }
 
-ccl_device_inline float2 operator/(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator/(const float2 a, const float2 b)
 {
   return make_float2(a.x / b.x, a.y / b.y);
 }
 
-ccl_device_inline float2 operator+(const float2 &a, const float f)
+ccl_device_inline float2 operator+(const float2 a, const float2 b)
 {
-  return a + make_float2(f, f);
+  return make_float2(a.x + b.x, a.y + b.y);
 }
 
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator+(const float2 a, const float f)
 {
-  return make_float2(a.x + b.x, a.y + b.y);
+  return a + make_float2(f, f);
 }
 
-ccl_device_inline float2 operator-(const float2 &a, const float f)
+ccl_device_inline float2 operator-(const float2 a, const float2 b)
 {
-  return a - make_float2(f, f);
+  return make_float2(a.x - b.x, a.y - b.y);
 }
 
-ccl_device_inline float2 operator-(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator-(const float2 a, const float f)
 {
-  return make_float2(a.x - b.x, a.y - b.y);
+  return a - make_float2(f, f);
 }
 
-ccl_device_inline float2 operator+=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator+=(float2 &a, const float2 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline float2 operator*=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator*=(float2 &a, const float2 b)
 {
   return a = a * b;
 }
@@ -141,7 +92,7 @@ ccl_device_inline float2 operator*=(float2 &a, float f)
   return a = a * f;
 }
 
-ccl_device_inline float2 operator/=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator/=(float2 &a, const float2 b)
 {
   return a = a / b;
 }
@@ -152,74 +103,81 @@ ccl_device_inline float2 operator/=(float2 &a, float f)
   return a = a * invf;
 }
 
-ccl_device_inline bool operator==(const float2 &a, const float2 &b)
+ccl_device_inline bool operator==(const float2 a, const float2 b)
 {
   return (a.x == b.x && a.y == b.y);
 }
 
-ccl_device_inline bool operator!=(const float2 &a, const float2 &b)
+ccl_device_inline bool operator!=(const float2 a, const float2 b)
 {
   return !(a == b);
 }
 
-ccl_device_inline bool is_zero(const float2 &a)
+ccl_device_inline bool is_zero(const float2 a)
 {
   return (a.x == 0.0f && a.y == 0.0f);
 }
 
-ccl_device_inline float average(const float2 &a)
+ccl_device_inline float average(const float2 a)
 {
   return (a.x + a.y) * (1.0f / 2.0f);
 }
 
-ccl_device_inline float distance(const float2 &a, const float2 &b)
+ccl_device_inline float dot(const float2 a, const float2 b)
 {
-  return len(a - b);
+  return a.x * b.x + a.y * b.y;
 }
+#endif
 
-ccl_device_inline float dot(const float2 &a, const float2 &b)
+ccl_device_inline float len(const float2 a)
 {
-  return a.x * b.x + a.y * b.y;
+  return sqrtf(dot(a, a));
 }
 
-ccl_device_inline float cross(const float2 &a, const float2 &b)
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float2 a, const float2 b)
+{
+  return len(a - b);
+}
+
+ccl_device_inline float cross(const float2 a, const float2 b)
 {
   return (a.x * b.y - a.y * b.x);
 }
 
-ccl_device_inline float2 normalize(const float2 &a)
+ccl_device_inline float2 normalize(const float2 a)
 {
   return a / len(a);
 }
 
-ccl_device_inline float2 normalize_len(const float2 &a, ccl_private float *t)
+ccl_device_inline float2 normalize_len(const float2 a, ccl_private float *t)
 {
   *t = len(a);
   return a / (*t);
 }
 
-ccl_device_inline float2 safe_normalize(const float2 &a)
+ccl_device_inline float2 safe_normalize(const float2 a)
 {
   float t = len(a);
   return (t != 0.0f) ? a / t : a;
 }
 
-ccl_device_inline float2 min(const float2 &a, const float2 &b)
+ccl_device_inline float2 min(const float2 a, const float2 b)
 {
   return make_float2(min(a.x, b.x), min(a.y, b.y));
 }
 
-ccl_device_inline float2 max(const float2 &a, const float2 &b)
+ccl_device_inline float2 max(const float2 a, const float2 b)
 {
   return make_float2(max(a.x, b.x), max(a.y, b.y));
 }
 
-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx)
+ccl_device_inline float2 clamp(const float2 a, const float2 mn, const float2 mx)
 {
   return min(max(a, mn), mx);
 }
 
-ccl_device_inline float2 fabs(const float2 &a)
+ccl_device_inline float2 fabs(const float2 a)
 {
   return make_float2(fabsf(a.x), fabsf(a.y));
 }
@@ -229,28 +187,23 @@ ccl_device_inline float2 as_float2(const float4 &a)
   return make_float2(a.x, a.y);
 }
 
-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 interp(const float2 a, const float2 b, float t)
 {
   return a + t * (b - a);
 }
 
-ccl_device_inline float2 mix(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 mix(const float2 a, const float2 b, float t)
 {
   return a + t * (b - a);
 }
 
-ccl_device_inline float2 floor(const float2 &a)
+ccl_device_inline float2 floor(const float2 a)
 {
   return make_float2(floorf(a.x), floorf(a.y));
 }
 
 #endif /* !__KERNEL_METAL__ */
 
-ccl_device_inline float len(const float2 a)
-{
-  return sqrtf(dot(a, a));
-}
-
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index eec7122b9dc..79ee86d9c82 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
  * Copyright 2011-2022 Blender Foundation */
 
 #ifndef __UTIL_MATH_FLOAT3_H__
@@ -10,73 +11,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float3 operator-(const float3 &a);
-ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator*(const float3 &a, const float f);
-ccl_device_inline float3 operator*(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float3 &a, const float f);
-ccl_device_inline float3 operator/(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+(const float3 &a, const float f);
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator-(const float3 &a, const float f);
-ccl_device_inline float3 operator-(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator-=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, float f);
-ccl_device_inline float3 operator/=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator/=(float3 &a, float f);
-
-ccl_device_inline bool operator==(const float3 &a, const float3 &b);
-ccl_device_inline bool operator!=(const float3 &a, const float3 &b);
-
-ccl_device_inline float distance(const float3 &a, const float3 &b);
-ccl_device_inline float dot(const float3 &a, const float3 &b);
-ccl_device_inline float dot_xy(const float3 &a, const float3 &b);
-ccl_device_inline float3 cross(const float3 &a, const float3 &b);
-ccl_device_inline float3 normalize(const float3 &a);
-ccl_device_inline float3 min(const float3 &a, const float3 &b);
-ccl_device_inline float3 max(const float3 &a, const float3 &b);
-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx);
-ccl_device_inline float3 fabs(const float3 &a);
-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t);
-ccl_device_inline float3 rcp(const float3 &a);
-ccl_device_inline float3 sqrt(const float3 &a);
-ccl_device_inline float3 floor(const float3 &a);
-ccl_device_inline float3 ceil(const float3 &a);
-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
-#endif /* !defined(__KERNEL_METAL__) */
-
-ccl_device_inline float reduce_min(float3 a);
-ccl_device_inline float reduce_max(float3 a);
-ccl_device_inline float len(const float3 a);
-ccl_device_inline float len_squared(const float3 a);
-
-ccl_device_inline float3 project(const float3 v, const float3 v_proj);
-
-ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_divide(const float3 a, const float3 b);
-ccl_device_inline float3 safe_divide(const float3 a, const float b);
-ccl_device_inline float3 interp(float3 a, float3 b, float t);
-ccl_device_inline float3 sqr(float3 a);
-
-ccl_device_inline bool is_zero(const float3 a);
-ccl_device_inline float reduce_add(const float3 a);
-ccl_device_inline float average(const float3 a);
-ccl_device_inline bool isequal(const float3 a, const float3 b);
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float3 zero_float3()
 {
 #ifdef __KERNEL_SSE__
@@ -109,7 +43,7 @@ ccl_device_inline float3 operator-(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator*(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, b.m128));
@@ -118,7 +52,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 operator*(const float3 &a, const float f)
+ccl_device_inline float3 operator*(const float3 a, const float f)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
@@ -127,7 +61,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float f)
 #  endif
 }
 
-ccl_device_inline float3 operator*(const float f, const float3 &a)
+ccl_device_inline float3 operator*(const float f, const float3 a)
 {
 #  if defined(__KERNEL_SSE__)
   return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
@@ -136,7 +70,7 @@ ccl_device_inline float3 operator*(const float f, const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 operator/(const float f, const float3 &a)
+ccl_device_inline float3 operator/(const float f, const float3 a)
 {
 #  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
@@ -145,7 +79,7 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 operator/(const float3 &a, const float f)
+ccl_device_inline float3 operator/(const float3 a, const float f)
 {
 #  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
@@ -154,7 +88,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 #  endif
 }
 
-ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator/(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, b.m128));
@@ -163,12 +97,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 operator+(const float3 &a, const float f)
-{
-  return a + make_float3(f, f, f);
-}
-
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator+(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_add_ps(a.m128, b.m128));
@@ -177,12 +106,12 @@ ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 operator-(const float3 &a, const float f)
+ccl_device_inline float3 operator+(const float3 a, const float f)
 {
-  return a - make_float3(f, f, f);
+  return a + make_float3(f, f, f);
 }
 
-ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_sub_ps(a.m128, b.m128));
@@ -191,17 +120,22 @@ ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float f)
+{
+  return a - make_float3(f, f, f);
+}
+
+ccl_device_inline float3 operator+=(float3 &a, const float3 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline float3 operator-=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-=(float3 &a, const float3 b)
 {
   return a = a - b;
 }
 
-ccl_device_inline float3 operator*=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator*=(float3 &a, const float3 b)
 {
   return a = a * b;
 }
@@ -211,7 +145,7 @@ ccl_device_inline float3 operator*=(float3 &a, float f)
   return a = a * f;
 }
 
-ccl_device_inline float3 operator/=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator/=(float3 &a, const float3 b)
 {
   return a = a / b;
 }
@@ -223,7 +157,7 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
 }
 
 #  if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__))
-ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 b)
 {
   a = float3(a) * b;
   return a;
@@ -235,7 +169,7 @@ ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f)
   return a;
 }
 
-ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 b)
 {
   a = float3(a) / b;
   return a;
@@ -248,7 +182,7 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
 }
 #  endif
 
-ccl_device_inline bool operator==(const float3 &a, const float3 &b)
+ccl_device_inline bool operator==(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@@ -257,17 +191,12 @@ ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
+ccl_device_inline bool operator!=(const float3 a, const float3 b)
 {
   return !(a == b);
 }
 
-ccl_device_inline float distance(const float3 &a, const float3 &b)
-{
-  return len(a - b);
-}
-
-ccl_device_inline float dot(const float3 &a, const float3 &b)
+ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@@ -276,26 +205,62 @@ ccl_device_inline float dot(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
+#endif
+
+ccl_device_inline float dot_xy(const float3 a, const float3 b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#  else
+#else
   return a.x * b.x + a.y * b.y;
-#  endif
+#endif
+}
+
+ccl_device_inline float len(const float3 a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
+  return sqrtf(dot(a, a));
+#endif
+}
+
+ccl_device_inline float reduce_min(float3 a)
+{
+  return min(min(a.x, a.y), a.z);
+}
+
+ccl_device_inline float reduce_max(float3 a)
+{
+  return max(max(a.x, a.y), a.z);
+}
+
+ccl_device_inline float len_squared(const float3 a)
+{
+  return dot(a, a);
+}
+
+#ifndef __KERNEL_METAL__
+
+ccl_device_inline float distance(const float3 a, const float3 b)
+{
+  return len(a - b);
 }
 
-ccl_device_inline float3 cross(const float3 &a, const float3 &b)
+ccl_device_inline float3 cross(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
-  return float3(shuffle<1, 2, 0, 3>(
-      msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+  const float4 x = float4(a.m128);
+  const float4 y = shuffle<1, 2, 0, 3>(float4(b.m128));
+  const float4 z = float4(_mm_mul_ps(shuffle<1, 2, 0, 3>(float4(a.m128)), float4(b.m128)));
+
+  return float3(shuffle<1, 2, 0, 3>(msub(x, y, z)).m128);
 #  else
   return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 #  endif
 }
 
-ccl_device_inline float3 normalize(const float3 &a)
+ccl_device_inline float3 normalize(const float3 a)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@@ -305,7 +270,7 @@ ccl_device_inline float3 normalize(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 min(const float3 &a, const float3 &b)
+ccl_device_inline float3 min(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_min_ps(a.m128, b.m128));
@@ -314,7 +279,7 @@ ccl_device_inline float3 min(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 max(const float3 &a, const float3 &b)
+ccl_device_inline float3 max(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_max_ps(a.m128, b.m128));
@@ -323,12 +288,12 @@ ccl_device_inline float3 max(const float3 &a, const float3 &b)
 #  endif
 }
 
-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
+ccl_device_inline float3 clamp(const float3 a, const float3 mn, const float3 mx)
 {
   return min(max(a, mn), mx);
 }
 
-ccl_device_inline float3 fabs(const float3 &a)
+ccl_device_inline float3 fabs(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
 #    ifdef __KERNEL_NEON__
@@ -342,7 +307,7 @@ ccl_device_inline float3 fabs(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 sqrt(const float3 &a)
+ccl_device_inline float3 sqrt(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_sqrt_ps(a));
@@ -351,7 +316,7 @@ ccl_device_inline float3 sqrt(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 floor(const float3 &a)
+ccl_device_inline float3 floor(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_floor_ps(a));
@@ -360,7 +325,7 @@ ccl_device_inline float3 floor(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 ceil(const float3 &a)
+ccl_device_inline float3 ceil(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
   return float3(_mm_ceil_ps(a));
@@ -369,12 +334,12 @@ ccl_device_inline float3 ceil(const float3 &a)
 #  endif
 }
 
-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
+ccl_device_inline float3 mix(const float3 a, const float3 b, float t)
 {
   return a + t * (b - a);
 }
 
-ccl_device_inline float3 rcp(const float3 &a)
+ccl_device_inline float3 rcp(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
@@ -399,33 +364,6 @@ ccl_device_inline float3 log(float3 v)
   return make_float3(logf(v.x), logf(v.y), logf(v.z));
 }
 
-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float reduce_min(float3 a)
-{
-  return min(min(a.x, a.y), a.z);
-}
-
-ccl_device_inline float reduce_max(float3 a)
-{
-  return max(max(a.x, a.y), a.z);
-}
-
-ccl_device_inline float len(const float3 a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
-#else
-  return sqrtf(dot(a, a));
-#endif
-}
-
-ccl_device_inline float len_squared(const float3 a)
-{
-  return dot(a, a);
-}
-
-#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
 {
   float3 unit_normal = normalize(normal);
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index c2721873037..301d2d789c0 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
  * Copyright 2011-2022 Blender Foundation */
 
 #ifndef __UTIL_MATH_FLOAT4_H__
@@ -10,85 +11,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float4 operator-(const float4 &a);
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator*(const float4 &a, float f);
-ccl_device_inline float4 operator*(float f, const float4 &a);
-ccl_device_inline float4 operator/(const float4 &a, float f);
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+(const float4 &a, const float f);
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator-(const float4 &a, const float f);
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, float f);
-ccl_device_inline float4 operator/=(float4 &a, float f);
-
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b);
-ccl_device_inline bool operator==(const float4 &a, const float4 &b);
-
-ccl_device_inline float distance(const float4 &a, const float4 &b);
-ccl_device_inline float dot(const float4 &a, const float4 &b);
-ccl_device_inline float len_squared(const float4 &a);
-ccl_device_inline float4 rcp(const float4 &a);
-ccl_device_inline float4 sqrt(const float4 &a);
-ccl_device_inline float4 sqr(const float4 &a);
-ccl_device_inline float4 cross(const float4 &a, const float4 &b);
-ccl_device_inline bool is_zero(const float4 &a);
-ccl_device_inline float average(const float4 &a);
-ccl_device_inline float len(const float4 &a);
-ccl_device_inline float4 normalize(const float4 &a);
-ccl_device_inline float4 safe_normalize(const float4 &a);
-ccl_device_inline float4 min(const float4 &a, const float4 &b);
-ccl_device_inline float4 max(const float4 &a, const float4 &b);
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx);
-ccl_device_inline float4 fabs(const float4 &a);
-ccl_device_inline float4 floor(const float4 &a);
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_METAL__*/
-
-ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
-ccl_device_inline float4 safe_divide(const float4 a, const float b);
-
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b);
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b);
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b);
-
-#  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b);
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
-#  endif
-#endif /* __KERNEL_SSE__ */
-
-ccl_device_inline float reduce_min(const float4 a);
-ccl_device_inline float reduce_max(const float4 a);
-ccl_device_inline float reduce_add(const float4 a);
-
-ccl_device_inline bool isequal(const float4 a, const float4 b);
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-#endif /* !__KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float4 zero_float4()
 {
 #ifdef __KERNEL_SSE__
@@ -103,6 +25,16 @@ ccl_device_inline float4 one_float4()
   return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }
 
+ccl_device_inline int4 cast(const float4 a)
+{
+#ifdef __KERNEL_SSE__
+  return int4(_mm_castps_si128(a));
+#else
+  return make_int4(
+      __float_as_int(a.x), __float_as_int(a.y), __float_as_int(a.z), __float_as_int(a.w));
+#endif
+}
+
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a)
 {
@@ -114,7 +46,7 @@ ccl_device_inline float4 operator-(const float4 &a)
 #  endif
 }
 
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator*(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_mul_ps(a.m128, b.m128));
@@ -123,7 +55,7 @@ ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float4 operator*(const float4 &a, float f)
+ccl_device_inline float4 operator*(const float4 a, float f)
 {
 #  if defined(__KERNEL_SSE__)
   return a * make_float4(f);
@@ -132,17 +64,17 @@ ccl_device_inline float4 operator*(const float4 &a, float f)
 #  endif
 }
 
-ccl_device_inline float4 operator*(float f, const float4 &a)
+ccl_device_inline float4 operator*(float f, const float4 a)
 {
   return a * f;
 }
 
-ccl_device_inline float4 operator/(const float4 &a, float f)
+ccl_device_inline float4 operator/(const float4 a, float f)
 {
   return a * (1.0f / f);
 }
 
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator/(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_div_ps(a.m128, b.m128));
@@ -151,12 +83,7 @@ ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float4 operator+(const float4 &a, const float f)
-{
-  return a + make_float4(f, f, f, f);
-}
-
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator+(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_add_ps(a.m128, b.m128));
@@ -165,12 +92,12 @@ ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float4 operator-(const float4 &a, const float f)
+ccl_device_inline float4 operator+(const float4 a, const float f)
 {
-  return a - make_float4(f, f, f, f);
+  return a + make_float4(f);
 }
 
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return float4(_mm_sub_ps(a.m128, b.m128));
@@ -179,17 +106,22 @@ ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float f)
+{
+  return a - make_float4(f);
+}
+
+ccl_device_inline float4 operator+=(float4 &a, const float4 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline float4 operator-=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-=(float4 &a, const float4 b)
 {
   return a = a - b;
 }
 
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator*=(float4 &a, const float4 b)
 {
   return a = a * b;
 }
@@ -204,7 +136,7 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
   return a = a / f;
 }
 
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
@@ -213,7 +145,7 @@ ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator>=(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
@@ -222,7 +154,7 @@ ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<=(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
@@ -231,7 +163,7 @@ ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline bool operator==(const float4 &a, const float4 &b)
+ccl_device_inline bool operator==(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -240,160 +172,148 @@ ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 #  endif
 }
 
-ccl_device_inline float distance(const float4 &a, const float4 &b)
-{
-  return len(a - b);
-}
-
-ccl_device_inline float dot(const float4 &a, const float4 &b)
+ccl_device_inline const float4 operator^(const float4 a, const float4 b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  __m128 t = vmulq_f32(a, b);
-  return vaddvq_f32(t);
-#    else
-  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#    endif
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_xor_ps(a.m128, b.m128));
 #  else
-  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+  return make_float4(__uint_as_float(__float_as_uint(a.x) ^ __float_as_uint(b.x)),
+                     __uint_as_float(__float_as_uint(a.y) ^ __float_as_uint(b.y)),
+                     __uint_as_float(__float_as_uint(a.z) ^ __float_as_uint(b.z)),
+                     __uint_as_float(__float_as_uint(a.w) ^ __float_as_uint(b.w)));
 #  endif
 }
 
-ccl_device_inline float len_squared(const float4 &a)
-{
-  return dot(a, a);
-}
-
-ccl_device_inline float4 rcp(const float4 &a)
+ccl_device_inline float4 min(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
-  /* Don't use _mm_rcp_ps due to poor precision. */
-  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+  return float4(_mm_min_ps(a.m128, b.m128));
 #  else
-  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+  return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
 #  endif
 }
 
-ccl_device_inline float4 sqrt(const float4 &a)
+ccl_device_inline float4 max(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
-  return float4(_mm_sqrt_ps(a.m128));
+  return float4(_mm_max_ps(a.m128, b.m128));
 #  else
-  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+  return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
 #  endif
 }
 
-ccl_device_inline float4 sqr(const float4 &a)
+ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
 {
-  return a * a;
+  return min(max(a, mn), mx);
 }
+#endif /* !__KERNEL_METAL__*/
 
-ccl_device_inline float4 cross(const float4 &a, const float4 &b)
+ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
 {
-#  ifdef __KERNEL_SSE__
-  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
-         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
+  return float4(vfmaq_f32(c, a, b));
+#  elif defined(__KERNEL_AVX2__)
+  return float4(_mm_fmadd_ps(a, b, c));
 #  else
-  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+  return a * b + c;
 #  endif
+#else
+  return a * b + c;
+#endif
 }
 
-ccl_device_inline bool is_zero(const float4 &a)
+ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
 {
-#  ifdef __KERNEL_SSE__
-  return a == zero_float4();
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
+  return float4(vfmaq_f32(vnegq_f32(c), a, b));
+#  elif defined(__KERNEL_AVX2__)
+  return float4(_mm_fmsub_ps(a, b, c));
 #  else
-  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+  return a * b - c;
 #  endif
+#else
+  return a * b - c;
+#endif
 }
 
-ccl_device_inline float average(const float4 &a)
+#ifdef __KERNEL_SSE__
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 b)
 {
-  return reduce_add(a) * 0.25f;
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
+#  else
+  return float4(
+      _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+#  endif
 }
 
-ccl_device_inline float len(const float4 &a)
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
 {
-  return sqrtf(dot(a, a));
+  return float4(_mm_movelh_ps(a, a));
 }
 
-ccl_device_inline float4 normalize(const float4 &a)
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
 {
-  return a / len(a);
+  return float4(_mm_movehl_ps(a, a));
 }
 
-ccl_device_inline float4 safe_normalize(const float4 &a)
+#  ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
 {
-  float t = len(a);
-  return (t != 0.0f) ? a / t : a;
+  return float4(_mm_moveldup_ps(b));
 }
 
-ccl_device_inline float4 min(const float4 &a, const float4 &b)
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
 {
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_min_ps(a.m128, b.m128));
-#  else
-  return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#  endif
+  return float4(_mm_movehdup_ps(b));
 }
+#  endif /* __KERNEL_SSE3__ */
 
-ccl_device_inline float4 max(const float4 &a, const float4 &b)
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 a, const float4 b)
 {
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_max_ps(a.m128, b.m128));
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
 #  else
-  return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+  return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
 #  endif
 }
 
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
+template<size_t i0> __forceinline const float4 shuffle(const float4 b)
 {
-  return min(max(a, mn), mx);
+  return shuffle<i0, i0, i0, i0>(b);
 }
-
-ccl_device_inline float4 fabs(const float4 &a)
+template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
 {
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vabsq_f32(a));
-#    else
-  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#    endif
-#  else
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-#  endif
-}
-
-ccl_device_inline float4 floor(const float4 &a)
-{
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_floor_ps(a));
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
 #  else
-  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+  return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)));
 #  endif
 }
 
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
 {
-  return a + t * (b - a);
+  return float4(_mm_movelh_ps(a, b));
 }
 
-ccl_device_inline float4 saturate(const float4 &a)
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
 {
-  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+  return float4(_mm_movehl_ps(b, a));
 }
 
-ccl_device_inline float4 exp(float4 v)
+template<size_t i> __forceinline float extract(const float4 a)
 {
-  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+  return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
 }
-
-ccl_device_inline float4 log(float4 v)
+template<> __forceinline float extract<0>(const float4 a)
 {
-  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+  return _mm_cvtss_f32(a);
 }
-
-#endif /* !__KERNEL_METAL__*/
+#endif
 
 ccl_device_inline float reduce_add(const float4 a)
 {
@@ -440,77 +360,192 @@ ccl_device_inline float reduce_max(const float4 a)
 #endif
 }
 
-ccl_device_inline bool isequal(const float4 a, const float4 b)
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float dot(const float4 a, const float4 b)
 {
-#if defined(__KERNEL_METAL__)
-  return all(a == b);
-#else
-  return a == b;
-#endif
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  __m128 t = vmulq_f32(a, b);
+  return vaddvq_f32(t);
+#    else
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#    endif
+#  else
+  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+#  endif
 }
+#endif /* !defined(__KERNEL_METAL__) */
 
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b)
+ccl_device_inline float len(const float4 a)
 {
-#  if defined(__KERNEL_NEON__)
-  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
+  return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float len_squared(const float4 a)
+{
+  return dot(a, a);
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float4 a, const float4 b)
+{
+  return len(a - b);
+}
+
+ccl_device_inline float4 rcp(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+  /* Don't use _mm_rcp_ps due to poor precision. */
+  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #  else
-  return float4(_mm_castsi128_ps(
-      _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
+  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
 #  endif
 }
 
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b)
+ccl_device_inline float4 sqrt(const float4 a)
 {
-#  if defined(__KERNEL_NEON__)
-  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_sqrt_ps(a.m128));
 #  else
-  return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
 #  endif
 }
 
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
+ccl_device_inline float4 sqr(const float4 a)
 {
-  return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
+  return a * a;
 }
 
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b)
+ccl_device_inline float4 cross(const float4 a, const float4 b)
 {
-  return float4(_mm_movelh_ps(a.m128, b.m128));
+#  ifdef __KERNEL_SSE__
+  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
+         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+#  else
+  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+#  endif
 }
 
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b)
+ccl_device_inline bool is_zero(const float4 a)
 {
-  return float4(_mm_movehl_ps(b.m128, a.m128));
+#  ifdef __KERNEL_SSE__
+  return a == zero_float4();
+#  else
+  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#  endif
 }
 
-#  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b)
+ccl_device_inline float average(const float4 a)
 {
-  return float4(_mm_moveldup_ps(b));
+  return reduce_add(a) * 0.25f;
 }
 
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b)
+ccl_device_inline float4 normalize(const float4 a)
 {
-  return float4(_mm_movehdup_ps(b));
+  return a / len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4 a)
+{
+  float t = len(a);
+  return (t != 0.0f) ? a / t : a;
+}
+
+ccl_device_inline float4 fabs(const float4 a)
+{
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vabsq_f32(a));
+#    else
+  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#    endif
+#  else
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 floor(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+#    if defined(__KERNEL_NEON__)
+  return float4(vrndmq_f32(a));
+#    else
+  return float4(_mm_floor_ps(a));
+#    endif
+#  else
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
+{
+#  ifdef __KERNEL_SSE__
+  const float4 f = floor(x);
+  *i = int4(_mm_cvttps_epi32(f.m128));
+  return x - f;
+#  else
+  float4 r;
+  r.x = floorfrac(x.x, &i->x);
+  r.y = floorfrac(x.y, &i->y);
+  r.z = floorfrac(x.z, &i->z);
+  r.w = floorfrac(x.w, &i->w);
+  return r;
+#  endif
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, const float4 t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float4 saturate(const float4 a)
+{
+  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
+#endif /* !__KERNEL_METAL__*/
+
+ccl_device_inline bool isequal(const float4 a, const float4 b)
+{
+#if defined(__KERNEL_METAL__)
+  return all(a == b);
+#else
+  return a == b;
+#endif
 }
-#  endif /* __KERNEL_SSE3__ */
-#endif   /* __KERNEL_SSE__ */
 
 #ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b)
+ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_SSE41__
   return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
+#    else
+  return float4(
+      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b)));
+#    endif
 #  else
   return make_float4(
       (mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
 #  endif
 }
 
-ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
+ccl_device_inline float4 mask(const int4 mask, const float4 a)
 {
   /* Replace elements of x with zero where mask isn't set. */
   return select(mask, a, zero_float4());
diff --git a/intern/cycles/util/math_float8.h b/intern/cycles/util/math_float8.h
index b538cfbe70b..755a720a10b 100644
--- a/intern/cycles/util/math_float8.h
+++ b/intern/cycles/util/math_float8.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
  * Copyright 2022 Blender Foundation */
 
 #ifndef __UTIL_MATH_FLOAT8_H__
@@ -10,193 +11,138 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator+(const float8_t a, const float f);
-ccl_device_inline float8_t operator+(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator-(const float8_t a);
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator-(const float8_t a, const float f);
-ccl_device_inline float8_t operator-(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*(const float8_t a, const float f);
-ccl_device_inline float8_t operator*(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b);
-ccl_device_inline float8_t operator/(const float8_t a, float f);
-ccl_device_inline float8_t operator/(const float f, const float8_t a);
-
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b);
-
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b);
-ccl_device_inline float8_t operator*=(float8_t a, float f);
-
-ccl_device_inline float8_t operator/=(float8_t a, float f);
-
-ccl_device_inline bool operator==(const float8_t a, const float8_t b);
-
-ccl_device_inline float8_t rcp(const float8_t a);
-ccl_device_inline float8_t sqrt(const float8_t a);
-ccl_device_inline float8_t sqr(const float8_t a);
-ccl_device_inline bool is_zero(const float8_t a);
-ccl_device_inline float average(const float8_t a);
-ccl_device_inline float8_t min(const float8_t a, const float8_t b);
-ccl_device_inline float8_t max(const float8_t a, const float8_t b);
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx);
-ccl_device_inline float8_t fabs(const float8_t a);
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t);
-ccl_device_inline float8_t saturate(const float8_t a);
-
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b);
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b);
-
-ccl_device_inline float reduce_min(const float8_t a);
-ccl_device_inline float reduce_max(const float8_t a);
-ccl_device_inline float reduce_add(const float8_t a);
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b);
-
-/*******************************************************************************
- * Definition.
- */
-
-ccl_device_inline float8_t zero_float8_t()
+ccl_device_inline vfloat8 zero_vfloat8()
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_setzero_ps());
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_setzero_ps());
 #else
-  return make_float8_t(0.0f);
+  return make_vfloat8(0.0f);
 #endif
 }
 
-ccl_device_inline float8_t one_float8_t()
+ccl_device_inline vfloat8 one_vfloat8()
 {
-  return make_float8_t(1.0f);
+  return make_vfloat8(1.0f);
 }
 
-ccl_device_inline float8_t operator+(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_add_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_add_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
       a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
 #endif
 }
 
-ccl_device_inline float8_t operator+(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator+(const vfloat8 a, const float f)
 {
-  return a + make_float8_t(f);
+  return a + make_vfloat8(f);
 }
 
-ccl_device_inline float8_t operator+(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator+(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) + a;
+  return make_vfloat8(f) + a;
 }
 
-ccl_device_inline float8_t operator-(const float8_t a)
+ccl_device_inline vfloat8 operator-(const vfloat8 a)
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
   __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
-  return float8_t(_mm256_xor_ps(a.m256, mask));
+  return vfloat8(_mm256_xor_ps(a.m256, mask));
 #else
-  return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+  return make_vfloat8(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
 #endif
 }
 
-ccl_device_inline float8_t operator-(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_sub_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_sub_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
       a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
 #endif
 }
 
-ccl_device_inline float8_t operator-(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator-(const vfloat8 a, const float f)
 {
-  return a - make_float8_t(f);
+  return a - make_vfloat8(f);
 }
 
-ccl_device_inline float8_t operator-(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator-(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) - a;
+  return make_vfloat8(f) - a;
 }
 
-ccl_device_inline float8_t operator*(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_mul_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_mul_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
       a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
 #endif
 }
 
-ccl_device_inline float8_t operator*(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator*(const vfloat8 a, const float f)
 {
-  return a * make_float8_t(f);
+  return a * make_vfloat8(f);
 }
 
-ccl_device_inline float8_t operator*(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator*(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) * a;
+  return make_vfloat8(f) * a;
 }
 
-ccl_device_inline float8_t operator/(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_div_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_div_ps(a.m256, b.m256));
 #else
-  return make_float8_t(
+  return make_vfloat8(
       a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
 #endif
 }
 
-ccl_device_inline float8_t operator/(const float8_t a, const float f)
+ccl_device_inline vfloat8 operator/(const vfloat8 a, const float f)
 {
-  return a / make_float8_t(f);
+  return a / make_vfloat8(f);
 }
 
-ccl_device_inline float8_t operator/(const float f, const float8_t a)
+ccl_device_inline vfloat8 operator/(const float f, const vfloat8 a)
 {
-  return make_float8_t(f) / a;
+  return make_vfloat8(f) / a;
 }
 
-ccl_device_inline float8_t operator+=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator+=(vfloat8 a, const vfloat8 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline float8_t operator-=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator-=(vfloat8 a, const vfloat8 b)
 {
   return a = a - b;
 }
 
-ccl_device_inline float8_t operator*=(float8_t a, const float8_t b)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, const vfloat8 b)
 {
   return a = a * b;
 }
 
-ccl_device_inline float8_t operator*=(float8_t a, float f)
+ccl_device_inline vfloat8 operator*=(vfloat8 a, float f)
 {
   return a = a * f;
 }
 
-ccl_device_inline float8_t operator/=(float8_t a, float f)
+ccl_device_inline vfloat8 operator/=(vfloat8 a, float f)
 {
   return a = a / f;
 }
 
-ccl_device_inline bool operator==(const float8_t a, const float8_t b)
+ccl_device_inline bool operator==(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
   return (_mm256_movemask_ps(_mm256_castsi256_ps(
               _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
           0b11111111) == 0b11111111;
@@ -206,132 +152,180 @@ ccl_device_inline bool operator==(const float8_t a, const float8_t b)
 #endif
 }
 
-ccl_device_inline float8_t rcp(const float8_t a)
+ccl_device_inline const vfloat8 operator^(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_rcp_ps(a.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_xor_ps(a.m256, b.m256));
 #else
-  return make_float8_t(1.0f / a.a,
-                       1.0f / a.b,
-                       1.0f / a.c,
-                       1.0f / a.d,
-                       1.0f / a.e,
-                       1.0f / a.f,
-                       1.0f / a.g,
-                       1.0f / a.h);
+  return make_vfloat8(__uint_as_float(__float_as_uint(a.a) ^ __float_as_uint(b.a)),
+                      __uint_as_float(__float_as_uint(a.b) ^ __float_as_uint(b.b)),
+                      __uint_as_float(__float_as_uint(a.c) ^ __float_as_uint(b.c)),
+                      __uint_as_float(__float_as_uint(a.d) ^ __float_as_uint(b.d)),
+                      __uint_as_float(__float_as_uint(a.e) ^ __float_as_uint(b.e)),
+                      __uint_as_float(__float_as_uint(a.f) ^ __float_as_uint(b.f)),
+                      __uint_as_float(__float_as_uint(a.g) ^ __float_as_uint(b.g)),
+                      __uint_as_float(__float_as_uint(a.h) ^ __float_as_uint(b.h)));
 #endif
 }
 
-ccl_device_inline float8_t sqrt(const float8_t a)
+ccl_device_inline vfloat8 rcp(const vfloat8 a)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_sqrt_ps(a.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_rcp_ps(a.m256));
 #else
-  return make_float8_t(sqrtf(a.a),
-                       sqrtf(a.b),
-                       sqrtf(a.c),
-                       sqrtf(a.d),
-                       sqrtf(a.e),
-                       sqrtf(a.f),
-                       sqrtf(a.g),
-                       sqrtf(a.h));
+  return make_vfloat8(1.0f / a.a,
+                      1.0f / a.b,
+                      1.0f / a.c,
+                      1.0f / a.d,
+                      1.0f / a.e,
+                      1.0f / a.f,
+                      1.0f / a.g,
+                      1.0f / a.h);
 #endif
 }
 
-ccl_device_inline float8_t sqr(const float8_t a)
+ccl_device_inline vfloat8 sqrt(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_sqrt_ps(a.m256));
+#else
+  return make_vfloat8(sqrtf(a.a),
+                      sqrtf(a.b),
+                      sqrtf(a.c),
+                      sqrtf(a.d),
+                      sqrtf(a.e),
+                      sqrtf(a.f),
+                      sqrtf(a.g),
+                      sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 sqr(const vfloat8 a)
 {
   return a * a;
 }
 
-ccl_device_inline bool is_zero(const float8_t a)
+ccl_device_inline bool is_zero(const vfloat8 a)
 {
-  return a == make_float8_t(0.0f);
+  return a == make_vfloat8(0.0f);
 }
 
-ccl_device_inline float average(const float8_t a)
+ccl_device_inline float reduce_add(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  vfloat8 b(_mm256_hadd_ps(a.m256, a.m256));
+  vfloat8 h(_mm256_hadd_ps(b.m256, b.m256));
+  return h[0] + h[4];
+#else
+  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline float average(const vfloat8 a)
 {
   return reduce_add(a) / 8.0f;
 }
 
-ccl_device_inline float8_t min(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 min(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_min_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_min_ps(a.m256, b.m256));
 #else
-  return make_float8_t(min(a.a, b.a),
-                       min(a.b, b.b),
-                       min(a.c, b.c),
-                       min(a.d, b.d),
-                       min(a.e, b.e),
-                       min(a.f, b.f),
-                       min(a.g, b.g),
-                       min(a.h, b.h));
+  return make_vfloat8(min(a.a, b.a),
+                      min(a.b, b.b),
+                      min(a.c, b.c),
+                      min(a.d, b.d),
+                      min(a.e, b.e),
+                      min(a.f, b.f),
+                      min(a.g, b.g),
+                      min(a.h, b.h));
 #endif
 }
 
-ccl_device_inline float8_t max(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 max(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_max_ps(a.m256, b.m256));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_max_ps(a.m256, b.m256));
 #else
-  return make_float8_t(max(a.a, b.a),
-                       max(a.b, b.b),
-                       max(a.c, b.c),
-                       max(a.d, b.d),
-                       max(a.e, b.e),
-                       max(a.f, b.f),
-                       max(a.g, b.g),
-                       max(a.h, b.h));
+  return make_vfloat8(max(a.a, b.a),
+                      max(a.b, b.b),
+                      max(a.c, b.c),
+                      max(a.d, b.d),
+                      max(a.e, b.e),
+                      max(a.f, b.f),
+                      max(a.g, b.g),
+                      max(a.h, b.h));
 #endif
 }
 
-ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx)
+ccl_device_inline vfloat8 clamp(const vfloat8 a, const vfloat8 mn, const vfloat8 mx)
 {
   return min(max(a, mn), mx);
 }
 
-ccl_device_inline float8_t fabs(const float8_t a)
+ccl_device_inline vfloat8 select(const vint8 mask, const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_blendv_ps(b, a, _mm256_castsi256_ps(mask)));
 #else
-  return make_float8_t(fabsf(a.a),
-                       fabsf(a.b),
-                       fabsf(a.c),
-                       fabsf(a.d),
-                       fabsf(a.e),
-                       fabsf(a.f),
-                       fabsf(a.g),
-                       fabsf(a.h));
+  return make_vfloat8((mask.a) ? a.a : b.a,
+                      (mask.b) ? a.b : b.b,
+                      (mask.c) ? a.c : b.c,
+                      (mask.d) ? a.d : b.d,
+                      (mask.e) ? a.e : b.e,
+                      (mask.f) ? a.f : b.f,
+                      (mask.g) ? a.g : b.g,
+                      (mask.h) ? a.h : b.h);
 #endif
 }
 
-ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t)
+ccl_device_inline vfloat8 fabs(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+  return make_vfloat8(fabsf(a.a),
+                      fabsf(a.b),
+                      fabsf(a.c),
+                      fabsf(a.d),
+                      fabsf(a.e),
+                      fabsf(a.f),
+                      fabsf(a.g),
+                      fabsf(a.h));
+#endif
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline vfloat8 mix(const vfloat8 a, const vfloat8 b, vfloat8 t)
 {
   return a + t * (b - a);
 }
 
-ccl_device_inline float8_t saturate(const float8_t a)
+ccl_device_inline vfloat8 saturate(const vfloat8 a)
 {
-  return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f));
+  return clamp(a, make_vfloat8(0.0f), make_vfloat8(1.0f));
 }
 
-ccl_device_inline float8_t exp(float8_t v)
+ccl_device_inline vfloat8 exp(vfloat8 v)
 {
-  return make_float8_t(
+  return make_vfloat8(
       expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
 }
 
-ccl_device_inline float8_t log(float8_t v)
+ccl_device_inline vfloat8 log(vfloat8 v)
 {
-  return make_float8_t(
+  return make_vfloat8(
       logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
 }
 
-ccl_device_inline float dot(const float8_t a, const float8_t b)
+ccl_device_inline float dot(const vfloat8 a, const vfloat8 b)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+#ifdef __KERNEL_AVX__
+  vfloat8 t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
   return t[0] + t[4];
 #else
   return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
@@ -339,62 +333,51 @@ ccl_device_inline float dot(const float8_t a, const float8_t b)
 #endif
 }
 
-ccl_device_inline float8_t pow(float8_t v, float e)
+ccl_device_inline vfloat8 pow(vfloat8 v, float e)
 {
-  return make_float8_t(powf(v.a, e),
-                       powf(v.b, e),
-                       powf(v.c, e),
-                       powf(v.d, e),
-                       powf(v.e, e),
-                       powf(v.f, e),
-                       powf(v.g, e),
-                       powf(v.h, e));
+  return make_vfloat8(powf(v.a, e),
+                      powf(v.b, e),
+                      powf(v.c, e),
+                      powf(v.d, e),
+                      powf(v.e, e),
+                      powf(v.f, e),
+                      powf(v.g, e),
+                      powf(v.h, e));
 }
 
-ccl_device_inline float reduce_min(const float8_t a)
+ccl_device_inline float reduce_min(const vfloat8 a)
 {
   return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
 }
 
-ccl_device_inline float reduce_max(const float8_t a)
+ccl_device_inline float reduce_max(const vfloat8 a)
 {
   return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
 }
 
-ccl_device_inline float reduce_add(const float8_t a)
-{
-#ifdef __KERNEL_AVX2__
-  float8_t b(_mm256_hadd_ps(a.m256, a.m256));
-  float8_t h(_mm256_hadd_ps(b.m256, b.m256));
-  return h[0] + h[4];
-#else
-  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
-#endif
-}
-
-ccl_device_inline bool isequal(const float8_t a, const float8_t b)
+ccl_device_inline bool isequal(const vfloat8 a, const vfloat8 b)
 {
   return a == b;
 }
 
-ccl_device_inline float8_t safe_divide(const float8_t a, const float b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const float b)
 {
-  return (b != 0.0f) ? a / b : make_float8_t(0.0f);
+  return (b != 0.0f) ? a / b : make_vfloat8(0.0f);
 }
 
-ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b)
+ccl_device_inline vfloat8 safe_divide(const vfloat8 a, const vfloat8 b)
 {
-  return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f,
-                       (b.b != 0.0f) ? a.b / b.b : 0.0f,
-                       (b.c != 0.0f) ? a.c / b.c : 0.0f,
-                       (b.d != 0.0f) ? a.d / b.d : 0.0f,
-                       (b.e != 0.0f) ? a.e / b.e : 0.0f,
-                       (b.f != 0.0f) ? a.f / b.f : 0.0f,
-                       (b.g != 0.0f) ? a.g / b.g : 0.0f,
-                       (b.h != 0.0f) ? a.h / b.h : 0.0f);
+  return make_vfloat8((b.a != 0.0f) ? a.a / b.a : 0.0f,
+                      (b.b != 0.0f) ? a.b / b.b : 0.0f,
+                      (b.c != 0.0f) ? a.c / b.c : 0.0f,
+                      (b.d != 0.0f) ? a.d / b.d : 0.0f,
+                      (b.e != 0.0f) ? a.e / b.e : 0.0f,
+                      (b.f != 0.0f) ? a.f / b.f : 0.0f,
+                      (b.g != 0.0f) ? a.g / b.g : 0.0f,
+                      (b.h != 0.0f) ? a.h / b.h : 0.0f);
 }
 
-ccl_device_inline float8_t ensure_finite(float8_t v)
+ccl_device_inline vfloat8 ensure_finite(vfloat8 v)
 {
   v.a = ensure_finite(v.a);
   v.b = ensure_finite(v.b);
@@ -408,12 +391,92 @@ ccl_device_inline float8_t ensure_finite(float8_t v)
   return v;
 }
 
-ccl_device_inline bool isfinite_safe(float8_t v)
+ccl_device_inline bool isfinite_safe(vfloat8 v)
 {
   return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
          isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
 }
 
+ccl_device_inline vint8 cast(const vfloat8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_castps_si256(a));
+#else
+  return make_vint8(__float_as_int(a.a),
+                    __float_as_int(a.b),
+                    __float_as_int(a.c),
+                    __float_as_int(a.d),
+                    __float_as_int(a.e),
+                    __float_as_int(a.f),
+                    __float_as_int(a.g),
+                    __float_as_int(a.h));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline float4 low(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return float4(_mm256_extractf128_ps(a.m256, 0));
+#  else
+  return make_float4(a.e, a.f, a.g, a.h);
+#  endif
+}
+ccl_device_forceinline float4 high(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return float4(_mm256_extractf128_ps(a.m256, 1));
+#  else
+  return make_float4(a.a, a.b, a.c, a.d);
+#  endif
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0)));
+#  else
+  return make_vfloat8(a[i0], a[i1], a[i2], a[i3], a[i4 + 4], a[i5 + 4], a[i6 + 4], a[i7 + 4]);
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  else
+  return make_vfloat8(shuffle<i0, i1, i2, i3>(high(a), high(b)),
+                      shuffle<i0, i1, i2, i3>(low(a), low(b)));
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+  return shuffle<i0, i1, i2, i3>(a, a);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a, const vfloat8 b)
+{
+  return shuffle<i0, i0, i0, i0>(a, b);
+}
+template<size_t i0> ccl_device_forceinline const vfloat8 shuffle(const vfloat8 a)
+{
+  return shuffle<i0>(a, a);
+}
+
+template<size_t i> ccl_device_forceinline float extract(const vfloat8 a)
+{
+#  ifdef __KERNEL_AVX__
+  __m256 b = shuffle<i, i, i, i>(a).m256;
+  return _mm256_cvtss_f32(b);
+#  else
+  return a[i];
+#  endif
+}
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_FLOAT8_H__ */
diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h
index f4d8a71221a..2df2ec5505b 100644
--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@@ -10,23 +10,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline bool operator==(const int2 a, const int2 b);
-ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
-ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
-ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
-#endif /* !__KERNEL_METAL__ */
-
-/*******************************************************************************
- * Definition.
- */
-
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h
index 48bffeaf553..b5b972ddfb5 100644
--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -10,21 +10,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline int3 min(int3 a, int3 b);
-ccl_device_inline int3 max(int3 a, int3 b);
-ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
-ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
-#endif /*  !defined(__KERNEL_METAL__) */
-
-/*******************************************************************************
- * Definition.
- */
-
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b)
 {
@@ -44,7 +29,7 @@ ccl_device_inline int3 max(int3 a, int3 b)
 #  endif
 }
 
-ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
+ccl_device_inline int3 clamp(const int3 a, int mn, int mx)
 {
 #  ifdef __KERNEL_SSE__
   return min(max(a, make_int3(mn)), make_int3(mx));
@@ -53,7 +38,7 @@ ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 #  endif
 }
 
-ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
+ccl_device_inline int3 clamp(const int3 a, int3 &mn, int mx)
 {
 #  ifdef __KERNEL_SSE__
   return min(max(a, mn), make_int3(mx));
@@ -62,22 +47,22 @@ ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 #  endif
 }
 
-ccl_device_inline bool operator==(const int3 &a, const int3 &b)
+ccl_device_inline bool operator==(const int3 a, const int3 b)
 {
   return a.x == b.x && a.y == b.y && a.z == b.z;
 }
 
-ccl_device_inline bool operator!=(const int3 &a, const int3 &b)
+ccl_device_inline bool operator!=(const int3 a, const int3 b)
 {
   return !(a == b);
 }
 
-ccl_device_inline bool operator<(const int3 &a, const int3 &b)
+ccl_device_inline bool operator<(const int3 a, const int3 b)
 {
   return a.x < b.x && a.y < b.y && a.z < b.z;
 }
 
-ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
+ccl_device_inline int3 operator+(const int3 a, const int3 b)
 {
 #  ifdef __KERNEL_SSE__
   return int3(_mm_add_epi32(a.m128, b.m128));
@@ -86,7 +71,7 @@ ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 #  endif
 }
 
-ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
+ccl_device_inline int3 operator-(const int3 a, const int3 b)
 {
 #  ifdef __KERNEL_SSE__
   return int3(_mm_sub_epi32(a.m128, b.m128));
diff --git a/intern/cycles/util/math_int4.h b/intern/cycles/util/math_int4.h
index fbdada223cb..c6d767d7587 100644
--- a/intern/cycles/util/math_int4.h
+++ b/intern/cycles/util/math_int4.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
  * Copyright 2011-2022 Blender Foundation */
 
 #ifndef __UTIL_MATH_INT4_H__
@@ -10,30 +11,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-/*******************************************************************************
- * Declaration.
- */
-
 #ifndef __KERNEL_GPU__
-ccl_device_inline int4 operator+(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator+=(int4 &a, const int4 &b);
-ccl_device_inline int4 operator>>(const int4 &a, int i);
-ccl_device_inline int4 operator<<(const int4 &a, int i);
-ccl_device_inline int4 operator<(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator>=(const int4 &a, const int4 &b);
-ccl_device_inline int4 operator&(const int4 &a, const int4 &b);
-ccl_device_inline int4 min(int4 a, int4 b);
-ccl_device_inline int4 max(int4 a, int4 b);
-ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx);
-ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b);
-#endif /* __KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator+(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_add_epi32(a.m128, b.m128));
@@ -42,12 +21,26 @@ ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator+=(int4 &a, const int4 &b)
+ccl_device_inline int4 operator+=(int4 &a, const int4 b)
 {
   return a = a + b;
 }
 
-ccl_device_inline int4 operator>>(const int4 &a, int i)
+ccl_device_inline int4 operator-(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_sub_epi32(a.m128, b.m128));
+#  else
+  return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator-=(int4 &a, const int4 b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline int4 operator>>(const int4 a, int i)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_srai_epi32(a.m128, i));
@@ -56,7 +49,7 @@ ccl_device_inline int4 operator>>(const int4 &a, int i)
 #  endif
 }
 
-ccl_device_inline int4 operator<<(const int4 &a, int i)
+ccl_device_inline int4 operator<<(const int4 a, int i)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_slli_epi32(a.m128, i));
@@ -65,7 +58,7 @@ ccl_device_inline int4 operator<<(const int4 &a, int i)
 #  endif
 }
 
-ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator<(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_cmplt_epi32(a.m128, b.m128));
@@ -74,7 +67,26 @@ ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator<(const int4 a, const int b)
+{
+  return a < make_int4(b);
+}
+
+ccl_device_inline int4 operator==(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_cmpeq_epi32(a.m128, b.m128));
+#  else
+  return make_int4(a.x == b.x, a.y == b.y, a.z == b.z, a.w == b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator==(const int4 a, const int b)
+{
+  return a == make_int4(b);
+}
+
+ccl_device_inline int4 operator>=(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
@@ -83,7 +95,12 @@ ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
 #  endif
 }
 
-ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
+ccl_device_inline int4 operator>=(const int4 a, const int b)
+{
+  return a >= make_int4(b);
+}
+
+ccl_device_inline int4 operator&(const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_and_si128(a.m128, b.m128));
@@ -92,6 +109,97 @@ ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
 #  endif
 }
 
+ccl_device_inline int4 operator|(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_or_si128(a.m128, b.m128));
+#  else
+  return make_int4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator^(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int4(_mm_xor_si128(a.m128, b.m128));
+#  else
+  return make_int4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+#  endif
+}
+
+ccl_device_inline int4 operator&(const int32_t a, const int4 b)
+{
+  return make_int4(a) & b;
+}
+
+ccl_device_inline int4 operator&(const int4 a, const int32_t b)
+{
+  return a & make_int4(b);
+}
+
+ccl_device_inline int4 operator|(const int32_t a, const int4 b)
+{
+  return make_int4(a) | b;
+}
+
+ccl_device_inline int4 operator|(const int4 a, const int32_t b)
+{
+  return a | make_int4(b);
+}
+
+ccl_device_inline int4 operator^(const int32_t a, const int4 b)
+{
+  return make_int4(a) ^ b;
+}
+
+ccl_device_inline int4 operator^(const int4 a, const int32_t b)
+{
+  return a ^ make_int4(b);
+}
+
+ccl_device_inline int4 &operator&=(int4 &a, const int4 b)
+{
+  return a = a & b;
+}
+ccl_device_inline int4 &operator&=(int4 &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+ccl_device_inline int4 &operator|=(int4 &a, const int4 b)
+{
+  return a = a | b;
+}
+ccl_device_inline int4 &operator|=(int4 &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+ccl_device_inline int4 &operator^=(int4 &a, const int4 b)
+{
+  return a = a ^ b;
+}
+ccl_device_inline int4 &operator^=(int4 &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+ccl_device_inline int4 &operator<<=(int4 &a, const int32_t b)
+{
+  return a = a << b;
+}
+ccl_device_inline int4 &operator>>=(int4 &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+#  ifdef __KERNEL_SSE__
+ccl_device_forceinline const int4 srl(const int4 a, const int32_t b)
+{
+  return int4(_mm_srli_epi32(a.m128, b));
+}
+#  endif
+
 ccl_device_inline int4 min(int4 a, int4 b)
 {
 #  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
@@ -110,12 +218,12 @@ ccl_device_inline int4 max(int4 a, int4 b)
 #  endif
 }
 
-ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx)
+ccl_device_inline int4 clamp(const int4 a, const int4 mn, const int4 mx)
 {
   return min(max(a, mn), mx);
 }
 
-ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b)
+ccl_device_inline int4 select(const int4 mask, const int4 a, const int4 b)
 {
 #  ifdef __KERNEL_SSE__
   return int4(_mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)));
@@ -135,6 +243,52 @@ ccl_device_inline int4 load_int4(const int *v)
 }
 #endif /* __KERNEL_GPU__ */
 
+ccl_device_inline float4 cast(const int4 a)
+{
+#ifdef __KERNEL_SSE__
+  return float4(_mm_castsi128_ps(a));
+#else
+  return make_float4(
+      __int_as_float(a.x), __int_as_float(a.y), __int_as_float(a.z), __int_as_float(a.w));
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+ccl_device_forceinline int4 andnot(const int4 a, const int4 b)
+{
+  return int4(_mm_andnot_si128(a.m128, b.m128));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline int4 shuffle(const int4 a)
+{
+#  ifdef __KERNEL_NEON__
+  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
+  return int4(vreinterpretq_m128i_s32(result));
+#  else
+  return int4(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  endif
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline int4 shuffle(const int4 a, const int4 b)
+{
+#  ifdef __KERNEL_NEON__
+  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
+                                                             vreinterpretq_s32_m128i(b));
+  return int4(vreinterpretq_m128i_s32(result));
+#  else
+  return int4(_mm_castps_si128(
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+#  endif
+}
+
+template<size_t i0> ccl_device_forceinline int4 shuffle(const int4 b)
+{
+  return shuffle<i0, i0, i0, i0>(b);
+}
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_INT4_H__ */
diff --git a/intern/cycles/util/math_int8.h b/intern/cycles/util/math_int8.h
new file mode 100644
index 00000000000..d150b0b74ec
--- /dev/null
+++ b/intern/cycles/util/math_int8.h
@@ -0,0 +1,355 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_INT8_H__
+#define __UTIL_MATH_INT8_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifndef __KERNEL_GPU__
+ccl_device_inline vint8 operator+(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_add_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator+=(vint8 &a, const vint8 b)
+{
+  return a = a + b;
+}
+
+ccl_device_inline vint8 operator-(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_sub_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator-=(vint8 &a, const vint8 b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline vint8 operator>>(const vint8 a, int i)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_srai_epi32(a.m256, i));
+#  else
+  return make_vint8(
+      a.a >> i, a.b >> i, a.c >> i, a.d >> i, a.e >> i, a.f >> i, a.g >> i, a.h >> i);
+#  endif
+}
+
+ccl_device_inline vint8 operator<<(const vint8 a, int i)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_slli_epi32(a.m256, i));
+#  else
+  return make_vint8(
+      a.a << i, a.b << i, a.c << i, a.d << i, a.e << i, a.f << i, a.g << i, a.h << i);
+#  endif
+}
+
+ccl_device_inline vint8 operator<(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_cmpgt_epi32(b.m256, a.m256));
+#  else
+  return make_vint8(
+      a.a < b.a, a.b < b.b, a.c < b.c, a.d < b.d, a.e < b.e, a.f < b.f, a.g < b.g, a.h < b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator<(const vint8 a, const int b)
+{
+  return a < make_vint8(b);
+}
+
+ccl_device_inline vint8 operator==(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_cmpeq_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(a.a == b.a,
+                    a.b == b.b,
+                    a.c == b.c,
+                    a.d == b.d,
+                    a.e == b.e,
+                    a.f == b.f,
+                    a.g == b.g,
+                    a.h == b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator==(const vint8 a, const int b)
+{
+  return a == make_vint8(b);
+}
+
+ccl_device_inline vint8 operator>=(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(
+      _mm256_xor_si256(_mm256_set1_epi32(0xffffffff), _mm256_cmpgt_epi32(b.m256, a.m256)));
+#  else
+  return make_vint8(a.a >= b.a,
+                    a.b >= b.b,
+                    a.c >= b.c,
+                    a.d >= b.d,
+                    a.e >= b.e,
+                    a.f >= b.f,
+                    a.g >= b.g,
+                    a.h >= b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator>=(const vint8 a, const int b)
+{
+  return a >= make_vint8(b);
+}
+
+ccl_device_inline vint8 operator&(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_and_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a & b.a, a.b & b.b, a.c & b.c, a.d & b.d, a.e & b.e, a.f & b.f, a.g & b.g, a.h & b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator|(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_or_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a | b.a, a.b | b.b, a.c | b.c, a.d | b.d, a.e | b.e, a.f | b.f, a.g | b.g, a.h | b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator^(const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_xor_si256(a.m256, b.m256));
+#  else
+  return make_vint8(
+      a.a ^ b.a, a.b ^ b.b, a.c ^ b.c, a.d ^ b.d, a.e ^ b.e, a.f ^ b.f, a.g ^ b.g, a.h ^ b.h);
+#  endif
+}
+
+ccl_device_inline vint8 operator&(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) & b;
+}
+
+ccl_device_inline vint8 operator&(const vint8 a, const int32_t b)
+{
+  return a & make_vint8(b);
+}
+
+ccl_device_inline vint8 operator|(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) | b;
+}
+
+ccl_device_inline vint8 operator|(const vint8 a, const int32_t b)
+{
+  return a | make_vint8(b);
+}
+
+ccl_device_inline vint8 operator^(const int32_t a, const vint8 b)
+{
+  return make_vint8(a) ^ b;
+}
+
+ccl_device_inline vint8 operator^(const vint8 a, const int32_t b)
+{
+  return a ^ make_vint8(b);
+}
+
+ccl_device_inline vint8 &operator&=(vint8 &a, const vint8 b)
+{
+  return a = a & b;
+}
+ccl_device_inline vint8 &operator&=(vint8 &a, const int32_t b)
+{
+  return a = a & b;
+}
+
+ccl_device_inline vint8 &operator|=(vint8 &a, const vint8 b)
+{
+  return a = a | b;
+}
+ccl_device_inline vint8 &operator|=(vint8 &a, const int32_t b)
+{
+  return a = a | b;
+}
+
+ccl_device_inline vint8 &operator^=(vint8 &a, const vint8 b)
+{
+  return a = a ^ b;
+}
+ccl_device_inline vint8 &operator^=(vint8 &a, const int32_t b)
+{
+  return a = a ^ b;
+}
+
+ccl_device_inline vint8 &operator<<=(vint8 &a, const int32_t b)
+{
+  return a = a << b;
+}
+ccl_device_inline vint8 &operator>>=(vint8 &a, const int32_t b)
+{
+  return a = a >> b;
+}
+
+#  ifdef __KERNEL_AVX__
+ccl_device_forceinline const vint8 srl(const vint8 a, const int32_t b)
+{
+  return vint8(_mm256_srli_epi32(a.m256, b));
+}
+#  endif
+
+ccl_device_inline vint8 min(vint8 a, vint8 b)
+{
+#  if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
+  return vint8(_mm256_min_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(min(a.a, b.a),
+                    min(a.b, b.b),
+                    min(a.c, b.c),
+                    min(a.d, b.d),
+                    min(a.e, b.e),
+                    min(a.f, b.f),
+                    min(a.g, b.g),
+                    min(a.h, b.h));
+#  endif
+}
+
+ccl_device_inline vint8 max(vint8 a, vint8 b)
+{
+#  if defined(__KERNEL_AVX__) && defined(__KERNEL_AVX41__)
+  return vint8(_mm256_max_epi32(a.m256, b.m256));
+#  else
+  return make_vint8(max(a.a, b.a),
+                    max(a.b, b.b),
+                    max(a.c, b.c),
+                    max(a.d, b.d),
+                    max(a.e, b.e),
+                    max(a.f, b.f),
+                    max(a.g, b.g),
+                    max(a.h, b.h));
+#  endif
+}
+
+ccl_device_inline vint8 clamp(const vint8 a, const vint8 mn, const vint8 mx)
+{
+  return min(max(a, mn), mx);
+}
+
+ccl_device_inline vint8 select(const vint8 mask, const vint8 a, const vint8 b)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_castps_si256(_mm256_blendv_ps(
+      _mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))));
+#  else
+  return make_vint8((mask.a) ? a.a : b.a,
+                    (mask.b) ? a.b : b.b,
+                    (mask.c) ? a.c : b.c,
+                    (mask.d) ? a.d : b.d,
+                    (mask.e) ? a.e : b.e,
+                    (mask.f) ? a.f : b.f,
+                    (mask.g) ? a.g : b.g,
+                    (mask.h) ? a.h : b.h);
+#  endif
+}
+
+ccl_device_inline vint8 load_vint8(const int *v)
+{
+#  ifdef __KERNEL_AVX__
+  return vint8(_mm256_loadu_si256((__m256i *)v));
+#  else
+  return make_vint8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+#  endif
+}
+#endif /* __KERNEL_GPU__ */
+
+ccl_device_inline vfloat8 cast(const vint8 a)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_castsi256_ps(a));
+#else
+  return make_vfloat8(__int_as_float(a.a),
+                      __int_as_float(a.b),
+                      __int_as_float(a.c),
+                      __int_as_float(a.d),
+                      __int_as_float(a.e),
+                      __int_as_float(a.f),
+                      __int_as_float(a.g),
+                      __int_as_float(a.h));
+#endif
+}
+
+#ifdef __KERNEL_AVX__
+template<size_t i> ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(
+      _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i))));
+}
+
+template<size_t i0, size_t i1> ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(_mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0)));
+}
+
+template<size_t i0, size_t i1>
+ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b)
+{
+  return vint8(_mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0)));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vint8 shuffle(const vint8 a)
+{
+  return vint8(
+      _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0))));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+ccl_device_forceinline const vint8 shuffle(const vint8 a, const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+}
+
+template<> __forceinline const vint8 shuffle<0, 0, 2, 2>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b))));
+}
+template<> __forceinline const vint8 shuffle<1, 1, 3, 3>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b))));
+}
+template<> __forceinline const vint8 shuffle<0, 1, 0, 1>(const vint8 b)
+{
+  return vint8(_mm256_castps_si256(
+      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b))))));
+}
+#endif
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INT8_H__ */
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index aa28682f8c1..0727debf775 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -133,7 +133,9 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
 ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+  return madd(make_float4(a.x),
+              make_float4(b.x),
+              madd(make_float4(a.y), make_float4(b.y), make_float4(a.z) * make_float4(b.z)))[0];
 #else
   return a.x * b.x + a.y * b.y + a.z * b.z;
 #endif
@@ -142,9 +144,10 @@ ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
 ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
-                     msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
-                     msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+  return make_float3(
+      msub(make_float4(a.y), make_float4(b.z), make_float4(a.z) * make_float4(b.y))[0],
+      msub(make_float4(a.z), make_float4(b.x), make_float4(a.x) * make_float4(b.z))[0],
+      msub(make_float4(a.x), make_float4(b.y), make_float4(a.y) * make_float4(b.x))[0]);
 #else
   return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 #endif
diff --git a/intern/cycles/util/sseb.h b/intern/cycles/util/sseb.h
deleted file mode 100644
index 6f78299711e..00000000000
--- a/intern/cycles/util/sseb.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEB_H__
-#define __UTIL_SSEB_H__
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct ssei;
-struct ssef;
-
-/*! 4-wide SSE bool type. */
-struct sseb {
-  typedef sseb Mask;   // mask type
-  typedef ssei Int;    // int type
-  typedef ssef Float;  // float type
-
-  enum { size = 4 };  // number of SIMD elements
-  union {
-    __m128 m128;
-    int32_t v[4];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline sseb()
-  {
-  }
-  __forceinline sseb(const sseb &other)
-  {
-    m128 = other.m128;
-  }
-  __forceinline sseb &operator=(const sseb &other)
-  {
-    m128 = other.m128;
-    return *this;
-  }
-
-  __forceinline sseb(const __m128 input) : m128(input)
-  {
-  }
-  __forceinline operator const __m128 &(void) const
-  {
-    return m128;
-  }
-  __forceinline operator const __m128i(void) const
-  {
-    return _mm_castps_si128(m128);
-  }
-  __forceinline operator const __m128d(void) const
-  {
-    return _mm_castps_pd(m128);
-  }
-
-  __forceinline sseb(bool a)
-      : m128(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(bool a, bool b)
-      : m128(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(bool a, bool b, bool c, bool d)
-      : m128(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)])
-  {
-  }
-  __forceinline sseb(int mask)
-  {
-    assert(mask >= 0 && mask < 16);
-    m128 = _mm_lookupmask_ps[mask];
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline sseb(FalseTy) : m128(_mm_setzero_ps())
-  {
-  }
-  __forceinline sseb(TrueTy)
-      : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator[](const size_t i) const
-  {
-    assert(i < 4);
-    return (_mm_movemask_ps(m128) >> i) & 1;
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 4);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator!(const sseb &a)
-{
-  return _mm_xor_ps(a, sseb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator&(const sseb &a, const sseb &b)
-{
-  return _mm_and_ps(a, b);
-}
-__forceinline const sseb operator|(const sseb &a, const sseb &b)
-{
-  return _mm_or_ps(a, b);
-}
-__forceinline const sseb operator^(const sseb &a, const sseb &b)
-{
-  return _mm_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator&=(sseb &a, const sseb &b)
-{
-  return a = a & b;
-}
-__forceinline const sseb operator|=(sseb &a, const sseb &b)
-{
-  return a = a | b;
-}
-__forceinline const sseb operator^=(sseb &a, const sseb &b)
-{
-  return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator!=(const sseb &a, const sseb &b)
-{
-  return _mm_xor_ps(a, b);
-}
-__forceinline const sseb operator==(const sseb &a, const sseb &b)
-{
-  return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b));
-}
-
-__forceinline const sseb select(const sseb &m, const sseb &t, const sseb &f)
-{
-#  if defined(__KERNEL_SSE41__)
-  return _mm_blendv_ps(f, t, m);
-#  else
-  return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb unpacklo(const sseb &a, const sseb &b)
-{
-  return _mm_unpacklo_ps(a, b);
-}
-__forceinline const sseb unpackhi(const sseb &a, const sseb &b)
-{
-  return _mm_unpackhi_ps(a, b);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const sseb shuffle(const sseb &a)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a);
-#  else
-  return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
-#  endif
-}
-
-#  ifndef __KERNEL_NEON__
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
-{
-  return _mm_movelh_ps(a, a);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
-{
-  return _mm_movehl_ps(a, a);
-}
-#  endif
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const sseb shuffle(const sseb &a, const sseb &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<int32x4_t, i0, i1, i2, i3>(a, b);
-#  else
-  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-#  endif
-}
-
-#  ifndef __KERNEL_NEON__
-template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
-{
-  return _mm_movelh_ps(a, b);
-}
-
-template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sseb &b)
-{
-  return _mm_movehl_ps(b, a);
-}
-#  endif
-
-#  if defined(__KERNEL_SSE3__) && !defined(__KERNEL_NEON__)
-template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
-{
-  return _mm_moveldup_ps(a);
-}
-template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
-{
-  return _mm_movehdup_ps(a);
-}
-#  endif
-
-#  if defined(__KERNEL_SSE41__)
-template<size_t dst, size_t src, size_t clr>
-__forceinline const sseb insert(const sseb &a, const sseb &b)
-{
-#    ifdef __KERNEL_NEON__
-  sseb res = a;
-  if (clr)
-    res[dst] = 0;
-  else
-    res[dst] = b[src];
-  return res;
-#    else
-  return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
-#    endif
-}
-template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
-{
-  return insert<dst, src, 0>(a, b);
-}
-template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b)
-{
-  return insert<dst, 0>(a, sseb(b));
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const sseb &a)
-{
-#    if defined(__KERNEL_NEON__)
-  const int32x4_t mask = {1, 1, 1, 1};
-  int32x4_t t = vandq_s32(vreinterpretq_s32_m128(a.m128), mask);
-  return vaddvq_s32(t);
-#    else
-  return _mm_popcnt_u32(_mm_movemask_ps(a));
-#    endif
-}
-#  else
-__forceinline uint32_t popcnt(const sseb &a)
-{
-  return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]);
-}
-#  endif
-
-__forceinline bool reduce_and(const sseb &a)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) == -4;
-#  else
-  return _mm_movemask_ps(a) == 0xf;
-#  endif
-}
-__forceinline bool reduce_or(const sseb &a)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(a.m128)) != 0x0;
-#  else
-  return _mm_movemask_ps(a) != 0x0;
-#  endif
-}
-__forceinline bool all(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == -4;
-#  else
-  return _mm_movemask_ps(b) == 0xf;
-#  endif
-}
-__forceinline bool any(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) != 0x0;
-#  else
-  return _mm_movemask_ps(b) != 0x0;
-#  endif
-}
-__forceinline bool none(const sseb &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return vaddvq_s32(vreinterpretq_s32_m128(b.m128)) == 0x0;
-#  else
-  return _mm_movemask_ps(b) == 0x0;
-#  endif
-}
-
-__forceinline uint32_t movemask(const sseb &a)
-{
-  return _mm_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_sseb(const char *label, const sseb &a)
-{
-  printf("%s: %d %d %d %d\n", label, a[0], a[1], a[2], a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h
deleted file mode 100644
index 1e2bfa90354..00000000000
--- a/intern/cycles/util/ssef.h
+++ /dev/null
@@ -1,1090 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEF_H__
-#define __UTIL_SSEF_H__
-
-#include <math.h>
-
-#include "util/ssei.h"
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct sseb;
-struct ssef;
-
-/*! 4-wide SSE float type. */
-struct ssef {
-  typedef sseb Mask;   // mask type
-  typedef ssei Int;    // int type
-  typedef ssef Float;  // float type
-
-  enum { size = 4 };  // number of SIMD elements
-  union {
-    __m128 m128;
-    float f[4];
-    int i[4];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline ssef()
-  {
-  }
-  __forceinline ssef(const ssef &other)
-  {
-    m128 = other.m128;
-  }
-  __forceinline ssef &operator=(const ssef &other)
-  {
-    m128 = other.m128;
-    return *this;
-  }
-
-  __forceinline ssef(const __m128 a) : m128(a)
-  {
-  }
-  __forceinline operator const __m128 &() const
-  {
-    return m128;
-  }
-  __forceinline operator __m128 &()
-  {
-    return m128;
-  }
-
-  __forceinline ssef(float a) : m128(_mm_set1_ps(a))
-  {
-  }
-  __forceinline ssef(float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d))
-  {
-  }
-
-  __forceinline explicit ssef(const __m128i a) : m128(_mm_cvtepi32_ps(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Loads and Stores
-  ////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_AVX__)
-  static __forceinline ssef broadcast(const void *const a)
-  {
-    return _mm_broadcast_ss((float *)a);
-  }
-#  else
-  static __forceinline ssef broadcast(const void *const a)
-  {
-    return _mm_set1_ps(*(float *)a);
-  }
-#  endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const float &operator[](const size_t i) const
-  {
-    assert(i < 4);
-    return f[i];
-  }
-  __forceinline float &operator[](const size_t i)
-  {
-    assert(i < 4);
-    return f[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef cast(const __m128i &a)
-{
-  return _mm_castsi128_ps(a);
-}
-__forceinline const ssef operator+(const ssef &a)
-{
-  return a;
-}
-__forceinline const ssef operator-(const ssef &a)
-{
-  return _mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-}
-__forceinline const ssef abs(const ssef &a)
-{
-  return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
-}
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssef sign(const ssef &a)
-{
-  return _mm_blendv_ps(ssef(1.0f), -ssef(1.0f), _mm_cmplt_ps(a, ssef(0.0f)));
-}
-#  endif
-__forceinline const ssef signmsk(const ssef &a)
-{
-  return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-}
-
-__forceinline const ssef rcp(const ssef &a)
-{
-  const ssef r = _mm_rcp_ps(a.m128);
-  return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-}
-__forceinline const ssef sqr(const ssef &a)
-{
-  return _mm_mul_ps(a, a);
-}
-__forceinline const ssef mm_sqrt(const ssef &a)
-{
-  return _mm_sqrt_ps(a.m128);
-}
-__forceinline const ssef rsqrt(const ssef &a)
-{
-  const ssef r = _mm_rsqrt_ps(a.m128);
-  return _mm_add_ps(
-      _mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r),
-      _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r),
-                 _mm_mul_ps(r, r)));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef operator+(const ssef &a, const ssef &b)
-{
-  return _mm_add_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator+(const ssef &a, const float &b)
-{
-  return a + ssef(b);
-}
-__forceinline const ssef operator+(const float &a, const ssef &b)
-{
-  return ssef(a) + b;
-}
-
-__forceinline const ssef operator-(const ssef &a, const ssef &b)
-{
-  return _mm_sub_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator-(const ssef &a, const float &b)
-{
-  return a - ssef(b);
-}
-__forceinline const ssef operator-(const float &a, const ssef &b)
-{
-  return ssef(a) - b;
-}
-
-__forceinline const ssef operator*(const ssef &a, const ssef &b)
-{
-  return _mm_mul_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator*(const ssef &a, const float &b)
-{
-  return a * ssef(b);
-}
-__forceinline const ssef operator*(const float &a, const ssef &b)
-{
-  return ssef(a) * b;
-}
-
-__forceinline const ssef operator/(const ssef &a, const ssef &b)
-{
-  return _mm_div_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator/(const ssef &a, const float &b)
-{
-  return a / ssef(b);
-}
-__forceinline const ssef operator/(const float &a, const ssef &b)
-{
-  return ssef(a) / b;
-}
-
-__forceinline const ssef operator^(const ssef &a, const ssef &b)
-{
-  return _mm_xor_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator^(const ssef &a, const ssei &b)
-{
-  return _mm_xor_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef operator&(const ssef &a, const ssef &b)
-{
-  return _mm_and_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator&(const ssef &a, const ssei &b)
-{
-  return _mm_and_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef operator|(const ssef &a, const ssef &b)
-{
-  return _mm_or_ps(a.m128, b.m128);
-}
-__forceinline const ssef operator|(const ssef &a, const ssei &b)
-{
-  return _mm_or_ps(a.m128, _mm_castsi128_ps(b.m128));
-}
-
-__forceinline const ssef andnot(const ssef &a, const ssef &b)
-{
-  return _mm_andnot_ps(a.m128, b.m128);
-}
-
-__forceinline const ssef min(const ssef &a, const ssef &b)
-{
-  return _mm_min_ps(a.m128, b.m128);
-}
-__forceinline const ssef min(const ssef &a, const float &b)
-{
-  return _mm_min_ps(a.m128, ssef(b));
-}
-__forceinline const ssef min(const float &a, const ssef &b)
-{
-  return _mm_min_ps(ssef(a), b.m128);
-}
-
-__forceinline const ssef max(const ssef &a, const ssef &b)
-{
-  return _mm_max_ps(a.m128, b.m128);
-}
-__forceinline const ssef max(const ssef &a, const float &b)
-{
-  return _mm_max_ps(a.m128, ssef(b));
-}
-__forceinline const ssef max(const float &a, const ssef &b)
-{
-  return _mm_max_ps(ssef(a), b.m128);
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline ssef mini(const ssef &a, const ssef &b)
-{
-  const ssei ai = _mm_castps_si128(a);
-  const ssei bi = _mm_castps_si128(b);
-  const ssei ci = _mm_min_epi32(ai, bi);
-  return _mm_castsi128_ps(ci);
-}
-#  endif
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline ssef maxi(const ssef &a, const ssef &b)
-{
-  const ssei ai = _mm_castps_si128(a);
-  const ssei bi = _mm_castps_si128(b);
-  const ssei ci = _mm_max_epi32(ai, bi);
-  return _mm_castsi128_ps(ci);
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Ternary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
-{
-#  if defined(__KERNEL_NEON__)
-  return vfmaq_f32(c, a, b);
-#  elif defined(__KERNEL_AVX2__)
-  return _mm_fmadd_ps(a, b, c);
-#  else
-  return a * b + c;
-#  endif
-}
-__forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
-{
-#  if defined(__KERNEL_NEON__)
-  return vfmaq_f32(vnegq_f32(c), a, b);
-#  elif defined(__KERNEL_AVX2__)
-  return _mm_fmsub_ps(a, b, c);
-#  else
-  return a * b - c;
-#  endif
-}
-__forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
-{
-#  if defined(__KERNEL_NEON__)
-  return vfmsq_f32(c, a, b);
-#  elif defined(__KERNEL_AVX2__)
-  return _mm_fnmadd_ps(a, b, c);
-#  else
-  return c - a * b;
-#  endif
-}
-__forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
-{
-#  if defined(__KERNEL_NEON__)
-  return vfmsq_f32(vnegq_f32(c), a, b);
-#  elif defined(__KERNEL_AVX2__)
-  return _mm_fnmsub_ps(a, b, c);
-#  else
-  return -a * b - c;
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef &operator+=(ssef &a, const ssef &b)
-{
-  return a = a + b;
-}
-__forceinline ssef &operator+=(ssef &a, const float &b)
-{
-  return a = a + b;
-}
-
-__forceinline ssef &operator-=(ssef &a, const ssef &b)
-{
-  return a = a - b;
-}
-__forceinline ssef &operator-=(ssef &a, const float &b)
-{
-  return a = a - b;
-}
-
-__forceinline ssef &operator*=(ssef &a, const ssef &b)
-{
-  return a = a * b;
-}
-__forceinline ssef &operator*=(ssef &a, const float &b)
-{
-  return a = a * b;
-}
-
-__forceinline ssef &operator/=(ssef &a, const ssef &b)
-{
-  return a = a / b;
-}
-__forceinline ssef &operator/=(ssef &a, const float &b)
-{
-  return a = a / b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator==(const ssef &a, const ssef &b)
-{
-  return _mm_cmpeq_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator==(const ssef &a, const float &b)
-{
-  return a == ssef(b);
-}
-__forceinline const sseb operator==(const float &a, const ssef &b)
-{
-  return ssef(a) == b;
-}
-
-__forceinline const sseb operator!=(const ssef &a, const ssef &b)
-{
-  return _mm_cmpneq_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator!=(const ssef &a, const float &b)
-{
-  return a != ssef(b);
-}
-__forceinline const sseb operator!=(const float &a, const ssef &b)
-{
-  return ssef(a) != b;
-}
-
-__forceinline const sseb operator<(const ssef &a, const ssef &b)
-{
-  return _mm_cmplt_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator<(const ssef &a, const float &b)
-{
-  return a < ssef(b);
-}
-__forceinline const sseb operator<(const float &a, const ssef &b)
-{
-  return ssef(a) < b;
-}
-
-__forceinline const sseb operator>=(const ssef &a, const ssef &b)
-{
-  return _mm_cmpnlt_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator>=(const ssef &a, const float &b)
-{
-  return a >= ssef(b);
-}
-__forceinline const sseb operator>=(const float &a, const ssef &b)
-{
-  return ssef(a) >= b;
-}
-
-__forceinline const sseb operator>(const ssef &a, const ssef &b)
-{
-  return _mm_cmpnle_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator>(const ssef &a, const float &b)
-{
-  return a > ssef(b);
-}
-__forceinline const sseb operator>(const float &a, const ssef &b)
-{
-  return ssef(a) > b;
-}
-
-__forceinline const sseb operator<=(const ssef &a, const ssef &b)
-{
-  return _mm_cmple_ps(a.m128, b.m128);
-}
-__forceinline const sseb operator<=(const ssef &a, const float &b)
-{
-  return a <= ssef(b);
-}
-__forceinline const sseb operator<=(const float &a, const ssef &b)
-{
-  return ssef(a) <= b;
-}
-
-__forceinline const ssef select(const sseb &m, const ssef &t, const ssef &f)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_blendv_ps(f, t, m);
-#  else
-  return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-#  endif
-}
-
-__forceinline const ssef select(const ssef &m, const ssef &t, const ssef &f)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_blendv_ps(f, t, m);
-#  else
-  return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
-#  endif
-}
-
-__forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
-{
-#  if defined(__KERNEL_SSE41__) && \
-      ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
-  return _mm_blend_ps(f, t, mask);
-#  else
-  return select(sseb(mask), t, f);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Rounding Functions
-////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssef round_even(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndnq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
-#    endif
-}
-__forceinline const ssef round_down(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndmq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
-#    endif
-}
-__forceinline const ssef round_up(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndpq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
-#    endif
-}
-__forceinline const ssef round_zero(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
-#    endif
-}
-__forceinline const ssef floor(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndmq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
-#    endif
-}
-__forceinline const ssef ceil(const ssef &a)
-{
-#    ifdef __KERNEL_NEON__
-  return vrndpq_f32(a);
-#    else
-  return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
-#    endif
-}
-#  else
-/* Non-SSE4.1 fallback, needed for floorfrac. */
-__forceinline const ssef floor(const ssef &a)
-{
-  return _mm_set_ps(floorf(a.f[3]), floorf(a.f[2]), floorf(a.f[1]), floorf(a.f[0]));
-}
-#  endif
-
-__forceinline ssei truncatei(const ssef &a)
-{
-  return _mm_cvttps_epi32(a.m128);
-}
-
-__forceinline ssef floorfrac(const ssef &x, ssei *i)
-{
-  ssef f = floor(x);
-  *i = truncatei(f);
-  return x - f;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Common Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef mix(const ssef &a, const ssef &b, const ssef &t)
-{
-  return madd(t, b, (ssef(1.0f) - t) * a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef unpacklo(const ssef &a, const ssef &b)
-{
-  return _mm_unpacklo_ps(a.m128, b.m128);
-}
-__forceinline ssef unpackhi(const ssef &a, const ssef &b)
-{
-  return _mm_unpackhi_ps(a.m128, b.m128);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssef shuffle(const ssef &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128);
-#  else
-  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-#  endif
-}
-
-template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
-{
-  return _mm_movelh_ps(a, a);
-}
-
-template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
-{
-  return _mm_movehl_ps(a, a);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssef shuffle(const ssef &a, const ssef &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b);
-#  else
-  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-#  endif
-}
-
-template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
-{
-#  ifdef __KERNEL_NEON__
-  return shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b);
-#  else
-  return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
-#  endif
-}
-
-#  ifndef __KERNEL_NEON__
-template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
-{
-  return _mm_movelh_ps(a, b);
-}
-
-template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const ssef &b)
-{
-  return _mm_movehl_ps(b, a);
-}
-#  endif
-
-#  if defined(__KERNEL_SSSE3__)
-__forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
-{
-  return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
-}
-#  endif
-
-#  if defined(__KERNEL_SSE3__)
-template<> __forceinline const ssef shuffle<0, 0, 2, 2>(const ssef &b)
-{
-  return _mm_moveldup_ps(b);
-}
-template<> __forceinline const ssef shuffle<1, 1, 3, 3>(const ssef &b)
-{
-  return _mm_movehdup_ps(b);
-}
-#  endif
-
-template<size_t i0> __forceinline const ssef shuffle(const ssef &b)
-{
-  return shuffle<i0, i0, i0, i0>(b);
-}
-
-#  if defined(__KERNEL_AVX__)
-__forceinline const ssef shuffle(const ssef &a, const ssei &shuf)
-{
-  return _mm_permutevar_ps(a, shuf);
-}
-#  endif
-
-template<size_t i> __forceinline float extract(const ssef &a)
-{
-  return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
-}
-template<> __forceinline float extract<0>(const ssef &a)
-{
-  return _mm_cvtss_f32(a);
-}
-
-#  if defined(__KERNEL_SSE41__)
-template<size_t dst, size_t src, size_t clr>
-__forceinline const ssef insert(const ssef &a, const ssef &b)
-{
-#    ifdef __KERNEL_NEON__
-  ssef res = a;
-  if (clr)
-    res[dst] = 0;
-  else
-    res[dst] = b[src];
-  return res;
-#    else
-  return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
-#    endif
-}
-template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
-{
-  return insert<dst, src, 0>(a, b);
-}
-template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b)
-{
-  return insert<dst, 0>(a, _mm_set_ss(b));
-}
-#  else
-template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b)
-{
-  ssef c = a;
-  c[dst] = b;
-  return c;
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Transpose
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline void transpose(const ssef &r0,
-                             const ssef &r1,
-                             const ssef &r2,
-                             const ssef &r3,
-                             ssef &c0,
-                             ssef &c1,
-                             ssef &c2,
-                             ssef &c3)
-{
-  ssef l02 = unpacklo(r0, r2);
-  ssef h02 = unpackhi(r0, r2);
-  ssef l13 = unpacklo(r1, r3);
-  ssef h13 = unpackhi(r1, r3);
-  c0 = unpacklo(l02, l13);
-  c1 = unpackhi(l02, l13);
-  c2 = unpacklo(h02, h13);
-  c3 = unpackhi(h02, h13);
-}
-
-__forceinline void transpose(
-    const ssef &r0, const ssef &r1, const ssef &r2, const ssef &r3, ssef &c0, ssef &c1, ssef &c2)
-{
-  ssef l02 = unpacklo(r0, r2);
-  ssef h02 = unpackhi(r0, r2);
-  ssef l13 = unpacklo(r1, r3);
-  ssef h13 = unpackhi(r1, r3);
-  c0 = unpacklo(l02, l13);
-  c1 = unpackhi(l02, l13);
-  c2 = unpacklo(h02, h13);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssef vreduce_min(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vdupq_n_f32(vminvq_f32(v));
-#  else
-  ssef h = min(shuffle<1, 0, 3, 2>(v), v);
-  return min(shuffle<2, 3, 0, 1>(h), h);
-#  endif
-}
-__forceinline const ssef vreduce_max(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vdupq_n_f32(vmaxvq_f32(v));
-#  else
-  ssef h = max(shuffle<1, 0, 3, 2>(v), v);
-  return max(shuffle<2, 3, 0, 1>(h), h);
-#  endif
-}
-__forceinline const ssef vreduce_add(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vdupq_n_f32(vaddvq_f32(v));
-#  else
-  ssef h = shuffle<1, 0, 3, 2>(v) + v;
-  return shuffle<2, 3, 0, 1>(h) + h;
-#  endif
-}
-
-__forceinline float reduce_min(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vminvq_f32(v);
-#  else
-  return _mm_cvtss_f32(vreduce_min(v));
-#  endif
-}
-__forceinline float reduce_max(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vmaxvq_f32(v);
-#  else
-  return _mm_cvtss_f32(vreduce_max(v));
-#  endif
-}
-__forceinline float reduce_add(const ssef &v)
-{
-#  ifdef __KERNEL_NEON__
-  return vaddvq_f32(v);
-#  else
-  return _mm_cvtss_f32(vreduce_add(v));
-#  endif
-}
-
-__forceinline uint32_t select_min(const ssef &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const ssef &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const sseb &valid, const ssef &v)
-{
-  const ssef a = select(valid, v, ssef(pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const sseb &valid, const ssef &v)
-{
-  const ssef a = select(valid, v, ssef(neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-__forceinline uint32_t movemask(const ssef &a)
-{
-  return _mm_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Memory load and store operations
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssef load4f(const float4 &a)
-{
-#  ifdef __KERNEL_WITH_SSE_ALIGN__
-  return _mm_load_ps(&a.x);
-#  else
-  return _mm_loadu_ps(&a.x);
-#  endif
-}
-
-__forceinline ssef load4f(const float3 &a)
-{
-#  ifdef __KERNEL_WITH_SSE_ALIGN__
-  return _mm_load_ps(&a.x);
-#  else
-  return _mm_loadu_ps(&a.x);
-#  endif
-}
-
-__forceinline ssef load4f(const void *const a)
-{
-  return _mm_load_ps((float *)a);
-}
-
-__forceinline ssef load1f_first(const float a)
-{
-  return _mm_set_ss(a);
-}
-
-__forceinline void store4f(void *ptr, const ssef &v)
-{
-  _mm_store_ps((float *)ptr, v);
-}
-
-__forceinline ssef loadu4f(const void *const a)
-{
-  return _mm_loadu_ps((float *)a);
-}
-
-__forceinline void storeu4f(void *ptr, const ssef &v)
-{
-  _mm_storeu_ps((float *)ptr, v);
-}
-
-__forceinline void store4f(const sseb &mask, void *ptr, const ssef &f)
-{
-#  if defined(__KERNEL_AVX__)
-  _mm_maskstore_ps((float *)ptr, (__m128i)mask, f);
-#  else
-  *(ssef *)ptr = select(mask, f, *(ssef *)ptr);
-#  endif
-}
-
-__forceinline ssef load4f_nt(void *ptr)
-{
-#  if defined(__KERNEL_SSE41__)
-  return _mm_castsi128_ps(_mm_stream_load_si128((__m128i *)ptr));
-#  else
-  return _mm_load_ps((float *)ptr);
-#  endif
-}
-
-__forceinline void store4f_nt(void *ptr, const ssef &v)
-{
-#  if defined(__KERNEL_SSE41__)
-  _mm_stream_ps((float *)ptr, v);
-#  else
-  _mm_store_ps((float *)ptr, v);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Euclidean Space Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline float dot(const ssef &a, const ssef &b)
-{
-  return reduce_add(a * b);
-}
-
-/* calculate shuffled cross product, useful when order of components does not matter */
-__forceinline ssef cross_zxy(const ssef &a, const ssef &b)
-{
-  const ssef a0 = a;
-  const ssef b0 = shuffle<1, 2, 0, 3>(b);
-  const ssef a1 = shuffle<1, 2, 0, 3>(a);
-  const ssef b1 = b;
-  return msub(a0, b0, a1 * b1);
-}
-
-__forceinline ssef cross(const ssef &a, const ssef &b)
-{
-  return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
-}
-
-ccl_device_inline const ssef dot3_splat(const ssef &a, const ssef &b)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_dp_ps(a.m128, b.m128, 0x7f);
-#  else
-  ssef t = a * b;
-  return ssef(((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2]);
-#  endif
-}
-
-/* squared length taking only specified axes into account */
-template<size_t X, size_t Y, size_t Z, size_t W> ccl_device_inline float len_squared(const ssef &a)
-{
-#  ifndef __KERNEL_SSE41__
-  float4 &t = (float4 &)a;
-  return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) +
-         (W ? t.w * t.w : 0.0f);
-#  else
-  return extract<0>(
-      ssef(_mm_dp_ps(a.m128, a.m128, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf)));
-#  endif
-}
-
-ccl_device_inline float dot3(const ssef &a, const ssef &b)
-{
-#  ifdef __KERNEL_SSE41__
-  return extract<0>(ssef(_mm_dp_ps(a.m128, b.m128, 0x7f)));
-#  else
-  ssef t = a * b;
-  return ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
-#  endif
-}
-
-ccl_device_inline const ssef len3_squared_splat(const ssef &a)
-{
-  return dot3_splat(a, a);
-}
-
-ccl_device_inline float len3_squared(const ssef &a)
-{
-  return dot3(a, a);
-}
-
-ccl_device_inline float len3(const ssef &a)
-{
-  return extract<0>(mm_sqrt(dot3_splat(a, a)));
-}
-
-/* SSE shuffle utility functions */
-
-#  ifdef __KERNEL_SSSE3__
-
-/* faster version for SSSE3 */
-typedef ssei shuffle_swap_t;
-
-ccl_device_inline shuffle_swap_t shuffle_swap_identity()
-{
-  return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-}
-
-ccl_device_inline shuffle_swap_t shuffle_swap_swap()
-{
-  return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
-}
-
-ccl_device_inline const ssef shuffle_swap(const ssef &a, const shuffle_swap_t &shuf)
-{
-  return cast(_mm_shuffle_epi8(cast(a), shuf));
-}
-
-#  else
-
-/* somewhat slower version for SSE2 */
-typedef int shuffle_swap_t;
-
-ccl_device_inline shuffle_swap_t shuffle_swap_identity()
-{
-  return 0;
-}
-
-ccl_device_inline shuffle_swap_t shuffle_swap_swap()
-{
-  return 1;
-}
-
-ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
-{
-  /* shuffle value must be a constant, so we need to branch */
-  if (shuf)
-    return shuffle<1, 0, 3, 2>(a);
-  else
-    return shuffle<3, 2, 1, 0>(a);
-}
-
-#  endif
-
-#  if defined(__KERNEL_SSE41__) && !defined(__KERNEL_NEON__)
-
-ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
-                                          const shuffle_swap_t &shuf_identity,
-                                          const shuffle_swap_t &shuf_swap,
-                                          const float3 &idir,
-                                          ssef idirsplat[3],
-                                          shuffle_swap_t shufflexyz[3])
-{
-  const __m128 idirsplat_raw[] = {_mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z)};
-  idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
-  idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn);
-  idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn);
-
-  const ssef signmask = cast(ssei(0x80000000));
-  const ssef shuf_identity_f = cast(shuf_identity);
-  const ssef shuf_swap_f = cast(shuf_swap);
-
-  shufflexyz[0] = _mm_castps_si128(
-      _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask)));
-  shufflexyz[1] = _mm_castps_si128(
-      _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask)));
-  shufflexyz[2] = _mm_castps_si128(
-      _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask)));
-}
-
-#  else
-
-ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
-                                          const shuffle_swap_t &shuf_identity,
-                                          const shuffle_swap_t &shuf_swap,
-                                          const float3 &idir,
-                                          ssef idirsplat[3],
-                                          shuffle_swap_t shufflexyz[3])
-{
-  idirsplat[0] = ssef(idir.x) ^ pn;
-  idirsplat[1] = ssef(idir.y) ^ pn;
-  idirsplat[2] = ssef(idir.z) ^ pn;
-
-  shufflexyz[0] = (idir.x >= 0) ? shuf_identity : shuf_swap;
-  shufflexyz[1] = (idir.y >= 0) ? shuf_identity : shuf_swap;
-  shufflexyz[2] = (idir.z >= 0) ? shuf_identity : shuf_swap;
-}
-
-#  endif
-
-ccl_device_inline const ssef uint32_to_float(const ssei &in)
-{
-  ssei a = _mm_srli_epi32(in, 16);
-  ssei b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff));
-  ssei c = _mm_or_si128(a, _mm_set1_epi32(0x53000000));
-  ssef d = _mm_cvtepi32_ps(b);
-  ssef e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000)));
-  return _mm_add_ps(e, d);
-}
-
-template<size_t S1, size_t S2, size_t S3, size_t S4>
-ccl_device_inline const ssef set_sign_bit(const ssef &a)
-{
-  return cast(cast(a) ^ ssei(S1 << 31, S2 << 31, S3 << 31, S4 << 31));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_ssef(const char *label, const ssef &a)
-{
-  printf(
-      "%s: %.8f %.8f %.8f %.8f\n", label, (double)a[0], (double)a[1], (double)a[2], (double)a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/ssei.h b/intern/cycles/util/ssei.h
deleted file mode 100644
index 5caf44c967f..00000000000
--- a/intern/cycles/util/ssei.h
+++ /dev/null
@@ -1,633 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_SSEI_H__
-#define __UTIL_SSEI_H__
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __KERNEL_SSE2__
-
-struct sseb;
-struct ssef;
-
-/*! 4-wide SSE integer type. */
-struct ssei {
-  typedef sseb Mask;   // mask type
-  typedef ssei Int;    // int type
-  typedef ssef Float;  // float type
-
-  enum { size = 4 };  // number of SIMD elements
-  union {
-    __m128i m128;
-    int32_t i[4];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline ssei()
-  {
-  }
-  __forceinline ssei(const ssei &a)
-  {
-    m128 = a.m128;
-  }
-  __forceinline ssei &operator=(const ssei &a)
-  {
-    m128 = a.m128;
-    return *this;
-  }
-
-  __forceinline ssei(const __m128i a) : m128(a)
-  {
-  }
-  __forceinline operator const __m128i &(void) const
-  {
-    return m128;
-  }
-  __forceinline operator __m128i &(void)
-  {
-    return m128;
-  }
-
-  __forceinline ssei(const int a) : m128(_mm_set1_epi32(a))
-  {
-  }
-  __forceinline ssei(int a, int b, int c, int d) : m128(_mm_setr_epi32(a, b, c, d))
-  {
-  }
-
-  __forceinline explicit ssei(const __m128 a) : m128(_mm_cvtps_epi32(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const int32_t &operator[](const size_t index) const
-  {
-    assert(index < 4);
-    return i[index];
-  }
-  __forceinline int32_t &operator[](const size_t index)
-  {
-    assert(index < 4);
-    return i[index];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssei cast(const __m128 &a)
-{
-  return _mm_castps_si128(a);
-}
-__forceinline const ssei operator+(const ssei &a)
-{
-  return a;
-}
-__forceinline const ssei operator-(const ssei &a)
-{
-  return _mm_sub_epi32(_mm_setzero_si128(), a.m128);
-}
-#  if defined(__KERNEL_SSSE3__)
-__forceinline const ssei abs(const ssei &a)
-{
-  return _mm_abs_epi32(a.m128);
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const ssei operator+(const ssei &a, const ssei &b)
-{
-  return _mm_add_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator+(const ssei &a, const int32_t &b)
-{
-  return a + ssei(b);
-}
-__forceinline const ssei operator+(const int32_t &a, const ssei &b)
-{
-  return ssei(a) + b;
-}
-
-__forceinline const ssei operator-(const ssei &a, const ssei &b)
-{
-  return _mm_sub_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator-(const ssei &a, const int32_t &b)
-{
-  return a - ssei(b);
-}
-__forceinline const ssei operator-(const int32_t &a, const ssei &b)
-{
-  return ssei(a) - b;
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei operator*(const ssei &a, const ssei &b)
-{
-  return _mm_mullo_epi32(a.m128, b.m128);
-}
-__forceinline const ssei operator*(const ssei &a, const int32_t &b)
-{
-  return a * ssei(b);
-}
-__forceinline const ssei operator*(const int32_t &a, const ssei &b)
-{
-  return ssei(a) * b;
-}
-#  endif
-
-__forceinline const ssei operator&(const ssei &a, const ssei &b)
-{
-  return _mm_and_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator&(const ssei &a, const int32_t &b)
-{
-  return a & ssei(b);
-}
-__forceinline const ssei operator&(const int32_t &a, const ssei &b)
-{
-  return ssei(a) & b;
-}
-
-__forceinline const ssei operator|(const ssei &a, const ssei &b)
-{
-  return _mm_or_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator|(const ssei &a, const int32_t &b)
-{
-  return a | ssei(b);
-}
-__forceinline const ssei operator|(const int32_t &a, const ssei &b)
-{
-  return ssei(a) | b;
-}
-
-__forceinline const ssei operator^(const ssei &a, const ssei &b)
-{
-  return _mm_xor_si128(a.m128, b.m128);
-}
-__forceinline const ssei operator^(const ssei &a, const int32_t &b)
-{
-  return a ^ ssei(b);
-}
-__forceinline const ssei operator^(const int32_t &a, const ssei &b)
-{
-  return ssei(a) ^ b;
-}
-
-__forceinline const ssei operator<<(const ssei &a, const int32_t &n)
-{
-  return _mm_slli_epi32(a.m128, n);
-}
-__forceinline const ssei operator>>(const ssei &a, const int32_t &n)
-{
-  return _mm_srai_epi32(a.m128, n);
-}
-
-__forceinline const ssei andnot(const ssei &a, const ssei &b)
-{
-  return _mm_andnot_si128(a.m128, b.m128);
-}
-__forceinline const ssei andnot(const sseb &a, const ssei &b)
-{
-  return _mm_andnot_si128(cast(a.m128), b.m128);
-}
-__forceinline const ssei andnot(const ssei &a, const sseb &b)
-{
-  return _mm_andnot_si128(a.m128, cast(b.m128));
-}
-
-__forceinline const ssei sra(const ssei &a, const int32_t &b)
-{
-  return _mm_srai_epi32(a.m128, b);
-}
-__forceinline const ssei srl(const ssei &a, const int32_t &b)
-{
-  return _mm_srli_epi32(a.m128, b);
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei min(const ssei &a, const ssei &b)
-{
-  return _mm_min_epi32(a.m128, b.m128);
-}
-__forceinline const ssei min(const ssei &a, const int32_t &b)
-{
-  return min(a, ssei(b));
-}
-__forceinline const ssei min(const int32_t &a, const ssei &b)
-{
-  return min(ssei(a), b);
-}
-
-__forceinline const ssei max(const ssei &a, const ssei &b)
-{
-  return _mm_max_epi32(a.m128, b.m128);
-}
-__forceinline const ssei max(const ssei &a, const int32_t &b)
-{
-  return max(a, ssei(b));
-}
-__forceinline const ssei max(const int32_t &a, const ssei &b)
-{
-  return max(ssei(a), b);
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei &operator+=(ssei &a, const ssei &b)
-{
-  return a = a + b;
-}
-__forceinline ssei &operator+=(ssei &a, const int32_t &b)
-{
-  return a = a + b;
-}
-
-__forceinline ssei &operator-=(ssei &a, const ssei &b)
-{
-  return a = a - b;
-}
-__forceinline ssei &operator-=(ssei &a, const int32_t &b)
-{
-  return a = a - b;
-}
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline ssei &operator*=(ssei &a, const ssei &b)
-{
-  return a = a * b;
-}
-__forceinline ssei &operator*=(ssei &a, const int32_t &b)
-{
-  return a = a * b;
-}
-#  endif
-
-__forceinline ssei &operator&=(ssei &a, const ssei &b)
-{
-  return a = a & b;
-}
-__forceinline ssei &operator&=(ssei &a, const int32_t &b)
-{
-  return a = a & b;
-}
-
-__forceinline ssei &operator|=(ssei &a, const ssei &b)
-{
-  return a = a | b;
-}
-__forceinline ssei &operator|=(ssei &a, const int32_t &b)
-{
-  return a = a | b;
-}
-
-__forceinline ssei &operator^=(ssei &a, const ssei &b)
-{
-  return a = a ^ b;
-}
-__forceinline ssei &operator^=(ssei &a, const int32_t &b)
-{
-  return a = a ^ b;
-}
-
-__forceinline ssei &operator<<=(ssei &a, const int32_t &b)
-{
-  return a = a << b;
-}
-__forceinline ssei &operator>>=(ssei &a, const int32_t &b)
-{
-  return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const sseb operator==(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator==(const ssei &a, const int32_t &b)
-{
-  return a == ssei(b);
-}
-__forceinline const sseb operator==(const int32_t &a, const ssei &b)
-{
-  return ssei(a) == b;
-}
-
-__forceinline const sseb operator!=(const ssei &a, const ssei &b)
-{
-  return !(a == b);
-}
-__forceinline const sseb operator!=(const ssei &a, const int32_t &b)
-{
-  return a != ssei(b);
-}
-__forceinline const sseb operator!=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) != b;
-}
-
-__forceinline const sseb operator<(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmplt_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator<(const ssei &a, const int32_t &b)
-{
-  return a < ssei(b);
-}
-__forceinline const sseb operator<(const int32_t &a, const ssei &b)
-{
-  return ssei(a) < b;
-}
-
-__forceinline const sseb operator>=(const ssei &a, const ssei &b)
-{
-  return !(a < b);
-}
-__forceinline const sseb operator>=(const ssei &a, const int32_t &b)
-{
-  return a >= ssei(b);
-}
-__forceinline const sseb operator>=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) >= b;
-}
-
-__forceinline const sseb operator>(const ssei &a, const ssei &b)
-{
-  return _mm_castsi128_ps(_mm_cmpgt_epi32(a.m128, b.m128));
-}
-__forceinline const sseb operator>(const ssei &a, const int32_t &b)
-{
-  return a > ssei(b);
-}
-__forceinline const sseb operator>(const int32_t &a, const ssei &b)
-{
-  return ssei(a) > b;
-}
-
-__forceinline const sseb operator<=(const ssei &a, const ssei &b)
-{
-  return !(a > b);
-}
-__forceinline const sseb operator<=(const ssei &a, const int32_t &b)
-{
-  return a <= ssei(b);
-}
-__forceinline const sseb operator<=(const int32_t &a, const ssei &b)
-{
-  return ssei(a) <= b;
-}
-
-__forceinline const ssei select(const sseb &m, const ssei &t, const ssei &f)
-{
-#  ifdef __KERNEL_SSE41__
-  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
-#  else
-  return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
-#  endif
-}
-
-__forceinline const ssei select(const int mask, const ssei &t, const ssei &f)
-{
-#  if defined(__KERNEL_SSE41__) && \
-      ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
-  return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
-#  else
-  return select(sseb(mask), t, f);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei unpacklo(const ssei &a, const ssei &b)
-{
-  return _mm_unpacklo_epi32(a, b);
-}
-__forceinline ssei unpackhi(const ssei &a, const ssei &b)
-{
-  return _mm_unpackhi_epi32(a, b);
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssei shuffle(const ssei &a)
-{
-#  ifdef __KERNEL_NEON__
-  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a));
-  return vreinterpretq_m128i_s32(result);
-#  else
-  return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0));
-#  endif
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const ssei shuffle(const ssei &a, const ssei &b)
-{
-#  ifdef __KERNEL_NEON__
-  int32x4_t result = shuffle_neon<int32x4_t, i0, i1, i2, i3>(vreinterpretq_s32_m128i(a),
-                                                             vreinterpretq_s32_m128i(b));
-  return vreinterpretq_m128i_s32(result);
-#  else
-  return _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-#  endif
-}
-
-template<size_t i0> __forceinline const ssei shuffle(const ssei &b)
-{
-  return shuffle<i0, i0, i0, i0>(b);
-}
-
-#  if defined(__KERNEL_SSE41__)
-template<size_t src> __forceinline int extract(const ssei &b)
-{
-  return _mm_extract_epi32(b, src);
-}
-template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b)
-{
-  return _mm_insert_epi32(a, b, dst);
-}
-#  else
-template<size_t src> __forceinline int extract(const ssei &b)
-{
-  return b[src];
-}
-template<size_t dst> __forceinline const ssei insert(const ssei &a, const int32_t b)
-{
-  ssei c = a;
-  c[dst] = b;
-  return c;
-}
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-#  if defined(__KERNEL_SSE41__)
-__forceinline const ssei vreduce_min(const ssei &v)
-{
-  ssei h = min(shuffle<1, 0, 3, 2>(v), v);
-  return min(shuffle<2, 3, 0, 1>(h), h);
-}
-__forceinline const ssei vreduce_max(const ssei &v)
-{
-  ssei h = max(shuffle<1, 0, 3, 2>(v), v);
-  return max(shuffle<2, 3, 0, 1>(h), h);
-}
-__forceinline const ssei vreduce_add(const ssei &v)
-{
-  ssei h = shuffle<1, 0, 3, 2>(v) + v;
-  return shuffle<2, 3, 0, 1>(h) + h;
-}
-
-__forceinline int reduce_min(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vminvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_min(v));
-#    endif
-}
-__forceinline int reduce_max(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vmaxvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_max(v));
-#    endif
-}
-__forceinline int reduce_add(const ssei &v)
-{
-#    ifdef __KERNEL_NEON__
-  return vaddvq_s32(vreinterpretq_s32_m128i(v));
-#    else
-  return extract<0>(vreduce_add(v));
-#    endif
-}
-
-__forceinline uint32_t select_min(const ssei &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const ssei &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const sseb &valid, const ssei &v)
-{
-  const ssei a = select(valid, v, ssei((int)pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const sseb &valid, const ssei &v)
-{
-  const ssei a = select(valid, v, ssei((int)neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-#  else
-
-__forceinline int ssei_min(int a, int b)
-{
-  return (a < b) ? a : b;
-}
-__forceinline int ssei_max(int a, int b)
-{
-  return (a > b) ? a : b;
-}
-__forceinline int reduce_min(const ssei &v)
-{
-  return ssei_min(ssei_min(v[0], v[1]), ssei_min(v[2], v[3]));
-}
-__forceinline int reduce_max(const ssei &v)
-{
-  return ssei_max(ssei_max(v[0], v[1]), ssei_max(v[2], v[3]));
-}
-__forceinline int reduce_add(const ssei &v)
-{
-  return v[0] + v[1] + v[2] + v[3];
-}
-
-#  endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Memory load and store operations
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline ssei load4i(const void *const a)
-{
-  return _mm_load_si128((__m128i *)a);
-}
-
-__forceinline void store4i(void *ptr, const ssei &v)
-{
-  _mm_store_si128((__m128i *)ptr, v);
-}
-
-__forceinline void storeu4i(void *ptr, const ssei &v)
-{
-  _mm_storeu_si128((__m128i *)ptr, v);
-}
-
-__forceinline void store4i(const sseb &mask, void *ptr, const ssei &i)
-{
-#  if defined(__KERNEL_AVX__)
-  _mm_maskstore_ps((float *)ptr, (__m128i)mask, _mm_castsi128_ps(i));
-#  else
-  *(ssei *)ptr = select(mask, i, *(ssei *)ptr);
-#  endif
-}
-
-__forceinline ssei load4i_nt(void *ptr)
-{
-#  if defined(__KERNEL_SSE41__)
-  return _mm_stream_load_si128((__m128i *)ptr);
-#  else
-  return _mm_load_si128((__m128i *)ptr);
-#  endif
-}
-
-__forceinline void store4i_nt(void *ptr, const ssei &v)
-{
-#  if defined(__KERNEL_SSE41__)
-  _mm_stream_ps((float *)ptr, _mm_castsi128_ps(v));
-#  else
-  _mm_store_si128((__m128i *)ptr, v);
-#  endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_ssei(const char *label, const ssei &a)
-{
-  printf("%s: %df %df %df %d\n", label, a[0], a[1], a[2], a[3]);
-}
-
-#endif
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index cb985c65dd8..84116262437 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -102,7 +102,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
     return projection_identity();
   }
 
-  memcpy(&tfmR, R, sizeof(R));
+  memcpy(&tfmR.x[0], R, sizeof(R));
 
   return tfmR;
 }
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 24184dc7074..d7f95b7f296 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -63,17 +63,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
 {
   /* TODO(sergey): Disabled for now, causes crashes in certain cases. */
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
-  ssef x, y, z, w, aa;
-  aa = a.m128;
+  const float4 aa(a.m128);
 
-  x = _mm_loadu_ps(&t->x.x);
-  y = _mm_loadu_ps(&t->y.x);
-  z = _mm_loadu_ps(&t->z.x);
-  w = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
+  float4 x(_mm_loadu_ps(&t->x.x));
+  float4 y(_mm_loadu_ps(&t->y.x));
+  float4 z(_mm_loadu_ps(&t->z.x));
+  float4 w(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f));
 
-  _MM_TRANSPOSE4_PS(x, y, z, w);
+  _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128);
 
-  ssef tmp = w;
+  float4 tmp = w;
   tmp = madd(shuffle<2>(aa), z, tmp);
   tmp = madd(shuffle<1>(aa), y, tmp);
   tmp = madd(shuffle<0>(aa), x, tmp);
@@ -94,16 +93,16 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
 ccl_device_inline float3 transform_direction(ccl_private const Transform *t, const float3 a)
 {
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
-  ssef x, y, z, w, aa;
-  aa = a.m128;
-  x = _mm_loadu_ps(&t->x.x);
-  y = _mm_loadu_ps(&t->y.x);
-  z = _mm_loadu_ps(&t->z.x);
-  w = _mm_setzero_ps();
+  const float4 aa(a.m128);
 
-  _MM_TRANSPOSE4_PS(x, y, z, w);
+  float4 x(_mm_loadu_ps(&t->x.x));
+  float4 y(_mm_loadu_ps(&t->y.x));
+  float4 z(_mm_loadu_ps(&t->z.x));
+  float4 w(_mm_setzero_ps());
 
-  ssef tmp = shuffle<2>(aa) * z;
+  _MM_TRANSPOSE4_PS(x.m128, y.m128, z.m128, w.m128);
+
+  float4 tmp = shuffle<2>(aa) * z;
   tmp = madd(shuffle<1>(aa), y, tmp);
   tmp = madd(shuffle<0>(aa), x, tmp);
 
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
index bb410a6daef..2faac576d82 100644
--- a/intern/cycles/util/transform_inverse.h
+++ b/intern/cycles/util/transform_inverse.h
@@ -9,26 +9,33 @@ CCL_NAMESPACE_BEGIN
  * Normally we don't use SSE41/AVX outside the kernel, but for this it's
  * important to match exactly for ray tracing precision. */
 
-ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_)
 {
 #if defined(__AVX2__) && defined(__KERNEL_SSE2__)
-  const ssef sse_a = (const __m128 &)a;
-  const ssef sse_b = (const __m128 &)b;
-  const ssef r = shuffle<1, 2, 0, 3>(
-      ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+  const __m128 a = (const __m128 &)a_;
+  const __m128 b = (const __m128 &)b_;
+  const __m128 a_shuffle = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1)));
+  const __m128 b_shuffle = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1)));
+  const __m128 r = _mm_castsi128_ps(
+      _mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))),
+                        _MM_SHUFFLE(3, 0, 2, 1)));
   return (const float3 &)r;
 #endif
 
-  return cross(a, b);
+  return cross(a_, b_);
 }
 
-ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_)
 {
-#ifdef __SSE4_1__
-  return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+  const __m128 a = (const __m128 &)a_;
+  const __m128 b = (const __m128 &)b_;
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
 #endif
 
-  return dot(a, b);
+  return dot(a_, b_);
 }
 
 ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm)
diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h
index 1ab6f76f9bc..cf7f35c4116 100644
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -97,6 +97,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_int2.h"
 #include "util/types_int3.h"
 #include "util/types_int4.h"
+#include "util/types_int8.h"
 
 #include "util/types_uint2.h"
 #include "util/types_uint3.h"
@@ -119,6 +120,7 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_int2_impl.h"
 #include "util/types_int3_impl.h"
 #include "util/types_int4_impl.h"
+#include "util/types_int8_impl.h"
 
 #include "util/types_uint2_impl.h"
 #include "util/types_uint3_impl.h"
@@ -129,16 +131,4 @@ ccl_device_inline void print_float(ccl_private const char *label, const float a)
 #include "util/types_float4_impl.h"
 #include "util/types_float8_impl.h"
 
-/* SSE types. */
-#ifndef __KERNEL_GPU__
-#  include "util/sseb.h"
-#  include "util/ssef.h"
-#  include "util/ssei.h"
-#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-#    include "util/avxb.h"
-#    include "util/avxf.h"
-#    include "util/avxi.h"
-#  endif
-#endif
-
 #endif /* __UTIL_TYPES_H__ */
diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h
index 29fd632f08e..121141ddfd9 100644
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -11,15 +11,15 @@
 CCL_NAMESPACE_BEGIN
 
 /* float8 is a reserved type in Metal that has not been implemented. For
- * that reason this is named float8_t and not using native vector types. */
+ * that reason this is named vfloat8 and not using native vector types. */
 
 #ifdef __KERNEL_GPU__
-struct float8_t
+struct vfloat8
 #else
-struct ccl_try_align(32) float8_t
+struct ccl_try_align(32) vfloat8
 #endif
 {
-#ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX__
   union {
     __m256 m256;
     struct {
@@ -27,18 +27,18 @@ struct ccl_try_align(32) float8_t
     };
   };
 
-  __forceinline float8_t();
-  __forceinline float8_t(const float8_t &a);
-  __forceinline explicit float8_t(const __m256 &a);
+  __forceinline vfloat8();
+  __forceinline vfloat8(const vfloat8 &a);
+  __forceinline explicit vfloat8(const __m256 &a);
 
   __forceinline operator const __m256 &() const;
   __forceinline operator __m256 &();
 
-  __forceinline float8_t &operator=(const float8_t &a);
+  __forceinline vfloat8 &operator=(const vfloat8 &a);
 
-#else  /* __KERNEL_AVX2__ */
+#else  /* __KERNEL_AVX__ */
   float a, b, c, d, e, f, g, h;
-#endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX__ */
 
 #ifndef __KERNEL_GPU__
   __forceinline float operator[](int i) const;
@@ -46,8 +46,11 @@ struct ccl_try_align(32) float8_t
 #endif
 };
 
-ccl_device_inline float8_t make_float8_t(float f);
-ccl_device_inline float8_t
-make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h);
+ccl_device_inline vfloat8 make_vfloat8(float f);
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h);
+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b);
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h
index e8576cdaf70..9f42e0f663c 100644
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -10,45 +10,45 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_AVX2__
-__forceinline float8_t::float8_t()
+#ifdef __KERNEL_AVX__
+__forceinline vfloat8::vfloat8()
 {
 }
 
-__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256)
+__forceinline vfloat8::vfloat8(const vfloat8 &f) : m256(f.m256)
 {
 }
 
-__forceinline float8_t::float8_t(const __m256 &f) : m256(f)
+__forceinline vfloat8::vfloat8(const __m256 &f) : m256(f)
 {
 }
 
-__forceinline float8_t::operator const __m256 &() const
+__forceinline vfloat8::operator const __m256 &() const
 {
   return m256;
 }
 
-__forceinline float8_t::operator __m256 &()
+__forceinline vfloat8::operator __m256 &()
 {
   return m256;
 }
 
-__forceinline float8_t &float8_t::operator=(const float8_t &f)
+__forceinline vfloat8 &vfloat8::operator=(const vfloat8 &f)
 {
   m256 = f.m256;
   return *this;
 }
-#endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX__ */
 
 #ifndef __KERNEL_GPU__
-__forceinline float float8_t::operator[](int i) const
+__forceinline float vfloat8::operator[](int i) const
 {
   util_assert(i >= 0);
   util_assert(i < 8);
   return *(&a + i);
 }
 
-__forceinline float &float8_t::operator[](int i)
+__forceinline float &vfloat8::operator[](int i)
 {
   util_assert(i >= 0);
   util_assert(i < 8);
@@ -56,25 +56,50 @@ __forceinline float &float8_t::operator[](int i)
 }
 #endif
 
-ccl_device_inline float8_t make_float8_t(float f)
+ccl_device_inline vfloat8 make_vfloat8(float f)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t r(_mm256_set1_ps(f));
+#ifdef __KERNEL_AVX__
+  vfloat8 r(_mm256_set1_ps(f));
 #else
-  float8_t r = {f, f, f, f, f, f, f, f};
+  vfloat8 r = {f, f, f, f, f, f, f, f};
 #endif
   return r;
 }
 
-ccl_device_inline float8_t
-make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h)
+ccl_device_inline vfloat8
+make_vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
 {
-#ifdef __KERNEL_AVX2__
-  float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#ifdef __KERNEL_AVX__
+  vfloat8 r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
 #else
-  float8_t r = {a, b, c, d, e, f, g, h};
+  vfloat8 r = {a, b, c, d, e, f, g, h};
 #endif
   return r;
 }
 
+ccl_device_inline vfloat8 make_vfloat8(const float4 a, const float4 b)
+{
+#ifdef __KERNEL_AVX__
+  return vfloat8(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1));
+#else
+  return make_vfloat8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+ccl_device_inline void print_vfloat8(ccl_private const char *label, const vfloat8 a)
+{
+#ifdef __KERNEL_PRINTF__
+  printf("%s: %.8f %.8f %.8f %.8f %.8f %.8f %.8f %.8f\n",
+         label,
+         (double)a.a,
+         (double)a.b,
+         (double)a.c,
+         (double)a.d,
+         (double)a.e,
+         (double)a.f,
+         (double)a.g,
+         (double)a.h);
+#endif
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int8.h b/intern/cycles/util/types_int8.h
new file mode 100644
index 00000000000..8643ebe96ad
--- /dev/null
+++ b/intern/cycles/util/types_int8.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+struct vfloat8;
+
+#ifdef __KERNEL_GPU__
+struct vint8
+#else
+struct ccl_try_align(32) vint8
+#endif
+{
+#ifdef __KERNEL_AVX__
+  union {
+    __m256i m256;
+    struct {
+      int a, b, c, d, e, f, g, h;
+    };
+  };
+
+  __forceinline vint8();
+  __forceinline vint8(const vint8 &a);
+  __forceinline explicit vint8(const __m256i &a);
+
+  __forceinline operator const __m256i &() const;
+  __forceinline operator __m256i &();
+
+  __forceinline vint8 &operator=(const vint8 &a);
+#else  /* __KERNEL_AVX__ */
+  int a, b, c, d, e, f, g, h;
+#endif /* __KERNEL_AVX__ */
+
+#ifndef __KERNEL_GPU__
+  __forceinline int operator[](int i) const;
+  __forceinline int &operator[](int i);
+#endif
+};
+
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h);
+ccl_device_inline vint8 make_vint8(int i);
+ccl_device_inline vint8 make_vint8(const vfloat8 f);
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/types_int8_impl.h b/intern/cycles/util/types_int8_impl.h
new file mode 100644
index 00000000000..080bcaa6a2b
--- /dev/null
+++ b/intern/cycles/util/types_int8_impl.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#ifndef __UTIL_TYPES_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+__forceinline vint8::vint8()
+{
+}
+
+__forceinline vint8::vint8(const vint8 &a) : m256(a.m256)
+{
+}
+
+__forceinline vint8::vint8(const __m256i &a) : m256(a)
+{
+}
+
+__forceinline vint8::operator const __m256i &() const
+{
+  return m256;
+}
+
+__forceinline vint8::operator __m256i &()
+{
+  return m256;
+}
+
+__forceinline vint8 &vint8::operator=(const vint8 &a)
+{
+  m256 = a.m256;
+  return *this;
+}
+#endif /* __KERNEL_AVX__ */
+
+#ifndef __KERNEL_GPU__
+__forceinline int vint8::operator[](int i) const
+{
+  util_assert(i >= 0);
+  util_assert(i < 8);
+  return *(&a + i);
+}
+
+__forceinline int &vint8::operator[](int i)
+{
+  util_assert(i >= 0);
+  util_assert(i < 8);
+  return *(&a + i);
+}
+#endif
+
+ccl_device_inline vint8 make_vint8(int a, int b, int c, int d, int e, int f, int g, int h)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_set_epi32(h, g, f, e, d, c, b, a));
+#else
+  return {a, b, c, d, e, f, g, h};
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(int i)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_set1_epi32(i));
+#else
+  return make_vint8(i, i, i, i, i, i, i, i);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const vfloat8 f)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_cvtps_epi32(f.m256));
+#else
+  return make_vint8(
+      (int)f.a, (int)f.b, (int)f.c, (int)f.d, (int)f.e, (int)f.f, (int)f.g, (int)f.h);
+#endif
+}
+
+ccl_device_inline vint8 make_vint8(const int4 a, const int4 b)
+{
+#ifdef __KERNEL_AVX__
+  return vint8(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128), b.m128, 1));
+#else
+  return make_vint8(a.x, a.y, a.z, a.w, b.x, b.y, b.z, b.w);
+#endif
+}
+
+CCL_NAMESPACE_END
author	Brecht Van Lommel <brecht>	2022-11-01 17:16:55 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2022-11-08 14:28:40 +0300
commit	e1b3d9112730bc3b569ffff732a1558752ded146 (patch)
tree	27caad945dcc9ce7313b4c84cd0efbab6f70503c
parent	32ec0521c542bb78a0080f8091856ec085030f09 (diff)