1 files changed, 137 insertions, 131 deletions
diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h
index 31e77d87413..209195a03f1 100644
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@@ -39,11 +39,11 @@ ccl_device_noinline_cpu float perlin_1d(float x)
 }
 
 /* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if
- * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not
+ * SSE is supported, that is, if __KERNEL_SSE__ is defined. If it is not
  * supported, we do a standard implementation, but if it is supported, we
  * do an implementation using SSE intrinsics.
  */
-#if !defined(__KERNEL_SSE2__)
+#if !defined(__KERNEL_SSE__)
 
 /* ** Standard Implementation ** */
 
@@ -250,18 +250,18 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 
 /* SSE Bilinear Interpolation:
  *
- * The function takes two ssef inputs:
+ * The function takes two float4 inputs:
  * - p : Contains the values at the points (v0, v1, v2, v3).
  * - f : Contains the values (x, y, _, _). The third and fourth values are unused.
  *
  * The interpolation is done in two steps:
  * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1).
  *    (v2, v3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
  *    fourth values are unused.
  * 2. Interpolate g0 and g1 along the y axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
  *
  * v1          v3          g1
  *  @ + + + + @            @                    y
@@ -272,27 +272,27 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
  * v0          v2          g0
  *
  */
-ccl_device_inline ssef bi_mix(ssef p, ssef f)
+ccl_device_inline float4 bi_mix(float4 p, float4 f)
 {
-  ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
+  float4 g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
   return mix(g, shuffle<1>(g), shuffle<1>(f));
 }
 
-ccl_device_inline ssef fade(const ssef &t)
+ccl_device_inline float4 fade(const float4 t)
 {
-  ssef a = madd(t, 6.0f, -15.0f);
-  ssef b = madd(t, a, 10.0f);
+  float4 a = madd(t, make_float4(6.0f), make_float4(-15.0f));
+  float4 b = madd(t, a, make_float4(10.0f));
   return (t * t) * (t * b);
 }
 
 /* Negate val if the nth bit of h is 1. */
 #  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
 
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y)
 {
-  ssei h = hash & 7;
-  ssef u = select(h < 4, x, y);
-  ssef v = 2.0f * select(h < 4, y, x);
+  int4 h = hash & 7;
+  float4 u = select(h < 4, x, y);
+  float4 v = 2.0f * select(h < 4, y, x);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
@@ -310,28 +310,28 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
  */
 ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
-  ssei XY;
-  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
-  ssef uv = fade(fxy);
+  int4 XY;
+  float4 fxy = floorfrac(make_float4(x, y, 0.0f, 0.0f), &XY);
+  float4 uv = fade(fxy);
 
-  ssei XY1 = XY + 1;
-  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
-  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+  int4 XY1 = XY + make_int4(1);
+  int4 X = shuffle<0, 0, 0, 0>(XY, XY1);
+  int4 Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
 
-  ssei h = hash_ssei2(X, Y);
+  int4 h = hash_int4_2(X, Y);
 
-  ssef fxy1 = fxy - 1.0f;
-  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
-  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+  float4 fxy1 = fxy - make_float4(1.0f);
+  float4 fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+  float4 fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
 
-  ssef g = grad(h, fx, fy);
+  float4 g = grad(h, fx, fy);
 
   return extract<0>(bi_mix(g, uv));
 }
 
 /* SSE Trilinear Interpolation:
  *
- * The function takes three ssef inputs:
+ * The function takes three float4 inputs:
  * - p : Contains the values at the points (v0, v1, v2, v3).
  * - q : Contains the values at the points (v4, v5, v6, v7).
  * - f : Contains the values (x, y, z, _). The fourth value is unused.
@@ -340,11 +340,11 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
  * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3).
  * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1).
  *    (s2, s3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
  *    fourth values are unused.
  * 3. Interpolate g0 and g1 along the z axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
  *
  *   v3               v7
  *     @ + + + + + + @               s3 @
@@ -362,10 +362,10 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
  *          @ + + + + + + @                  @
  *        v0               v4                 s0
  */
-ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
+ccl_device_inline float4 tri_mix(float4 p, float4 q, float4 f)
 {
-  ssef s = mix(p, q, shuffle<0>(f));
-  ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
+  float4 s = mix(p, q, shuffle<0>(f));
+  float4 g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
   return mix(g, shuffle<1>(g), shuffle<2>(f));
 }
 
@@ -374,24 +374,24 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
  * supported, we do an SSE implementation, but if it is supported,
  * we do an implementation using AVX intrinsics.
  */
-#  if !defined(__KERNEL_AVX__)
+#  if !defined(__KERNEL_AVX2__)
 
-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y, const float4 z)
 {
-  ssei h = hash & 15;
-  ssef u = select(h < 8, x, y);
-  ssef vt = select((h == 12) | (h == 14), x, z);
-  ssef v = select(h < 4, y, vt);
+  int4 h = hash & 15;
+  float4 u = select(h < 8, x, y);
+  float4 vt = select((h == 12) | (h == 14), x, z);
+  float4 v = select(h < 4, y, vt);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
-ccl_device_inline ssef
-grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w)
+ccl_device_inline float4
+grad(const int4 hash, const float4 x, const float4 y, const float4 z, const float4 w)
 {
-  ssei h = hash & 31;
-  ssef u = select(h < 24, x, y);
-  ssef v = select(h < 16, y, z);
-  ssef s = select(h < 8, z, w);
+  int4 h = hash & 31;
+  float4 u = select(h < 24, x, y);
+  float4 v = select(h < 16, y, z);
+  float4 s = select(h < 8, z, w);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }
 
@@ -401,7 +401,7 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &
  * between two trilinear interpolations.
  *
  */
-ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
+ccl_device_inline float4 quad_mix(float4 p, float4 q, float4 r, float4 s, float4 f)
 {
   return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
 }
@@ -427,23 +427,23 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
  */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);
 
-  ssei XYZ1 = XYZ + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
 
-  ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z);
-  ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z);
+  int4 h1 = hash_int4_3(shuffle<0>(XYZ), Y, Z);
+  int4 h2 = hash_int4_3(shuffle<0>(XYZ1), Y, Z);
 
-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+  float4 fxyz1 = fxyz - make_float4(1.0f);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
 
-  ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
-  ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
+  float4 g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
+  float4 g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
 
   return extract<0>(tri_mix(g1, g2, uvw));
 }
@@ -481,29 +481,29 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
  */
 ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
-  ssei XYZW;
-  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
-  ssef uvws = fade(fxyzw);
+  int4 XYZW;
+  float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+  float4 uvws = fade(fxyzw);
 
-  ssei XYZW1 = XYZW + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  int4 XYZW1 = XYZW + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
 
-  ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
-  ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
+  int4 h1 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
+  int4 h2 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
 
-  ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
-  ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
+  int4 h3 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
+  int4 h4 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
 
-  ssef fxyzw1 = fxyzw - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  float4 fxyzw1 = fxyzw - make_float4(1.0f);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
 
-  ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
-  ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
+  float4 g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
+  float4 g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
 
-  ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
-  ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
+  float4 g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
+  float4 g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
 
   return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
 }
@@ -512,22 +512,22 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 
 /* AVX Implementation */
 
-ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z)
+ccl_device_inline vfloat8 grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z)
 {
-  avxi h = hash & 15;
-  avxf u = select(h < 8, x, y);
-  avxf vt = select((h == 12) | (h == 14), x, z);
-  avxf v = select(h < 4, y, vt);
+  vint8 h = hash & 15;
+  vfloat8 u = select(h < 8, x, y);
+  vfloat8 vt = select((h == 12) | (h == 14), x, z);
+  vfloat8 v = select(h < 4, y, vt);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }
 
-ccl_device_inline avxf
-grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w)
+ccl_device_inline vfloat8
+grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z, const vfloat8 w)
 {
-  avxi h = hash & 31;
-  avxf u = select(h < 24, x, y);
-  avxf v = select(h < 16, y, z);
-  avxf s = select(h < 8, z, w);
+  vint8 h = hash & 31;
+  vfloat8 u = select(h < 24, x, y);
+  vfloat8 v = select(h < 16, y, z);
+  vfloat8 s = select(h < 8, z, w);
   return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }
 
@@ -537,13 +537,13 @@ grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &
  * 1. Interpolate p and q along the w axis to get s.
  * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final
  *    value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the
- *    low and high ssef from s.
+ *    low and high float4 from s.
  *
  */
-ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
+ccl_device_inline float4 quad_mix(vfloat8 p, vfloat8 q, float4 f)
 {
-  ssef fv = shuffle<3>(f);
-  avxf s = mix(p, q, avxf(fv, fv));
+  float4 fv = shuffle<3>(f);
+  vfloat8 s = mix(p, q, make_vfloat8(fv, fv));
   return tri_mix(low(s), high(s), f);
 }
 
@@ -565,25 +565,25 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
  */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);
 
-  ssei XYZ1 = XYZ + 1;
-  ssei X = shuffle<0>(XYZ);
-  ssei X1 = shuffle<0>(XYZ1);
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 X = shuffle<0>(XYZ);
+  int4 X1 = shuffle<0>(XYZ1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
 
-  avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z));
+  vint8 h = hash_int8_3(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z));
 
-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fx = shuffle<0>(fxyz);
-  ssef fx1 = shuffle<0>(fxyz1);
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+  float4 fxyz1 = fxyz - make_float4(1.0f);
+  float4 fx = shuffle<0>(fxyz);
+  float4 fx1 = shuffle<0>(fxyz1);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
 
-  avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz));
+  vfloat8 g = grad(h, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz));
 
   return extract<0>(tri_mix(low(g), high(g), uvw));
 }
@@ -617,31 +617,37 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
  */
 ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
-  ssei XYZW;
-  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
-  ssef uvws = fade(fxyzw);
-
-  ssei XYZW1 = XYZW + 1;
-  ssei X = shuffle<0>(XYZW);
-  ssei X1 = shuffle<0>(XYZW1);
-  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
-  ssei W = shuffle<3>(XYZW);
-  ssei W1 = shuffle<3>(XYZW1);
-
-  avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W));
-  avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1));
-
-  ssef fxyzw1 = fxyzw - 1.0f;
-  ssef fx = shuffle<0>(fxyzw);
-  ssef fx1 = shuffle<0>(fxyzw1);
-  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
-  ssef fw = shuffle<3>(fxyzw);
-  ssef fw1 = shuffle<3>(fxyzw1);
-
-  avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw));
-  avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1));
+  int4 XYZW;
+  float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+  float4 uvws = fade(fxyzw);
+
+  int4 XYZW1 = XYZW + make_int4(1);
+  int4 X = shuffle<0>(XYZW);
+  int4 X1 = shuffle<0>(XYZW1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  int4 W = shuffle<3>(XYZW);
+  int4 W1 = shuffle<3>(XYZW1);
+
+  vint8 h1 = hash_int8_4(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W, W));
+  vint8 h2 = hash_int8_4(
+      make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W1, W1));
+
+  float4 fxyzw1 = fxyzw - make_float4(1.0f);
+  float4 fx = shuffle<0>(fxyzw);
+  float4 fx1 = shuffle<0>(fxyzw1);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  float4 fw = shuffle<3>(fxyzw);
+  float4 fw1 = shuffle<3>(fxyzw1);
+
+  vfloat8 g1 = grad(
+      h1, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz), make_vfloat8(fw, fw));
+  vfloat8 g2 = grad(h2,
+                    make_vfloat8(fx, fx1),
+                    make_vfloat8(fy, fy),
+                    make_vfloat8(fz, fz),
+                    make_vfloat8(fw1, fw1));
 
   return extract<0>(quad_mix(g1, g2, uvws));
 }