1 files changed, 490 insertions, 170 deletions
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index dd375af27e5..35b74fb4b3e 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -32,246 +32,566 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssei quick_floor_sse(const ssef &x)
-{
-  ssei b = truncatei(x);
-  ssei isneg = cast((x < ssef(0.0f)).m128);
-  return b + isneg;  // unsaturated add 0xffffffff is the same as subtract -1
-}
-#endif
-
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssei hash_sse(const ssei &kx, const ssei &ky, const ssei &kz)
-{
-#  define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))
-#  define xor_rot(a, b, c) \
-    do { \
-      a = a ^ b; \
-      a = a - rot(b, c); \
-    } while (0)
-
-  uint len = 3;
-  ssei magic = ssei(0xdeadbeef + (len << 2) + 13);
-  ssei a = magic + kx;
-  ssei b = magic + ky;
-  ssei c = magic + kz;
-
-  xor_rot(c, b, 14);
-  xor_rot(a, c, 11);
-  xor_rot(b, a, 25);
-  xor_rot(c, b, 16);
-  xor_rot(a, c, 4);
-  xor_rot(b, a, 14);
-  xor_rot(c, b, 24);
-
-  return c;
-#  undef rot
-#  undef xor_rot
-}
-#endif
+/* **** Perlin Noise **** */
 
-#if 0  // unused
-ccl_device int imod(int a, int b)
+ccl_device float fade(float t)
 {
-  a %= b;
-  return a < 0 ? a + b : a;
+  return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
 }
 
-ccl_device uint phash(int kx, int ky, int kz, int3 p)
+ccl_device_inline float negate_if(float val, int condition)
 {
-  return hash(imod(kx, p.x), imod(ky, p.y), imod(kz, p.z));
+  return (condition) ? -val : val;
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float floorfrac(float x, int *i)
+ccl_device float grad1(int hash, float x)
 {
-  *i = quick_floor_to_int(x);
-  return x - *i;
+  int h = hash & 15;
+  float g = 1 + (h & 7);
+  return negate_if(g, h & 8) * x;
 }
-#else
-ccl_device_inline ssef floorfrac_sse(const ssef &x, ssei *i)
+
+ccl_device_noinline_cpu float perlin_1d(float x)
 {
-  *i = quick_floor_sse(x);
-  return x - ssef(*i);
+  int X;
+  float fx = floorfrac(x, &X);
+  float u = fade(fx);
+
+  return mix(grad1(hash_uint(X), fx), grad1(hash_uint(X + 1), fx - 1.0f), u);
 }
-#endif
 
+/* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if
+ * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not
+ * supported, we do a standard implementation, but if it is supported, we
+ * do an implementation using SSE intrinsics.
+ */
 #ifndef __KERNEL_SSE2__
-ccl_device float fade(float t)
+
+/* ** Standard Implementation ** */
+
+/* Bilinear Interpolation:
+ *
+ * v2          v3
+ *  @ + + + + @       y
+ *  +         +       ^
+ *  +         +       |
+ *  +         +       |
+ *  @ + + + + @       @------> x
+ * v0          v1
+ *
+ */
+ccl_device float bi_mix(float v0, float v1, float v2, float v3, float x, float y)
 {
-  return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
+  float x1 = 1.0f - x;
+  return (1.0f - y) * (v0 * x1 + v1 * x) + y * (v2 * x1 + v3 * x);
 }
-#else
-ccl_device_inline ssef fade_sse(const ssef *t)
+
+/* Trilinear Interpolation:
+ *
+ *   v6               v7
+ *     @ + + + + + + @
+ *     +\            +\
+ *     + \           + \
+ *     +  \          +  \
+ *     +   \ v4      +   \ v5
+ *     +    @ + + + +++ + @          z
+ *     +    +        +    +      y   ^
+ *  v2 @ + +++ + + + @ v3 +       \  |
+ *      \   +         \   +        \ |
+ *       \  +          \  +         \|
+ *        \ +           \ +          +---------> x
+ *         \+            \+
+ *          @ + + + + + + @
+ *        v0               v1
+ */
+ccl_device float tri_mix(float v0,
+                         float v1,
+                         float v2,
+                         float v3,
+                         float v4,
+                         float v5,
+                         float v6,
+                         float v7,
+                         float x,
+                         float y,
+                         float z)
 {
-  ssef a = madd(*t, ssef(6.0f), ssef(-15.0f));
-  ssef b = madd(*t, a, ssef(10.0f));
-  return ((*t) * (*t)) * ((*t) * b);
+  float x1 = 1.0f - x;
+  float y1 = 1.0f - y;
+  float z1 = 1.0f - z;
+  return z1 * (y1 * (v0 * x1 + v1 * x) + y * (v2 * x1 + v3 * x)) +
+         z * (y1 * (v4 * x1 + v5 * x) + y * (v6 * x1 + v7 * x));
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float nerp(float t, float a, float b)
+ccl_device float quad_mix(float v0,
+                          float v1,
+                          float v2,
+                          float v3,
+                          float v4,
+                          float v5,
+                          float v6,
+                          float v7,
+                          float v8,
+                          float v9,
+                          float v10,
+                          float v11,
+                          float v12,
+                          float v13,
+                          float v14,
+                          float v15,
+                          float x,
+                          float y,
+                          float z,
+                          float w)
 {
-  return (1.0f - t) * a + t * b;
+  return mix(tri_mix(v0, v1, v2, v3, v4, v5, v6, v7, x, y, z),
+             tri_mix(v8, v9, v10, v11, v12, v13, v14, v15, x, y, z),
+             w);
 }
-#else
-ccl_device_inline ssef nerp_sse(const ssef &t, const ssef &a, const ssef &b)
+
+ccl_device float grad2(int hash, float x, float y)
 {
-  ssef x1 = (ssef(1.0f) - t) * a;
-  return madd(t, b, x1);
+  int h = hash & 7;
+  float u = h < 4 ? x : y;
+  float v = 2.0f * (h < 4 ? y : x);
+  return negate_if(u, h & 1) + negate_if(v, h & 2);
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float grad(int hash, float x, float y, float z)
+ccl_device float grad3(int hash, float x, float y, float z)
 {
-  // use vectors pointing to the edges of the cube
   int h = hash & 15;
   float u = h < 8 ? x : y;
-  float vt = ((h == 12) | (h == 14)) ? x : z;
+  float vt = ((h == 12) || (h == 14)) ? x : z;
   float v = h < 4 ? y : vt;
-  return ((h & 1) ? -u : u) + ((h & 2) ? -v : v);
+  return negate_if(u, h & 1) + negate_if(v, h & 2);
 }
-#else
-ccl_device_inline ssef grad_sse(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
-{
-  ssei c1 = ssei(1);
-  ssei c2 = ssei(2);
 
-  ssei h = hash & ssei(15);  // h = hash & 15
+ccl_device float grad4(int hash, float x, float y, float z, float w)
+{
+  int h = hash & 31;
+  float u = h < 24 ? x : y;
+  float v = h < 16 ? y : z;
+  float s = h < 8 ? z : w;
+  return negate_if(u, h & 1) + negate_if(v, h & 2) + negate_if(s, h & 4);
+}
 
-  sseb case_ux = h < ssei(8);  // 0xffffffff if h < 8 else 0
+ccl_device_noinline_cpu float perlin_2d(float x, float y)
+{
+  int X;
+  int Y;
 
-  ssef u = select(case_ux, x, y);  // u = h<8 ? x : y
+  float fx = floorfrac(x, &X);
+  float fy = floorfrac(y, &Y);
 
-  sseb case_vy = h < ssei(4);  // 0xffffffff if h < 4 else 0
+  float u = fade(fx);
+  float v = fade(fy);
 
-  sseb case_h12 = h == ssei(12);  // 0xffffffff if h == 12 else 0
-  sseb case_h14 = h == ssei(14);  // 0xffffffff if h == 14 else 0
+  float r = bi_mix(grad2(hash_uint2(X, Y), fx, fy),
+                   grad2(hash_uint2(X + 1, Y), fx - 1.0f, fy),
+                   grad2(hash_uint2(X, Y + 1), fx, fy - 1.0f),
+                   grad2(hash_uint2(X + 1, Y + 1), fx - 1.0f, fy - 1.0f),
+                   u,
+                   v);
 
-  sseb case_vx = case_h12 | case_h14;  // 0xffffffff if h == 12 or h == 14 else 0
+  return r;
+}
 
-  ssef v = select(case_vy, y, select(case_vx, x, z));  // v = h<4 ? y : h == 12 || h == 14 ? x : z
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
+{
+  int X;
+  int Y;
+  int Z;
 
-  ssei case_uneg = (h & c1) << 31;        // 1<<31 if h&1 else 0
-  ssef case_uneg_mask = cast(case_uneg);  // -0.0 if h&1 else +0.0
-  ssef ru = u ^ case_uneg_mask;           // -u if h&1 else u (copy float sign)
+  float fx = floorfrac(x, &X);
+  float fy = floorfrac(y, &Y);
+  float fz = floorfrac(z, &Z);
 
-  ssei case_vneg = (h & c2) << 30;        // 2<<30 if h&2 else 0
-  ssef case_vneg_mask = cast(case_vneg);  // -0.0 if h&2 else +0.0
-  ssef rv = v ^ case_vneg_mask;           // -v if h&2 else v (copy float sign)
+  float u = fade(fx);
+  float v = fade(fy);
+  float w = fade(fz);
 
-  ssef r = ru + rv;  // ((h&1) ? -u : u) + ((h&2) ? -v : v)
+  float r = tri_mix(grad3(hash_uint3(X, Y, Z), fx, fy, fz),
+                    grad3(hash_uint3(X + 1, Y, Z), fx - 1.0f, fy, fz),
+                    grad3(hash_uint3(X, Y + 1, Z), fx, fy - 1.0f, fz),
+                    grad3(hash_uint3(X + 1, Y + 1, Z), fx - 1.0f, fy - 1.0f, fz),
+                    grad3(hash_uint3(X, Y, Z + 1), fx, fy, fz - 1.0f),
+                    grad3(hash_uint3(X + 1, Y, Z + 1), fx - 1.0f, fy, fz - 1.0f),
+                    grad3(hash_uint3(X, Y + 1, Z + 1), fx, fy - 1.0f, fz - 1.0f),
+                    grad3(hash_uint3(X + 1, Y + 1, Z + 1), fx - 1.0f, fy - 1.0f, fz - 1.0f),
+                    u,
+                    v,
+                    w);
   return r;
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float scale3(float result)
-{
-  return 0.9820f * result;
-}
-#else
-ccl_device_inline ssef scale3_sse(const ssef &result)
-{
-  return ssef(0.9820f) * result;
-}
-#endif
-
-#ifndef __KERNEL_SSE2__
-ccl_device_noinline_cpu float perlin(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   int X;
-  float fx = floorfrac(x, &X);
   int Y;
-  float fy = floorfrac(y, &Y);
   int Z;
+  int W;
+
+  float fx = floorfrac(x, &X);
+  float fy = floorfrac(y, &Y);
   float fz = floorfrac(z, &Z);
+  float fw = floorfrac(w, &W);
 
   float u = fade(fx);
   float v = fade(fy);
-  float w = fade(fz);
+  float t = fade(fz);
+  float s = fade(fw);
+
+  float r = quad_mix(
+      grad4(hash_uint4(X, Y, Z, W), fx, fy, fz, fw),
+      grad4(hash_uint4(X + 1, Y, Z, W), fx - 1.0f, fy, fz, fw),
+      grad4(hash_uint4(X, Y + 1, Z, W), fx, fy - 1.0f, fz, fw),
+      grad4(hash_uint4(X + 1, Y + 1, Z, W), fx - 1.0f, fy - 1.0f, fz, fw),
+      grad4(hash_uint4(X, Y, Z + 1, W), fx, fy, fz - 1.0f, fw),
+      grad4(hash_uint4(X + 1, Y, Z + 1, W), fx - 1.0f, fy, fz - 1.0f, fw),
+      grad4(hash_uint4(X, Y + 1, Z + 1, W), fx, fy - 1.0f, fz - 1.0f, fw),
+      grad4(hash_uint4(X + 1, Y + 1, Z + 1, W), fx - 1.0f, fy - 1.0f, fz - 1.0f, fw),
+      grad4(hash_uint4(X, Y, Z, W + 1), fx, fy, fz, fw - 1.0f),
+      grad4(hash_uint4(X + 1, Y, Z, W + 1), fx - 1.0f, fy, fz, fw - 1.0f),
+      grad4(hash_uint4(X, Y + 1, Z, W + 1), fx, fy - 1.0f, fz, fw - 1.0f),
+      grad4(hash_uint4(X + 1, Y + 1, Z, W + 1), fx - 1.0f, fy - 1.0f, fz, fw - 1.0f),
+      grad4(hash_uint4(X, Y, Z + 1, W + 1), fx, fy, fz - 1.0f, fw - 1.0f),
+      grad4(hash_uint4(X + 1, Y, Z + 1, W + 1), fx - 1.0f, fy, fz - 1.0f, fw - 1.0f),
+      grad4(hash_uint4(X, Y + 1, Z + 1, W + 1), fx, fy - 1.0f, fz - 1.0f, fw - 1.0f),
+      grad4(hash_uint4(X + 1, Y + 1, Z + 1, W + 1), fx - 1.0f, fy - 1.0f, fz - 1.0f, fw - 1.0f),
+      u,
+      v,
+      t,
+      s);
 
-  float result;
-
-  result = nerp(
-      w,
-      nerp(v,
-           nerp(u,
-                grad(hash_uint3(X, Y, Z), fx, fy, fz),
-                grad(hash_uint3(X + 1, Y, Z), fx - 1.0f, fy, fz)),
-           nerp(u,
-                grad(hash_uint3(X, Y + 1, Z), fx, fy - 1.0f, fz),
-                grad(hash_uint3(X + 1, Y + 1, Z), fx - 1.0f, fy - 1.0f, fz))),
-      nerp(v,
-           nerp(u,
-                grad(hash_uint3(X, Y, Z + 1), fx, fy, fz - 1.0f),
-                grad(hash_uint3(X + 1, Y, Z + 1), fx - 1.0f, fy, fz - 1.0f)),
-           nerp(u,
-                grad(hash_uint3(X, Y + 1, Z + 1), fx, fy - 1.0f, fz - 1.0f),
-                grad(hash_uint3(X + 1, Y + 1, Z + 1), fx - 1.0f, fy - 1.0f, fz - 1.0f))));
-  float r = scale3(result);
-
-  /* can happen for big coordinates, things even out to 0.0 then anyway */
-  return (isfinite(r)) ? r : 0.0f;
+  return r;
 }
+
 #else
-ccl_device_noinline float perlin(float x, float y, float z)
+
+/* ** SSE Implementation ** */
+
+/* SSE Bilinear Interpolation:
+ *
+ * The function takes two ssef inputs:
+ * - p : Contains the values at the points (v0, v1, v2, v3).
+ * - f : Contains the values (x, y, _, _). The third and fourth values are unused.
+ *
+ * The interpolation is done in two steps:
+ * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1).
+ *    (v2, v3) is generated by moving v2 and v3 to the first and second
+ *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    fourth values are unused.
+ * 2. Interplate g0 and g1 along the y axis to get the final value.
+ *    g1 is generated by populating an ssef with the second value of g.
+ *    Only the first value is important in the final ssef.
+ *
+ * v1          v3          g1
+ *  @ + + + + @            @                    y
+ *  +         +     (1)    +    (2)             ^
+ *  +         +     --->   +    --->   final    |
+ *  +         +            +                    |
+ *  @ + + + + @            @                    @------> x
+ * v0          v2          g0
+ *
+ */
+ccl_device_inline ssef bi_mix(ssef p, ssef f)
+{
+  ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
+  return mix(g, shuffle<1>(g), shuffle<1>(f));
+}
+
+/* SSE Trilinear Interpolation:
+ *
+ * The function takes three ssef inputs:
+ * - p : Contains the values at the points (v0, v1, v2, v3).
+ * - q : Contains the values at the points (v4, v5, v6, v7).
+ * - f : Contains the values (x, y, z, _). The fourth value is unused.
+ *
+ * The interpolation is done in three steps:
+ * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3).
+ * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1).
+ *    (s2, s3) is generated by moving v2 and v3 to the first and second
+ *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    fourth values are unused.
+ * 3. Interplate g0 and g1 along the z axis to get the final value.
+ *    g1 is generated by populating an ssef with the second value of g.
+ *    Only the first value is important in the final ssef.
+ *
+ *   v3               v7
+ *     @ + + + + + + @               s3 @
+ *     +\            +\                 +\
+ *     + \           + \                + \
+ *     +  \          +  \               +  \             g1
+ *     +   \ v1      +   \ v5           +   \ s1         @
+ *     +    @ + + + +++ + @             +    @           +                     z
+ *     +    +        +    +    (1)      +    +    (2)    +   (3)           y   ^
+ *  v2 @ + +++ + + + @ v6 +    --->  s2 @    +    --->   +   --->  final    \  |
+ *      \   +         \   +              \   +           +                   \ |
+ *       \  +          \  +               \  +           +                    \|
+ *        \ +           \ +                \ +           @                     +---------> x
+ *         \+            \+                 \+           g0
+ *          @ + + + + + + @                  @
+ *        v0               v4                 s0
+ */
+ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
+{
+  ssef s = mix(p, q, shuffle<0>(f));
+  ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
+  return mix(g, shuffle<1>(g), shuffle<2>(f));
+}
+
+/* SSE Quadrilinear Interpolation:
+ *
+ * Quadrilinear interpolation is as simple as a linear interpolation
+ * between two trilinear interpolations.
+ *
+ */
+ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
+{
+  return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
+}
+
+ccl_device_inline ssef fade(const ssef &t)
+{
+  ssef a = madd(t, 6.0f, -15.0f);
+  ssef b = madd(t, a, 10.0f);
+  return (t * t) * (t * b);
+}
+
+/* Negate val if the nth bit of h is 1. */
+#  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))
+
+ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+{
+  ssei h = hash & 7;
+  ssef u = select(h < 4, x, y);
+  ssef v = 2.0f * select(h < 4, y, x);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
+}
+
+ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
+{
+  ssei h = hash & 15;
+  ssef u = select(h < 8, x, y);
+  ssef vt = select((h == 12) | (h == 14), x, z);
+  ssef v = select(h < 4, y, vt);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
+}
+
+ccl_device_inline ssef
+grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w)
+{
+  ssei h = hash & 31;
+  ssef u = select(h < 24, x, y);
+  ssef v = select(h < 16, y, z);
+  ssef s = select(h < 8, z, w);
+  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
+}
+
+/* We use SSE to compute and interpolate 4 gradients at once:
+ *
+ *    Point  Offset from v0
+ *     v0       (0, 0)
+ *     v1       (0, 1)
+ *     v2       (1, 0)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(V, V + 1))
+ *     v3       (1, 1)         ^
+ *               |  |__________|       (0, 0, 1, 1) = shuffle<0, 0, 0, 0>(V, V + 1)
+ *               |                          ^
+ *               |__________________________|
+ *
+ */
+ccl_device_noinline float perlin_2d(float x, float y)
+{
+  ssei XY;
+  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
+  ssef uv = fade(fxy);
+
+  ssei XY1 = XY + 1;
+  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
+  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+
+  ssei h = hash_ssei2(X, Y);
+
+  ssef fxy1 = fxy - 1.0f;
+  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+
+  ssef g = grad(h, fx, fy);
+
+  return extract<0>(bi_mix(g, uv));
+}
+
+/* We use SSE to compute and interpolate 4 gradients at once. Since we have 8
+ * gradients in 3D, we need to compute two sets of gradients at the points:
+ *
+ *    Point  Offset from v0
+ *     v0      (0, 0, 0)
+ *     v1      (0, 0, 1)
+ *     v2      (0, 1, 0)    (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ *     v3      (0, 1, 1)         ^
+ *                 |  |__________|       (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ *                 |                          ^
+ *                 |__________________________|
+ *
+ *    Point  Offset from v0
+ *     v4      (1, 0, 0)
+ *     v5      (1, 0, 1)
+ *     v6      (1, 1, 0)
+ *     v7      (1, 1, 1)
+ *
+ */
+ccl_device_noinline float perlin_3d(float x, float y, float z)
 {
-  ssef xyz = ssef(x, y, z, 0.0f);
   ssei XYZ;
+  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
+  ssef uvw = fade(fxyz);
 
-  ssef fxyz = floorfrac_sse(xyz, &XYZ);
+  ssei XYZ1 = XYZ + 1;
+  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
 
-  ssef uvw = fade_sse(&fxyz);
-  ssef u = shuffle<0>(uvw), v = shuffle<1>(uvw), w = shuffle<2>(uvw);
+  ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z);
+  ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z);
 
-  ssei XYZ_ofc = XYZ + ssei(1);
-  ssei vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc);                       // +0, +0, +1, +1
-  ssei vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc));  // +0, +1, +0, +1
+  ssef fxyz1 = fxyz - 1.0f;
+  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
 
-  ssei h1 = hash_sse(shuffle<0>(XYZ), vdy, vdz);      // hash directions 000, 001, 010, 011
-  ssei h2 = hash_sse(shuffle<0>(XYZ_ofc), vdy, vdz);  // hash directions 100, 101, 110, 111
+  ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
+  ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
 
-  ssef fxyz_ofc = fxyz - ssef(1.0f);
-  ssef vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc);
-  ssef vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc));
+  return extract<0>(tri_mix(g1, g2, uvw));
+}
 
-  ssef g1 = grad_sse(h1, shuffle<0>(fxyz), vfy, vfz);
-  ssef g2 = grad_sse(h2, shuffle<0>(fxyz_ofc), vfy, vfz);
-  ssef n1 = nerp_sse(u, g1, g2);
+/* We use SSE to compute and interpolate 4 gradients at once. Since we have 16
+ * gradients in 4D, we need to compute four sets of gradients at the points:
+ *
+ *    Point  Offset from v0
+ *     v0     (0, 0, 0, 0)
+ *     v1     (0, 0, 1, 0)
+ *     v2     (0, 1, 0, 0)  (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1))
+ *     v3     (0, 1, 1, 0)    ^
+ *                |  |________|    (0, 0, 1, 1) = shuffle<1, 1, 1, 1>(V, V + 1)
+ *                |                       ^
+ *                |_______________________|
+ *
+ *    Point  Offset from v0
+ *     v4     (1, 0, 0, 0)
+ *     v5     (1, 0, 1, 0)
+ *     v6     (1, 1, 0, 0)
+ *     v7     (1, 1, 1, 0)
+ *
+ *    Point  Offset from v0
+ *     v8     (0, 0, 0, 1)
+ *     v9     (0, 0, 1, 1)
+ *     v10    (0, 1, 0, 1)
+ *     v11    (0, 1, 1, 1)
+ *
+ *    Point  Offset from v0
+ *     v12    (1, 0, 0, 1)
+ *     v13    (1, 0, 1, 1)
+ *     v14    (1, 1, 0, 1)
+ *     v15    (1, 1, 1, 1)
+ *
+ */
+ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+{
+  ssei XYZW;
+  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
+  ssef uvws = fade(fxyzw);
 
-  ssef n1_half = shuffle<2, 3, 2, 3>(n1);  // extract 2 floats to a separate vector
-  ssef n2 = nerp_sse(
-      v, n1, n1_half);  // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
+  ssei XYZW1 = XYZW + 1;
+  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
 
-  ssef n2_second = shuffle<1>(n2);  // extract b to a separate vector
-  ssef result = nerp_sse(
-      w, n2, n2_second);  // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]
+  ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
+  ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
 
-  ssef r = scale3_sse(result);
+  ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
+  ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
 
-  ssef infmask = cast(ssei(0x7f800000));
-  ssef rinfmask = ((r & infmask) == infmask).m128;  // 0xffffffff if r is inf/-inf/nan else 0
-  ssef rfinite = andnot(rinfmask, r);               // 0 if r is inf/-inf/nan else r
-  return extract<0>(rfinite);
+  ssef fxyzw1 = fxyzw - 1.0f;
+  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+
+  ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
+  ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
+
+  ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
+  ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
+
+  return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
 }
 #endif
 
-/* perlin noise in range 0..1 */
-ccl_device float noise(float3 p)
+/* Remap the output of noise to a predictable range [-1, 1].
+ * The scale values were computed experimentally by the OSL developers.
+ */
+
+ccl_device_inline float noise_scale1(float result)
+{
+  return 0.2500f * result;
+}
+
+ccl_device_inline float noise_scale2(float result)
+{
+  return 0.6616f * result;
+}
+
+ccl_device_inline float noise_scale3(float result)
+{
+  return 0.9820f * result;
+}
+
+ccl_device_inline float noise_scale4(float result)
+{
+  return 0.8344f * result;
+}
+
+/* Safe Signed And Unsigned Noise */
+
+ccl_device_inline float snoise_1d(float p)
+{
+  float r = perlin_1d(p);
+  return isinf(r) ? 0.0f : noise_scale1(r);
+}
+
+ccl_device_inline float noise_1d(float p)
+{
+  return 0.5f * snoise_1d(p) + 0.5f;
+}
+
+ccl_device_inline float snoise_2d(float2 p)
+{
+  float r = perlin_2d(p.x, p.y);
+  return isinf(r) ? 0.0f : noise_scale2(r);
+}
+
+ccl_device_inline float noise_2d(float2 p)
+{
+  return 0.5f * snoise_2d(p) + 0.5f;
+}
+
+ccl_device_inline float snoise_3d(float3 p)
+{
+  float r = perlin_3d(p.x, p.y, p.z);
+  return isinf(r) ? 0.0f : noise_scale3(r);
+}
+
+ccl_device_inline float noise_3d(float3 p)
+{
+  return 0.5f * snoise_3d(p) + 0.5f;
+}
+
+ccl_device_inline float snoise_4d(float4 p)
 {
-  float r = perlin(p.x, p.y, p.z);
-  return 0.5f * r + 0.5f;
+  float r = perlin_4d(p.x, p.y, p.z, p.w);
+  return isinf(r) ? 0.0f : noise_scale4(r);
 }
 
-/* perlin noise in range -1..1 */
-ccl_device float snoise(float3 p)
+ccl_device_inline float noise_4d(float4 p)
 {
-  return perlin(p.x, p.y, p.z);
+  return 0.5f * snoise_4d(p) + 0.5f;
 }
 
 /* cell noise */
@@ -293,7 +613,7 @@ ccl_device float3 cellnoise3(float3 p)
   ssei ip_yxz = shuffle<1, 0, 2, 3>(ssei(ip.m128));
   ssei ip_xyy = shuffle<0, 1, 1, 3>(ssei(ip.m128));
   ssei ip_zzx = shuffle<2, 2, 0, 3>(ssei(ip.m128));
-  ssei bits = hash_sse(ip_xyy, ip_yxz, ip_zzx);
+  ssei bits = hash_ssei3(ip_xyy, ip_yxz, ip_zzx);
   return float3(uint32_to_float(bits) * ssef(1.0f / (float)0xFFFFFFFF));
 #endif
 }