diff options
Diffstat (limited to 'intern/cycles/kernel/svm/noise.h')
-rw-r--r-- | intern/cycles/kernel/svm/noise.h | 268 |
1 files changed, 137 insertions, 131 deletions
diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h index 31e77d87413..209195a03f1 100644 --- a/intern/cycles/kernel/svm/noise.h +++ b/intern/cycles/kernel/svm/noise.h @@ -39,11 +39,11 @@ ccl_device_noinline_cpu float perlin_1d(float x) } /* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if - * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not + * SSE is supported, that is, if __KERNEL_SSE__ is defined. If it is not * supported, we do a standard implementation, but if it is supported, we * do an implementation using SSE intrinsics. */ -#if !defined(__KERNEL_SSE2__) +#if !defined(__KERNEL_SSE__) /* ** Standard Implementation ** */ @@ -250,18 +250,18 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) /* SSE Bilinear Interpolation: * - * The function takes two ssef inputs: + * The function takes two float4 inputs: * - p : Contains the values at the points (v0, v1, v2, v3). * - f : Contains the values (x, y, _, _). The third and fourth values are unused. * * The interpolation is done in two steps: * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1). * (v2, v3) is generated by moving v2 and v3 to the first and second - * places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and + * places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and * fourth values are unused. * 2. Interpolate g0 and g1 along the y axis to get the final value. - * g1 is generated by populating an ssef with the second value of g. - * Only the first value is important in the final ssef. + * g1 is generated by populating an float4 with the second value of g. + * Only the first value is important in the final float4. * * v1 v3 g1 * @ + + + + @ @ y @@ -272,27 +272,27 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) * v0 v2 g0 * */ -ccl_device_inline ssef bi_mix(ssef p, ssef f) +ccl_device_inline float4 bi_mix(float4 p, float4 f) { - ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f)); + float4 g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f)); return mix(g, shuffle<1>(g), shuffle<1>(f)); } -ccl_device_inline ssef fade(const ssef &t) +ccl_device_inline float4 fade(const float4 t) { - ssef a = madd(t, 6.0f, -15.0f); - ssef b = madd(t, a, 10.0f); + float4 a = madd(t, make_float4(6.0f), make_float4(-15.0f)); + float4 b = madd(t, a, make_float4(10.0f)); return (t * t) * (t * b); } /* Negate val if the nth bit of h is 1. */ # define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n)))) -ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y) +ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y) { - ssei h = hash & 7; - ssef u = select(h < 4, x, y); - ssef v = 2.0f * select(h < 4, y, x); + int4 h = hash & 7; + float4 u = select(h < 4, x, y); + float4 v = 2.0f * select(h < 4, y, x); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); } @@ -310,28 +310,28 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y) */ ccl_device_noinline_cpu float perlin_2d(float x, float y) { - ssei XY; - ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY); - ssef uv = fade(fxy); + int4 XY; + float4 fxy = floorfrac(make_float4(x, y, 0.0f, 0.0f), &XY); + float4 uv = fade(fxy); - ssei XY1 = XY + 1; - ssei X = shuffle<0, 0, 0, 0>(XY, XY1); - ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1)); + int4 XY1 = XY + make_int4(1); + int4 X = shuffle<0, 0, 0, 0>(XY, XY1); + int4 Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1)); - ssei h = hash_ssei2(X, Y); + int4 h = hash_int4_2(X, Y); - ssef fxy1 = fxy - 1.0f; - ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1); - ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1)); + float4 fxy1 = fxy - make_float4(1.0f); + float4 fx = shuffle<0, 0, 0, 0>(fxy, fxy1); + float4 fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1)); - ssef g = grad(h, fx, fy); + float4 g = grad(h, fx, fy); return extract<0>(bi_mix(g, uv)); } /* SSE Trilinear Interpolation: * - * The function takes three ssef inputs: + * The function takes three float4 inputs: * - p : Contains the values at the points (v0, v1, v2, v3). * - q : Contains the values at the points (v4, v5, v6, v7). * - f : Contains the values (x, y, z, _). The fourth value is unused. @@ -340,11 +340,11 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y) * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3). * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1). * (s2, s3) is generated by moving v2 and v3 to the first and second - * places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and + * places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and * fourth values are unused. * 3. Interpolate g0 and g1 along the z axis to get the final value. - * g1 is generated by populating an ssef with the second value of g. - * Only the first value is important in the final ssef. + * g1 is generated by populating an float4 with the second value of g. + * Only the first value is important in the final float4. * * v3 v7 * @ + + + + + + @ s3 @ @@ -362,10 +362,10 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y) * @ + + + + + + @ @ * v0 v4 s0 */ -ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f) +ccl_device_inline float4 tri_mix(float4 p, float4 q, float4 f) { - ssef s = mix(p, q, shuffle<0>(f)); - ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f)); + float4 s = mix(p, q, shuffle<0>(f)); + float4 g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f)); return mix(g, shuffle<1>(g), shuffle<2>(f)); } @@ -374,24 +374,24 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f) * supported, we do an SSE implementation, but if it is supported, * we do an implementation using AVX intrinsics. */ -# if !defined(__KERNEL_AVX__) +# if !defined(__KERNEL_AVX2__) -ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z) +ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y, const float4 z) { - ssei h = hash & 15; - ssef u = select(h < 8, x, y); - ssef vt = select((h == 12) | (h == 14), x, z); - ssef v = select(h < 4, y, vt); + int4 h = hash & 15; + float4 u = select(h < 8, x, y); + float4 vt = select((h == 12) | (h == 14), x, z); + float4 v = select(h < 4, y, vt); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); } -ccl_device_inline ssef -grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w) +ccl_device_inline float4 +grad(const int4 hash, const float4 x, const float4 y, const float4 z, const float4 w) { - ssei h = hash & 31; - ssef u = select(h < 24, x, y); - ssef v = select(h < 16, y, z); - ssef s = select(h < 8, z, w); + int4 h = hash & 31; + float4 u = select(h < 24, x, y); + float4 v = select(h < 16, y, z); + float4 s = select(h < 8, z, w); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2); } @@ -401,7 +401,7 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef & * between two trilinear interpolations. * */ -ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f) +ccl_device_inline float4 quad_mix(float4 p, float4 q, float4 r, float4 s, float4 f) { return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f)); } @@ -427,23 +427,23 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f) */ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) { - ssei XYZ; - ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ); - ssef uvw = fade(fxyz); + int4 XYZ; + float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ); + float4 uvw = fade(fxyz); - ssei XYZ1 = XYZ + 1; - ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); - ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); + int4 XYZ1 = XYZ + make_int4(1); + int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); + int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); - ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z); - ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z); + int4 h1 = hash_int4_3(shuffle<0>(XYZ), Y, Z); + int4 h2 = hash_int4_3(shuffle<0>(XYZ1), Y, Z); - ssef fxyz1 = fxyz - 1.0f; - ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); - ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); + float4 fxyz1 = fxyz - make_float4(1.0f); + float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); + float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); - ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz); - ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz); + float4 g1 = grad(h1, shuffle<0>(fxyz), fy, fz); + float4 g2 = grad(h2, shuffle<0>(fxyz1), fy, fz); return extract<0>(tri_mix(g1, g2, uvw)); } @@ -481,29 +481,29 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) */ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) { - ssei XYZW; - ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW); - ssef uvws = fade(fxyzw); + int4 XYZW; + float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW); + float4 uvws = fade(fxyzw); - ssei XYZW1 = XYZW + 1; - ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); - ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); + int4 XYZW1 = XYZW + make_int4(1); + int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); + int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); - ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW)); - ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW)); + int4 h1 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW)); + int4 h2 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW)); - ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1)); - ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1)); + int4 h3 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1)); + int4 h4 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1)); - ssef fxyzw1 = fxyzw - 1.0f; - ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); - ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); + float4 fxyzw1 = fxyzw - make_float4(1.0f); + float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); + float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); - ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw)); - ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw)); + float4 g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw)); + float4 g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw)); - ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1)); - ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1)); + float4 g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1)); + float4 g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1)); return extract<0>(quad_mix(g1, g2, g3, g4, uvws)); } @@ -512,22 +512,22 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) /* AVX Implementation */ -ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z) +ccl_device_inline vfloat8 grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z) { - avxi h = hash & 15; - avxf u = select(h < 8, x, y); - avxf vt = select((h == 12) | (h == 14), x, z); - avxf v = select(h < 4, y, vt); + vint8 h = hash & 15; + vfloat8 u = select(h < 8, x, y); + vfloat8 vt = select((h == 12) | (h == 14), x, z); + vfloat8 v = select(h < 4, y, vt); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1); } -ccl_device_inline avxf -grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w) +ccl_device_inline vfloat8 +grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z, const vfloat8 w) { - avxi h = hash & 31; - avxf u = select(h < 24, x, y); - avxf v = select(h < 16, y, z); - avxf s = select(h < 8, z, w); + vint8 h = hash & 31; + vfloat8 u = select(h < 24, x, y); + vfloat8 v = select(h < 16, y, z); + vfloat8 s = select(h < 8, z, w); return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2); } @@ -537,13 +537,13 @@ grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf & * 1. Interpolate p and q along the w axis to get s. * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final * value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the - * low and high ssef from s. + * low and high float4 from s. * */ -ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f) +ccl_device_inline float4 quad_mix(vfloat8 p, vfloat8 q, float4 f) { - ssef fv = shuffle<3>(f); - avxf s = mix(p, q, avxf(fv, fv)); + float4 fv = shuffle<3>(f); + vfloat8 s = mix(p, q, make_vfloat8(fv, fv)); return tri_mix(low(s), high(s), f); } @@ -565,25 +565,25 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f) */ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) { - ssei XYZ; - ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ); - ssef uvw = fade(fxyz); + int4 XYZ; + float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ); + float4 uvw = fade(fxyz); - ssei XYZ1 = XYZ + 1; - ssei X = shuffle<0>(XYZ); - ssei X1 = shuffle<0>(XYZ1); - ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); - ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); + int4 XYZ1 = XYZ + make_int4(1); + int4 X = shuffle<0>(XYZ); + int4 X1 = shuffle<0>(XYZ1); + int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1); + int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1)); - avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z)); + vint8 h = hash_int8_3(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z)); - ssef fxyz1 = fxyz - 1.0f; - ssef fx = shuffle<0>(fxyz); - ssef fx1 = shuffle<0>(fxyz1); - ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); - ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); + float4 fxyz1 = fxyz - make_float4(1.0f); + float4 fx = shuffle<0>(fxyz); + float4 fx1 = shuffle<0>(fxyz1); + float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1); + float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1)); - avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz)); + vfloat8 g = grad(h, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz)); return extract<0>(tri_mix(low(g), high(g), uvw)); } @@ -617,31 +617,37 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) */ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) { - ssei XYZW; - ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW); - ssef uvws = fade(fxyzw); - - ssei XYZW1 = XYZW + 1; - ssei X = shuffle<0>(XYZW); - ssei X1 = shuffle<0>(XYZW1); - ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); - ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); - ssei W = shuffle<3>(XYZW); - ssei W1 = shuffle<3>(XYZW1); - - avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W)); - avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1)); - - ssef fxyzw1 = fxyzw - 1.0f; - ssef fx = shuffle<0>(fxyzw); - ssef fx1 = shuffle<0>(fxyzw1); - ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); - ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); - ssef fw = shuffle<3>(fxyzw); - ssef fw1 = shuffle<3>(fxyzw1); - - avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw)); - avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1)); + int4 XYZW; + float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW); + float4 uvws = fade(fxyzw); + + int4 XYZW1 = XYZW + make_int4(1); + int4 X = shuffle<0>(XYZW); + int4 X1 = shuffle<0>(XYZW1); + int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1); + int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1)); + int4 W = shuffle<3>(XYZW); + int4 W1 = shuffle<3>(XYZW1); + + vint8 h1 = hash_int8_4(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W, W)); + vint8 h2 = hash_int8_4( + make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W1, W1)); + + float4 fxyzw1 = fxyzw - make_float4(1.0f); + float4 fx = shuffle<0>(fxyzw); + float4 fx1 = shuffle<0>(fxyzw1); + float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1); + float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1)); + float4 fw = shuffle<3>(fxyzw); + float4 fw1 = shuffle<3>(fxyzw1); + + vfloat8 g1 = grad( + h1, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz), make_vfloat8(fw, fw)); + vfloat8 g2 = grad(h2, + make_vfloat8(fx, fx1), + make_vfloat8(fy, fy), + make_vfloat8(fz, fz), + make_vfloat8(fw1, fw1)); return extract<0>(quad_mix(g1, g2, uvws)); } |