From 47c5898fa15b9e82ddfe28891029101eaf1e67d2 Mon Sep 17 00:00:00 2001 From: "Sv. Lockal" Date: Sun, 12 Jan 2014 18:14:00 +0400 Subject: Cycles: SSE for Voronoi textures (targeted for Haswell CPUs) Gives up to 15% speedup scenes with voronoi-based textures (up to 25% with volumes) on Haswell. The performance change for other CPUs is much smaller: 1-2%. Reviewed By: brecht Differential Revision: https://developer.blender.org/D203 --- intern/cycles/kernel/svm/svm_noise.h | 90 ++++++++++++++++++++++-------------- intern/cycles/util/util_simd.h | 19 +++++++- 2 files changed, 72 insertions(+), 37 deletions(-) diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index 6ad10b7c8db..d431934f6ce 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -32,24 +32,31 @@ CCL_NAMESPACE_BEGIN +#ifndef __KERNEL_SSE2__ ccl_device int quick_floor(float x) { return float_to_int(x) - ((x < 0) ? 1 : 0); } - -#ifdef __KERNEL_SSE2__ -ccl_device_inline __m128i quick_floor_sse(const __m128 *x) +#else +ccl_device_inline __m128i quick_floor_sse(const __m128 &x) { - __m128i b = _mm_cvttps_epi32(*x); - __m128i isneg = _mm_castps_si128(_mm_cmplt_ps(*x, _mm_set1_ps(0.0f))); + __m128i b = _mm_cvttps_epi32(x); + __m128i isneg = _mm_castps_si128(_mm_cmplt_ps(x, _mm_set1_ps(0.0f))); return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the same as subtract -1 } #endif +#ifndef __KERNEL_SSE2__ ccl_device float bits_to_01(uint bits) { return bits * (1.0f/(float)0xFFFFFFFF); } +#else +ccl_device_inline __m128 bits_to_01_sse(const __m128i &bits) +{ + return _mm_mul_ps(uint32_to_float(bits), _mm_set1_ps(1.0f/(float)0xFFFFFFFF)); +} +#endif ccl_device uint hash(uint kx, uint ky, uint kz) { @@ -81,16 +88,16 @@ ccl_device uint hash(uint kx, uint ky, uint kz) } #ifdef __KERNEL_SSE2__ -ccl_device_inline __m128i hash_sse(const __m128i *kx, const __m128i *ky, const __m128i *kz) +ccl_device_inline __m128i hash_sse(const __m128i &kx, const __m128i &ky, const __m128i &kz) { #define rot(x,k) _mm_or_si128(_mm_slli_epi32((x), (k)), _mm_srli_epi32((x), 32-(k))) #define xor_rot(a, b, c) do {a = _mm_xor_si128(a, b); a = _mm_sub_epi32(a, rot(b, c));} while(0) uint len = 3; __m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13); - __m128i a = _mm_add_epi32(magic, *kx); - __m128i b = _mm_add_epi32(magic, *ky); - __m128i c = _mm_add_epi32(magic, *kz); + __m128i a = _mm_add_epi32(magic, kx); + __m128i b = _mm_add_epi32(magic, ky); + __m128i c = _mm_add_epi32(magic, kz); xor_rot(c, b, 14); xor_rot(a, c, 11); @@ -126,10 +133,10 @@ ccl_device float floorfrac(float x, int* i) return x - *i; } #else -ccl_device_inline __m128 floorfrac_sse(const __m128 *x, __m128i *i) +ccl_device_inline __m128 floorfrac_sse(const __m128 &x, __m128i *i) { *i = quick_floor_sse(x); - return _mm_sub_ps(*x, _mm_cvtepi32_ps(*i)); + return _mm_sub_ps(x, _mm_cvtepi32_ps(*i)); } #endif @@ -153,10 +160,10 @@ ccl_device float nerp(float t, float a, float b) return (1.0f - t) * a + t * b; } #else -ccl_device_inline __m128 nerp_sse(const __m128 *t, const __m128 *a, const __m128 *b) +ccl_device_inline __m128 nerp_sse(const __m128 &t, const __m128 &a, const __m128 &b) { - __m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), *t), *a); - return fma(*t, *b, x1); + __m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), t), a); + return fma(t, b, x1); } #endif @@ -171,16 +178,16 @@ ccl_device float grad(int hash, float x, float y, float z) return ((h&1) ? -u : u) + ((h&2) ? -v : v); } #else -ccl_device_inline __m128 grad_sse(const __m128i *hash, const __m128 *x, const __m128 *y, const __m128 *z) +ccl_device_inline __m128 grad_sse(const __m128i &hash, const __m128 &x, const __m128 &y, const __m128 &z) { __m128i c1 = _mm_set1_epi32(1); __m128i c2 = _mm_set1_epi32(2); - __m128i h = _mm_and_si128(*hash, _mm_set1_epi32(15)); // h = hash & 15 + __m128i h = _mm_and_si128(hash, _mm_set1_epi32(15)); // h = hash & 15 __m128i case_ux = _mm_cmplt_epi32(h, _mm_set1_epi32(8)); // 0xffffffff if h < 8 else 0 - __m128 u = blend(_mm_castsi128_ps(case_ux), *x, *y); // u = h<8 ? x : y + __m128 u = blend(_mm_castsi128_ps(case_ux), x, y); // u = h<8 ? x : y __m128i case_vy = _mm_cmplt_epi32(h, _mm_set1_epi32(4)); // 0xffffffff if h < 4 else 0 @@ -189,7 +196,7 @@ ccl_device_inline __m128 grad_sse(const __m128i *hash, const __m128 *x, const __ __m128i case_vx = _mm_or_si128(case_h12, case_h14); // 0xffffffff if h == 12 or h == 14 else 0 - __m128 v = blend(_mm_castsi128_ps(case_vy), *y, blend(_mm_castsi128_ps(case_vx), *x, *z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z + __m128 v = blend(_mm_castsi128_ps(case_vy), y, blend(_mm_castsi128_ps(case_vx), x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z __m128i case_uneg = _mm_slli_epi32(_mm_and_si128(h, c1), 31); // 1<<31 if h&1 else 0 __m128 case_uneg_mask = _mm_castsi128_ps(case_uneg); // -0.0 if h&1 else +0.0 @@ -210,9 +217,9 @@ ccl_device float scale3(float result) return 0.9820f * result; } #else -ccl_device_inline __m128 scale3_sse(const __m128 *result) +ccl_device_inline __m128 scale3_sse(const __m128 &result) { - return _mm_mul_ps(_mm_set1_ps(0.9820f), *result); + return _mm_mul_ps(_mm_set1_ps(0.9820f), result); } #endif @@ -248,7 +255,7 @@ ccl_device_noinline float perlin(float x, float y, float z) __m128 xyz = _mm_setr_ps(x, y, z, 0.0f); __m128i XYZ; - __m128 fxyz = floorfrac_sse(&xyz, &XYZ); + __m128 fxyz = floorfrac_sse(xyz, &XYZ); __m128 uvw = fade_sse(&fxyz); __m128 u = broadcast<0>(uvw), v = broadcast<1>(uvw), w = broadcast<2>(uvw); @@ -257,24 +264,24 @@ ccl_device_noinline float perlin(float x, float y, float z) __m128i vp[] = {broadcast<0>(XYZ), broadcast<1>(XYZ), broadcast<2>(XYZ)}; __m128i vd[] = {_mm_add_epi32(vp[0], ci[0]), _mm_add_epi32(vp[1], ci[1]), _mm_add_epi32(vp[2], ci[2])}; - __m128i h1 = hash_sse(vp, vd+1, vd+2); // hash directions 000, 001, 010, 011 (vp[0] is not a typo, because vp[0]+0 == vp[0]) - __m128i h2 = hash_sse(vd, vd+1, vd+2); // hash directions 100, 101, 110, 111 + __m128i h1 = hash_sse(vp[0], vd[1], vd[2]); // hash directions 000, 001, 010, 011 (vp[0] is not a typo, because vp[0]+0 == vp[0]) + __m128i h2 = hash_sse(vd[0], vd[1], vd[2]); // hash directions 100, 101, 110, 111 __m128 cf[] = {_mm_setr_ps(1.0f, 1.0f, 1.0f, 1.0f), _mm_setr_ps(0.0f, 0.0f, 1.0f, 1.0f), _mm_setr_ps(0.0f, 1.0f, 0.0f, 1.0f)}; __m128 vf[] = {broadcast<0>(fxyz), broadcast<1>(fxyz), broadcast<2>(fxyz)}; __m128 vfd[] = {_mm_sub_ps(vf[0], cf[0]), _mm_sub_ps(vf[1], cf[1]), _mm_sub_ps(vf[2], cf[2])}; - __m128 g1 = grad_sse(&h1, vf, vfd+1, vfd+2); // vf is not a typo (same as above) - __m128 g2 = grad_sse(&h2, vfd, vfd+1, vfd+2); - __m128 n1 = nerp_sse(&u, &g1, &g2); + __m128 g1 = grad_sse(h1, vf[0], vfd[1], vfd[2]); // vf is not a typo (same as above) + __m128 g2 = grad_sse(h2, vfd[0], vfd[1], vfd[2]); + __m128 n1 = nerp_sse(u, g1, g2); __m128 n1_half = _mm_movehl_ps(n1, n1); // extract 2 floats to a separate vector - __m128 n2 = nerp_sse(&v, &n1, &n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _] + __m128 n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _] __m128 n2_second = broadcast<1>(n2); // extract b to a separate vector - __m128 result = nerp_sse(&w, &n2, &n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _] + __m128 result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _] - __m128 r = scale3_sse(&result); + __m128 r = scale3_sse(result); __m128 infmask = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000)); __m128 rinfmask = _mm_cmpeq_ps(_mm_and_ps(r, infmask), infmask); // 0xffffffff if r is inf/-inf/nan else 0 @@ -318,20 +325,21 @@ ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pper #endif /* perlin noise in range 0..1 */ -ccl_device float noise(float3 p) +ccl_device float noise(const float3 &p) { float r = perlin(p.x, p.y, p.z); return 0.5f*r + 0.5f; } /* perlin noise in range -1..1 */ -ccl_device float snoise(float3 p) +ccl_device float snoise(const float3 &p) { return perlin(p.x, p.y, p.z); } /* cell noise */ -ccl_device_noinline float cellnoise(float3 p) +#ifndef __KERNEL_SSE2__ +ccl_device_noinline float cellnoise(const float3 &p) { uint ix = quick_floor(p.x); uint iy = quick_floor(p.y); @@ -340,7 +348,7 @@ ccl_device_noinline float cellnoise(float3 p) return bits_to_01(hash(ix, iy, iz)); } -ccl_device float3 cellnoise_color(float3 p) +ccl_device float3 cellnoise_color(const float3 &p) { float r = cellnoise(p); float g = cellnoise(make_float3(p.y, p.x, p.z)); @@ -348,17 +356,29 @@ ccl_device float3 cellnoise_color(float3 p) return make_float3(r, g, b); } +#else +ccl_device float3 cellnoise_color(const float3 &p) +{ + __m128i v_yxz = quick_floor_sse(_mm_setr_ps(p.y, p.x, p.z, 0.0f)); + __m128i v_xyy = shuffle<1, 0, 0, 3>(v_yxz); + __m128i v_zzx = shuffle<2, 2, 1, 3>(v_yxz); + __m128 rgb = bits_to_01_sse(hash_sse(v_xyy, v_yxz, v_zzx)); + + float3 result = *(float3*)&rgb; + return result; +} +#endif #if 0 // unused /* periodic perlin noise in range 0..1 */ -ccl_device float pnoise(float3 p, float3 pperiod) +ccl_device float pnoise(const float3 &p, const float3 &pperiod) { float r = perlin_periodic(p.x, p.y, p.z, pperiod); return 0.5f*r + 0.5f; } /* periodic perlin noise in range -1..1 */ -ccl_device float psnoise(float3 p, float3 pperiod) +ccl_device float psnoise(const float3 &p, const float3 &pperiod) { return perlin_periodic(p.x, p.y, p.z, pperiod); } diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index a6880e77054..c8f794adfb3 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -74,9 +74,14 @@ template ccl_device_inline const __m return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); } -template ccl_device_inline const __m128 shuffle(const __m128& b) +template ccl_device_inline const __m128 shuffle(const __m128& a) { - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))); + return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0))); +} + +template ccl_device_inline const __m128i shuffle(const __m128i& a) +{ + return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)); } /* Blend 2 vectors based on mask: (a[i] & mask[i]) | (b[i] & ~mask[i]) */ @@ -108,6 +113,16 @@ template ccl_device_inline const __m128i broadcast(const __m128i& a) return _mm_shuffle_epi32(a, _MM_SHUFFLE(N, N, N, N)); } +ccl_device_inline const __m128 uint32_to_float(const __m128i &in) +{ + __m128i a = _mm_srli_epi32(in, 16); + __m128i b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff)); + __m128i c = _mm_or_si128(a, _mm_set1_epi32(0x53000000)); + __m128 d = _mm_cvtepi32_ps(b); + __m128 e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000))); + return _mm_add_ps(e, d); +} + #endif /* __KERNEL_SSE2__ */ CCL_NAMESPACE_END -- cgit v1.2.3