diff options
author | Sv. Lockal <lockalsash@gmail.com> | 2014-04-03 23:34:53 +0400 |
---|---|---|
committer | Sv. Lockal <lockalsash@gmail.com> | 2014-04-03 23:35:10 +0400 |
commit | ab32a1807dd153723d26a7d53895ed071233dafc (patch) | |
tree | f4e27eac2ecab3353ff558f346c1be1ad780c41f /intern | |
parent | c019ae5ea3a8eb49010de76c14a359c4729bbcf0 (diff) |
Cycles: SSE optimization for Voronoi cells texture
Gives 5-6% speedup for Caterpillar_PatazStudio.blend.
Reviewed By: brecht, dingto
Differential Revision: https://developer.blender.org/D419
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/kernel/svm/svm_noise.h | 14 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_texture.h | 93 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_voronoi.h | 19 | ||||
-rw-r--r-- | intern/cycles/util/util_math.h | 4 | ||||
-rw-r--r-- | intern/cycles/util/util_simd.h | 12 |
5 files changed, 115 insertions, 27 deletions
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index 282ad191470..91dda8972f9 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -357,15 +357,13 @@ ccl_device float3 cellnoise_color(float3 p) return make_float3(r, g, b); } #else -ccl_device float3 cellnoise_color(const float3& p) +ccl_device __m128 cellnoise_color(const __m128& p) { - __m128i v_yxz = quick_floor_sse(_mm_setr_ps(p.y, p.x, p.z, 0.0f)); - __m128i v_xyy = shuffle<1, 0, 0, 3>(v_yxz); - __m128i v_zzx = shuffle<2, 2, 1, 3>(v_yxz); - __m128 rgb = bits_to_01_sse(hash_sse(v_xyy, v_yxz, v_zzx)); - - float3 result = *(float3*)&rgb; - return result; + __m128i ip = quick_floor_sse(p); + __m128i ip_yxz = shuffle<1, 0, 2, 3>(ip); + __m128i ip_xyy = shuffle<0, 1, 1, 3>(ip); + __m128i ip_zzx = shuffle<2, 2, 0, 3>(ip); + return bits_to_01_sse(hash_sse(ip_xyy, ip_yxz, ip_zzx)); } #endif diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h index 8ced8390b0b..5fd9204cbf6 100644 --- a/intern/cycles/kernel/svm/svm_texture.h +++ b/intern/cycles/kernel/svm/svm_texture.h @@ -18,6 +18,7 @@ CCL_NAMESPACE_BEGIN /* Voronoi Distances */ +#if 0 ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e) { #if 0 @@ -43,8 +44,7 @@ ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, } /* Voronoi / Worley like */ - -ccl_device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2) +ccl_device_inline float4 voronoi_Fn(float3 p, float e, int n1, int n2) { float da[4]; float3 pa[4]; @@ -119,7 +119,95 @@ ccl_device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2) return result; } +#endif + +ccl_device float voronoi_F1_distance(float3 p) +{ + /* returns squared distance in da */ + float da = 1e10f; + +#ifndef __KERNEL_SSE2__ + int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z); + + for (int xx = -1; xx <= 1; xx++) { + for (int yy = -1; yy <= 1; yy++) { + for (int zz = -1; zz <= 1; zz++) { + float3 ip = make_float3(ix + xx, iy + yy, iz + zz); + float3 vp = ip + cellnoise_color(ip); + float d = len_squared(p - vp); + da = min(d, da); + } + } + } +#else + __m128 vec_p = load_m128(p); + __m128i xyzi = quick_floor_sse(vec_p); + + for (int xx = -1; xx <= 1; xx++) { + for (int yy = -1; yy <= 1; yy++) { + for (int zz = -1; zz <= 1; zz++) { + __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); + __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); + float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + da = min(d, da); + } + } + } +#endif + + return da; +} + +ccl_device float3 voronoi_F1_color(float3 p) +{ + /* returns color of the nearest point */ + float da = 1e10f; + +#ifndef __KERNEL_SSE2__ + float3 pa; + int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z); + + for (int xx = -1; xx <= 1; xx++) { + for (int yy = -1; yy <= 1; yy++) { + for (int zz = -1; zz <= 1; zz++) { + float3 ip = make_float3(ix + xx, iy + yy, iz + zz); + float3 vp = ip + cellnoise_color(ip); + float d = len_squared(p - vp); + + if(d < da) { + da = d; + pa = vp; + } + } + } + } + + return cellnoise_color(pa); +#else + __m128 pa, vec_p = load_m128(p); + __m128i xyzi = quick_floor_sse(vec_p); + + for (int xx = -1; xx <= 1; xx++) { + for (int yy = -1; yy <= 1; yy++) { + for (int zz = -1; zz <= 1; zz++) { + __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); + __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); + float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + + if(d < da) { + da = d; + pa = vp; + } + } + } + } + + __m128 color = cellnoise_color(pa); + return (float3 &)color; +#endif +} +#if 0 ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; } ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; } ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; } @@ -139,6 +227,7 @@ ccl_device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; } ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; } ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; } ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; } +#endif /* Noise Bases */ diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h index 7f597dc8bff..083a2f30e06 100644 --- a/intern/cycles/kernel/svm/svm_voronoi.h +++ b/intern/cycles/kernel/svm/svm_voronoi.h @@ -20,23 +20,16 @@ CCL_NAMESPACE_BEGIN ccl_device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float3 p) { - /* compute distance and point coordinate of 4 nearest neighbours */ - float4 dpa0 = voronoi_Fn(p, 1.0f, 0, -1); - - /* output */ - float fac; - float3 color; - if(coloring == NODE_VORONOI_INTENSITY) { - fac = fabsf(dpa0.w); - color = make_float3(fac, fac, fac); + /* compute squared distance to the nearest neighbour */ + float fac = voronoi_F1_distance(p); + return make_float4(fac, fac, fac, fac); } else { - color = cellnoise_color(float4_to_float3(dpa0)); - fac = average(color); + /* compute color of the nearest neighbour */ + float3 color = voronoi_F1_color(p); + return make_float4(color.x, color.y, color.z, average(color)); } - - return make_float4(color.x, color.y, color.z, fac); } ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index b57aa26dbc4..53ed6817258 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -163,11 +163,7 @@ ccl_device_inline float clamp(float a, float mn, float mx) ccl_device_inline int float_to_int(float f) { -#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER) - return _mm_cvtt_ss2si(_mm_load_ss(&f)); -#else return (int)f; -#endif } ccl_device_inline int floor_to_int(float f) diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 486816cc5c0..679556ee59b 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -220,6 +220,18 @@ ccl_device_inline const __m128 dot3_splat(const __m128& a, const __m128& b) #endif } +/* squared length taking only specified axes into account */ +template<size_t X, size_t Y, size_t Z, size_t W> +ccl_device_inline float len_squared(const __m128& a) +{ +#ifndef __KERNEL_SSE41__ + float4& t = (float4 &)a; + return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) + (W ? t.w * t.w : 0.0f); +#else + return _mm_cvtss_f32(_mm_dp_ps(a, a, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf)); +#endif +} + ccl_device_inline float dot3(const __m128& a, const __m128& b) { #ifdef __KERNEL_SSE41__ |