diff options
author | Sv. Lockal <lockalsash@gmail.com> | 2014-04-03 23:34:53 +0400 |
---|---|---|
committer | Sv. Lockal <lockalsash@gmail.com> | 2014-04-03 23:35:10 +0400 |
commit | ab32a1807dd153723d26a7d53895ed071233dafc (patch) | |
tree | f4e27eac2ecab3353ff558f346c1be1ad780c41f /intern/cycles/kernel/svm/svm_texture.h | |
parent | c019ae5ea3a8eb49010de76c14a359c4729bbcf0 (diff) |
Cycles: SSE optimization for Voronoi cells texture
Gives 5-6% speedup for Caterpillar_PatazStudio.blend.
Reviewed By: brecht, dingto
Differential Revision: https://developer.blender.org/D419
Diffstat (limited to 'intern/cycles/kernel/svm/svm_texture.h')
-rw-r--r-- | intern/cycles/kernel/svm/svm_texture.h | 93 |
1 files changed, 91 insertions, 2 deletions
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h index 8ced8390b0b..5fd9204cbf6 100644 --- a/intern/cycles/kernel/svm/svm_texture.h +++ b/intern/cycles/kernel/svm/svm_texture.h @@ -18,6 +18,7 @@ CCL_NAMESPACE_BEGIN /* Voronoi Distances */ +#if 0 ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e) { #if 0 @@ -43,8 +44,7 @@ ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, } /* Voronoi / Worley like */ - -ccl_device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2) +ccl_device_inline float4 voronoi_Fn(float3 p, float e, int n1, int n2) { float da[4]; float3 pa[4]; @@ -119,7 +119,95 @@ ccl_device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2) return result; } +#endif + +ccl_device float voronoi_F1_distance(float3 p) +{ + /* returns squared distance in da */ + float da = 1e10f; + +#ifndef __KERNEL_SSE2__ + int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z); + + for (int xx = -1; xx <= 1; xx++) { + for (int yy = -1; yy <= 1; yy++) { + for (int zz = -1; zz <= 1; zz++) { + float3 ip = make_float3(ix + xx, iy + yy, iz + zz); + float3 vp = ip + cellnoise_color(ip); + float d = len_squared(p - vp); + da = min(d, da); + } + } + } +#else + __m128 vec_p = load_m128(p); + __m128i xyzi = quick_floor_sse(vec_p); + + for (int xx = -1; xx <= 1; xx++) { + for (int yy = -1; yy <= 1; yy++) { + for (int zz = -1; zz <= 1; zz++) { + __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); + __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); + float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + da = min(d, da); + } + } + } +#endif + + return da; +} + +ccl_device float3 voronoi_F1_color(float3 p) +{ + /* returns color of the nearest point */ + float da = 1e10f; + +#ifndef __KERNEL_SSE2__ + float3 pa; + int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z); + + for (int xx = -1; xx <= 1; xx++) { + for (int yy = -1; yy <= 1; yy++) { + for (int zz = -1; zz <= 1; zz++) { + float3 ip = make_float3(ix + xx, iy + yy, iz + zz); + float3 vp = ip + cellnoise_color(ip); + float d = len_squared(p - vp); + + if(d < da) { + da = d; + pa = vp; + } + } + } + } + + return cellnoise_color(pa); +#else + __m128 pa, vec_p = load_m128(p); + __m128i xyzi = quick_floor_sse(vec_p); + + for (int xx = -1; xx <= 1; xx++) { + for (int yy = -1; yy <= 1; yy++) { + for (int zz = -1; zz <= 1; zz++) { + __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); + __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); + float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + + if(d < da) { + da = d; + pa = vp; + } + } + } + } + + __m128 color = cellnoise_color(pa); + return (float3 &)color; +#endif +} +#if 0 ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; } ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; } ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; } @@ -139,6 +227,7 @@ ccl_device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; } ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; } ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; } ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; } +#endif /* Noise Bases */ |