diff options
Diffstat (limited to 'intern/cycles/kernel/svm')
-rw-r--r-- | intern/cycles/kernel/svm/svm_image.h | 10 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_noise.h | 144 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_texture.h | 22 |
3 files changed, 88 insertions, 88 deletions
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index daf7c6652d2..b34c101f5e7 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -134,8 +134,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, { #ifdef __KERNEL_CPU__ #ifdef __KERNEL_SSE2__ - __m128 r_m128; - float4 &r = (float4 &)r_m128; + ssef r_ssef; + float4 &r = (float4 &)r_ssef; r = kernel_tex_image_interp(id, x, y); #else float4 r = kernel_tex_image_interp(id, x, y); @@ -318,14 +318,14 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, float alpha = r.w; if(use_alpha && alpha != 1.0f && alpha != 0.0f) { - r_m128 = _mm_div_ps(r_m128, _mm_set1_ps(alpha)); + r_ssef = r_ssef / ssef(alpha); if(id >= TEX_NUM_FLOAT_IMAGES) - r_m128 = _mm_min_ps(r_m128, _mm_set1_ps(1.0f)); + r_ssef = min(r_ssef, ssef(1.0f)); r.w = alpha; } if(srgb) { - r_m128 = color_srgb_to_scene_linear(r_m128); + r_ssef = color_srgb_to_scene_linear(r_ssef); r.w = alpha; } #else diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index 91dda8972f9..869341c81f4 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -38,11 +38,11 @@ ccl_device int quick_floor(float x) return float_to_int(x) - ((x < 0) ? 1 : 0); } #else -ccl_device_inline __m128i quick_floor_sse(const __m128& x) +ccl_device_inline ssei quick_floor_sse(const ssef& x) { - __m128i b = _mm_cvttps_epi32(x); - __m128i isneg = _mm_castps_si128(_mm_cmplt_ps(x, _mm_set1_ps(0.0f))); - return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the same as subtract -1 + ssei b = truncatei(x); + ssei isneg = cast((x < ssef(0.0f)).m128); + return b + isneg; // unsaturated add 0xffffffff is the same as subtract -1 } #endif @@ -52,9 +52,9 @@ ccl_device float bits_to_01(uint bits) return bits * (1.0f/(float)0xFFFFFFFF); } #else -ccl_device_inline __m128 bits_to_01_sse(const __m128i& bits) +ccl_device_inline ssef bits_to_01_sse(const ssei& bits) { - return _mm_mul_ps(uint32_to_float(bits), _mm_set1_ps(1.0f/(float)0xFFFFFFFF)); + return uint32_to_float(bits) * ssef(1.0f/(float)0xFFFFFFFF); } #endif @@ -88,16 +88,16 @@ ccl_device uint hash(uint kx, uint ky, uint kz) } #ifdef __KERNEL_SSE2__ -ccl_device_inline __m128i hash_sse(const __m128i& kx, const __m128i& ky, const __m128i& kz) +ccl_device_inline ssei hash_sse(const ssei& kx, const ssei& ky, const ssei& kz) { -#define rot(x,k) _mm_or_si128(_mm_slli_epi32((x), (k)), _mm_srli_epi32((x), 32-(k))) -#define xor_rot(a, b, c) do {a = _mm_xor_si128(a, b); a = _mm_sub_epi32(a, rot(b, c));} while(0) +#define rot(x,k) (((x)<<(k)) | (srl(x, 32-(k)))) +#define xor_rot(a, b, c) do {a = a^b; a = a - rot(b, c);} while(0) uint len = 3; - __m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13); - __m128i a = _mm_add_epi32(magic, kx); - __m128i b = _mm_add_epi32(magic, ky); - __m128i c = _mm_add_epi32(magic, kz); + ssei magic = ssei(0xdeadbeef + (len << 2) + 13); + ssei a = magic + kx; + ssei b = magic + ky; + ssei c = magic + kz; xor_rot(c, b, 14); xor_rot(a, c, 11); @@ -133,10 +133,10 @@ ccl_device float floorfrac(float x, int* i) return x - *i; } #else -ccl_device_inline __m128 floorfrac_sse(const __m128& x, __m128i *i) +ccl_device_inline ssef floorfrac_sse(const ssef& x, ssei *i) { *i = quick_floor_sse(x); - return _mm_sub_ps(x, _mm_cvtepi32_ps(*i)); + return x - ssef(*i); } #endif @@ -146,11 +146,11 @@ ccl_device float fade(float t) return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f); } #else -ccl_device_inline __m128 fade_sse(const __m128 *t) +ccl_device_inline ssef fade_sse(const ssef *t) { - __m128 a = fma(*t, _mm_set1_ps(6.0f), _mm_set1_ps(-15.0f)); - __m128 b = fma(*t, a, _mm_set1_ps(10.0f)); - return _mm_mul_ps(_mm_mul_ps(*t, *t), _mm_mul_ps(*t, b)); + ssef a = madd(*t, ssef(6.0f), ssef(-15.0f)); + ssef b = madd(*t, a, ssef(10.0f)); + return ((*t) * (*t)) * ((*t) * b); } #endif @@ -160,10 +160,10 @@ ccl_device float nerp(float t, float a, float b) return (1.0f - t) * a + t * b; } #else -ccl_device_inline __m128 nerp_sse(const __m128& t, const __m128& a, const __m128& b) +ccl_device_inline ssef nerp_sse(const ssef& t, const ssef& a, const ssef& b) { - __m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), t), a); - return fma(t, b, x1); + ssef x1 = (ssef(1.0f) - t) * a; + return madd(t, b, x1); } #endif @@ -178,35 +178,35 @@ ccl_device float grad(int hash, float x, float y, float z) return ((h&1) ? -u : u) + ((h&2) ? -v : v); } #else -ccl_device_inline __m128 grad_sse(const __m128i& hash, const __m128& x, const __m128& y, const __m128& z) +ccl_device_inline ssef grad_sse(const ssei& hash, const ssef& x, const ssef& y, const ssef& z) { - __m128i c1 = _mm_set1_epi32(1); - __m128i c2 = _mm_set1_epi32(2); + ssei c1 = ssei(1); + ssei c2 = ssei(2); - __m128i h = _mm_and_si128(hash, _mm_set1_epi32(15)); // h = hash & 15 + ssei h = hash & ssei(15); // h = hash & 15 - __m128i case_ux = _mm_cmplt_epi32(h, _mm_set1_epi32(8)); // 0xffffffff if h < 8 else 0 + sseb case_ux = h < ssei(8); // 0xffffffff if h < 8 else 0 - __m128 u = blend(_mm_castsi128_ps(case_ux), x, y); // u = h<8 ? x : y + ssef u = select(case_ux, x, y); // u = h<8 ? x : y - __m128i case_vy = _mm_cmplt_epi32(h, _mm_set1_epi32(4)); // 0xffffffff if h < 4 else 0 + sseb case_vy = h < ssei(4); // 0xffffffff if h < 4 else 0 - __m128i case_h12 = _mm_cmpeq_epi32(h, _mm_set1_epi32(12)); // 0xffffffff if h == 12 else 0 - __m128i case_h14 = _mm_cmpeq_epi32(h, _mm_set1_epi32(14)); // 0xffffffff if h == 14 else 0 + sseb case_h12 = h == ssei(12); // 0xffffffff if h == 12 else 0 + sseb case_h14 = h == ssei(14); // 0xffffffff if h == 14 else 0 - __m128i case_vx = _mm_or_si128(case_h12, case_h14); // 0xffffffff if h == 12 or h == 14 else 0 + sseb case_vx = case_h12 | case_h14; // 0xffffffff if h == 12 or h == 14 else 0 - __m128 v = blend(_mm_castsi128_ps(case_vy), y, blend(_mm_castsi128_ps(case_vx), x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z + ssef v = select(case_vy, y, select(case_vx, x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z - __m128i case_uneg = _mm_slli_epi32(_mm_and_si128(h, c1), 31); // 1<<31 if h&1 else 0 - __m128 case_uneg_mask = _mm_castsi128_ps(case_uneg); // -0.0 if h&1 else +0.0 - __m128 ru = _mm_xor_ps(u, case_uneg_mask); // -u if h&1 else u (copy float sign) + ssei case_uneg = (h & c1) << 31; // 1<<31 if h&1 else 0 + ssef case_uneg_mask = cast(case_uneg); // -0.0 if h&1 else +0.0 + ssef ru = u ^ case_uneg_mask; // -u if h&1 else u (copy float sign) - __m128i case_vneg = _mm_slli_epi32(_mm_and_si128(h, c2), 30); // 2<<30 if h&2 else 0 - __m128 case_vneg_mask = _mm_castsi128_ps(case_vneg); // -0.0 if h&2 else +0.0 - __m128 rv = _mm_xor_ps(v, case_vneg_mask); // -v if h&2 else v (copy float sign) + ssei case_vneg = (h & c2) << 30; // 2<<30 if h&2 else 0 + ssef case_vneg_mask = cast(case_vneg); // -0.0 if h&2 else +0.0 + ssef rv = v ^ case_vneg_mask; // -v if h&2 else v (copy float sign) - __m128 r = _mm_add_ps(ru, rv); // ((h&1) ? -u : u) + ((h&2) ? -v : v) + ssef r = ru + rv; // ((h&1) ? -u : u) + ((h&2) ? -v : v) return r; } #endif @@ -217,9 +217,9 @@ ccl_device float scale3(float result) return 0.9820f * result; } #else -ccl_device_inline __m128 scale3_sse(const __m128& result) +ccl_device_inline ssef scale3_sse(const ssef& result) { - return _mm_mul_ps(_mm_set1_ps(0.9820f), result); + return ssef(0.9820f) * result; } #endif @@ -252,41 +252,41 @@ ccl_device_noinline float perlin(float x, float y, float z) #else ccl_device_noinline float perlin(float x, float y, float z) { - __m128 xyz = _mm_setr_ps(x, y, z, 0.0f); - __m128i XYZ; + ssef xyz = ssef(x, y, z, 0.0f); + ssei XYZ; - __m128 fxyz = floorfrac_sse(xyz, &XYZ); + ssef fxyz = floorfrac_sse(xyz, &XYZ); - __m128 uvw = fade_sse(&fxyz); - __m128 u = broadcast<0>(uvw), v = broadcast<1>(uvw), w = broadcast<2>(uvw); + ssef uvw = fade_sse(&fxyz); + ssef u = shuffle<0>(uvw), v = shuffle<1>(uvw), w = shuffle<2>(uvw); - __m128i XYZ_ofc = _mm_add_epi32(XYZ, _mm_set1_epi32(1)); - __m128i vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc); // +0, +0, +1, +1 - __m128i vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1 + ssei XYZ_ofc = XYZ + ssei(1); + ssei vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc); // +0, +0, +1, +1 + ssei vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1 - __m128i h1 = hash_sse(broadcast<0>(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011 - __m128i h2 = hash_sse(broadcast<0>(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111 + ssei h1 = hash_sse(shuffle<0>(XYZ), vdy, vdz); // hash directions 000, 001, 010, 011 + ssei h2 = hash_sse(shuffle<0>(XYZ_ofc), vdy, vdz); // hash directions 100, 101, 110, 111 - __m128 fxyz_ofc = _mm_sub_ps(fxyz, _mm_set1_ps(1.0f)); - __m128 vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc); - __m128 vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc)); + ssef fxyz_ofc = fxyz - ssef(1.0f); + ssef vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc); + ssef vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc)); - __m128 g1 = grad_sse(h1, broadcast<0>(fxyz), vfy, vfz); - __m128 g2 = grad_sse(h2, broadcast<0>(fxyz_ofc), vfy, vfz); - __m128 n1 = nerp_sse(u, g1, g2); + ssef g1 = grad_sse(h1, shuffle<0>(fxyz), vfy, vfz); + ssef g2 = grad_sse(h2, shuffle<0>(fxyz_ofc), vfy, vfz); + ssef n1 = nerp_sse(u, g1, g2); - __m128 n1_half = shuffle<2, 3, 2, 3>(n1); // extract 2 floats to a separate vector - __m128 n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _] + ssef n1_half = shuffle<2, 3, 2, 3>(n1); // extract 2 floats to a separate vector + ssef n2 = nerp_sse(v, n1, n1_half); // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _] - __m128 n2_second = broadcast<1>(n2); // extract b to a separate vector - __m128 result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _] + ssef n2_second = shuffle<1>(n2); // extract b to a separate vector + ssef result = nerp_sse(w, n2, n2_second); // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _] - __m128 r = scale3_sse(result); + ssef r = scale3_sse(result); - __m128 infmask = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000)); - __m128 rinfmask = _mm_cmpeq_ps(_mm_and_ps(r, infmask), infmask); // 0xffffffff if r is inf/-inf/nan else 0 - __m128 rfinite = _mm_andnot_ps(rinfmask, r); // 0 if r is inf/-inf/nan else r - return _mm_cvtss_f32(rfinite); + ssef infmask = cast(ssei(0x7f800000)); + ssef rinfmask = ((r & infmask) == infmask).m128; // 0xffffffff if r is inf/-inf/nan else 0 + ssef rfinite = andnot(rinfmask, r); // 0 if r is inf/-inf/nan else r + return extract<0>(rfinite); } #endif @@ -357,12 +357,12 @@ ccl_device float3 cellnoise_color(float3 p) return make_float3(r, g, b); } #else -ccl_device __m128 cellnoise_color(const __m128& p) +ccl_device ssef cellnoise_color(const ssef& p) { - __m128i ip = quick_floor_sse(p); - __m128i ip_yxz = shuffle<1, 0, 2, 3>(ip); - __m128i ip_xyy = shuffle<0, 1, 1, 3>(ip); - __m128i ip_zzx = shuffle<2, 2, 0, 3>(ip); + ssei ip = quick_floor_sse(p); + ssei ip_yxz = shuffle<1, 0, 2, 3>(ip); + ssei ip_xyy = shuffle<0, 1, 1, 3>(ip); + ssei ip_zzx = shuffle<2, 2, 0, 3>(ip); return bits_to_01_sse(hash_sse(ip_xyy, ip_yxz, ip_zzx)); } #endif diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h index 5fd9204cbf6..d97c85db36a 100644 --- a/intern/cycles/kernel/svm/svm_texture.h +++ b/intern/cycles/kernel/svm/svm_texture.h @@ -140,15 +140,15 @@ ccl_device float voronoi_F1_distance(float3 p) } } #else - __m128 vec_p = load_m128(p); - __m128i xyzi = quick_floor_sse(vec_p); + ssef vec_p = load4f(p); + ssei xyzi = quick_floor_sse(vec_p); for (int xx = -1; xx <= 1; xx++) { for (int yy = -1; yy <= 1; yy++) { for (int zz = -1; zz <= 1; zz++) { - __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); - __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); - float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); + ssef vp = ip + cellnoise_color(ip); + float d = len_squared<1, 1, 1, 0>(vec_p - vp); da = min(d, da); } } @@ -184,15 +184,15 @@ ccl_device float3 voronoi_F1_color(float3 p) return cellnoise_color(pa); #else - __m128 pa, vec_p = load_m128(p); - __m128i xyzi = quick_floor_sse(vec_p); + ssef pa, vec_p = load4f(p); + ssei xyzi = quick_floor_sse(vec_p); for (int xx = -1; xx <= 1; xx++) { for (int yy = -1; yy <= 1; yy++) { for (int zz = -1; zz <= 1; zz++) { - __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0))); - __m128 vp = _mm_add_ps(ip, cellnoise_color(ip)); - float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp)); + ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0)); + ssef vp = ip + cellnoise_color(ip); + float d = len_squared<1, 1, 1, 0>(vec_p - vp); if(d < da) { da = d; @@ -202,7 +202,7 @@ ccl_device float3 voronoi_F1_color(float3 p) } } - __m128 color = cellnoise_color(pa); + ssef color = cellnoise_color(pa); return (float3 &)color; #endif } |