diff options
Diffstat (limited to 'intern/cycles/util/math_float4.h')
-rw-r--r-- | intern/cycles/util/math_float4.h | 148 |
1 files changed, 94 insertions, 54 deletions
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h index ae9dfe75a9c..c2721873037 100644 --- a/intern/cycles/util/math_float4.h +++ b/intern/cycles/util/math_float4.h @@ -55,7 +55,8 @@ ccl_device_inline float4 floor(const float4 &a); ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t); #endif /* !__KERNEL_METAL__*/ -ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b); +ccl_device_inline float4 safe_divide(const float4 a, const float4 b); +ccl_device_inline float4 safe_divide(const float4 a, const float b); #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> @@ -74,11 +75,14 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b); # endif #endif /* __KERNEL_SSE__ */ +ccl_device_inline float reduce_min(const float4 a); +ccl_device_inline float reduce_max(const float4 a); +ccl_device_inline float reduce_add(const float4 a); + +ccl_device_inline bool isequal(const float4 a, const float4 b); + #ifndef __KERNEL_GPU__ ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b); -ccl_device_inline float4 reduce_min(const float4 &a); -ccl_device_inline float4 reduce_max(const float4 &a); -ccl_device_inline float4 reduce_add(const float4 &a); #endif /* !__KERNEL_GPU__ */ /******************************************************************************* @@ -303,27 +307,9 @@ ccl_device_inline bool is_zero(const float4 &a) # endif } -ccl_device_inline float4 reduce_add(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vaddvq_f32(a))); -# elif defined(__KERNEL_SSE3__) - float4 h(_mm_hadd_ps(a.m128, a.m128)); - return float4(_mm_hadd_ps(h.m128, h.m128)); -# else - float4 h(shuffle<1, 0, 3, 2>(a) + a); - return shuffle<2, 3, 0, 1>(h) + h; -# endif -# else - float sum = (a.x + a.y) + (a.z + a.w); - return make_float4(sum, sum, sum, sum); -# endif -} - ccl_device_inline float average(const float4 &a) { - return reduce_add(a).x * 0.25f; + return reduce_add(a) * 0.25f; } ccl_device_inline float len(const float4 &a) @@ -392,8 +378,77 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) return a + t * (b - a); } +ccl_device_inline float4 saturate(const float4 &a) +{ + return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w)); +} + +ccl_device_inline float4 exp(float4 v) +{ + return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z)); +} + +ccl_device_inline float4 log(float4 v) +{ + return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z)); +} + #endif /* !__KERNEL_METAL__*/ +ccl_device_inline float reduce_add(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vaddvq_f32(a); +# elif defined(__KERNEL_SSE3__) + float4 h(_mm_hadd_ps(a.m128, a.m128)); + return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128)); +# else + float4 h(shuffle<1, 0, 3, 2>(a) + a); + return _mm_cvtss_f32(shuffle<2, 3, 0, 1>(h) + h); +# endif +#else + return a.x + a.y + a.z + a.w; +#endif +} + +ccl_device_inline float reduce_min(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vminvq_f32(a); +# else + float4 h = min(shuffle<1, 0, 3, 2>(a), a); + return _mm_cvtss_f32(min(shuffle<2, 3, 0, 1>(h), h)); +# endif +#else + return min(min(a.x, a.y), min(a.z, a.w)); +#endif +} + +ccl_device_inline float reduce_max(const float4 a) +{ +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) + return vmaxvq_f32(a); +# else + float4 h = max(shuffle<1, 0, 3, 2>(a), a); + return _mm_cvtss_f32(max(shuffle<2, 3, 0, 1>(h), h)); +# endif +#else + return max(max(a.x, a.y), max(a.z, a.w)); +#endif +} + +ccl_device_inline bool isequal(const float4 a, const float4 b) +{ +#if defined(__KERNEL_METAL__) + return all(a == b); +#else + return a == b; +#endif +} + #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4 &b) @@ -461,34 +516,6 @@ ccl_device_inline float4 mask(const int4 &mask, const float4 &a) return select(mask, a, zero_float4()); } -ccl_device_inline float4 reduce_min(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vminvq_f32(a))); -# else - float4 h = min(shuffle<1, 0, 3, 2>(a), a); - return min(shuffle<2, 3, 0, 1>(h), h); -# endif -# else - return make_float4(min(min(a.x, a.y), min(a.z, a.w))); -# endif -} - -ccl_device_inline float4 reduce_max(const float4 &a) -{ -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) - return float4(vdupq_n_f32(vmaxvq_f32(a))); -# else - float4 h = max(shuffle<1, 0, 3, 2>(a), a); - return max(shuffle<2, 3, 0, 1>(h), h); -# endif -# else - return make_float4(max(max(a.x, a.y), max(a.z, a.w))); -# endif -} - ccl_device_inline float4 load_float4(ccl_private const float *v) { # ifdef __KERNEL_SSE__ @@ -500,17 +527,25 @@ ccl_device_inline float4 load_float4(ccl_private const float *v) #endif /* !__KERNEL_GPU__ */ -ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b) +ccl_device_inline float4 safe_divide(const float4 a, const float b) { return (b != 0.0f) ? a / b : zero_float4(); } -ccl_device_inline bool isfinite4_safe(float4 v) +ccl_device_inline float4 safe_divide(const float4 a, const float4 b) +{ + return make_float4((b.x != 0.0f) ? a.x / b.x : 0.0f, + (b.y != 0.0f) ? a.y / b.y : 0.0f, + (b.z != 0.0f) ? a.z / b.z : 0.0f, + (b.w != 0.0f) ? a.w / b.w : 0.0f); +} + +ccl_device_inline bool isfinite_safe(float4 v) { return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z) && isfinite_safe(v.w); } -ccl_device_inline float4 ensure_finite4(float4 v) +ccl_device_inline float4 ensure_finite(float4 v) { if (!isfinite_safe(v.x)) v.x = 0.0f; @@ -523,6 +558,11 @@ ccl_device_inline float4 ensure_finite4(float4 v) return v; } +ccl_device_inline float4 pow(float4 v, float e) +{ + return make_float4(powf(v.x, e), powf(v.y, e), powf(v.z, e), powf(v.z, e)); +} + CCL_NAMESPACE_END #endif /* __UTIL_MATH_FLOAT4_H__ */ |