From a0cc7bd961ef2cd501468dce08998992a88b3bed Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Sat, 6 Oct 2018 20:39:01 +0200 Subject: Cycles: Implement vectorized NLM kernels for faster CPU denoising --- intern/cycles/util/util_math.h | 24 +++++++++++++++++++ intern/cycles/util/util_math_fast.h | 28 ++++++++++++++++++++++ intern/cycles/util/util_math_float4.h | 12 ++++++++++ intern/cycles/util/util_math_int4.h | 40 +++++++++++++++++++++++++++++++ intern/cycles/util/util_types_int4.h | 2 ++ intern/cycles/util/util_types_int4_impl.h | 10 ++++++++ 6 files changed, 116 insertions(+) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 52aeb8d8599..eafae5f31c0 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -220,6 +220,30 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } + +ccl_device_inline int4 __float4_as_int4(float4 f) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_castps_si128(f.m128)); + #else + return make_int4(__float_as_int(f.x), + __float_as_int(f.y), + __float_as_int(f.z), + __float_as_int(f.w)); +#endif +} + +ccl_device_inline float4 __int4_as_float4(int4 i) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_castsi128_ps(i.m128)); +#else + return make_float4(__int_as_float(i.x), + __int_as_float(i.y), + __int_as_float(i.z), + __int_as_float(i.w)); +#endif +} #endif /* __KERNEL_OPENCL__ */ /* Versions of functions which are safe for fast math. */ diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h index d3960deb3b4..323d40058e5 100644 --- a/intern/cycles/util/util_math_fast.h +++ b/intern/cycles/util/util_math_fast.h @@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c) return a * b + c; } +ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c) +{ + return a * b + c; +} + /* * FAST & APPROXIMATE MATH * @@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x) return fast_exp2f(x / M_LN2_F); } +#ifndef __KERNEL_GPU__ +ccl_device float4 fast_exp2f4(float4 x) +{ + const float4 one = make_float4(1.0f); + const float4 limit = make_float4(126.0f); + x = clamp(x, -limit, limit); + int4 m = make_int4(x); + x = one - (one - (x - make_float4(m))); + float4 r = make_float4(1.33336498402e-3f); + r = madd4(x, r, make_float4(9.810352697968e-3f)); + r = madd4(x, r, make_float4(5.551834031939e-2f)); + r = madd4(x, r, make_float4(0.2401793301105f)); + r = madd4(x, r, make_float4(0.693144857883f)); + r = madd4(x, r, make_float4(1.0f)); + return __int4_as_float4(__float4_as_int4(r) + (m << 23)); +} + +ccl_device_inline float4 fast_expf4(float4 x) +{ + return fast_exp2f4(x / M_LN2_F); +} +#endif + ccl_device_inline float fast_exp10(float x) { /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index aa7e56fefe9..105547098b5 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b); ccl_device_inline float4 operator-(const float4& a, const float4& b); ccl_device_inline float4 operator+=(float4& a, const float4& b); ccl_device_inline float4 operator*=(float4& a, const float4& b); +ccl_device_inline float4 operator*=(float4& a, float f); ccl_device_inline float4 operator/=(float4& a, float f); ccl_device_inline int4 operator<(const float4& a, const float4& b); @@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx); ccl_device_inline float4 fabs(const float4& a); #endif /* !__KERNEL_OPENCL__*/ @@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b) return a = a * b; } +ccl_device_inline float4 operator*=(float4& a, float f) +{ + return a = a * f; +} + ccl_device_inline float4 operator/=(float4& a, float f) { return a = a / f; @@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b) #endif } +ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx) +{ + return min(max(a, mn), mx); +} + ccl_device_inline float4 fabs(const float4& a) { #ifdef __KERNEL_SSE__ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h index 79a8c0841e7..cde366b8c27 100644 --- a/intern/cycles/util/util_math_int4.h +++ b/intern/cycles/util/util_math_int4.h @@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN ccl_device_inline int4 operator+(const int4& a, const int4& b); ccl_device_inline int4 operator+=(int4& a, const int4& b); ccl_device_inline int4 operator>>(const int4& a, int i); +ccl_device_inline int4 operator<<(const int4& a, int i); +ccl_device_inline int4 operator<(const int4& a, const int4& b); +ccl_device_inline int4 operator>=(const int4& a, const int4& b); +ccl_device_inline int4 operator&(const int4& a, const int4& b); ccl_device_inline int4 min(int4 a, int4 b); ccl_device_inline int4 max(int4 a, int4 b); ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx); @@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i) #endif } +ccl_device_inline int4 operator<<(const int4& a, int i) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_slli_epi32(a.m128, i)); +#else + return make_int4(a.x << i, a.y << i, a.z << i, a.w << i); +#endif +} + +ccl_device_inline int4 operator<(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_cmplt_epi32(a.m128, b.m128)); +#else + return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); +#endif +} + +ccl_device_inline int4 operator>=(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128))); +#else + return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#endif +} + +ccl_device_inline int4 operator&(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_and_si128(a.m128, b.m128)); +#else + return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); +#endif +} + ccl_device_inline int4 min(int4 a, int4 b) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h index cdd0ecbdae5..4ef162f1ac6 100644 --- a/intern/cycles/util/util_types_int4.h +++ b/intern/cycles/util/util_types_int4.h @@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ struct float3; +struct float4; struct ccl_try_align(16) int4 { #ifdef __KERNEL_SSE__ @@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 { ccl_device_inline int4 make_int4(int i); ccl_device_inline int4 make_int4(int x, int y, int z, int w); ccl_device_inline int4 make_int4(const float3& f); +ccl_device_inline int4 make_int4(const float4& f); ccl_device_inline void print_int4(const char *label, const int4& a); #endif /* __KERNEL_GPU__ */ diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h index 07cdc88f2dc..a62561709de 100644 --- a/intern/cycles/util/util_types_int4_impl.h +++ b/intern/cycles/util/util_types_int4_impl.h @@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f) return a; } +ccl_device_inline int4 make_int4(const float4& f) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_cvtps_epi32(f.m128)); +#else + int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; +#endif + return a; +} + ccl_device_inline void print_int4(const char *label, const int4& a) { printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); -- cgit v1.2.3