Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLukas Stockner <lukas.stockner@freenet.de>2018-10-06 21:39:01 +0300
committerLukas Stockner <lukas.stockner@freenet.de>2018-10-06 22:49:54 +0300
commita0cc7bd961ef2cd501468dce08998992a88b3bed (patch)
tree90a26871e42e7aafc7d17c0172b4513a501896f8 /intern/cycles/util
parent6e08b01bd9ab87f63d8176bf2ec0b4f073a92ee3 (diff)
Cycles: Implement vectorized NLM kernels for faster CPU denoising
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/util_math.h24
-rw-r--r--intern/cycles/util/util_math_fast.h28
-rw-r--r--intern/cycles/util/util_math_float4.h12
-rw-r--r--intern/cycles/util/util_math_int4.h40
-rw-r--r--intern/cycles/util/util_types_int4.h2
-rw-r--r--intern/cycles/util/util_types_int4_impl.h10
6 files changed, 116 insertions, 0 deletions
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 52aeb8d8599..eafae5f31c0 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -220,6 +220,30 @@ ccl_device_inline float __uint_as_float(uint i)
u.i = i;
return u.f;
}
+
+ccl_device_inline int4 __float4_as_int4(float4 f)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_castps_si128(f.m128));
+ #else
+ return make_int4(__float_as_int(f.x),
+ __float_as_int(f.y),
+ __float_as_int(f.z),
+ __float_as_int(f.w));
+#endif
+}
+
+ccl_device_inline float4 __int4_as_float4(int4 i)
+{
+#ifdef __KERNEL_SSE__
+ return float4(_mm_castsi128_ps(i.m128));
+#else
+ return make_float4(__int_as_float(i.x),
+ __int_as_float(i.y),
+ __int_as_float(i.z),
+ __int_as_float(i.w));
+#endif
+}
#endif /* __KERNEL_OPENCL__ */
/* Versions of functions which are safe for fast math. */
diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h
index d3960deb3b4..323d40058e5 100644
--- a/intern/cycles/util/util_math_fast.h
+++ b/intern/cycles/util/util_math_fast.h
@@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c)
return a * b + c;
}
+ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
+{
+ return a * b + c;
+}
+
/*
* FAST & APPROXIMATE MATH
*
@@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x)
return fast_exp2f(x / M_LN2_F);
}
+#ifndef __KERNEL_GPU__
+ccl_device float4 fast_exp2f4(float4 x)
+{
+ const float4 one = make_float4(1.0f);
+ const float4 limit = make_float4(126.0f);
+ x = clamp(x, -limit, limit);
+ int4 m = make_int4(x);
+ x = one - (one - (x - make_float4(m)));
+ float4 r = make_float4(1.33336498402e-3f);
+ r = madd4(x, r, make_float4(9.810352697968e-3f));
+ r = madd4(x, r, make_float4(5.551834031939e-2f));
+ r = madd4(x, r, make_float4(0.2401793301105f));
+ r = madd4(x, r, make_float4(0.693144857883f));
+ r = madd4(x, r, make_float4(1.0f));
+ return __int4_as_float4(__float4_as_int4(r) + (m << 23));
+}
+
+ccl_device_inline float4 fast_expf4(float4 x)
+{
+ return fast_exp2f4(x / M_LN2_F);
+}
+#endif
+
ccl_device_inline float fast_exp10(float x)
{
/* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index aa7e56fefe9..105547098b5 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b);
ccl_device_inline float4 operator-(const float4& a, const float4& b);
ccl_device_inline float4 operator+=(float4& a, const float4& b);
ccl_device_inline float4 operator*=(float4& a, const float4& b);
+ccl_device_inline float4 operator*=(float4& a, float f);
ccl_device_inline float4 operator/=(float4& a, float f);
ccl_device_inline int4 operator<(const float4& a, const float4& b);
@@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a);
ccl_device_inline float4 safe_normalize(const float4& a);
ccl_device_inline float4 min(const float4& a, const float4& b);
ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx);
ccl_device_inline float4 fabs(const float4& a);
#endif /* !__KERNEL_OPENCL__*/
@@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b)
return a = a * b;
}
+ccl_device_inline float4 operator*=(float4& a, float f)
+{
+ return a = a * f;
+}
+
ccl_device_inline float4 operator/=(float4& a, float f)
{
return a = a / f;
@@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
#endif
}
+ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx)
+{
+ return min(max(a, mn), mx);
+}
+
ccl_device_inline float4 fabs(const float4& a)
{
#ifdef __KERNEL_SSE__
diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h
index 79a8c0841e7..cde366b8c27 100644
--- a/intern/cycles/util/util_math_int4.h
+++ b/intern/cycles/util/util_math_int4.h
@@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline int4 operator+(const int4& a, const int4& b);
ccl_device_inline int4 operator+=(int4& a, const int4& b);
ccl_device_inline int4 operator>>(const int4& a, int i);
+ccl_device_inline int4 operator<<(const int4& a, int i);
+ccl_device_inline int4 operator<(const int4& a, const int4& b);
+ccl_device_inline int4 operator>=(const int4& a, const int4& b);
+ccl_device_inline int4 operator&(const int4& a, const int4& b);
ccl_device_inline int4 min(int4 a, int4 b);
ccl_device_inline int4 max(int4 a, int4 b);
ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx);
@@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i)
#endif
}
+ccl_device_inline int4 operator<<(const int4& a, int i)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_slli_epi32(a.m128, i));
+#else
+ return make_int4(a.x << i, a.y << i, a.z << i, a.w << i);
+#endif
+}
+
+ccl_device_inline int4 operator<(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_cmplt_epi32(a.m128, b.m128));
+#else
+ return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
+#endif
+}
+
+ccl_device_inline int4 operator>=(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
+#else
+ return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
+#endif
+}
+
+ccl_device_inline int4 operator&(const int4& a, const int4& b)
+{
+#ifdef __KERNEL_SSE__
+ return int4(_mm_and_si128(a.m128, b.m128));
+#else
+ return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
+#endif
+}
+
ccl_device_inline int4 min(int4 a, int4 b)
{
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h
index cdd0ecbdae5..4ef162f1ac6 100644
--- a/intern/cycles/util/util_types_int4.h
+++ b/intern/cycles/util/util_types_int4.h
@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
#ifndef __KERNEL_GPU__
struct float3;
+struct float4;
struct ccl_try_align(16) int4 {
#ifdef __KERNEL_SSE__
@@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 {
ccl_device_inline int4 make_int4(int i);
ccl_device_inline int4 make_int4(int x, int y, int z, int w);
ccl_device_inline int4 make_int4(const float3& f);
+ccl_device_inline int4 make_int4(const float4& f);
ccl_device_inline void print_int4(const char *label, const int4& a);
#endif /* __KERNEL_GPU__ */
diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h
index 07cdc88f2dc..a62561709de 100644
--- a/intern/cycles/util/util_types_int4_impl.h
+++ b/intern/cycles/util/util_types_int4_impl.h
@@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f)
return a;
}
+ccl_device_inline int4 make_int4(const float4& f)
+{
+#ifdef __KERNEL_SSE__
+ int4 a(_mm_cvtps_epi32(f.m128));
+#else
+ int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
+#endif
+ return a;
+}
+
ccl_device_inline void print_int4(const char *label, const int4& a)
{
printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);