Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2016-10-12 15:23:29 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2016-10-12 15:43:00 +0300
commite588106d459207f04d28cfc3456355343d413446 (patch)
tree3e70269b489f40145e0d46c0502a289317990c0f /intern
parent42aeb608e75ec976c0bb3d91ca14b49371e43e6d (diff)
Cycles: Use more SSE intrinsics for float3 type
This gives about 5% speedup on AVX2 kernels (other kernels still have SSE disabled for math operations) and this solves the slowdown of koro scene mention in the previous commit. The title says it all actually. This commit also contains changes to pass float3 as const reference in affected functions. This should make MSVC happier without breaking OpenCL because it's only done in areas which are ifdef-ed for non-OpenCL. Another patch based on inspiration from Maxym Dmytrychenko, thanks!
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/util/util_math.h78
-rw-r--r--intern/cycles/util/util_types.h15
2 files changed, 75 insertions, 18 deletions
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 89a882d9b9d..c98407b1f77 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -424,53 +424,87 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)
#ifndef __KERNEL_OPENCL__
-ccl_device_inline float3 operator-(const float3 a)
+ccl_device_inline float3 operator-(const float3& a)
{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
return make_float3(-a.x, -a.y, -a.z);
+#endif
}
-ccl_device_inline float3 operator*(const float3 a, const float3 b)
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_mul_ps(a.m128,b.m128));
+#else
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
}
-ccl_device_inline float3 operator*(const float3 a, float f)
+ccl_device_inline float3 operator*(const float3& a, const float f)
{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
}
-ccl_device_inline float3 operator*(float f, const float3 a)
+ccl_device_inline float3 operator*(const float f, const float3& a)
{
+#ifdef __KERNEL_SSE__
+ return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
+#else
return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
}
-ccl_device_inline float3 operator/(float f, const float3 a)
+ccl_device_inline float3 operator/(const float f, const float3& a)
{
- return make_float3(f/a.x, f/a.y, f/a.z);
+#ifdef __KERNEL_SSE__
+ __m128 rc = _mm_rcp_ps(a.m128);
+ return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#else
+ return make_float3(f / a.x, f / a.y, f / a.z);
+#endif
}
-ccl_device_inline float3 operator/(const float3 a, float f)
+ccl_device_inline float3 operator/(const float3& a, const float f)
{
float invf = 1.0f/f;
- return make_float3(a.x*invf, a.y*invf, a.z*invf);
+ return a * invf;
}
-ccl_device_inline float3 operator/(const float3 a, const float3 b)
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
{
- return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+#ifdef __KERNEL_SSE__
+ __m128 rc = _mm_rcp_ps(b.m128);
+ return float3(_mm_mul_ps(a, rc));
+#else
+ return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
}
-ccl_device_inline float3 operator+(const float3 a, const float3 b)
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
{
- return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+#ifdef __KERNEL_SSE__
+ return float3(_mm_add_ps(a.m128, b.m128));
+#else
+ return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
}
-ccl_device_inline float3 operator-(const float3 a, const float3 b)
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
{
- return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+#ifdef __KERNEL_SSE__
+ return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+ return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
}
-ccl_device_inline float3 operator+=(float3& a, const float3 b)
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
{
return a = a + b;
}
@@ -505,6 +539,15 @@ ccl_device_inline float dot(const float3 a, const float3 b)
#endif
}
+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+ return a.x*b.x + a.y*b.y;
+#endif
+}
+
ccl_device_inline float dot(const float4 a, const float4 b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
@@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a)
ccl_device_inline float3 normalize_len(const float3 a, float *t)
{
*t = len(a);
- return a/(*t);
+ float x = 1.0f / *t;
+ return a*x;
}
ccl_device_inline float3 safe_normalize(const float3 a)
{
float t = len(a);
- return (t != 0.0f)? a/t: a;
+ return (t != 0.0f)? a * (1.0f/t) : a;
}
ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 6af65f88a02..a000fae4bd6 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 {
__forceinline int3(const __m128i a) : m128(a) {}
__forceinline operator const __m128i&(void) const { return m128; }
__forceinline operator __m128i&(void) { return m128; }
+
+ int3(const int3& a) { m128 = a.m128; }
+ int3& operator =(const int3& a) { m128 = a.m128; return *this; }
#else
int x, y, z, w;
#endif
@@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 {
__forceinline int4(const __m128i a) : m128(a) {}
__forceinline operator const __m128i&(void) const { return m128; }
__forceinline operator __m128i&(void) { return m128; }
+
+ int4(const int4& a) : m128(a.m128) {}
+ int4& operator=(const int4& a) { m128 = a.m128; return *this; }
#else
int x, y, z, w;
#endif
@@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 {
};
__forceinline float3() {}
- __forceinline float3(const __m128 a) : m128(a) {}
+ __forceinline float3(const __m128& a) : m128(a) {}
__forceinline operator const __m128&(void) const { return m128; }
__forceinline operator __m128&(void) { return m128; }
+
+ __forceinline float3(const float3& a) : m128(a.m128) {}
+ __forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
#else
float x, y, z, w;
#endif
@@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 {
__forceinline float4(const __m128 a) : m128(a) {}
__forceinline operator const __m128&(void) const { return m128; }
__forceinline operator __m128&(void) { return m128; }
+
+ __forceinline float4(const float4& a) : m128(a.m128) {}
+ __forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
+
#else
float x, y, z, w;
#endif