diff options
author | Sergey Sharybin <sergey.vfx@gmail.com> | 2016-10-12 15:23:29 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey.vfx@gmail.com> | 2016-10-12 15:43:00 +0300 |
commit | e588106d459207f04d28cfc3456355343d413446 (patch) | |
tree | 3e70269b489f40145e0d46c0502a289317990c0f /intern | |
parent | 42aeb608e75ec976c0bb3d91ca14b49371e43e6d (diff) |
Cycles: Use more SSE intrinsics for float3 type
This gives about 5% speedup on AVX2 kernels (other kernels still
have SSE disabled for math operations) and this solves the slowdown
of koro scene mention in the previous commit.
The title says it all actually. This commit also contains
changes to pass float3 as const reference in affected functions.
This should make MSVC happier without breaking OpenCL because it's
only done in areas which are ifdef-ed for non-OpenCL.
Another patch based on inspiration from Maxym Dmytrychenko, thanks!
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/util/util_math.h | 78 | ||||
-rw-r--r-- | intern/cycles/util/util_types.h | 15 |
2 files changed, 75 insertions, 18 deletions
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 89a882d9b9d..c98407b1f77 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -424,53 +424,87 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t) #ifndef __KERNEL_OPENCL__ -ccl_device_inline float3 operator-(const float3 a) +ccl_device_inline float3 operator-(const float3& a) { +#ifdef __KERNEL_SSE__ + return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#else return make_float3(-a.x, -a.y, -a.z); +#endif } -ccl_device_inline float3 operator*(const float3 a, const float3 b) +ccl_device_inline float3 operator*(const float3& a, const float3& b) { +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,b.m128)); +#else return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); +#endif } -ccl_device_inline float3 operator*(const float3 a, float f) +ccl_device_inline float3 operator*(const float3& a, const float f) { +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f))); +#else return make_float3(a.x*f, a.y*f, a.z*f); +#endif } -ccl_device_inline float3 operator*(float f, const float3 a) +ccl_device_inline float3 operator*(const float f, const float3& a) { +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f))); +#else return make_float3(a.x*f, a.y*f, a.z*f); +#endif } -ccl_device_inline float3 operator/(float f, const float3 a) +ccl_device_inline float3 operator/(const float f, const float3& a) { - return make_float3(f/a.x, f/a.y, f/a.z); +#ifdef __KERNEL_SSE__ + __m128 rc = _mm_rcp_ps(a.m128); + return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); +#else + return make_float3(f / a.x, f / a.y, f / a.z); +#endif } -ccl_device_inline float3 operator/(const float3 a, float f) +ccl_device_inline float3 operator/(const float3& a, const float f) { float invf = 1.0f/f; - return make_float3(a.x*invf, a.y*invf, a.z*invf); + return a * invf; } -ccl_device_inline float3 operator/(const float3 a, const float3 b) +ccl_device_inline float3 operator/(const float3& a, const float3& b) { - return make_float3(a.x/b.x, a.y/b.y, a.z/b.z); +#ifdef __KERNEL_SSE__ + __m128 rc = _mm_rcp_ps(b.m128); + return float3(_mm_mul_ps(a, rc)); +#else + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +#endif } -ccl_device_inline float3 operator+(const float3 a, const float3 b) +ccl_device_inline float3 operator+(const float3& a, const float3& b) { - return make_float3(a.x+b.x, a.y+b.y, a.z+b.z); +#ifdef __KERNEL_SSE__ + return float3(_mm_add_ps(a.m128, b.m128)); +#else + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +#endif } -ccl_device_inline float3 operator-(const float3 a, const float3 b) +ccl_device_inline float3 operator-(const float3& a, const float3& b) { - return make_float3(a.x-b.x, a.y-b.y, a.z-b.z); +#ifdef __KERNEL_SSE__ + return float3(_mm_sub_ps(a.m128, b.m128)); +#else + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +#endif } -ccl_device_inline float3 operator+=(float3& a, const float3 b) +ccl_device_inline float3 operator+=(float3& a, const float3& b) { return a = a + b; } @@ -505,6 +539,15 @@ ccl_device_inline float dot(const float3 a, const float3 b) #endif } +ccl_device_inline float dot_xy(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b)); +#else + return a.x*b.x + a.y*b.y; +#endif +} + ccl_device_inline float dot(const float4 a, const float4 b) { #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) @@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a) ccl_device_inline float3 normalize_len(const float3 a, float *t) { *t = len(a); - return a/(*t); + float x = 1.0f / *t; + return a*x; } ccl_device_inline float3 safe_normalize(const float3 a) { float t = len(a); - return (t != 0.0f)? a/t: a; + return (t != 0.0f)? a * (1.0f/t) : a; } ccl_device_inline float3 safe_normalize_len(const float3 a, float *t) diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 6af65f88a02..a000fae4bd6 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 { __forceinline int3(const __m128i a) : m128(a) {} __forceinline operator const __m128i&(void) const { return m128; } __forceinline operator __m128i&(void) { return m128; } + + int3(const int3& a) { m128 = a.m128; } + int3& operator =(const int3& a) { m128 = a.m128; return *this; } #else int x, y, z, w; #endif @@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 { __forceinline int4(const __m128i a) : m128(a) {} __forceinline operator const __m128i&(void) const { return m128; } __forceinline operator __m128i&(void) { return m128; } + + int4(const int4& a) : m128(a.m128) {} + int4& operator=(const int4& a) { m128 = a.m128; return *this; } #else int x, y, z, w; #endif @@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 { }; __forceinline float3() {} - __forceinline float3(const __m128 a) : m128(a) {} + __forceinline float3(const __m128& a) : m128(a) {} __forceinline operator const __m128&(void) const { return m128; } __forceinline operator __m128&(void) { return m128; } + + __forceinline float3(const float3& a) : m128(a.m128) {} + __forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; } #else float x, y, z, w; #endif @@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 { __forceinline float4(const __m128 a) : m128(a) {} __forceinline operator const __m128&(void) const { return m128; } __forceinline operator __m128&(void) { return m128; } + + __forceinline float4(const float4& a) : m128(a.m128) {} + __forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; } + #else float x, y, z, w; #endif |