From a24fbf3323101cd35332161b12a04e687b5583e4 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Wed, 2 Aug 2017 02:23:03 +0200 Subject: Code refactor: add, remove, optimize various SSE functions. * Remove some unnecessary SSE emulation defines. * Use full precision float division so we can enable it. * Add sqrt(), sqr(), fabs(), shuffle variations, mask(). * Optimize reduce_add(), select(). Differential Revision: https://developer.blender.org/D2764 --- intern/cycles/util/util_math.h | 1 + intern/cycles/util/util_math_float3.h | 20 +++----- intern/cycles/util/util_math_float4.h | 93 ++++++++++++++++++++++++++++------- intern/cycles/util/util_simd.h | 52 ++------------------ 4 files changed, 86 insertions(+), 80 deletions(-) (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index b719640b19c..4d51ec5570a 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b) #ifndef __KERNEL_GPU__ using std::isfinite; using std::isnan; +using std::sqrt; ccl_device_inline int abs(int x) { diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index bb04c4aa2d9..e73e5bc17a2 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f) ccl_device_inline float3 operator*(const float f, const float3& a) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 +#if defined(__KERNEL_SSE__) return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); #else return make_float3(a.x*f, a.y*f, a.z*f); @@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a) ccl_device_inline float3 operator/(const float f, const float3& a) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(a.m128); - return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); #else return make_float3(f / a.x, f / a.y, f / a.z); #endif @@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f) ccl_device_inline float3 operator/(const float3& a, const float3& b) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(b.m128); - return float3(_mm_mul_ps(a, rc)); +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(a.m128, b.m128)); #else return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); #endif @@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t) ccl_device_inline float3 rcp(const float3& a) { #ifdef __KERNEL_SSE__ - const float4 r(_mm_rcp_ps(a.m128)); - return float3(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); + /* Don't use _mm_rcp_ps due to poor precision. */ + return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); #else return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); #endif diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index d89121b3a1d..007b3fc5082 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -48,6 +48,8 @@ ccl_device_inline bool operator==(const float4& a, const float4& b); ccl_device_inline float dot(const float4& a, const float4& b); ccl_device_inline float len_squared(const float4& a); ccl_device_inline float4 rcp(const float4& a); +ccl_device_inline float4 sqrt(const float4& a); +ccl_device_inline float4 sqr(const float4& a); ccl_device_inline float4 cross(const float4& a, const float4& b); ccl_device_inline bool is_zero(const float4& a); ccl_device_inline float reduce_add(const float4& a); @@ -57,14 +59,20 @@ ccl_device_inline float4 normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 fabs(const float4& a); #endif /* !__KERNEL_OPENCL__*/ #ifdef __KERNEL_SSE__ template __forceinline const float4 shuffle(const float4& b); +template +__forceinline const float4 shuffle(const float4& a, const float4& b); template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b); +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b); +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b); + # ifdef __KERNEL_SSE3__ template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b); template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b); @@ -77,9 +85,6 @@ ccl_device_inline float4 select(const int4& mask, const float4& b); ccl_device_inline float4 reduce_min(const float4& a); ccl_device_inline float4 reduce_max(const float4& a); -# if 0 -ccl_device_inline float4 reduce_add(const float4& a); -# endif #endif /* !__KERNEL_GPU__ */ /******************************************************************************* @@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f) ccl_device_inline float4 operator/(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return a * rcp(b); + return float4(_mm_div_ps(a.m128, b.m128)); #else return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); #endif @@ -224,14 +229,30 @@ ccl_device_inline float len_squared(const float4& a) ccl_device_inline float4 rcp(const float4& a) { #ifdef __KERNEL_SSE__ - float4 r(_mm_rcp_ps(a.m128)); - return float4(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); + /* Don't use _mm_rcp_ps due to poor precision. */ + return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); #else return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); #endif } +ccl_device_inline float4 sqrt(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_sqrt_ps(a.m128)); +#else + return make_float4(sqrtf(a.x), + sqrtf(a.y), + sqrtf(a.z), + sqrtf(a.w)); +#endif +} + +ccl_device_inline float4 sqr(const float4& a) +{ + return a * a; +} + ccl_device_inline float4 cross(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ @@ -257,9 +278,13 @@ ccl_device_inline bool is_zero(const float4& a) ccl_device_inline float reduce_add(const float4& a) { #ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE3__ + float4 h(_mm_hadd_ps(a.m128, a.m128)); + return _mm_cvtss_f32(_mm_hadd_ps(h.m128, h.m128)); +# else float4 h(shuffle<1,0,3,2>(a) + a); - /* TODO(sergey): Investigate efficiency. */ - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); + return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); +# endif #else return ((a.x + a.y) + (a.z + a.w)); #endif @@ -309,6 +334,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b) max(a.w, b.w)); #endif } + +ccl_device_inline float4 fabs(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); +#else + return make_float4(fabsf(a.x), + fabsf(a.y), + fabsf(a.z), + fabsf(a.w)); +#endif +} #endif /* !__KERNEL_OPENCL__*/ #ifdef __KERNEL_SSE__ @@ -320,11 +357,28 @@ __forceinline const float4 shuffle(const float4& b) _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); } +template +__forceinline const float4 shuffle(const float4& a, const float4& b) +{ + return float4(_mm_shuffle_ps(a.m128, b.m128, + _MM_SHUFFLE(index_3, index_2, index_1, index_0))); +} + template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) { return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); } +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b) +{ + return float4(_mm_movelh_ps(a.m128, b.m128)); +} + +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b) +{ + return float4(_mm_movehl_ps(b.m128, a.m128)); +} + # ifdef __KERNEL_SSE3__ template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) { @@ -344,9 +398,7 @@ ccl_device_inline float4 select(const int4& mask, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), - _mm_andnot_ps(_mm_cvtepi32_ps(mask), b))); + return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128))); #else return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, @@ -355,6 +407,13 @@ ccl_device_inline float4 select(const int4& mask, #endif } +ccl_device_inline float4 mask(const int4& mask, + const float4& a) +{ + /* Replace elements of x with zero where mask isn't set. */ + return select(mask, a, make_float4(0.0f)); +} + ccl_device_inline float4 reduce_min(const float4& a) { #ifdef __KERNEL_SSE__ @@ -375,17 +434,15 @@ ccl_device_inline float4 reduce_max(const float4& a) #endif } -#if 0 -ccl_device_inline float4 reduce_add(const float4& a) +ccl_device_inline float4 load_float4(const float *v) { #ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return shuffle<2,3,0,1>(h) + h; + return float4(_mm_loadu_ps(v)); #else - return make_float4((a.x + a.y) + (a.z + a.w)); + return make_float4(v[0], v[1], v[2], v[3]); #endif } -#endif + #endif /* !__KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 7d938a0fbca..66dd80420ae 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -347,12 +347,9 @@ __forceinline size_t __bscf(size_t& v) #endif /* _WIN32 */ -static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; -static const size_t BITSCAN_NO_BIT_SET_64 = 64; +#if !(defined(__SSE4_1__) || defined(__SSE4_2__)) -#ifdef __KERNEL_SSE3__ -/* Emulation of SSE4 functions with SSE3 */ -# ifndef __KERNEL_SSE41__ +/* Emulation of SSE4 functions with SSE2 */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -361,48 +358,31 @@ static const size_t BITSCAN_NO_BIT_SET_64 = 64; #define _MM_FROUND_CUR_DIRECTION 0x04 #undef _mm_blendv_ps -#define _mm_blendv_ps __emu_mm_blendv_ps __forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); } #undef _mm_blend_ps -#define _mm_blend_ps __emu_mm_blend_ps __forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); } #undef _mm_blendv_epi8 -#define _mm_blendv_epi8 __emu_mm_blendv_epi8 __forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); } -#undef _mm_mullo_epi32 -#define _mm_mullo_epi32 __emu_mm_mullo_epi32 -__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { - __m128i rvalue; - char* _r = (char*)(&rvalue + 1); - char* _v = (char*)(& value + 1); - char* _i = (char*)(& input + 1); - for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i)); - return rvalue; -} - #undef _mm_min_epi32 -#define _mm_min_epi32 __emu_mm_min_epi32 __forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); } #undef _mm_max_epi32 -#define _mm_max_epi32 __emu_mm_max_epi32 __forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } #undef _mm_extract_epi32 -#define _mm_extract_epi32 __emu_mm_extract_epi32 __forceinline int _mm_extract_epi32( __m128i input, const int index ) { switch ( index ) { case 0: return _mm_cvtsi128_si32(input); @@ -414,24 +394,15 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) { } #undef _mm_insert_epi32 -#define _mm_insert_epi32 __emu_mm_insert_epi32 __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; } -#undef _mm_extract_ps -#define _mm_extract_ps __emu_mm_extract_ps -__forceinline int _mm_extract_ps( __m128 input, const int index ) { - int32_t* ptr = (int32_t*)&input; return ptr[index]; -} - #undef _mm_insert_ps -#define _mm_insert_ps __emu_mm_insert_ps __forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index ) { assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); } #undef _mm_round_ps -#define _mm_round_ps __emu_mm_round_ps __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) { switch ( flags ) @@ -444,22 +415,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) return value; } -# ifdef _M_X64 -#undef _mm_insert_epi64 -#define _mm_insert_epi64 __emu_mm_insert_epi64 -__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { - assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; -} - -#undef _mm_extract_epi64 -#define _mm_extract_epi64 __emu_mm_extract_epi64 -__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { - assert(size_t(index) < 2); - return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); -} -# endif - -# endif +#endif /* !(defined(__SSE4_1__) || defined(__SSE4_2__)) */ #undef _mm_fabs_ps #define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) @@ -494,8 +450,6 @@ ccl_device_inline __m128 _mm_hsum_ps(__m128 x) #undef _mm_mask_ps #define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask) -#endif - #else /* __KERNEL_SSE2__ */ /* This section is for utility functions which operates on non-register data -- cgit v1.2.3