diff options
author | Joseph Eagar <joeedh@gmail.com> | 2022-11-12 01:18:46 +0300 |
---|---|---|
committer | Joseph Eagar <joeedh@gmail.com> | 2022-11-12 01:18:46 +0300 |
commit | c29795452cc71cb9f5a571a4aff0f593a2d7acaf (patch) | |
tree | 53a46bb77f3102c545f7e55d3344e310b3bf6116 /intern/cycles/util/transform_inverse.h | |
parent | 9980fd0b8e1f3a07060316f28469f55a3f2fc0cd (diff) | |
parent | 03ccf37162d365f3fdc8d8cd0cd6e9ff314fec6e (diff) |
Merge branch 'master' into temp-sculpt-roll-mappingtemp-sculpt-roll-mapping
Diffstat (limited to 'intern/cycles/util/transform_inverse.h')
-rw-r--r-- | intern/cycles/util/transform_inverse.h | 27 |
1 files changed, 17 insertions, 10 deletions
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h index bb410a6daef..2faac576d82 100644 --- a/intern/cycles/util/transform_inverse.h +++ b/intern/cycles/util/transform_inverse.h @@ -9,26 +9,33 @@ CCL_NAMESPACE_BEGIN * Normally we don't use SSE41/AVX outside the kernel, but for this it's * important to match exactly for ray tracing precision. */ -ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b) +ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_) { #if defined(__AVX2__) && defined(__KERNEL_SSE2__) - const ssef sse_a = (const __m128 &)a; - const ssef sse_b = (const __m128 &)b; - const ssef r = shuffle<1, 2, 0, 3>( - ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b))); + const __m128 a = (const __m128 &)a_; + const __m128 b = (const __m128 &)b_; + const __m128 a_shuffle = _mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1))); + const __m128 b_shuffle = _mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1))); + const __m128 r = _mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))), + _MM_SHUFFLE(3, 0, 2, 1))); return (const float3 &)r; #endif - return cross(a, b); + return cross(a_, b_); } -ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b) +ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_) { -#ifdef __SSE4_1__ - return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F)); +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + const __m128 a = (const __m128 &)a_; + const __m128 b = (const __m128 &)b_; + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); #endif - return dot(a, b); + return dot(a_, b_); } ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm) |