/* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ #pragma once CCL_NAMESPACE_BEGIN /* Custom cross and dot implementations that match Embree bit for bit. * Normally we don't use SSE41/AVX outside the kernel, but for this it's * important to match exactly for ray tracing precision. */ ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_) { #if defined(__AVX2__) && defined(__KERNEL_SSE2__) const __m128 a = (const __m128 &)a_; const __m128 b = (const __m128 &)b_; const __m128 a_shuffle = _mm_castsi128_ps( _mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1))); const __m128 b_shuffle = _mm_castsi128_ps( _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1))); const __m128 r = _mm_castsi128_ps( _mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))), _MM_SHUFFLE(3, 0, 2, 1))); return (const float3 &)r; #endif return cross(a_, b_); } ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) const __m128 a = (const __m128 &)a_; const __m128 b = (const __m128 &)b_; return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); #endif return dot(a_, b_); } ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm) { /* This implementation matches the one in Embree exactly, to ensure consistent * results with the ray intersection of instances. */ float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x); float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y); float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z); float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w); /* Compute determinant. */ float det = transform_inverse_dot(x, transform_inverse_cross(y, z)); if (det == 0.0f) { /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should * never be in this situation, but try to invert it anyway with tweak. * * This logic does not match Embree which would just give an invalid * matrix. A better solution would be to remove this and ensure any object * matrix is valid. */ x.x += 1e-8f; y.y += 1e-8f; z.z += 1e-8f; det = transform_inverse_dot(x, cross(y, z)); if (det == 0.0f) { det = FLT_MAX; } } /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */ const float3 inverse_x = transform_inverse_cross(y, z) / det; const float3 inverse_y = transform_inverse_cross(z, x) / det; const float3 inverse_z = transform_inverse_cross(x, y) / det; /* Compute translation and fill transform. */ Transform itfm; itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w)); itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w)); itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w)); return itfm; } CCL_NAMESPACE_END