From 230f9ade64844a50ea02461cfa005f364de09aa9 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Tue, 9 Aug 2022 14:28:19 +0200 Subject: Cycles: make transform inverse match Embree exactly Helps improve ray-tracing precision. This is a bit complicated as it requires different implementation depending on the CPU architecture. --- intern/cycles/util/CMakeLists.txt | 9 ++++ intern/cycles/util/transform.cpp | 4 +- intern/cycles/util/transform.h | 65 +++++++++++++---------------- intern/cycles/util/transform_avx2.cpp | 13 ++++++ intern/cycles/util/transform_inverse.h | 76 ++++++++++++++++++++++++++++++++++ intern/cycles/util/transform_sse41.cpp | 13 ++++++ 6 files changed, 141 insertions(+), 39 deletions(-) create mode 100644 intern/cycles/util/transform_avx2.cpp create mode 100644 intern/cycles/util/transform_inverse.h create mode 100644 intern/cycles/util/transform_sse41.cpp (limited to 'intern/cycles/util') diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 9bc9f00e142..f3fc7739f66 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -26,6 +26,8 @@ set(SRC thread.cpp time.cpp transform.cpp + transform_avx2.cpp + transform_sse41.cpp windows.cpp ) @@ -138,6 +140,13 @@ set(SRC_HEADERS xml.h ) +if(CXX_HAS_SSE) + set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") +endif() +if(CXX_HAS_AVX2) + set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") +endif() + include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp index 0b87e88871d..cb985c65dd8 100644 --- a/intern/cycles/util/transform.cpp +++ b/intern/cycles/util/transform.cpp @@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN /* Transform Inverse */ -static bool transform_matrix4_gj_inverse(float R[][4], float M[][4]) +static bool projection_matrix4_inverse(float R[][4], float M[][4]) { /* SPDX-License-Identifier: BSD-3-Clause * Adapted from code: @@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) memcpy(R, &tfmR, sizeof(R)); memcpy(M, &tfm, sizeof(M)); - if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { + if (UNLIKELY(!projection_matrix4_inverse(R, M))) { return projection_identity(); } diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h index 71164efbac1..24184dc7074 100644 --- a/intern/cycles/util/transform.h +++ b/intern/cycles/util/transform.h @@ -11,6 +11,10 @@ #include "util/math.h" #include "util/types.h" +#ifndef __KERNEL_GPU__ +# include "util/system.h" +#endif + CCL_NAMESPACE_BEGIN /* Affine transformation, stored as 4x3 matrix. */ @@ -38,6 +42,12 @@ typedef struct DecomposedTransform { float4 x, y, z, w; } DecomposedTransform; +CCL_NAMESPACE_END + +#include "util/transform_inverse.h" + +CCL_NAMESPACE_BEGIN + /* Functions */ #ifdef __KERNEL_METAL__ @@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t) #endif /* defined(__KERNEL_GPU_RAYTRACING__) */ } +#ifndef __KERNEL_GPU__ +void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm); +void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm); +#endif + ccl_device_inline Transform transform_inverse(const Transform tfm) { - /* This implementation matches the one in Embree exactly, to ensure consistent - * results with the ray intersection of instances. */ - float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x); - float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y); - float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z); - float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w); - - /* Compute determinant. */ - float det = dot(x, cross(y, z)); - - if (det == 0.0f) { - /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should - * never be in this situation, but try to invert it anyway with tweak. - * - * This logic does not match Embree which would just give an invalid - * matrix. A better solution would be to remove this and ensure any object - * matrix is valid. */ - x.x += 1e-8f; - y.y += 1e-8f; - z.z += 1e-8f; - - det = dot(x, cross(y, z)); - if (det == 0.0f) { - det = FLT_MAX; - } + /* Optimized transform implementations. */ +#ifndef __KERNEL_GPU__ + if (system_cpu_support_avx2()) { + Transform itfm; + transform_inverse_cpu_avx2(tfm, itfm); + return itfm; } + else if (system_cpu_support_sse41()) { + Transform itfm; + transform_inverse_cpu_sse41(tfm, itfm); + return itfm; + } +#endif - /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */ - const float3 inverse_x = cross(y, z) / det; - const float3 inverse_y = cross(z, x) / det; - const float3 inverse_z = cross(x, y) / det; - - /* Compute translation and fill transform. */ - Transform itfm; - itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w)); - itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w)); - itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w)); - - return itfm; + return transform_inverse_impl(tfm); } ccl_device_inline void transform_compose(ccl_private Transform *tfm, diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp new file mode 100644 index 00000000000..57c160388e2 --- /dev/null +++ b/intern/cycles/util/transform_avx2.cpp @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#include "util/transform.h" + +CCL_NAMESPACE_BEGIN + +void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm) +{ + itfm = transform_inverse_impl(tfm); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h new file mode 100644 index 00000000000..07fd06c1467 --- /dev/null +++ b/intern/cycles/util/transform_inverse.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Custom cross and dot implementations that match Embree bit for bit. + * Normally we don't use SSE41/AVX outside the kernel, but for this it's + * important to match exactly for ray tracing precision. */ + +ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b) +{ +#ifdef __AVX2__ + const ssef sse_a = (const __m128 &)a; + const ssef sse_b = (const __m128 &)b; + const ssef r = shuffle<1, 2, 0, 3>( + ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b))); + return (const float3 &)r; +#endif + + return cross(a, b); +} + +ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b) +{ +#ifdef __SSE4_1__ + return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F)); +#endif + + return dot(a, b); +} + +ccl_device_inline Transform transform_inverse_impl(const Transform tfm) +{ + /* This implementation matches the one in Embree exactly, to ensure consistent + * results with the ray intersection of instances. */ + float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x); + float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y); + float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z); + float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w); + + /* Compute determinant. */ + float det = transform_inverse_dot(x, transform_inverse_cross(y, z)); + + if (det == 0.0f) { + /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should + * never be in this situation, but try to invert it anyway with tweak. + * + * This logic does not match Embree which would just give an invalid + * matrix. A better solution would be to remove this and ensure any object + * matrix is valid. */ + x.x += 1e-8f; + y.y += 1e-8f; + z.z += 1e-8f; + + det = transform_inverse_dot(x, cross(y, z)); + if (det == 0.0f) { + det = FLT_MAX; + } + } + + /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */ + const float3 inverse_x = transform_inverse_cross(y, z) / det; + const float3 inverse_y = transform_inverse_cross(z, x) / det; + const float3 inverse_z = transform_inverse_cross(x, y) / det; + + /* Compute translation and fill transform. */ + Transform itfm; + itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w)); + itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w)); + itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w)); + + return itfm; +} +CCL_NAMESPACE_END diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp new file mode 100644 index 00000000000..8a698807a9c --- /dev/null +++ b/intern/cycles/util/transform_sse41.cpp @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#include "util/transform.h" + +CCL_NAMESPACE_BEGIN + +void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm) +{ + itfm = transform_inverse_impl(tfm); +} + +CCL_NAMESPACE_END -- cgit v1.2.3