Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2022-08-09 15:28:19 +0300
committerBrecht Van Lommel <brecht@blender.org>2022-08-09 17:59:05 +0300
commit230f9ade64844a50ea02461cfa005f364de09aa9 (patch)
tree2e03313888537044cb0595986ac300a9c3e2ca0a /intern/cycles/util
parent286e535071c8f9a906c6c36b8dac0eda6384c79a (diff)
Cycles: make transform inverse match Embree exactly
Helps improve ray-tracing precision. This is a bit complicated as it requires different implementation depending on the CPU architecture.
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/CMakeLists.txt9
-rw-r--r--intern/cycles/util/transform.cpp4
-rw-r--r--intern/cycles/util/transform.h65
-rw-r--r--intern/cycles/util/transform_avx2.cpp13
-rw-r--r--intern/cycles/util/transform_inverse.h76
-rw-r--r--intern/cycles/util/transform_sse41.cpp13
6 files changed, 141 insertions, 39 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 9bc9f00e142..f3fc7739f66 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -26,6 +26,8 @@ set(SRC
thread.cpp
time.cpp
transform.cpp
+ transform_avx2.cpp
+ transform_sse41.cpp
windows.cpp
)
@@ -138,6 +140,13 @@ set(SRC_HEADERS
xml.h
)
+if(CXX_HAS_SSE)
+ set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+endif()
+if(CXX_HAS_AVX2)
+ set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
+
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index 0b87e88871d..cb985c65dd8 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
/* Transform Inverse */
-static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
+static bool projection_matrix4_inverse(float R[][4], float M[][4])
{
/* SPDX-License-Identifier: BSD-3-Clause
* Adapted from code:
@@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
memcpy(R, &tfmR, sizeof(R));
memcpy(M, &tfm, sizeof(M));
- if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
+ if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
return projection_identity();
}
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 71164efbac1..24184dc7074 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -11,6 +11,10 @@
#include "util/math.h"
#include "util/types.h"
+#ifndef __KERNEL_GPU__
+# include "util/system.h"
+#endif
+
CCL_NAMESPACE_BEGIN
/* Affine transformation, stored as 4x3 matrix. */
@@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
float4 x, y, z, w;
} DecomposedTransform;
+CCL_NAMESPACE_END
+
+#include "util/transform_inverse.h"
+
+CCL_NAMESPACE_BEGIN
+
/* Functions */
#ifdef __KERNEL_METAL__
@@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
#endif /* defined(__KERNEL_GPU_RAYTRACING__) */
}
+#ifndef __KERNEL_GPU__
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
+#endif
+
ccl_device_inline Transform transform_inverse(const Transform tfm)
{
- /* This implementation matches the one in Embree exactly, to ensure consistent
- * results with the ray intersection of instances. */
- float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
- float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
- float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
- float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-
- /* Compute determinant. */
- float det = dot(x, cross(y, z));
-
- if (det == 0.0f) {
- /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
- * never be in this situation, but try to invert it anyway with tweak.
- *
- * This logic does not match Embree which would just give an invalid
- * matrix. A better solution would be to remove this and ensure any object
- * matrix is valid. */
- x.x += 1e-8f;
- y.y += 1e-8f;
- z.z += 1e-8f;
-
- det = dot(x, cross(y, z));
- if (det == 0.0f) {
- det = FLT_MAX;
- }
+ /* Optimized transform implementations. */
+#ifndef __KERNEL_GPU__
+ if (system_cpu_support_avx2()) {
+ Transform itfm;
+ transform_inverse_cpu_avx2(tfm, itfm);
+ return itfm;
}
+ else if (system_cpu_support_sse41()) {
+ Transform itfm;
+ transform_inverse_cpu_sse41(tfm, itfm);
+ return itfm;
+ }
+#endif
- /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
- const float3 inverse_x = cross(y, z) / det;
- const float3 inverse_y = cross(z, x) / det;
- const float3 inverse_z = cross(x, y) / det;
-
- /* Compute translation and fill transform. */
- Transform itfm;
- itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
- itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
- itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
-
- return itfm;
+ return transform_inverse_impl(tfm);
}
ccl_device_inline void transform_compose(ccl_private Transform *tfm,
diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp
new file mode 100644
index 00000000000..57c160388e2
--- /dev/null
+++ b/intern/cycles/util/transform_avx2.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
+{
+ itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
new file mode 100644
index 00000000000..07fd06c1467
--- /dev/null
+++ b/intern/cycles/util/transform_inverse.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Custom cross and dot implementations that match Embree bit for bit.
+ * Normally we don't use SSE41/AVX outside the kernel, but for this it's
+ * important to match exactly for ray tracing precision. */
+
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+{
+#ifdef __AVX2__
+ const ssef sse_a = (const __m128 &)a;
+ const ssef sse_b = (const __m128 &)b;
+ const ssef r = shuffle<1, 2, 0, 3>(
+ ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+ return (const float3 &)r;
+#endif
+
+ return cross(a, b);
+}
+
+ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+{
+#ifdef __SSE4_1__
+ return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#endif
+
+ return dot(a, b);
+}
+
+ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
+{
+ /* This implementation matches the one in Embree exactly, to ensure consistent
+ * results with the ray intersection of instances. */
+ float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+ float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+ float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+ float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+ /* Compute determinant. */
+ float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
+
+ if (det == 0.0f) {
+ /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+ * never be in this situation, but try to invert it anyway with tweak.
+ *
+ * This logic does not match Embree which would just give an invalid
+ * matrix. A better solution would be to remove this and ensure any object
+ * matrix is valid. */
+ x.x += 1e-8f;
+ y.y += 1e-8f;
+ z.z += 1e-8f;
+
+ det = transform_inverse_dot(x, cross(y, z));
+ if (det == 0.0f) {
+ det = FLT_MAX;
+ }
+ }
+
+ /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+ const float3 inverse_x = transform_inverse_cross(y, z) / det;
+ const float3 inverse_y = transform_inverse_cross(z, x) / det;
+ const float3 inverse_z = transform_inverse_cross(x, y) / det;
+
+ /* Compute translation and fill transform. */
+ Transform itfm;
+ itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
+ itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
+ itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
+
+ return itfm;
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp
new file mode 100644
index 00000000000..8a698807a9c
--- /dev/null
+++ b/intern/cycles/util/transform_sse41.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
+{
+ itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END