From 023eb2ea7c16a00272f83d564145e28aeb9ed2b7 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Thu, 21 Jul 2022 15:49:00 +0200 Subject: Cycles: more closely match some math and intersection operations in Embree This helps with debugging, and gives a slightly closer match between CPU and CUDA/HIP/Metal renders when it comes to ray tracing precision. --- intern/cycles/blender/curves.cpp | 2 +- intern/cycles/kernel/geom/object.h | 2 +- intern/cycles/kernel/geom/shader_data.h | 2 +- intern/cycles/util/math.h | 5 +++ intern/cycles/util/math_float3.h | 15 +++++-- intern/cycles/util/math_intersect.h | 35 +++++++-------- intern/cycles/util/transform.cpp | 19 +------- intern/cycles/util/transform.h | 79 ++++++++++++++++++--------------- 8 files changed, 80 insertions(+), 79 deletions(-) (limited to 'intern') diff --git a/intern/cycles/blender/curves.cpp b/intern/cycles/blender/curves.cpp index c4154bce022..04ba7c53878 100644 --- a/intern/cycles/blender/curves.cpp +++ b/intern/cycles/blender/curves.cpp @@ -55,7 +55,7 @@ static bool ObtainCacheParticleData( return false; Transform tfm = get_transform(b_ob->matrix_world()); - Transform itfm = transform_quick_inverse(tfm); + Transform itfm = transform_inverse(tfm); for (BL::Modifier &b_mod : b_ob->modifiers) { if ((b_mod.type() == b_mod.type_PARTICLE_SYSTEM) && diff --git a/intern/cycles/kernel/geom/object.h b/intern/cycles/kernel/geom/object.h index b15f6b5dda5..bef7d710159 100644 --- a/intern/cycles/kernel/geom/object.h +++ b/intern/cycles/kernel/geom/object.h @@ -86,7 +86,7 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals kg, Transform tfm = object_fetch_transform_motion(kg, object, time); if (itfm) - *itfm = transform_quick_inverse(tfm); + *itfm = transform_inverse(tfm); return tfm; } diff --git a/intern/cycles/kernel/geom/shader_data.h b/intern/cycles/kernel/geom/shader_data.h index 99b9289cb4a..5af89b45f20 100644 --- a/intern/cycles/kernel/geom/shader_data.h +++ b/intern/cycles/kernel/geom/shader_data.h @@ -18,7 +18,7 @@ ccl_device void shader_setup_object_transforms(KernelGlobals kg, { if (sd->object_flag & SD_OBJECT_MOTION) { sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time); - sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion); + sd->ob_itfm_motion = transform_inverse(sd->ob_tfm_motion); } } #endif diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index af2f1ea092d..8360ce05a56 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -511,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a) return make_float4(a.x, a.y, a.z, 1.0f); } +ccl_device_inline float4 float3_to_float4(const float3 a, const float w) +{ + return make_float4(a.x, a.y, a.z, w); +} + ccl_device_inline float inverse_lerp(float a, float b, float x) { return (x - a) / (b - a); diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index c02b4cdbf0d..c408eadf195 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -147,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a) ccl_device_inline float3 operator/(const float3 &a, const float f) { - float invf = 1.0f / f; - return a * invf; +# if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(a.m128, _mm_set1_ps(f))); +# else + return make_float3(a.x / f, a.y / f, a.z / f); +# endif } ccl_device_inline float3 operator/(const float3 &a, const float3 &b) @@ -284,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b) ccl_device_inline float3 cross(const float3 &a, const float3 &b) { - float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); - return r; +# ifdef __KERNEL_SSE__ + return float3(shuffle<1, 2, 0, 3>( + msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b)))); +# else + return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); +# endif } ccl_device_inline float3 normalize(const float3 &a) diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h index c5b1cd51030..3e5891b2507 100644 --- a/intern/cycles/util/math_intersect.h +++ b/intern/cycles/util/math_intersect.h @@ -105,10 +105,10 @@ ccl_device bool ray_disk_intersect(float3 ray_P, return false; } -ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, - float3 ray_dir, - float ray_tmin, - float ray_tmax, +ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P, + const float3 ray_D, + const float ray_tmin, + const float ray_tmax, const float3 tri_a, const float3 tri_b, const float3 tri_c, @@ -116,14 +116,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, ccl_private float *isect_v, ccl_private float *isect_t) { -#define dot3(a, b) dot(a, b) - const float3 P = ray_P; - const float3 dir = ray_dir; + /* This implementation matches the Plücker coordinates triangle intersection + * in Embree. */ /* Calculate vertices relative to ray origin. */ - const float3 v0 = tri_c - P; - const float3 v1 = tri_a - P; - const float3 v2 = tri_b - P; + const float3 v0 = tri_c - ray_P; + const float3 v1 = tri_a - ray_P; + const float3 v2 = tri_b - ray_P; /* Calculate triangle edges. */ const float3 e0 = v2 - v0; @@ -131,29 +130,29 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, const float3 e2 = v1 - v2; /* Perform edge tests. */ - const float U = dot(cross(v2 + v0, e0), ray_dir); - const float V = dot(cross(v0 + v1, e1), ray_dir); - const float W = dot(cross(v1 + v2, e2), ray_dir); + const float U = dot(cross(v2 + v0, e0), ray_D); + const float V = dot(cross(v0 + v1, e1), ray_D); + const float W = dot(cross(v1 + v2, e2), ray_D); + const float eps = FLT_EPSILON * fabsf(U + V + W); const float minUVW = min(U, min(V, W)); const float maxUVW = max(U, max(V, W)); - if (minUVW < 0.0f && maxUVW > 0.0f) { + if (!(minUVW >= -eps || maxUVW <= eps)) { return false; } /* Calculate geometry normal and denominator. */ const float3 Ng1 = cross(e1, e0); - // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0); const float3 Ng = Ng1 + Ng1; - const float den = dot3(Ng, dir); + const float den = dot(Ng, ray_D); /* Avoid division by 0. */ if (UNLIKELY(den == 0.0f)) { return false; } /* Perform depth test. */ - const float T = dot3(v0, Ng); + const float T = dot(v0, Ng); const float t = T / den; if (!(t >= ray_tmin && t <= ray_tmax)) { return false; @@ -163,8 +162,6 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, *isect_v = V / den; *isect_t = t; return true; - -#undef dot3 } /* Tests for an intersection between a ray and a quad defined by diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp index 0bf5de57a20..0b87e88871d 100644 --- a/intern/cycles/util/transform.cpp +++ b/intern/cycles/util/transform.cpp @@ -99,15 +99,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) memcpy(M, &tfm, sizeof(M)); if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { - /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should - * never be in this situation, but try to invert it anyway with tweak */ - M[0][0] += 1e-8f; - M[1][1] += 1e-8f; - M[2][2] += 1e-8f; - - if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { - return projection_identity(); - } + return projection_identity(); } memcpy(&tfmR, R, sizeof(R)); @@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) return tfmR; } -Transform transform_inverse(const Transform &tfm) -{ - ProjectionTransform projection(tfm); - return projection_to_transform(projection_inverse(projection)); -} - Transform transform_transposed_inverse(const Transform &tfm) { - ProjectionTransform projection(tfm); - ProjectionTransform iprojection = projection_inverse(projection); + ProjectionTransform iprojection(transform_inverse(tfm)); return projection_to_transform(projection_transpose(iprojection)); } diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h index a460581d1f3..71164efbac1 100644 --- a/intern/cycles/util/transform.h +++ b/intern/cycles/util/transform.h @@ -63,10 +63,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f _MM_TRANSPOSE4_PS(x, y, z, w); - ssef tmp = shuffle<0>(aa) * x; - tmp = madd(shuffle<1>(aa), y, tmp); + ssef tmp = w; tmp = madd(shuffle<2>(aa), z, tmp); - tmp += w; + tmp = madd(shuffle<1>(aa), y, tmp); + tmp = madd(shuffle<0>(aa), x, tmp); return float3(tmp.m128); #elif defined(__KERNEL_METAL__) @@ -93,9 +93,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con _MM_TRANSPOSE4_PS(x, y, z, w); - ssef tmp = shuffle<0>(aa) * x; + ssef tmp = shuffle<2>(aa) * z; tmp = madd(shuffle<1>(aa), y, tmp); - tmp = madd(shuffle<2>(aa), z, tmp); + tmp = madd(shuffle<0>(aa), x, tmp); return float3(tmp.m128); #elif defined(__KERNEL_METAL__) @@ -312,7 +312,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val t->z[column] = value.z; } -Transform transform_inverse(const Transform &a); Transform transform_transposed_inverse(const Transform &a); ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale) @@ -392,39 +391,47 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t) #endif /* defined(__KERNEL_GPU_RAYTRACING__) */ } -ccl_device_inline Transform transform_quick_inverse(Transform M) +ccl_device_inline Transform transform_inverse(const Transform tfm) { - /* possible optimization: can we avoid doing this altogether and construct - * the inverse matrix directly from negated translation, transposed rotation, - * scale can be inverted but what about shearing? */ - Transform R; - float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) + - M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z); + /* This implementation matches the one in Embree exactly, to ensure consistent + * results with the ray intersection of instances. */ + float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x); + float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y); + float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z); + float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w); + + /* Compute determinant. */ + float det = dot(x, cross(y, z)); + if (det == 0.0f) { - M.x.x += 1e-8f; - M.y.y += 1e-8f; - M.z.z += 1e-8f; - det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) + - M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z); + /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should + * never be in this situation, but try to invert it anyway with tweak. + * + * This logic does not match Embree which would just give an invalid + * matrix. A better solution would be to remove this and ensure any object + * matrix is valid. */ + x.x += 1e-8f; + y.y += 1e-8f; + z.z += 1e-8f; + + det = dot(x, cross(y, z)); + if (det == 0.0f) { + det = FLT_MAX; + } } - det = (det != 0.0f) ? 1.0f / det : 0.0f; - - float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z, - M.z.y * M.x.z - M.z.z * M.x.y, - M.y.z * M.x.y - M.y.y * M.x.z); - float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x, - M.z.z * M.x.x - M.z.x * M.x.z, - M.y.x * M.x.z - M.y.z * M.x.x); - float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y, - M.z.x * M.x.y - M.z.y * M.x.x, - M.y.y * M.x.x - M.y.x * M.x.y); - float3 T = -make_float3(M.x.w, M.y.w, M.z.w); - - R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T)); - R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T)); - R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T)); - - return R; + + /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */ + const float3 inverse_x = cross(y, z) / det; + const float3 inverse_y = cross(z, x) / det; + const float3 inverse_z = cross(x, y) / det; + + /* Compute translation and fill transform. */ + Transform itfm; + itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w)); + itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w)); + itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w)); + + return itfm; } ccl_device_inline void transform_compose(ccl_private Transform *tfm, -- cgit v1.2.3