diff options
author | Brecht Van Lommel <brecht@blender.org> | 2022-08-09 20:19:54 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2022-08-09 20:19:54 +0300 |
commit | 752fb5dd081233f4f6ec389503cc4e7bec15e772 (patch) | |
tree | f467cff8b7436acea1bd09b27b2869da56ab0f16 /intern/cycles | |
parent | 1e8e70a90cdafa94746dd8aaee886a5747ff9d9e (diff) | |
parent | 79f1cc601cdbcf142e1bf4c1966f64dcf93b030f (diff) |
Merge branch 'blender-v3.3-release'
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/util.h | 24 | ||||
-rw-r--r-- | intern/cycles/kernel/integrator/shade_surface.h | 58 | ||||
-rw-r--r-- | intern/cycles/util/CMakeLists.txt | 9 | ||||
-rw-r--r-- | intern/cycles/util/math_intersect.h | 92 | ||||
-rw-r--r-- | intern/cycles/util/system.cpp | 89 | ||||
-rw-r--r-- | intern/cycles/util/system.h | 11 | ||||
-rw-r--r-- | intern/cycles/util/transform.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/util/transform.h | 65 | ||||
-rw-r--r-- | intern/cycles/util/transform_avx2.cpp | 13 | ||||
-rw-r--r-- | intern/cycles/util/transform_inverse.h | 76 | ||||
-rw-r--r-- | intern/cycles/util/transform_sse41.cpp | 13 | ||||
-rw-r--r-- | intern/cycles/util/vector.h | 1 |
13 files changed, 342 insertions, 114 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 8b8ad1758ca..fbc30234dac 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -326,6 +326,7 @@ set(SRC_UTIL_HEADERS ../util/rect.h ../util/static_assert.h ../util/transform.h + ../util/transform_inverse.h ../util/texture.h ../util/types.h ../util/types_float2.h diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h index b67c9394bea..a57703a8b8c 100644 --- a/intern/cycles/kernel/bvh/util.h +++ b/intern/cycles/kernel/bvh/util.h @@ -33,6 +33,30 @@ ccl_device_forceinline float intersection_t_offset(const float t) return __uint_as_float(bits); } +/* Ray offset to avoid self intersection. + * + * This function can be used to compute a modified ray start position for rays + * leaving from a surface. This is from: + * "A Fast and Robust Method for Avoiding Self-Intersection" + * Ray Tracing Gems, chapter 6. + */ +ccl_device_inline float3 ray_offset(const float3 P, const float3 Ng) +{ + const float int_scale = 256.0f; + const int3 of_i = make_int3( + (int)(int_scale * Ng.x), (int)(int_scale * Ng.y), (int)(int_scale * Ng.z)); + + const float3 p_i = make_float3( + __int_as_float(__float_as_int(P.x) + ((P.x < 0) ? -of_i.x : of_i.x)), + __int_as_float(__float_as_int(P.y) + ((P.y < 0) ? -of_i.y : of_i.y)), + __int_as_float(__float_as_int(P.z) + ((P.z < 0) ? -of_i.z : of_i.z))); + const float origin = 1.0f / 32.0f; + const float float_scale = 1.0f / 65536.0f; + return make_float3(fabsf(P.x) < origin ? P.x + float_scale * Ng.x : p_i.x, + fabsf(P.y) < origin ? P.y + float_scale * Ng.y : p_i.y, + fabsf(P.z) < origin ? P.z + float_scale * Ng.z : p_i.z); +} + #ifndef __KERNEL_GPU__ ccl_device int intersections_compare(const void *a, const void *b) { diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h index 986f7f622fb..86dbfdd3edc 100644 --- a/intern/cycles/kernel/integrator/shade_surface.h +++ b/intern/cycles/kernel/integrator/shade_surface.h @@ -31,6 +31,52 @@ ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg, shader_setup_from_ray(kg, sd, &ray, &isect); } +ccl_device_forceinline float3 integrate_surface_ray_offset(KernelGlobals kg, + const ccl_private ShaderData *sd, + const float3 ray_P, + const float3 ray_D) +{ + /* No ray offset needed for other primitive types. */ + if (!(sd->type & PRIMITIVE_TRIANGLE)) { + return ray_P; + } + + /* Self intersection tests already account for the case where a ray hits the + * same primitive. However precision issues can still cause neighboring + * triangles to be hit. Here we test if the ray-triangle intersection with + * the same primitive would miss, implying that a neighbouring triangle would + * be hit instead. + * + * This relies on triangle intersection to be watertight, and the object inverse + * object transform to match the one used by ray intersection exactly. + * + * Potential improvements: + * - It appears this happens when either barycentric coordinates are small, + * or dot(sd->Ng, ray_D) is small. Detect such cases and skip test? + * - Instead of ray offset, can we tweak P to lie within the triangle? + */ + const uint tri_vindex = kernel_data_fetch(tri_vindex, sd->prim).w; + const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0), + tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1), + tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2); + + float3 local_ray_P = ray_P; + float3 local_ray_D = ray_D; + + if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + const Transform itfm = object_get_inverse_transform(kg, sd); + local_ray_P = transform_point(&itfm, local_ray_P); + local_ray_D = transform_direction(&itfm, local_ray_D); + } + + if (ray_triangle_intersect_self(local_ray_P, local_ray_D, tri_a, tri_b, tri_c)) { + return ray_P; + } + else { + return ray_offset(ray_P, sd->Ng); + } +} + #ifdef __HOLDOUT__ ccl_device_forceinline bool integrate_surface_holdout(KernelGlobals kg, ConstIntegratorState state, @@ -200,6 +246,10 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg, # endif } + if (ray.self.object != OBJECT_NONE) { + ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D); + } + /* Write shadow ray and associated state to global memory. */ integrator_state_write_shadow_ray(kg, shadow_state, &ray); // Save memory by storing the light and object indices in the shadow_isect @@ -328,8 +378,9 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce( } else { /* Setup ray with changed origin and direction. */ - INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P; - INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in); + const float3 D = normalize(bsdf_omega_in); + INTEGRATOR_STATE_WRITE(state, ray, P) = integrate_surface_ray_offset(kg, sd, sd->P, D); + INTEGRATOR_STATE_WRITE(state, ray, D) = D; INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f; INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ @@ -423,6 +474,9 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg, Ray ray ccl_optional_struct_init; ray.P = shadow_ray_offset(kg, sd, ao_D, &skip_self); ray.D = ao_D; + if (skip_self) { + ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D); + } ray.tmin = 0.0f; ray.tmax = kernel_data.integrator.ao_bounces_distance; ray.time = sd->time; diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index e70f65e124c..81a7607baab 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -26,6 +26,8 @@ set(SRC thread.cpp time.cpp transform.cpp + transform_avx2.cpp + transform_sse41.cpp windows.cpp ) @@ -137,6 +139,13 @@ set(SRC_HEADERS xml.h ) +if(CXX_HAS_SSE) + set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") +endif() +if(CXX_HAS_AVX2) + set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") +endif() + include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h index cc07cbe7745..aa28682f8c1 100644 --- a/intern/cycles/util/math_intersect.h +++ b/intern/cycles/util/math_intersect.h @@ -105,6 +105,51 @@ ccl_device bool ray_disk_intersect(float3 ray_P, return false; } +/* Custom rcp, cross and dot implementations that match Embree bit for bit. */ +ccl_device_forceinline float ray_triangle_rcp(const float x) +{ +#ifdef __KERNEL_NEON__ + /* Move scalar to vector register and do rcp. */ + __m128 a; + a[0] = x; + float32x4_t reciprocal = vrecpeq_f32(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return reciprocal[0]; +#elif defined(__KERNEL_SSE__) + const __m128 a = _mm_set_ss(x); + const __m128 r = _mm_rcp_ss(a); + +# ifdef __KERNEL_AVX2_ + return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f)))); +# else + return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); +# endif +#else + return 1.0f / x; +#endif +} + +ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0]; +#else + return a.x * b.x + a.y * b.y + a.z * b.z; +#endif +} + +ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0], + msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0], + msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]); +#else + return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); +#endif +} + ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P, const float3 ray_D, const float ray_tmin, @@ -130,9 +175,9 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P, const float3 e2 = v1 - v2; /* Perform edge tests. */ - const float U = dot(cross(e0, v2 + v0), ray_D); - const float V = dot(cross(e1, v0 + v1), ray_D); - const float W = dot(cross(e2, v1 + v2), ray_D); + const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D); + const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D); + const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D); const float UVW = U + V + W; const float eps = FLT_EPSILON * fabsf(UVW); @@ -144,7 +189,7 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P, } /* Calculate geometry normal and denominator. */ - const float3 Ng1 = cross(e1, e0); + const float3 Ng1 = ray_triangle_cross(e1, e0); const float3 Ng = Ng1 + Ng1; const float den = dot(Ng, ray_D); /* Avoid division by 0. */ @@ -159,13 +204,46 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P, return false; } - const float rcp_UVW = (fabsf(UVW) < 1e-18f) ? 0.0f : 1.0f / UVW; - *isect_u = min(U * rcp_UVW, 1.0f); - *isect_v = min(V * rcp_UVW, 1.0f); + const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW); + *isect_u = min(U * rcp_uvw, 1.0f); + *isect_v = min(V * rcp_uvw, 1.0f); *isect_t = t; return true; } +ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P, + const float3 ray_D, + const float3 tri_a, + const float3 tri_b, + const float3 tri_c) +{ + /* Matches logic in ray_triangle_intersect, self intersection test to validate + * if a ray is going to hit self or might incorrectly hit a neighboring triangle. */ + + /* Calculate vertices relative to ray origin. */ + const float3 v0 = tri_a - ray_P; + const float3 v1 = tri_b - ray_P; + const float3 v2 = tri_c - ray_P; + + /* Calculate triangle edges. */ + const float3 e0 = v2 - v0; + const float3 e1 = v0 - v1; + const float3 e2 = v1 - v2; + + /* Perform edge tests. */ + const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D); + const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D); + const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D); + + const float eps = FLT_EPSILON * fabsf(U + V + W); + const float minUVW = min(U, min(V, W)); + const float maxUVW = max(U, max(V, W)); + + /* Note the extended epsilon compared to ray_triangle_intersect, to account + * for intersections with neighboring triangles that have an epsilon. */ + return (minUVW >= eps || maxUVW <= -eps); +} + /* Tests for an intersection between a ray and a quad defined by * its midpoint, normal and sides. * If ellipse is true, hits outside the ellipse that's enclosed by the diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp index a13ad95b9fe..3183ac06f26 100644 --- a/intern/cycles/util/system.cpp +++ b/intern/cycles/util/system.cpp @@ -128,53 +128,42 @@ int system_cpu_bits() #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) struct CPUCapabilities { - bool x64; - bool mmx; - bool sse; bool sse2; bool sse3; - bool ssse3; bool sse41; - bool sse42; - bool sse4a; bool avx; - bool f16c; bool avx2; - bool xop; - bool fma3; - bool fma4; - bool bmi1; - bool bmi2; }; static CPUCapabilities &system_cpu_capabilities() { - static CPUCapabilities caps; + static CPUCapabilities caps = {}; static bool caps_init = false; if (!caps_init) { int result[4], num; - memset(&caps, 0, sizeof(caps)); - __cpuid(result, 0); num = result[0]; if (num >= 1) { __cpuid(result, 0x00000001); - caps.mmx = (result[3] & ((int)1 << 23)) != 0; - caps.sse = (result[3] & ((int)1 << 25)) != 0; - caps.sse2 = (result[3] & ((int)1 << 26)) != 0; - caps.sse3 = (result[2] & ((int)1 << 0)) != 0; + const bool sse = (result[3] & ((int)1 << 25)) != 0; + const bool sse2 = (result[3] & ((int)1 << 26)) != 0; + const bool sse3 = (result[2] & ((int)1 << 0)) != 0; + + const bool ssse3 = (result[2] & ((int)1 << 9)) != 0; + const bool sse41 = (result[2] & ((int)1 << 19)) != 0; + /* const bool sse42 = (result[2] & ((int)1 << 20)) != 0; */ - caps.ssse3 = (result[2] & ((int)1 << 9)) != 0; - caps.sse41 = (result[2] & ((int)1 << 19)) != 0; - caps.sse42 = (result[2] & ((int)1 << 20)) != 0; + const bool fma3 = (result[2] & ((int)1 << 12)) != 0; + const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0; + const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0; - caps.fma3 = (result[2] & ((int)1 << 12)) != 0; - caps.avx = false; - bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0; - bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0; + /* Simplify to combined capabilities for which we specialize kernels. */ + caps.sse2 = sse && sse2; + caps.sse3 = sse && sse2 && sse3 && ssse3; + caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41; if (os_uses_xsave_xrestore && cpu_avx_support) { // Check if the OS will save the YMM registers @@ -189,15 +178,18 @@ static CPUCapabilities &system_cpu_capabilities() # else xcr_feature_mask = 0; # endif - caps.avx = (xcr_feature_mask & 0x6) == 0x6; - } + const bool avx = (xcr_feature_mask & 0x6) == 0x6; + const bool f16c = (result[2] & ((int)1 << 29)) != 0; - caps.f16c = (result[2] & ((int)1 << 29)) != 0; + __cpuid(result, 0x00000007); + bool bmi1 = (result[1] & ((int)1 << 3)) != 0; + bool bmi2 = (result[1] & ((int)1 << 8)) != 0; + bool avx2 = (result[1] & ((int)1 << 5)) != 0; - __cpuid(result, 0x00000007); - caps.bmi1 = (result[1] & ((int)1 << 3)) != 0; - caps.bmi2 = (result[1] & ((int)1 << 8)) != 0; - caps.avx2 = (result[1] & ((int)1 << 5)) != 0; + caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx; + caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 && + bmi2; + } } caps_init = true; @@ -209,32 +201,31 @@ static CPUCapabilities &system_cpu_capabilities() bool system_cpu_support_sse2() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2; + return caps.sse2; } bool system_cpu_support_sse3() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3; + return caps.sse3; } bool system_cpu_support_sse41() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41; + return caps.sse41; } bool system_cpu_support_avx() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx; + return caps.avx; } bool system_cpu_support_avx2() { CPUCapabilities &caps = system_cpu_capabilities(); - return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c && - caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2; + return caps.avx2; } #else @@ -264,26 +255,6 @@ bool system_cpu_support_avx2() #endif -bool system_call_self(const vector<string> &args) -{ - /* Escape program and arguments in case they contain spaces. */ - string cmd = "\"" + Sysutil::this_program_path() + "\""; - - for (int i = 0; i < args.size(); i++) { - cmd += " \"" + args[i] + "\""; - } - -#ifdef _WIN32 - /* Use cmd /S to avoid issues with spaces in arguments. */ - cmd = "cmd /S /C \"" + cmd + " > nul \""; -#else - /* Quiet output. */ - cmd += " > /dev/null"; -#endif - - return (system(cmd.c_str()) == 0); -} - size_t system_physical_ram() { #ifdef _WIN32 diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h index 23dcfdd303a..2152b89ed24 100644 --- a/intern/cycles/util/system.h +++ b/intern/cycles/util/system.h @@ -4,15 +4,17 @@ #ifndef __UTIL_SYSTEM_H__ #define __UTIL_SYSTEM_H__ -#include "util/string.h" -#include "util/vector.h" +#include <stdint.h> +#include <stdlib.h> + +#include <string> CCL_NAMESPACE_BEGIN /* Get width in characters of the current console output. */ int system_console_width(); -string system_cpu_brand_string(); +std::string system_cpu_brand_string(); int system_cpu_bits(); bool system_cpu_support_sse2(); bool system_cpu_support_sse3(); @@ -22,9 +24,6 @@ bool system_cpu_support_avx2(); size_t system_physical_ram(); -/* Start a new process of the current application with the given arguments. */ -bool system_call_self(const vector<string> &args); - /* Get identifier of the currently running process. */ uint64_t system_self_process_id(); diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp index 0b87e88871d..cb985c65dd8 100644 --- a/intern/cycles/util/transform.cpp +++ b/intern/cycles/util/transform.cpp @@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN /* Transform Inverse */ -static bool transform_matrix4_gj_inverse(float R[][4], float M[][4]) +static bool projection_matrix4_inverse(float R[][4], float M[][4]) { /* SPDX-License-Identifier: BSD-3-Clause * Adapted from code: @@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm) memcpy(R, &tfmR, sizeof(R)); memcpy(M, &tfm, sizeof(M)); - if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { + if (UNLIKELY(!projection_matrix4_inverse(R, M))) { return projection_identity(); } diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h index 71164efbac1..24184dc7074 100644 --- a/intern/cycles/util/transform.h +++ b/intern/cycles/util/transform.h @@ -11,6 +11,10 @@ #include "util/math.h" #include "util/types.h" +#ifndef __KERNEL_GPU__ +# include "util/system.h" +#endif + CCL_NAMESPACE_BEGIN /* Affine transformation, stored as 4x3 matrix. */ @@ -38,6 +42,12 @@ typedef struct DecomposedTransform { float4 x, y, z, w; } DecomposedTransform; +CCL_NAMESPACE_END + +#include "util/transform_inverse.h" + +CCL_NAMESPACE_BEGIN + /* Functions */ #ifdef __KERNEL_METAL__ @@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t) #endif /* defined(__KERNEL_GPU_RAYTRACING__) */ } +#ifndef __KERNEL_GPU__ +void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm); +void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm); +#endif + ccl_device_inline Transform transform_inverse(const Transform tfm) { - /* This implementation matches the one in Embree exactly, to ensure consistent - * results with the ray intersection of instances. */ - float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x); - float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y); - float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z); - float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w); - - /* Compute determinant. */ - float det = dot(x, cross(y, z)); - - if (det == 0.0f) { - /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should - * never be in this situation, but try to invert it anyway with tweak. - * - * This logic does not match Embree which would just give an invalid - * matrix. A better solution would be to remove this and ensure any object - * matrix is valid. */ - x.x += 1e-8f; - y.y += 1e-8f; - z.z += 1e-8f; - - det = dot(x, cross(y, z)); - if (det == 0.0f) { - det = FLT_MAX; - } + /* Optimized transform implementations. */ +#ifndef __KERNEL_GPU__ + if (system_cpu_support_avx2()) { + Transform itfm; + transform_inverse_cpu_avx2(tfm, itfm); + return itfm; } + else if (system_cpu_support_sse41()) { + Transform itfm; + transform_inverse_cpu_sse41(tfm, itfm); + return itfm; + } +#endif - /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */ - const float3 inverse_x = cross(y, z) / det; - const float3 inverse_y = cross(z, x) / det; - const float3 inverse_z = cross(x, y) / det; - - /* Compute translation and fill transform. */ - Transform itfm; - itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w)); - itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w)); - itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w)); - - return itfm; + return transform_inverse_impl(tfm); } ccl_device_inline void transform_compose(ccl_private Transform *tfm, diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp new file mode 100644 index 00000000000..57c160388e2 --- /dev/null +++ b/intern/cycles/util/transform_avx2.cpp @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#include "util/transform.h" + +CCL_NAMESPACE_BEGIN + +void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm) +{ + itfm = transform_inverse_impl(tfm); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h new file mode 100644 index 00000000000..07fd06c1467 --- /dev/null +++ b/intern/cycles/util/transform_inverse.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Custom cross and dot implementations that match Embree bit for bit. + * Normally we don't use SSE41/AVX outside the kernel, but for this it's + * important to match exactly for ray tracing precision. */ + +ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b) +{ +#ifdef __AVX2__ + const ssef sse_a = (const __m128 &)a; + const ssef sse_b = (const __m128 &)b; + const ssef r = shuffle<1, 2, 0, 3>( + ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b))); + return (const float3 &)r; +#endif + + return cross(a, b); +} + +ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b) +{ +#ifdef __SSE4_1__ + return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F)); +#endif + + return dot(a, b); +} + +ccl_device_inline Transform transform_inverse_impl(const Transform tfm) +{ + /* This implementation matches the one in Embree exactly, to ensure consistent + * results with the ray intersection of instances. */ + float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x); + float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y); + float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z); + float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w); + + /* Compute determinant. */ + float det = transform_inverse_dot(x, transform_inverse_cross(y, z)); + + if (det == 0.0f) { + /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should + * never be in this situation, but try to invert it anyway with tweak. + * + * This logic does not match Embree which would just give an invalid + * matrix. A better solution would be to remove this and ensure any object + * matrix is valid. */ + x.x += 1e-8f; + y.y += 1e-8f; + z.z += 1e-8f; + + det = transform_inverse_dot(x, cross(y, z)); + if (det == 0.0f) { + det = FLT_MAX; + } + } + + /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */ + const float3 inverse_x = transform_inverse_cross(y, z) / det; + const float3 inverse_y = transform_inverse_cross(z, x) / det; + const float3 inverse_z = transform_inverse_cross(x, y) / det; + + /* Compute translation and fill transform. */ + Transform itfm; + itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w)); + itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w)); + itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w)); + + return itfm; +} +CCL_NAMESPACE_END diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp new file mode 100644 index 00000000000..8a698807a9c --- /dev/null +++ b/intern/cycles/util/transform_sse41.cpp @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#include "util/transform.h" + +CCL_NAMESPACE_BEGIN + +void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm) +{ + itfm = transform_inverse_impl(tfm); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h index 0056fb269ae..9e27997cf2c 100644 --- a/intern/cycles/util/vector.h +++ b/intern/cycles/util/vector.h @@ -10,7 +10,6 @@ #include "util/aligned_malloc.h" #include "util/guarded_allocator.h" -#include "util/types.h" CCL_NAMESPACE_BEGIN |