Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2022-08-09 20:19:54 +0300
committerBrecht Van Lommel <brecht@blender.org>2022-08-09 20:19:54 +0300
commit752fb5dd081233f4f6ec389503cc4e7bec15e772 (patch)
treef467cff8b7436acea1bd09b27b2869da56ab0f16 /intern/cycles/util
parent1e8e70a90cdafa94746dd8aaee886a5747ff9d9e (diff)
parent79f1cc601cdbcf142e1bf4c1966f64dcf93b030f (diff)
Merge branch 'blender-v3.3-release'
Diffstat (limited to 'intern/cycles/util')
-rw-r--r--intern/cycles/util/CMakeLists.txt9
-rw-r--r--intern/cycles/util/math_intersect.h92
-rw-r--r--intern/cycles/util/system.cpp89
-rw-r--r--intern/cycles/util/system.h11
-rw-r--r--intern/cycles/util/transform.cpp4
-rw-r--r--intern/cycles/util/transform.h65
-rw-r--r--intern/cycles/util/transform_avx2.cpp13
-rw-r--r--intern/cycles/util/transform_inverse.h76
-rw-r--r--intern/cycles/util/transform_sse41.cpp13
-rw-r--r--intern/cycles/util/vector.h1
10 files changed, 261 insertions, 112 deletions
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index e70f65e124c..81a7607baab 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -26,6 +26,8 @@ set(SRC
thread.cpp
time.cpp
transform.cpp
+ transform_avx2.cpp
+ transform_sse41.cpp
windows.cpp
)
@@ -137,6 +139,13 @@ set(SRC_HEADERS
xml.h
)
+if(CXX_HAS_SSE)
+ set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+endif()
+if(CXX_HAS_AVX2)
+ set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
+
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index cc07cbe7745..aa28682f8c1 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -105,6 +105,51 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
return false;
}
+/* Custom rcp, cross and dot implementations that match Embree bit for bit. */
+ccl_device_forceinline float ray_triangle_rcp(const float x)
+{
+#ifdef __KERNEL_NEON__
+ /* Move scalar to vector register and do rcp. */
+ __m128 a;
+ a[0] = x;
+ float32x4_t reciprocal = vrecpeq_f32(a);
+ reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+ reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+ return reciprocal[0];
+#elif defined(__KERNEL_SSE__)
+ const __m128 a = _mm_set_ss(x);
+ const __m128 r = _mm_rcp_ss(a);
+
+# ifdef __KERNEL_AVX2_
+ return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+# else
+ return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+# endif
+#else
+ return 1.0f / x;
+#endif
+}
+
+ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+#else
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+#endif
+}
+
+ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+ return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
+ msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
+ msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+#else
+ return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#endif
+}
+
ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
const float3 ray_D,
const float ray_tmin,
@@ -130,9 +175,9 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
const float3 e2 = v1 - v2;
/* Perform edge tests. */
- const float U = dot(cross(e0, v2 + v0), ray_D);
- const float V = dot(cross(e1, v0 + v1), ray_D);
- const float W = dot(cross(e2, v1 + v2), ray_D);
+ const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D);
+ const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D);
+ const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D);
const float UVW = U + V + W;
const float eps = FLT_EPSILON * fabsf(UVW);
@@ -144,7 +189,7 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
}
/* Calculate geometry normal and denominator. */
- const float3 Ng1 = cross(e1, e0);
+ const float3 Ng1 = ray_triangle_cross(e1, e0);
const float3 Ng = Ng1 + Ng1;
const float den = dot(Ng, ray_D);
/* Avoid division by 0. */
@@ -159,13 +204,46 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
return false;
}
- const float rcp_UVW = (fabsf(UVW) < 1e-18f) ? 0.0f : 1.0f / UVW;
- *isect_u = min(U * rcp_UVW, 1.0f);
- *isect_v = min(V * rcp_UVW, 1.0f);
+ const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW);
+ *isect_u = min(U * rcp_uvw, 1.0f);
+ *isect_v = min(V * rcp_uvw, 1.0f);
*isect_t = t;
return true;
}
+ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P,
+ const float3 ray_D,
+ const float3 tri_a,
+ const float3 tri_b,
+ const float3 tri_c)
+{
+ /* Matches logic in ray_triangle_intersect, self intersection test to validate
+ * if a ray is going to hit self or might incorrectly hit a neighboring triangle. */
+
+ /* Calculate vertices relative to ray origin. */
+ const float3 v0 = tri_a - ray_P;
+ const float3 v1 = tri_b - ray_P;
+ const float3 v2 = tri_c - ray_P;
+
+ /* Calculate triangle edges. */
+ const float3 e0 = v2 - v0;
+ const float3 e1 = v0 - v1;
+ const float3 e2 = v1 - v2;
+
+ /* Perform edge tests. */
+ const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D);
+ const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D);
+ const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D);
+
+ const float eps = FLT_EPSILON * fabsf(U + V + W);
+ const float minUVW = min(U, min(V, W));
+ const float maxUVW = max(U, max(V, W));
+
+ /* Note the extended epsilon compared to ray_triangle_intersect, to account
+ * for intersections with neighboring triangles that have an epsilon. */
+ return (minUVW >= eps || maxUVW <= -eps);
+}
+
/* Tests for an intersection between a ray and a quad defined by
* its midpoint, normal and sides.
* If ellipse is true, hits outside the ellipse that's enclosed by the
diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp
index a13ad95b9fe..3183ac06f26 100644
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@@ -128,53 +128,42 @@ int system_cpu_bits()
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
struct CPUCapabilities {
- bool x64;
- bool mmx;
- bool sse;
bool sse2;
bool sse3;
- bool ssse3;
bool sse41;
- bool sse42;
- bool sse4a;
bool avx;
- bool f16c;
bool avx2;
- bool xop;
- bool fma3;
- bool fma4;
- bool bmi1;
- bool bmi2;
};
static CPUCapabilities &system_cpu_capabilities()
{
- static CPUCapabilities caps;
+ static CPUCapabilities caps = {};
static bool caps_init = false;
if (!caps_init) {
int result[4], num;
- memset(&caps, 0, sizeof(caps));
-
__cpuid(result, 0);
num = result[0];
if (num >= 1) {
__cpuid(result, 0x00000001);
- caps.mmx = (result[3] & ((int)1 << 23)) != 0;
- caps.sse = (result[3] & ((int)1 << 25)) != 0;
- caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
- caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
+ const bool sse = (result[3] & ((int)1 << 25)) != 0;
+ const bool sse2 = (result[3] & ((int)1 << 26)) != 0;
+ const bool sse3 = (result[2] & ((int)1 << 0)) != 0;
+
+ const bool ssse3 = (result[2] & ((int)1 << 9)) != 0;
+ const bool sse41 = (result[2] & ((int)1 << 19)) != 0;
+ /* const bool sse42 = (result[2] & ((int)1 << 20)) != 0; */
- caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
- caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
- caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
+ const bool fma3 = (result[2] & ((int)1 << 12)) != 0;
+ const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
+ const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
- caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
- caps.avx = false;
- bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
- bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
+ /* Simplify to combined capabilities for which we specialize kernels. */
+ caps.sse2 = sse && sse2;
+ caps.sse3 = sse && sse2 && sse3 && ssse3;
+ caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41;
if (os_uses_xsave_xrestore && cpu_avx_support) {
// Check if the OS will save the YMM registers
@@ -189,15 +178,18 @@ static CPUCapabilities &system_cpu_capabilities()
# else
xcr_feature_mask = 0;
# endif
- caps.avx = (xcr_feature_mask & 0x6) == 0x6;
- }
+ const bool avx = (xcr_feature_mask & 0x6) == 0x6;
+ const bool f16c = (result[2] & ((int)1 << 29)) != 0;
- caps.f16c = (result[2] & ((int)1 << 29)) != 0;
+ __cpuid(result, 0x00000007);
+ bool bmi1 = (result[1] & ((int)1 << 3)) != 0;
+ bool bmi2 = (result[1] & ((int)1 << 8)) != 0;
+ bool avx2 = (result[1] & ((int)1 << 5)) != 0;
- __cpuid(result, 0x00000007);
- caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
- caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
- caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
+ caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx;
+ caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 &&
+ bmi2;
+ }
}
caps_init = true;
@@ -209,32 +201,31 @@ static CPUCapabilities &system_cpu_capabilities()
bool system_cpu_support_sse2()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2;
+ return caps.sse2;
}
bool system_cpu_support_sse3()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
+ return caps.sse3;
}
bool system_cpu_support_sse41()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41;
+ return caps.sse41;
}
bool system_cpu_support_avx()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
+ return caps.avx;
}
bool system_cpu_support_avx2()
{
CPUCapabilities &caps = system_cpu_capabilities();
- return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c &&
- caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+ return caps.avx2;
}
#else
@@ -264,26 +255,6 @@ bool system_cpu_support_avx2()
#endif
-bool system_call_self(const vector<string> &args)
-{
- /* Escape program and arguments in case they contain spaces. */
- string cmd = "\"" + Sysutil::this_program_path() + "\"";
-
- for (int i = 0; i < args.size(); i++) {
- cmd += " \"" + args[i] + "\"";
- }
-
-#ifdef _WIN32
- /* Use cmd /S to avoid issues with spaces in arguments. */
- cmd = "cmd /S /C \"" + cmd + " > nul \"";
-#else
- /* Quiet output. */
- cmd += " > /dev/null";
-#endif
-
- return (system(cmd.c_str()) == 0);
-}
-
size_t system_physical_ram()
{
#ifdef _WIN32
diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h
index 23dcfdd303a..2152b89ed24 100644
--- a/intern/cycles/util/system.h
+++ b/intern/cycles/util/system.h
@@ -4,15 +4,17 @@
#ifndef __UTIL_SYSTEM_H__
#define __UTIL_SYSTEM_H__
-#include "util/string.h"
-#include "util/vector.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <string>
CCL_NAMESPACE_BEGIN
/* Get width in characters of the current console output. */
int system_console_width();
-string system_cpu_brand_string();
+std::string system_cpu_brand_string();
int system_cpu_bits();
bool system_cpu_support_sse2();
bool system_cpu_support_sse3();
@@ -22,9 +24,6 @@ bool system_cpu_support_avx2();
size_t system_physical_ram();
-/* Start a new process of the current application with the given arguments. */
-bool system_call_self(const vector<string> &args);
-
/* Get identifier of the currently running process. */
uint64_t system_self_process_id();
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index 0b87e88871d..cb985c65dd8 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
/* Transform Inverse */
-static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
+static bool projection_matrix4_inverse(float R[][4], float M[][4])
{
/* SPDX-License-Identifier: BSD-3-Clause
* Adapted from code:
@@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
memcpy(R, &tfmR, sizeof(R));
memcpy(M, &tfm, sizeof(M));
- if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
+ if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
return projection_identity();
}
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 71164efbac1..24184dc7074 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -11,6 +11,10 @@
#include "util/math.h"
#include "util/types.h"
+#ifndef __KERNEL_GPU__
+# include "util/system.h"
+#endif
+
CCL_NAMESPACE_BEGIN
/* Affine transformation, stored as 4x3 matrix. */
@@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
float4 x, y, z, w;
} DecomposedTransform;
+CCL_NAMESPACE_END
+
+#include "util/transform_inverse.h"
+
+CCL_NAMESPACE_BEGIN
+
/* Functions */
#ifdef __KERNEL_METAL__
@@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
#endif /* defined(__KERNEL_GPU_RAYTRACING__) */
}
+#ifndef __KERNEL_GPU__
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
+#endif
+
ccl_device_inline Transform transform_inverse(const Transform tfm)
{
- /* This implementation matches the one in Embree exactly, to ensure consistent
- * results with the ray intersection of instances. */
- float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
- float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
- float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
- float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-
- /* Compute determinant. */
- float det = dot(x, cross(y, z));
-
- if (det == 0.0f) {
- /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
- * never be in this situation, but try to invert it anyway with tweak.
- *
- * This logic does not match Embree which would just give an invalid
- * matrix. A better solution would be to remove this and ensure any object
- * matrix is valid. */
- x.x += 1e-8f;
- y.y += 1e-8f;
- z.z += 1e-8f;
-
- det = dot(x, cross(y, z));
- if (det == 0.0f) {
- det = FLT_MAX;
- }
+ /* Optimized transform implementations. */
+#ifndef __KERNEL_GPU__
+ if (system_cpu_support_avx2()) {
+ Transform itfm;
+ transform_inverse_cpu_avx2(tfm, itfm);
+ return itfm;
}
+ else if (system_cpu_support_sse41()) {
+ Transform itfm;
+ transform_inverse_cpu_sse41(tfm, itfm);
+ return itfm;
+ }
+#endif
- /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
- const float3 inverse_x = cross(y, z) / det;
- const float3 inverse_y = cross(z, x) / det;
- const float3 inverse_z = cross(x, y) / det;
-
- /* Compute translation and fill transform. */
- Transform itfm;
- itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
- itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
- itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
-
- return itfm;
+ return transform_inverse_impl(tfm);
}
ccl_device_inline void transform_compose(ccl_private Transform *tfm,
diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp
new file mode 100644
index 00000000000..57c160388e2
--- /dev/null
+++ b/intern/cycles/util/transform_avx2.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
+{
+ itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
new file mode 100644
index 00000000000..07fd06c1467
--- /dev/null
+++ b/intern/cycles/util/transform_inverse.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Custom cross and dot implementations that match Embree bit for bit.
+ * Normally we don't use SSE41/AVX outside the kernel, but for this it's
+ * important to match exactly for ray tracing precision. */
+
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+{
+#ifdef __AVX2__
+ const ssef sse_a = (const __m128 &)a;
+ const ssef sse_b = (const __m128 &)b;
+ const ssef r = shuffle<1, 2, 0, 3>(
+ ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+ return (const float3 &)r;
+#endif
+
+ return cross(a, b);
+}
+
+ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+{
+#ifdef __SSE4_1__
+ return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#endif
+
+ return dot(a, b);
+}
+
+ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
+{
+ /* This implementation matches the one in Embree exactly, to ensure consistent
+ * results with the ray intersection of instances. */
+ float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+ float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+ float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+ float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+ /* Compute determinant. */
+ float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
+
+ if (det == 0.0f) {
+ /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+ * never be in this situation, but try to invert it anyway with tweak.
+ *
+ * This logic does not match Embree which would just give an invalid
+ * matrix. A better solution would be to remove this and ensure any object
+ * matrix is valid. */
+ x.x += 1e-8f;
+ y.y += 1e-8f;
+ z.z += 1e-8f;
+
+ det = transform_inverse_dot(x, cross(y, z));
+ if (det == 0.0f) {
+ det = FLT_MAX;
+ }
+ }
+
+ /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+ const float3 inverse_x = transform_inverse_cross(y, z) / det;
+ const float3 inverse_y = transform_inverse_cross(z, x) / det;
+ const float3 inverse_z = transform_inverse_cross(x, y) / det;
+
+ /* Compute translation and fill transform. */
+ Transform itfm;
+ itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
+ itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
+ itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
+
+ return itfm;
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp
new file mode 100644
index 00000000000..8a698807a9c
--- /dev/null
+++ b/intern/cycles/util/transform_sse41.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
+{
+ itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h
index 0056fb269ae..9e27997cf2c 100644
--- a/intern/cycles/util/vector.h
+++ b/intern/cycles/util/vector.h
@@ -10,7 +10,6 @@
#include "util/aligned_malloc.h"
#include "util/guarded_allocator.h"
-#include "util/types.h"
CCL_NAMESPACE_BEGIN