From 286e535071c8f9a906c6c36b8dac0eda6384c79a Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Mon, 8 Aug 2022 17:45:37 +0200
Subject: Cleanup: simplify CPU instruction checking

The performance of this will be slightly more important for upcoming changes.
Also removed an unused function and changed includes so these system.h can
be included in more places.
---
 intern/cycles/util/system.cpp | 89 +++++++++++++++----------------------------
 intern/cycles/util/system.h   | 11 +++---
 intern/cycles/util/vector.h   |  1 -
 3 files changed, 35 insertions(+), 66 deletions(-)

(limited to 'intern/cycles/util')

diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp
index a13ad95b9fe..3183ac06f26 100644
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@@ -128,53 +128,42 @@ int system_cpu_bits()
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
 
 struct CPUCapabilities {
-  bool x64;
-  bool mmx;
-  bool sse;
   bool sse2;
   bool sse3;
-  bool ssse3;
   bool sse41;
-  bool sse42;
-  bool sse4a;
   bool avx;
-  bool f16c;
   bool avx2;
-  bool xop;
-  bool fma3;
-  bool fma4;
-  bool bmi1;
-  bool bmi2;
 };
 
 static CPUCapabilities &system_cpu_capabilities()
 {
-  static CPUCapabilities caps;
+  static CPUCapabilities caps = {};
   static bool caps_init = false;
 
   if (!caps_init) {
     int result[4], num;
 
-    memset(&caps, 0, sizeof(caps));
-
     __cpuid(result, 0);
     num = result[0];
 
     if (num >= 1) {
       __cpuid(result, 0x00000001);
-      caps.mmx = (result[3] & ((int)1 << 23)) != 0;
-      caps.sse = (result[3] & ((int)1 << 25)) != 0;
-      caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
-      caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
+      const bool sse = (result[3] & ((int)1 << 25)) != 0;
+      const bool sse2 = (result[3] & ((int)1 << 26)) != 0;
+      const bool sse3 = (result[2] & ((int)1 << 0)) != 0;
+
+      const bool ssse3 = (result[2] & ((int)1 << 9)) != 0;
+      const bool sse41 = (result[2] & ((int)1 << 19)) != 0;
+      /* const bool sse42 = (result[2] & ((int)1 << 20)) != 0; */
 
-      caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
-      caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
-      caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
+      const bool fma3 = (result[2] & ((int)1 << 12)) != 0;
+      const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
+      const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
 
-      caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
-      caps.avx = false;
-      bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
-      bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
+      /* Simplify to combined capabilities for which we specialize kernels. */
+      caps.sse2 = sse && sse2;
+      caps.sse3 = sse && sse2 && sse3 && ssse3;
+      caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41;
 
       if (os_uses_xsave_xrestore && cpu_avx_support) {
         // Check if the OS will save the YMM registers
@@ -189,15 +178,18 @@ static CPUCapabilities &system_cpu_capabilities()
 #  else
         xcr_feature_mask = 0;
 #  endif
-        caps.avx = (xcr_feature_mask & 0x6) == 0x6;
-      }
+        const bool avx = (xcr_feature_mask & 0x6) == 0x6;
+        const bool f16c = (result[2] & ((int)1 << 29)) != 0;
 
-      caps.f16c = (result[2] & ((int)1 << 29)) != 0;
+        __cpuid(result, 0x00000007);
+        bool bmi1 = (result[1] & ((int)1 << 3)) != 0;
+        bool bmi2 = (result[1] & ((int)1 << 8)) != 0;
+        bool avx2 = (result[1] & ((int)1 << 5)) != 0;
 
-      __cpuid(result, 0x00000007);
-      caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
-      caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
-      caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
+        caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx;
+        caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 &&
+                    bmi2;
+      }
     }
 
     caps_init = true;
@@ -209,32 +201,31 @@ static CPUCapabilities &system_cpu_capabilities()
 bool system_cpu_support_sse2()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2;
+  return caps.sse2;
 }
 
 bool system_cpu_support_sse3()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
+  return caps.sse3;
 }
 
 bool system_cpu_support_sse41()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41;
+  return caps.sse41;
 }
 
 bool system_cpu_support_avx()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
+  return caps.avx;
 }
 
 bool system_cpu_support_avx2()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c &&
-         caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+  return caps.avx2;
 }
 #else
 
@@ -264,26 +255,6 @@ bool system_cpu_support_avx2()
 
 #endif
 
-bool system_call_self(const vector<string> &args)
-{
-  /* Escape program and arguments in case they contain spaces. */
-  string cmd = "\"" + Sysutil::this_program_path() + "\"";
-
-  for (int i = 0; i < args.size(); i++) {
-    cmd += " \"" + args[i] + "\"";
-  }
-
-#ifdef _WIN32
-  /* Use cmd /S to avoid issues with spaces in arguments. */
-  cmd = "cmd /S /C \"" + cmd + " > nul \"";
-#else
-  /* Quiet output. */
-  cmd += " > /dev/null";
-#endif
-
-  return (system(cmd.c_str()) == 0);
-}
-
 size_t system_physical_ram()
 {
 #ifdef _WIN32
diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h
index 23dcfdd303a..2152b89ed24 100644
--- a/intern/cycles/util/system.h
+++ b/intern/cycles/util/system.h
@@ -4,15 +4,17 @@
 #ifndef __UTIL_SYSTEM_H__
 #define __UTIL_SYSTEM_H__
 
-#include "util/string.h"
-#include "util/vector.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <string>
 
 CCL_NAMESPACE_BEGIN
 
 /* Get width in characters of the current console output. */
 int system_console_width();
 
-string system_cpu_brand_string();
+std::string system_cpu_brand_string();
 int system_cpu_bits();
 bool system_cpu_support_sse2();
 bool system_cpu_support_sse3();
@@ -22,9 +24,6 @@ bool system_cpu_support_avx2();
 
 size_t system_physical_ram();
 
-/* Start a new process of the current application with the given arguments. */
-bool system_call_self(const vector<string> &args);
-
 /* Get identifier of the currently running process. */
 uint64_t system_self_process_id();
 
diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h
index 0056fb269ae..9e27997cf2c 100644
--- a/intern/cycles/util/vector.h
+++ b/intern/cycles/util/vector.h
@@ -10,7 +10,6 @@
 
 #include "util/aligned_malloc.h"
 #include "util/guarded_allocator.h"
-#include "util/types.h"
 
 CCL_NAMESPACE_BEGIN
 
-- 
cgit v1.2.3


From 230f9ade64844a50ea02461cfa005f364de09aa9 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Tue, 9 Aug 2022 14:28:19 +0200
Subject: Cycles: make transform inverse match Embree exactly

Helps improve ray-tracing precision. This is a bit complicated as it requires
different implementation depending on the CPU architecture.
---
 intern/cycles/util/CMakeLists.txt      |  9 ++++
 intern/cycles/util/transform.cpp       |  4 +-
 intern/cycles/util/transform.h         | 65 +++++++++++++----------------
 intern/cycles/util/transform_avx2.cpp  | 13 ++++++
 intern/cycles/util/transform_inverse.h | 76 ++++++++++++++++++++++++++++++++++
 intern/cycles/util/transform_sse41.cpp | 13 ++++++
 6 files changed, 141 insertions(+), 39 deletions(-)
 create mode 100644 intern/cycles/util/transform_avx2.cpp
 create mode 100644 intern/cycles/util/transform_inverse.h
 create mode 100644 intern/cycles/util/transform_sse41.cpp

(limited to 'intern/cycles/util')

diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 9bc9f00e142..f3fc7739f66 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -26,6 +26,8 @@ set(SRC
   thread.cpp
   time.cpp
   transform.cpp
+  transform_avx2.cpp
+  transform_sse41.cpp
   windows.cpp
 )
 
@@ -138,6 +140,13 @@ set(SRC_HEADERS
   xml.h
 )
 
+if(CXX_HAS_SSE)
+  set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+endif()
+if(CXX_HAS_AVX2)
+  set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index 0b87e88871d..cb985c65dd8 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Transform Inverse */
 
-static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
+static bool projection_matrix4_inverse(float R[][4], float M[][4])
 {
   /* SPDX-License-Identifier: BSD-3-Clause
    * Adapted from code:
@@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
   memcpy(R, &tfmR, sizeof(R));
   memcpy(M, &tfm, sizeof(M));
 
-  if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
+  if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
     return projection_identity();
   }
 
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 71164efbac1..24184dc7074 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -11,6 +11,10 @@
 #include "util/math.h"
 #include "util/types.h"
 
+#ifndef __KERNEL_GPU__
+#  include "util/system.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Affine transformation, stored as 4x3 matrix. */
@@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
   float4 x, y, z, w;
 } DecomposedTransform;
 
+CCL_NAMESPACE_END
+
+#include "util/transform_inverse.h"
+
+CCL_NAMESPACE_BEGIN
+
 /* Functions */
 
 #ifdef __KERNEL_METAL__
@@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 #endif /* defined(__KERNEL_GPU_RAYTRACING__) */
 }
 
+#ifndef __KERNEL_GPU__
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
+#endif
+
 ccl_device_inline Transform transform_inverse(const Transform tfm)
 {
-  /* This implementation matches the one in Embree exactly, to ensure consistent
-   * results with the ray intersection of instances. */
-  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
-  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
-  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
-  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-
-  /* Compute determinant. */
-  float det = dot(x, cross(y, z));
-
-  if (det == 0.0f) {
-    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
-     * never be in this situation, but try to invert it anyway with tweak.
-     *
-     * This logic does not match Embree which would just give an invalid
-     * matrix. A better solution would be to remove this and ensure any object
-     * matrix is valid. */
-    x.x += 1e-8f;
-    y.y += 1e-8f;
-    z.z += 1e-8f;
-
-    det = dot(x, cross(y, z));
-    if (det == 0.0f) {
-      det = FLT_MAX;
-    }
+  /* Optimized transform implementations. */
+#ifndef __KERNEL_GPU__
+  if (system_cpu_support_avx2()) {
+    Transform itfm;
+    transform_inverse_cpu_avx2(tfm, itfm);
+    return itfm;
   }
+  else if (system_cpu_support_sse41()) {
+    Transform itfm;
+    transform_inverse_cpu_sse41(tfm, itfm);
+    return itfm;
+  }
+#endif
 
-  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
-  const float3 inverse_x = cross(y, z) / det;
-  const float3 inverse_y = cross(z, x) / det;
-  const float3 inverse_z = cross(x, y) / det;
-
-  /* Compute translation and fill transform. */
-  Transform itfm;
-  itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
-  itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
-  itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
-
-  return itfm;
+  return transform_inverse_impl(tfm);
 }
 
 ccl_device_inline void transform_compose(ccl_private Transform *tfm,
diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp
new file mode 100644
index 00000000000..57c160388e2
--- /dev/null
+++ b/intern/cycles/util/transform_avx2.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
+{
+  itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
new file mode 100644
index 00000000000..07fd06c1467
--- /dev/null
+++ b/intern/cycles/util/transform_inverse.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Custom cross and dot implementations that match Embree bit for bit.
+ * Normally we don't use SSE41/AVX outside the kernel, but for this it's
+ * important to match exactly for ray tracing precision. */
+
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+{
+#ifdef __AVX2__
+  const ssef sse_a = (const __m128 &)a;
+  const ssef sse_b = (const __m128 &)b;
+  const ssef r = shuffle<1, 2, 0, 3>(
+      ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+  return (const float3 &)r;
+#endif
+
+  return cross(a, b);
+}
+
+ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+{
+#ifdef __SSE4_1__
+  return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#endif
+
+  return dot(a, b);
+}
+
+ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
+{
+  /* This implementation matches the one in Embree exactly, to ensure consistent
+   * results with the ray intersection of instances. */
+  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+  /* Compute determinant. */
+  float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
+
+  if (det == 0.0f) {
+    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+     * never be in this situation, but try to invert it anyway with tweak.
+     *
+     * This logic does not match Embree which would just give an invalid
+     * matrix. A better solution would be to remove this and ensure any object
+     * matrix is valid. */
+    x.x += 1e-8f;
+    y.y += 1e-8f;
+    z.z += 1e-8f;
+
+    det = transform_inverse_dot(x, cross(y, z));
+    if (det == 0.0f) {
+      det = FLT_MAX;
+    }
+  }
+
+  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+  const float3 inverse_x = transform_inverse_cross(y, z) / det;
+  const float3 inverse_y = transform_inverse_cross(z, x) / det;
+  const float3 inverse_z = transform_inverse_cross(x, y) / det;
+
+  /* Compute translation and fill transform. */
+  Transform itfm;
+  itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
+  itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
+  itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
+
+  return itfm;
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp
new file mode 100644
index 00000000000..8a698807a9c
--- /dev/null
+++ b/intern/cycles/util/transform_sse41.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
+{
+  itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
-- 
cgit v1.2.3


From 79f1cc601cdbcf142e1bf4c1966f64dcf93b030f Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Mon, 18 Jul 2022 21:07:06 +0200
Subject: Cycles: improve ray tracing precision near triangle edges

Detect cases where a ray-intersection would miss the current triangle, which if
the intersection is strictly watertight, implies that a neighboring triangle would
incorrectly be hit instead.

When that is detected, apply a ray-offset. The idea being that we only want to
introduce potential error from ray offsets if we really need to.

This work for BVH2 and Embree, as we are able to match the ray-interesction
bit-for-bit, though doing so for Embree requires ugly hacks. Tiny differences
like fused-multiply-add or dot product intrinstics in matrix inversion and ray
intersection needed to be matched exactly, so this is fragile.

Unfortunately we're not able to do the same for OptiX or MetalRT, since those
implementations are unknown (and possibly impossible to match as hardware
instructions). Still artifacts are much reduced, though not eliminated.

Ref T97259

Differential Revision: https://developer.blender.org/D15559
---
 intern/cycles/util/math_intersect.h | 92 ++++++++++++++++++++++++++++++++++---
 1 file changed, 85 insertions(+), 7 deletions(-)

(limited to 'intern/cycles/util')

diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index cc07cbe7745..aa28682f8c1 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -105,6 +105,51 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
   return false;
 }
 
+/* Custom rcp, cross and dot implementations that match Embree bit for bit. */
+ccl_device_forceinline float ray_triangle_rcp(const float x)
+{
+#ifdef __KERNEL_NEON__
+  /* Move scalar to vector register and do rcp. */
+  __m128 a;
+  a[0] = x;
+  float32x4_t reciprocal = vrecpeq_f32(a);
+  reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+  return reciprocal[0];
+#elif defined(__KERNEL_SSE__)
+  const __m128 a = _mm_set_ss(x);
+  const __m128 r = _mm_rcp_ss(a);
+
+#  ifdef __KERNEL_AVX2_
+  return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#  else
+  return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#  endif
+#else
+  return 1.0f / x;
+#endif
+}
+
+ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+#else
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+#endif
+}
+
+ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
+                     msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
+                     msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+#else
+  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#endif
+}
+
 ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
                                                    const float3 ray_D,
                                                    const float ray_tmin,
@@ -130,9 +175,9 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
   const float3 e2 = v1 - v2;
 
   /* Perform edge tests. */
-  const float U = dot(cross(e0, v2 + v0), ray_D);
-  const float V = dot(cross(e1, v0 + v1), ray_D);
-  const float W = dot(cross(e2, v1 + v2), ray_D);
+  const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D);
+  const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D);
+  const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D);
 
   const float UVW = U + V + W;
   const float eps = FLT_EPSILON * fabsf(UVW);
@@ -144,7 +189,7 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
   }
 
   /* Calculate geometry normal and denominator. */
-  const float3 Ng1 = cross(e1, e0);
+  const float3 Ng1 = ray_triangle_cross(e1, e0);
   const float3 Ng = Ng1 + Ng1;
   const float den = dot(Ng, ray_D);
   /* Avoid division by 0. */
@@ -159,13 +204,46 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
     return false;
   }
 
-  const float rcp_UVW = (fabsf(UVW) < 1e-18f) ? 0.0f : 1.0f / UVW;
-  *isect_u = min(U * rcp_UVW, 1.0f);
-  *isect_v = min(V * rcp_UVW, 1.0f);
+  const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW);
+  *isect_u = min(U * rcp_uvw, 1.0f);
+  *isect_v = min(V * rcp_uvw, 1.0f);
   *isect_t = t;
   return true;
 }
 
+ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P,
+                                                        const float3 ray_D,
+                                                        const float3 tri_a,
+                                                        const float3 tri_b,
+                                                        const float3 tri_c)
+{
+  /* Matches logic in ray_triangle_intersect, self intersection test to validate
+   * if a ray is going to hit self or might incorrectly hit a neighboring triangle. */
+
+  /* Calculate vertices relative to ray origin. */
+  const float3 v0 = tri_a - ray_P;
+  const float3 v1 = tri_b - ray_P;
+  const float3 v2 = tri_c - ray_P;
+
+  /* Calculate triangle edges. */
+  const float3 e0 = v2 - v0;
+  const float3 e1 = v0 - v1;
+  const float3 e2 = v1 - v2;
+
+  /* Perform edge tests. */
+  const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D);
+  const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D);
+  const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D);
+
+  const float eps = FLT_EPSILON * fabsf(U + V + W);
+  const float minUVW = min(U, min(V, W));
+  const float maxUVW = max(U, max(V, W));
+
+  /* Note the extended epsilon compared to ray_triangle_intersect, to account
+   * for intersections with neighboring triangles that have an epsilon. */
+  return (minUVW >= eps || maxUVW <= -eps);
+}
+
 /* Tests for an intersection between a ray and a quad defined by
  * its midpoint, normal and sides.
  * If ellipse is true, hits outside the ellipse that's enclosed by the
-- 
cgit v1.2.3