3 files changed, 165 insertions, 9 deletions
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index b67c9394bea..a57703a8b8c 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -33,6 +33,30 @@ ccl_device_forceinline float intersection_t_offset(const float t)
   return __uint_as_float(bits);
 }
 
+/* Ray offset to avoid self intersection.
+ *
+ * This function can be used to compute a modified ray start position for rays
+ * leaving from a surface. This is from:
+ * "A Fast and Robust Method for Avoiding Self-Intersection"
+ * Ray Tracing Gems, chapter 6.
+ */
+ccl_device_inline float3 ray_offset(const float3 P, const float3 Ng)
+{
+  const float int_scale = 256.0f;
+  const int3 of_i = make_int3(
+      (int)(int_scale * Ng.x), (int)(int_scale * Ng.y), (int)(int_scale * Ng.z));
+
+  const float3 p_i = make_float3(
+      __int_as_float(__float_as_int(P.x) + ((P.x < 0) ? -of_i.x : of_i.x)),
+      __int_as_float(__float_as_int(P.y) + ((P.y < 0) ? -of_i.y : of_i.y)),
+      __int_as_float(__float_as_int(P.z) + ((P.z < 0) ? -of_i.z : of_i.z)));
+  const float origin = 1.0f / 32.0f;
+  const float float_scale = 1.0f / 65536.0f;
+  return make_float3(fabsf(P.x) < origin ? P.x + float_scale * Ng.x : p_i.x,
+                     fabsf(P.y) < origin ? P.y + float_scale * Ng.y : p_i.y,
+                     fabsf(P.z) < origin ? P.z + float_scale * Ng.z : p_i.z);
+}
+
 #ifndef __KERNEL_GPU__
 ccl_device int intersections_compare(const void *a, const void *b)
 {
diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index 70b20a93b6a..19b8946e865 100644
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -31,6 +31,52 @@ ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
   shader_setup_from_ray(kg, sd, &ray, &isect);
 }
 
+ccl_device_forceinline float3 integrate_surface_ray_offset(KernelGlobals kg,
+                                                           const ccl_private ShaderData *sd,
+                                                           const float3 ray_P,
+                                                           const float3 ray_D)
+{
+  /* No ray offset needed for other primitive types. */
+  if (!(sd->type & PRIMITIVE_TRIANGLE)) {
+    return ray_P;
+  }
+
+  /* Self intersection tests already account for the case where a ray hits the
+   * same primitive. However precision issues can still cause neighboring
+   * triangles to be hit. Here we test if the ray-triangle intersection with
+   * the same primitive would miss, implying that a neighbouring triangle would
+   * be hit instead.
+   *
+   * This relies on triangle intersection to be watertight, and the object inverse
+   * object transform to match the one used by ray intersection exactly.
+   *
+   * Potential improvements:
+   * - It appears this happens when either barycentric coordinates are small,
+   *   or dot(sd->Ng, ray_D)  is small. Detect such cases and skip test?
+   * - Instead of ray offset, can we tweak P to lie within the triangle?
+   */
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, sd->prim).w;
+  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
+                      tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
+                      tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+
+  float3 local_ray_P = ray_P;
+  float3 local_ray_D = ray_D;
+
+  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+    const Transform itfm = object_get_inverse_transform(kg, sd);
+    local_ray_P = transform_point(&itfm, local_ray_P);
+    local_ray_D = transform_direction(&itfm, local_ray_D);
+  }
+
+  if (ray_triangle_intersect_self(local_ray_P, local_ray_D, tri_a, tri_b, tri_c)) {
+    return ray_P;
+  }
+  else {
+    return ray_offset(ray_P, sd->Ng);
+  }
+}
+
 #ifdef __HOLDOUT__
 ccl_device_forceinline bool integrate_surface_holdout(KernelGlobals kg,
                                                       ConstIntegratorState state,
@@ -200,6 +246,10 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
 #  endif
   }
 
+  if (ray.self.object != OBJECT_NONE) {
+    ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
+  }
+
   /* Write shadow ray and associated state to global memory. */
   integrator_state_write_shadow_ray(kg, shadow_state, &ray);
   // Save memory by storing the light and object indices in the shadow_isect
@@ -327,8 +377,9 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
   }
   else {
     /* Setup ray with changed origin and direction. */
-    INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
-    INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in);
+    const float3 D = normalize(bsdf_omega_in);
+    INTEGRATOR_STATE_WRITE(state, ray, P) = integrate_surface_ray_offset(kg, sd, sd->P, D);
+    INTEGRATOR_STATE_WRITE(state, ray, D) = D;
     INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
     INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
@@ -422,6 +473,9 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
   Ray ray ccl_optional_struct_init;
   ray.P = shadow_ray_offset(kg, sd, ao_D, &skip_self);
   ray.D = ao_D;
+  if (skip_self) {
+    ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
+  }
   ray.tmin = 0.0f;
   ray.tmax = kernel_data.integrator.ao_bounces_distance;
   ray.time = sd->time;
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index cc07cbe7745..aa28682f8c1 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -105,6 +105,51 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
   return false;
 }
 
+/* Custom rcp, cross and dot implementations that match Embree bit for bit. */
+ccl_device_forceinline float ray_triangle_rcp(const float x)
+{
+#ifdef __KERNEL_NEON__
+  /* Move scalar to vector register and do rcp. */
+  __m128 a;
+  a[0] = x;
+  float32x4_t reciprocal = vrecpeq_f32(a);
+  reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+  return reciprocal[0];
+#elif defined(__KERNEL_SSE__)
+  const __m128 a = _mm_set_ss(x);
+  const __m128 r = _mm_rcp_ss(a);
+
+#  ifdef __KERNEL_AVX2_
+  return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#  else
+  return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#  endif
+#else
+  return 1.0f / x;
+#endif
+}
+
+ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+#else
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+#endif
+}
+
+ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
+                     msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
+                     msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+#else
+  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#endif
+}
+
 ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
                                                    const float3 ray_D,
                                                    const float ray_tmin,
@@ -130,9 +175,9 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
   const float3 e2 = v1 - v2;
 
   /* Perform edge tests. */
-  const float U = dot(cross(e0, v2 + v0), ray_D);
-  const float V = dot(cross(e1, v0 + v1), ray_D);
-  const float W = dot(cross(e2, v1 + v2), ray_D);
+  const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D);
+  const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D);
+  const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D);
 
   const float UVW = U + V + W;
   const float eps = FLT_EPSILON * fabsf(UVW);
@@ -144,7 +189,7 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
   }
 
   /* Calculate geometry normal and denominator. */
-  const float3 Ng1 = cross(e1, e0);
+  const float3 Ng1 = ray_triangle_cross(e1, e0);
   const float3 Ng = Ng1 + Ng1;
   const float den = dot(Ng, ray_D);
   /* Avoid division by 0. */
@@ -159,13 +204,46 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
     return false;
   }
 
-  const float rcp_UVW = (fabsf(UVW) < 1e-18f) ? 0.0f : 1.0f / UVW;
-  *isect_u = min(U * rcp_UVW, 1.0f);
-  *isect_v = min(V * rcp_UVW, 1.0f);
+  const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW);
+  *isect_u = min(U * rcp_uvw, 1.0f);
+  *isect_v = min(V * rcp_uvw, 1.0f);
   *isect_t = t;
   return true;
 }
 
+ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P,
+                                                        const float3 ray_D,
+                                                        const float3 tri_a,
+                                                        const float3 tri_b,
+                                                        const float3 tri_c)
+{
+  /* Matches logic in ray_triangle_intersect, self intersection test to validate
+   * if a ray is going to hit self or might incorrectly hit a neighboring triangle. */
+
+  /* Calculate vertices relative to ray origin. */
+  const float3 v0 = tri_a - ray_P;
+  const float3 v1 = tri_b - ray_P;
+  const float3 v2 = tri_c - ray_P;
+
+  /* Calculate triangle edges. */
+  const float3 e0 = v2 - v0;
+  const float3 e1 = v0 - v1;
+  const float3 e2 = v1 - v2;
+
+  /* Perform edge tests. */
+  const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D);
+  const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D);
+  const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D);
+
+  const float eps = FLT_EPSILON * fabsf(U + V + W);
+  const float minUVW = min(U, min(V, W));
+  const float maxUVW = max(U, max(V, W));
+
+  /* Note the extended epsilon compared to ray_triangle_intersect, to account
+   * for intersections with neighboring triangles that have an epsilon. */
+  return (minUVW >= eps || maxUVW <= -eps);
+}
+
 /* Tests for an intersection between a ray and a quad defined by
  * its midpoint, normal and sides.
  * If ellipse is true, hits outside the ellipse that's enclosed by the