20 files changed, 2109 insertions, 1427 deletions
diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h
new file mode 100644
index 00000000000..2d7d8c2d704
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -0,0 +1,582 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* CPU Embree implementation of ray-scene intersection. */
+
+#pragma once
+
+#include <embree3/rtcore_ray.h>
+#include <embree3/rtcore_scene.h>
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+#include "kernel/geom/object.h"
+#include "kernel/integrator/state.h"
+#include "kernel/sample/lcg.h"
+
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define EMBREE_IS_HAIR(x) (x & 1)
+
+/* Intersection context. */
+
+struct CCLIntersectContext {
+  typedef enum {
+    RAY_REGULAR = 0,
+    RAY_SHADOW_ALL = 1,
+    RAY_LOCAL = 2,
+    RAY_SSS = 3,
+    RAY_VOLUME_ALL = 4,
+  } RayType;
+
+  KernelGlobals kg;
+  RayType type;
+
+  /* For avoiding self intersections */
+  const Ray *ray;
+
+  /* for shadow rays */
+  Intersection *isect_s;
+  uint max_hits;
+  uint num_hits;
+  uint num_recorded_hits;
+  float throughput;
+  float max_t;
+  bool opaque_hit;
+
+  /* for SSS Rays: */
+  LocalIntersection *local_isect;
+  int local_object_id;
+  uint *lcg_state;
+
+  CCLIntersectContext(KernelGlobals kg_, RayType type_)
+  {
+    kg = kg_;
+    type = type_;
+    ray = NULL;
+    max_hits = 1;
+    num_hits = 0;
+    num_recorded_hits = 0;
+    throughput = 1.0f;
+    max_t = FLT_MAX;
+    opaque_hit = false;
+    isect_s = NULL;
+    local_isect = NULL;
+    local_object_id = -1;
+    lcg_state = NULL;
+  }
+};
+
+class IntersectContext {
+ public:
+  IntersectContext(CCLIntersectContext *ctx)
+  {
+    rtcInitIntersectContext(&context);
+    userRayExt = ctx;
+  }
+  RTCIntersectContext context;
+  CCLIntersectContext *userRayExt;
+};
+
+/* Utilities. */
+
+ccl_device_inline void kernel_embree_setup_ray(const Ray &ray,
+                                               RTCRay &rtc_ray,
+                                               const uint visibility)
+{
+  rtc_ray.org_x = ray.P.x;
+  rtc_ray.org_y = ray.P.y;
+  rtc_ray.org_z = ray.P.z;
+  rtc_ray.dir_x = ray.D.x;
+  rtc_ray.dir_y = ray.D.y;
+  rtc_ray.dir_z = ray.D.z;
+  rtc_ray.tnear = ray.tmin;
+  rtc_ray.tfar = ray.tmax;
+  rtc_ray.time = ray.time;
+  rtc_ray.mask = visibility;
+}
+
+ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
+                                                  RTCRayHit &rayhit,
+                                                  const uint visibility)
+{
+  kernel_embree_setup_ray(ray, rayhit.ray, visibility);
+  rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+  rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
+}
+
+ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg,
+                                                          const RTCHit *hit,
+                                                          const Ray *ray)
+{
+  int object, prim;
+
+  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+    object = hit->instID[0] / 2;
+    if ((ray->self.object == object) || (ray->self.light_object == object)) {
+      RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+          rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
+      prim = hit->primID +
+             (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+    }
+    else {
+      return false;
+    }
+  }
+  else {
+    object = hit->geomID / 2;
+    if ((ray->self.object == object) || (ray->self.light_object == object)) {
+      prim = hit->primID +
+             (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
+    }
+    else {
+      return false;
+    }
+  }
+
+  const bool is_hair = hit->geomID & 1;
+  if (is_hair) {
+    prim = kernel_data_fetch(curve_segments, prim).prim;
+  }
+
+  return intersection_skip_self_shadow(ray->self, object, prim);
+}
+
+ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg,
+                                                 const RTCRay *ray,
+                                                 const RTCHit *hit,
+                                                 Intersection *isect)
+{
+  isect->t = ray->tfar;
+  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+    RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+        rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
+    isect->prim = hit->primID +
+                  (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+    isect->object = hit->instID[0] / 2;
+  }
+  else {
+    isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(
+                                    rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
+    isect->object = hit->geomID / 2;
+  }
+
+  const bool is_hair = hit->geomID & 1;
+  if (is_hair) {
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, isect->prim);
+    isect->type = segment.type;
+    isect->prim = segment.prim;
+    isect->u = hit->u;
+    isect->v = hit->v;
+  }
+  else {
+    isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
+    isect->u = hit->u;
+    isect->v = hit->v;
+  }
+}
+
+ccl_device_inline void kernel_embree_convert_sss_hit(
+    KernelGlobals kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int object)
+{
+  isect->u = hit->u;
+  isect->v = hit->v;
+  isect->t = ray->tfar;
+  RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+      rtcGetGeometry(kernel_data.device_bvh, object * 2));
+  isect->prim = hit->primID +
+                (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+  isect->object = object;
+  isect->type = kernel_data_fetch(objects, object).primitive_type;
+}
+
+/* Ray filter functions. */
+
+/* This gets called by Embree at every valid ray/object intersection.
+ * Things like recording subsurface or shadow hits for later evaluation
+ * as well as filtering for volume objects happen here.
+ * Cycles' own BVH does that directly inside the traversal calls. */
+ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNArguments *args)
+{
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
+  assert(args->N == 1);
+
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+    *args->valid = 0;
+  }
+}
+
+/* This gets called by Embree at every valid ray/object intersection.
+ * Things like recording subsurface or shadow hits for later evaluation
+ * as well as filtering for volume objects happen here.
+ * Cycles' own BVH does that directly inside the traversal calls.
+ */
+ccl_device void kernel_embree_filter_occluded_func(const RTCFilterFunctionNArguments *args)
+{
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
+  assert(args->N == 1);
+
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  switch (ctx->type) {
+    case CCLIntersectContext::RAY_SHADOW_ALL: {
+      Intersection current_isect;
+      kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+      if (intersection_skip_self_shadow(cray->self, current_isect.object, current_isect.prim)) {
+        *args->valid = 0;
+        return;
+      }
+      /* If no transparent shadows or max number of hits exceeded, all light is blocked. */
+      const int flags = intersection_get_shader_flags(kg, current_isect.prim, current_isect.type);
+      if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->num_hits >= ctx->max_hits) {
+        ctx->opaque_hit = true;
+        return;
+      }
+
+      ++ctx->num_hits;
+
+      /* Always use baked shadow transparency for curves. */
+      if (current_isect.type & PRIMITIVE_CURVE) {
+        ctx->throughput *= intersection_curve_shadow_transparency(
+            kg, current_isect.object, current_isect.prim, current_isect.type, current_isect.u);
+
+        if (ctx->throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+          ctx->opaque_hit = true;
+          return;
+        }
+        else {
+          *args->valid = 0;
+          return;
+        }
+      }
+
+      /* Test if we need to record this transparent intersection. */
+      const uint max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+      if (ctx->num_recorded_hits < max_record_hits || ray->tfar < ctx->max_t) {
+        /* If maximum number of hits was reached, replace the intersection with the
+         * highest distance. We want to find the N closest intersections. */
+        const uint num_recorded_hits = min(ctx->num_recorded_hits, max_record_hits);
+        uint isect_index = num_recorded_hits;
+        if (num_recorded_hits + 1 >= max_record_hits) {
+          float max_t = ctx->isect_s[0].t;
+          uint max_recorded_hit = 0;
+
+          for (uint i = 1; i < num_recorded_hits; ++i) {
+            if (ctx->isect_s[i].t > max_t) {
+              max_recorded_hit = i;
+              max_t = ctx->isect_s[i].t;
+            }
+          }
+
+          if (num_recorded_hits >= max_record_hits) {
+            isect_index = max_recorded_hit;
+          }
+
+          /* Limit the ray distance and stop counting hits beyond this.
+           * TODO: is there some way we can tell Embree to stop intersecting beyond
+           * this distance when max number of hits is reached?. Or maybe it will
+           * become irrelevant if we make max_hits a very high number on the CPU. */
+          ctx->max_t = max(current_isect.t, max_t);
+        }
+
+        ctx->isect_s[isect_index] = current_isect;
+      }
+
+      /* Always increase the number of recorded hits, even beyond the maximum,
+       * so that we can detect this and trace another ray if needed. */
+      ++ctx->num_recorded_hits;
+
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
+      break;
+    }
+    case CCLIntersectContext::RAY_LOCAL:
+    case CCLIntersectContext::RAY_SSS: {
+      /* Check if it's hitting the correct object. */
+      Intersection current_isect;
+      if (ctx->type == CCLIntersectContext::RAY_SSS) {
+        kernel_embree_convert_sss_hit(kg, ray, hit, &current_isect, ctx->local_object_id);
+      }
+      else {
+        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+        if (ctx->local_object_id != current_isect.object) {
+          /* This tells Embree to continue tracing. */
+          *args->valid = 0;
+          break;
+        }
+      }
+      if (intersection_skip_self_local(cray->self, current_isect.prim)) {
+        *args->valid = 0;
+        return;
+      }
+
+      /* No intersection information requested, just return a hit. */
+      if (ctx->max_hits == 0) {
+        break;
+      }
+
+      /* Ignore curves. */
+      if (EMBREE_IS_HAIR(hit->geomID)) {
+        /* This tells Embree to continue tracing. */
+        *args->valid = 0;
+        break;
+      }
+
+      LocalIntersection *local_isect = ctx->local_isect;
+      int hit_idx = 0;
+
+      if (ctx->lcg_state) {
+        /* See triangle_intersect_subsurface() for the native equivalent. */
+        for (int i = min((int)ctx->max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+          if (local_isect->hits[i].t == ray->tfar) {
+            /* This tells Embree to continue tracing. */
+            *args->valid = 0;
+            return;
+          }
+        }
+
+        local_isect->num_hits++;
+
+        if (local_isect->num_hits <= ctx->max_hits) {
+          hit_idx = local_isect->num_hits - 1;
+        }
+        else {
+          /* reservoir sampling: if we are at the maximum number of
+           * hits, randomly replace element or skip it */
+          hit_idx = lcg_step_uint(ctx->lcg_state) % local_isect->num_hits;
+
+          if (hit_idx >= ctx->max_hits) {
+            /* This tells Embree to continue tracing. */
+            *args->valid = 0;
+            return;
+          }
+        }
+      }
+      else {
+        /* Record closest intersection only. */
+        if (local_isect->num_hits && current_isect.t > local_isect->hits[0].t) {
+          *args->valid = 0;
+          return;
+        }
+
+        local_isect->num_hits = 1;
+      }
+
+      /* record intersection */
+      local_isect->hits[hit_idx] = current_isect;
+      local_isect->Ng[hit_idx] = normalize(make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z));
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
+      break;
+    }
+    case CCLIntersectContext::RAY_VOLUME_ALL: {
+      /* Append the intersection to the end of the array. */
+      if (ctx->num_hits < ctx->max_hits) {
+        Intersection current_isect;
+        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+        if (intersection_skip_self(cray->self, current_isect.object, current_isect.prim)) {
+          *args->valid = 0;
+          return;
+        }
+
+        Intersection *isect = &ctx->isect_s[ctx->num_hits];
+        ++ctx->num_hits;
+        *isect = current_isect;
+        /* Only primitives from volume object. */
+        uint tri_object = isect->object;
+        int object_flag = kernel_data_fetch(object_flag, tri_object);
+        if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+          --ctx->num_hits;
+        }
+        /* This tells Embree to continue tracing. */
+        *args->valid = 0;
+      }
+      break;
+    }
+    case CCLIntersectContext::RAY_REGULAR:
+    default:
+      if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+        *args->valid = 0;
+        return;
+      }
+      break;
+  }
+}
+
+ccl_device void kernel_embree_filter_func_backface_cull(const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore back-facing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+    *args->valid = 0;
+  }
+}
+
+ccl_device void kernel_embree_filter_occluded_func_backface_cull(
+    const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore back-facing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+
+  kernel_embree_filter_occluded_func(args);
+}
+
+/* Scene intersection. */
+
+ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
+                                                  ccl_private const Ray *ray,
+                                                  const uint visibility,
+                                                  ccl_private Intersection *isect)
+{
+  isect->t = ray->tmax;
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
+  IntersectContext rtc_ctx(&ctx);
+  RTCRayHit ray_hit;
+  ctx.ray = ray;
+  kernel_embree_setup_rayhit(*ray, ray_hit, visibility);
+  rtcIntersect1(kernel_data.device_bvh, &rtc_ctx.context, &ray_hit);
+  if (ray_hit.hit.geomID == RTC_INVALID_GEOMETRY_ID ||
+      ray_hit.hit.primID == RTC_INVALID_GEOMETRY_ID) {
+    return false;
+  }
+
+  kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect);
+  return true;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
+                                                        ccl_private const Ray *ray,
+                                                        ccl_private LocalIntersection *local_isect,
+                                                        int local_object,
+                                                        ccl_private uint *lcg_state,
+                                                        int max_hits)
+{
+  const bool has_bvh = !(kernel_data_fetch(object_flag, local_object) &
+                         SD_OBJECT_TRANSFORM_APPLIED);
+  CCLIntersectContext ctx(kg,
+                          has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
+  ctx.lcg_state = lcg_state;
+  ctx.max_hits = max_hits;
+  ctx.ray = ray;
+  ctx.local_isect = local_isect;
+  if (local_isect) {
+    local_isect->num_hits = 0;
+  }
+  ctx.local_object_id = local_object;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
+
+  /* If this object has its own BVH, use it. */
+  if (has_bvh) {
+    RTCGeometry geom = rtcGetGeometry(kernel_data.device_bvh, local_object * 2);
+    if (geom) {
+      float3 P = ray->P;
+      float3 dir = ray->D;
+      float3 idir = ray->D;
+      bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir);
+
+      rtc_ray.org_x = P.x;
+      rtc_ray.org_y = P.y;
+      rtc_ray.org_z = P.z;
+      rtc_ray.dir_x = dir.x;
+      rtc_ray.dir_y = dir.y;
+      rtc_ray.dir_z = dir.z;
+      rtc_ray.tnear = ray->tmin;
+      rtc_ray.tfar = ray->tmax;
+      RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
+      kernel_assert(scene);
+      if (scene) {
+        rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
+      }
+    }
+  }
+  else {
+    rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+  }
+
+  /* rtcOccluded1 sets tfar to -inf if a hit was found. */
+  return (local_isect && local_isect->num_hits > 0) || (rtc_ray.tfar < 0);
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
+                                                             IntegratorShadowStateCPU *state,
+                                                             ccl_private const Ray *ray,
+                                                             uint visibility,
+                                                             uint max_hits,
+                                                             ccl_private uint *num_recorded_hits,
+                                                             ccl_private float *throughput)
+{
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
+  Intersection *isect_array = (Intersection *)state->shadow_isect;
+  ctx.isect_s = isect_array;
+  ctx.max_hits = max_hits;
+  ctx.ray = ray;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, visibility);
+  rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+
+  *num_recorded_hits = ctx.num_recorded_hits;
+  *throughput = ctx.throughput;
+  return ctx.opaque_hit;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
+                                                         ccl_private const Ray *ray,
+                                                         ccl_private Intersection *isect,
+                                                         const uint max_hits,
+                                                         const uint visibility)
+{
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
+  ctx.isect_s = isect;
+  ctx.max_hits = max_hits;
+  ctx.num_hits = 0;
+  ctx.ray = ray;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, visibility);
+  rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+  return ctx.num_hits;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
index 3bfc37e98ee..1e3e790ca1f 100644
--- a/intern/cycles/kernel/device/cpu/compat.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#define __KERNEL_CPU__
-
 /* Release kernel has too much false-positive maybe-uninitialized warnings,
  * which makes it possible to miss actual warnings.
  */
@@ -35,38 +33,4 @@ CCL_NAMESPACE_BEGIN
 
 #define kernel_assert(cond) assert(cond)
 
-/* Macros to handle different memory storage on different devices */
-
-#ifdef __KERNEL_SSE2__
-typedef vector3<sseb> sse3b;
-typedef vector3<ssef> sse3f;
-typedef vector3<ssei> sse3i;
-
-ccl_device_inline void print_sse3b(const char *label, sse3b &a)
-{
-  print_sseb(label, a.x);
-  print_sseb(label, a.y);
-  print_sseb(label, a.z);
-}
-
-ccl_device_inline void print_sse3f(const char *label, sse3f &a)
-{
-  print_ssef(label, a.x);
-  print_ssef(label, a.y);
-  print_ssef(label, a.z);
-}
-
-ccl_device_inline void print_sse3i(const char *label, sse3i &a)
-{
-  print_ssei(label, a.x);
-  print_ssei(label, a.y);
-  print_ssei(label, a.z);
-}
-
-#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-typedef vector3<avxf> avx3f;
-#  endif
-
-#endif
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
index 309afae412e..f7f1a36b2a7 100644
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -9,6 +9,8 @@
 #include "kernel/types.h"
 #include "kernel/util/profiling.h"
 
+#include "util/guiding.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -43,9 +45,20 @@ typedef struct KernelGlobalsCPU {
 #ifdef __OSL__
   /* On the CPU, we also have the OSL globals here. Most data structures are shared
    * with SVM, the difference is in the shaders and object/mesh attributes. */
-  OSLGlobals *osl;
-  OSLShadingSystem *osl_ss;
-  OSLThreadData *osl_tdata;
+  OSLGlobals *osl = nullptr;
+  OSLShadingSystem *osl_ss = nullptr;
+  OSLThreadData *osl_tdata = nullptr;
+#endif
+
+#ifdef __PATH_GUIDING__
+  /* Pointers to global data structures. */
+  openpgl::cpp::SampleStorage *opgl_sample_data_storage = nullptr;
+  openpgl::cpp::Field *opgl_guiding_field = nullptr;
+
+  /* Local data structures owned by the thread. */
+  openpgl::cpp::PathSegmentStorage *opgl_path_segment_storage = nullptr;
+  openpgl::cpp::SurfaceSamplingDistribution *opgl_surface_sampling_distribution = nullptr;
+  openpgl::cpp::VolumeSamplingDistribution *opgl_volume_sampling_distribution = nullptr;
 #endif
 
   /* **** Run-time data ****  */
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 0e5f7b4a2fd..0d7c06f4fc6 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -34,7 +34,7 @@
 #    include "kernel/integrator/megakernel.h"
 
 #    include "kernel/film/adaptive_sampling.h"
-#    include "kernel/film/id_passes.h"
+#    include "kernel/film/cryptomatte_passes.h"
 #    include "kernel/film/read.h"
 
 #    include "kernel/bake/bake.h"
@@ -169,7 +169,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
   STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
   return false;
 #else
-  return kernel_adaptive_sampling_convergence_check(
+  return film_adaptive_sampling_convergence_check(
       kg, render_buffer, x, y, threshold, reset, offset, stride);
 #endif
 }
@@ -185,7 +185,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCP
 #ifdef KERNEL_STUB
   STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
 #else
-  kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+  film_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
 #endif
 }
 
@@ -200,7 +200,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCP
 #ifdef KERNEL_STUB
   STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
 #else
-  kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+  film_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
 #endif
 }
 
@@ -215,7 +215,7 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
 #ifdef KERNEL_STUB
   STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
 #else
-  kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+  film_cryptomatte_post(kg, render_buffer, pixel_index);
 #endif
 }
 
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index e1ab802aa80..d7d2000775f 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -526,7 +526,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   bool converged = true;
 
   if (x < sw && y < sh) {
-    converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+    converged = ccl_gpu_kernel_call(film_adaptive_sampling_convergence_check(
         nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
   }
 
@@ -553,7 +553,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
   if (y < sh) {
     ccl_gpu_kernel_call(
-        kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
+        film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
   }
 }
 ccl_gpu_kernel_postfix
@@ -572,7 +572,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
   if (x < sw) {
     ccl_gpu_kernel_call(
-        kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
+        film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
   }
 }
 ccl_gpu_kernel_postfix
@@ -589,7 +589,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int pixel_index = ccl_gpu_global_id_x();
 
   if (pixel_index < num_pixels) {
-    ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
+    ccl_gpu_kernel_call(film_cryptomatte_post(nullptr, render_buffer, pixel_index));
   }
 }
 ccl_gpu_kernel_postfix
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index c1df49c4f49..38cdcb572eb 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN
  * and keep device specific code in compat.h */
 
 #ifdef __KERNEL_ONEAPI__
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-template<typename IsActiveOp>
-void cpu_serial_active_index_array_impl(const uint num_states,
-                                        ccl_global int *ccl_restrict indices,
-                                        ccl_global int *ccl_restrict num_indices,
-                                        IsActiveOp is_active_op)
-{
-  int write_index = 0;
-  for (int state_index = 0; state_index < num_states; state_index++) {
-    if (is_active_op(state_index))
-      indices[write_index++] = state_index;
-  }
-  *num_indices = write_index;
-  return;
-}
-#  endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
 
 template<typename IsActiveOp>
 void gpu_parallel_active_index_array_impl(const uint num_states,
@@ -182,18 +166,11 @@ __device__
                                          num_simd_groups, \
                                          simdgroup_offset)
 #elif defined(__KERNEL_ONEAPI__)
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-#    define gpu_parallel_active_index_array( \
-        blocksize, num_states, indices, num_indices, is_active_op) \
-      if (ccl_gpu_global_size_x() == 1) \
-        cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
-      else \
-        gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
-#  else
-#    define gpu_parallel_active_index_array( \
-        blocksize, num_states, indices, num_indices, is_active_op) \
-      gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
-#  endif
+
+#  define gpu_parallel_active_index_array( \
+      blocksize, num_states, indices, num_indices, is_active_op) \
+    gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+
 #else
 
 #  define gpu_parallel_active_index_array( \
diff --git a/intern/cycles/kernel/device/metal/bvh.h b/intern/cycles/kernel/device/metal/bvh.h
new file mode 100644
index 00000000000..03faa3f020f
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/bvh.h
@@ -0,0 +1,360 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* MetalRT implementation of ray-scene intersection. */
+
+#pragma once
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Payload types. */
+
+struct MetalRTIntersectionPayload {
+  RaySelfPrimitives self;
+  uint visibility;
+  float u, v;
+  int prim;
+  int type;
+#if defined(__METALRT_MOTION__)
+  float time;
+#endif
+};
+
+struct MetalRTIntersectionLocalPayload {
+  RaySelfPrimitives self;
+  uint local_object;
+  uint lcg_state;
+  short max_hits;
+  bool has_lcg_state;
+  bool result;
+  LocalIntersection local_isect;
+};
+
+struct MetalRTIntersectionShadowPayload {
+  RaySelfPrimitives self;
+  uint visibility;
+#if defined(__METALRT_MOTION__)
+  float time;
+#endif
+  int state;
+  float throughput;
+  short max_hits;
+  short num_hits;
+  short num_recorded_hits;
+  bool result;
+};
+
+/* Scene intersection. */
+
+ccl_device_intersect bool scene_intersect(KernelGlobals kg,
+                                          ccl_private const Ray *ray,
+                                          const uint visibility,
+                                          ccl_private Intersection *isect)
+{
+  if (!intersection_ray_valid(ray)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    return false;
+  }
+
+#if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    kernel_assert(!"Invalid ift_default");
+    return false;
+  }
+#endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionPayload payload;
+  payload.self = ray->self;
+  payload.u = 0.0f;
+  payload.v = 0.0f;
+  payload.visibility = visibility;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+    /* No further intersector setup required: Default MetalRT behavior is any-hit. */
+  }
+  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    /* No further intersector setup required: Shadow ray early termination is controlled by the
+     * intersection handler */
+  }
+
+#if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_default,
+                                             payload);
+#else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
+#endif
+
+  if (intersection.type == intersection_type::none) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+
+    return false;
+  }
+
+  isect->t = intersection.distance;
+
+  isect->prim = payload.prim;
+  isect->type = payload.type;
+  isect->object = intersection.user_instance_id;
+
+  isect->t = intersection.distance;
+  if (intersection.type == intersection_type::triangle) {
+    isect->u = intersection.triangle_barycentric_coord.x;
+    isect->v = intersection.triangle_barycentric_coord.y;
+  }
+  else {
+    isect->u = payload.u;
+    isect->v = payload.v;
+  }
+
+  return isect->type != PRIMITIVE_NONE;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
+                                                ccl_private const Ray *ray,
+                                                ccl_private LocalIntersection *local_isect,
+                                                int local_object,
+                                                ccl_private uint *lcg_state,
+                                                int max_hits)
+{
+  if (!intersection_ray_valid(ray)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid ift_local");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionLocalPayload payload;
+  payload.self = ray->self;
+  payload.local_object = local_object;
+  payload.max_hits = max_hits;
+  payload.local_isect.num_hits = 0;
+  if (lcg_state) {
+    payload.has_lcg_state = true;
+    payload.lcg_state = *lcg_state;
+  }
+  payload.result = false;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+#  if defined(__METALRT_MOTION__)
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+#  endif
+
+  if (lcg_state) {
+    *lcg_state = payload.lcg_state;
+  }
+  *local_isect = payload.local_isect;
+
+  return payload.result;
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
+                                                     IntegratorShadowState state,
+                                                     ccl_private const Ray *ray,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     ccl_private uint *num_recorded_hits,
+                                                     ccl_private float *throughput)
+{
+  if (!intersection_ray_valid(ray)) {
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_shadow)) {
+    kernel_assert(!"Invalid ift_shadow");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionShadowPayload payload;
+  payload.self = ray->self;
+  payload.visibility = visibility;
+  payload.max_hits = max_hits;
+  payload.num_hits = 0;
+  payload.num_recorded_hits = 0;
+  payload.throughput = 1.0f;
+  payload.result = false;
+  payload.state = state;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  typename metalrt_intersector_type::result_type intersection;
+
+#  if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_shadow,
+                                             payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_shadow, payload);
+#  endif
+
+  *num_recorded_hits = payload.num_recorded_hits;
+  *throughput = payload.throughput;
+
+  return payload.result;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
+                                                 ccl_private const Ray *ray,
+                                                 ccl_private Intersection *isect,
+                                                 const uint visibility)
+{
+  if (!intersection_ray_valid(ray)) {
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
+    kernel_assert(!"Invalid ift_default");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionPayload payload;
+  payload.self = ray->self;
+  payload.visibility = visibility;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+#  if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_default,
+                                             payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
+#  endif
+
+  if (intersection.type == intersection_type::none) {
+    return false;
+  }
+
+  isect->prim = payload.prim;
+  isect->type = payload.type;
+  isect->object = intersection.user_instance_id;
+
+  isect->t = intersection.distance;
+  if (intersection.type == intersection_type::triangle) {
+    isect->u = intersection.triangle_barycentric_coord.x;
+    isect->v = intersection.triangle_barycentric_coord.y;
+  }
+  else {
+    isect->u = payload.u;
+    isect->v = payload.v;
+  }
+
+  return isect->type != PRIMITIVE_NONE;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 0ed52074a90..f689e93e5a2 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -29,24 +29,13 @@ using namespace metal::raytracing;
 
 /* Qualifiers */
 
+#define ccl_device
+#define ccl_device_inline ccl_device __attribute__((always_inline))
+#define ccl_device_forceinline ccl_device __attribute__((always_inline))
 #if defined(__KERNEL_METAL_APPLE__)
-
-/* Inline everything for Apple GPUs.
- * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
- * at the cost of longer compile times (~4.5 minutes on M1 Max). */
-
-#  define ccl_device __attribute__((always_inline))
-#  define ccl_device_inline __attribute__((always_inline))
-#  define ccl_device_forceinline __attribute__((always_inline))
-#  define ccl_device_noinline __attribute__((always_inline))
-
+#  define ccl_device_noinline ccl_device
 #else
-
-#  define ccl_device
-#  define ccl_device_inline ccl_device
-#  define ccl_device_forceinline ccl_device
 #  define ccl_device_noinline ccl_device __attribute__((noinline))
-
 #endif
 
 #define ccl_device_noinline_cpu ccl_device
@@ -189,35 +178,46 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
   } volume_write_lambda_pass{kg, this, state};
 
 /* make_type definitions with Metal style element initializers */
-#ifdef make_float2
-#  undef make_float2
-#endif
-#ifdef make_float3
-#  undef make_float3
-#endif
-#ifdef make_float4
-#  undef make_float4
-#endif
-#ifdef make_int2
-#  undef make_int2
-#endif
-#ifdef make_int3
-#  undef make_int3
-#endif
-#ifdef make_int4
-#  undef make_int4
-#endif
-#ifdef make_uchar4
-#  undef make_uchar4
-#endif
-
-#define make_float2(x, y) float2(x, y)
-#define make_float3(x, y, z) float3(x, y, z)
-#define make_float4(x, y, z, w) float4(x, y, z, w)
-#define make_int2(x, y) int2(x, y)
-#define make_int3(x, y, z) int3(x, y, z)
-#define make_int4(x, y, z, w) int4(x, y, z, w)
-#define make_uchar4(x, y, z, w) uchar4(x, y, z, w)
+ccl_device_forceinline float2 make_float2(const float x, const float y)
+{
+  return float2(x, y);
+}
+
+ccl_device_forceinline float3 make_float3(const float x, const float y, const float z)
+{
+  return float3(x, y, z);
+}
+
+ccl_device_forceinline float4 make_float4(const float x,
+                                          const float y,
+                                          const float z,
+                                          const float w)
+{
+  return float4(x, y, z, w);
+}
+
+ccl_device_forceinline int2 make_int2(const int x, const int y)
+{
+  return int2(x, y);
+}
+
+ccl_device_forceinline int3 make_int3(const int x, const int y, const int z)
+{
+  return int3(x, y, z);
+}
+
+ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w)
+{
+  return int4(x, y, z, w);
+}
+
+ccl_device_forceinline uchar4 make_uchar4(const uchar x,
+                                          const uchar y,
+                                          const uchar z,
+                                          const uchar w)
+{
+  return uchar4(x, y, z, w);
+}
 
 /* Math functions */
 
@@ -260,8 +260,6 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
 
 #ifdef __METALRT__
 
-#  define __KERNEL_GPU_RAYTRACING__
-
 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
 #  else
diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h
index 99cb1e3826e..e75ec9cadec 100644
--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -34,21 +34,48 @@ class MetalKernelContext {
       kernel_assert(0);
       return 0;
     }
-
+    
+#ifdef __KERNEL_METAL_INTEL__
+    template<typename TextureType, typename CoordsType>
+    inline __attribute__((__always_inline__))
+    auto ccl_gpu_tex_object_read_intel_workaround(TextureType texture_array,
+                                                  const uint tid, const uint sid,
+                                                  CoordsType coords) const
+    {
+      switch(sid) {
+        default:
+        case 0: return texture_array[tid].tex.sample(sampler(address::repeat, filter::nearest), coords);
+        case 1: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::nearest), coords);
+        case 2: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::nearest), coords);
+        case 3: return texture_array[tid].tex.sample(sampler(address::repeat, filter::linear), coords);
+        case 4: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::linear), coords);
+        case 5: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::linear), coords);
+      }
+    }
+#endif
+    
     // texture2d
     template<>
     inline __attribute__((__always_inline__))
     float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const {
       const uint tid(tex);
       const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
       return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+#else
+      return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y));
+#endif
     }
     template<>
     inline __attribute__((__always_inline__))
     float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const {
       const uint tid(tex);
       const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
       return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+#else
+      return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y)).x;
+#endif
     }
 
     // texture3d
@@ -57,14 +84,22 @@ class MetalKernelContext {
     float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const {
       const uint tid(tex);
       const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
       return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+#else
+      return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z));
+#endif
     }
     template<>
     inline __attribute__((__always_inline__))
     float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const {
       const uint tid(tex);
       const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
       return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+#else
+      return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z)).x;
+#endif
     }
 #    include "kernel/device/gpu/image.h"
 
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
index 764c26dbe8f..8b69ee025cd 100644
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -1,41 +1,44 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2021-2022 Blender Foundation */
 
-/* Metal kernel entry points */
+/* Metal kernel entry points. */
 
 #include "kernel/device/metal/compat.h"
 #include "kernel/device/metal/globals.h"
 #include "kernel/device/metal/function_constants.h"
 #include "kernel/device/gpu/kernel.h"
 
-/* MetalRT intersection handlers */
+/* MetalRT intersection handlers. */
+
 #ifdef __METALRT__
 
-/* Return type for a bounding box intersection function. */
-struct BoundingBoxIntersectionResult
-{
+/* Intersection return types. */
+
+/* For a bounding box intersection function. */
+struct BoundingBoxIntersectionResult {
   bool accept [[accept_intersection]];
   bool continue_search [[continue_search]];
   float distance [[distance]];
 };
 
-/* Return type for a triangle intersection function. */
-struct TriangleIntersectionResult
-{
+/* For a triangle intersection function. */
+struct TriangleIntersectionResult {
   bool accept [[accept_intersection]];
-  bool continue_search  [[continue_search]];
+  bool continue_search [[continue_search]];
 };
 
 enum { METALRT_HIT_TRIANGLE, METALRT_HIT_BOUNDING_BOX };
 
-ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives& self,
+/* Utilities. */
+
+ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives &self,
                                               const int object,
                                               const int prim)
 {
   return (self.prim == prim) && (self.object == object);
 }
 
-ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives& self,
+ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives &self,
                                                      const int object,
                                                      const int prim)
 {
@@ -43,12 +46,14 @@ ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimi
          ((self.light_prim == prim) && (self.light_object == object));
 }
 
-ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives& self,
+ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives &self,
                                                     const int prim)
 {
   return (self.prim == prim);
 }
 
+/* Hit functions. */
+
 template<typename TReturn, uint intersection_type>
 TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
                           ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload,
@@ -58,7 +63,7 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
                           const float ray_tmax)
 {
   TReturn result;
-  
+
 #ifdef __BVH_LOCAL__
   uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
 
@@ -101,7 +106,8 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
   }
   else {
     if (payload.local_isect.num_hits && ray_tmax > payload.local_isect.hits[0].t) {
-      /* Record closest intersection only. Do not terminate ray here, since there is no guarantee about distance ordering in any-hit */
+      /* Record closest intersection only. Do not terminate ray here, since there is no guarantee
+       * about distance ordering in any-hit */
       result.accept = false;
       result.continue_search = true;
       return result;
@@ -116,8 +122,8 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
   isect->object = object;
   isect->type = kernel_data_fetch(objects, object).primitive_type;
 
-  isect->u = 1.0f - barycentrics.y - barycentrics.x;
-  isect->v = barycentrics.x;
+  isect->u = barycentrics.x;
+  isect->v = barycentrics.y;
 
   /* Record geometric normal */
   const uint tri_vindex = kernel_data_fetch(tri_vindex, isect->prim).w;
@@ -133,21 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }
 
-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                       ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
-                                       uint instance_id [[user_instance_id]],
-                                       uint primitive_id [[primitive_id]],
-                                       float2 barycentrics [[barycentric_coord]],
-                                       float ray_tmax [[distance]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+    uint instance_id [[user_instance_id]],
+    uint primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
 {
   return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax);
+      launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax);
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
 {
   /* unused function */
@@ -175,23 +180,14 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   }
 #  endif
 
-  if (intersection_skip_self_shadow(payload.self, object, prim)) {
-    /* continue search */
-    return true;
-  }
-
-  float u = 0.0f, v = 0.0f;
+  const float u = barycentrics.x;
+  const float v = barycentrics.y;
   int type = 0;
   if (intersection_type == METALRT_HIT_TRIANGLE) {
-    u = 1.0f - barycentrics.y - barycentrics.x;
-    v = barycentrics.x;
     type = kernel_data_fetch(objects, object).primitive_type;
   }
 #  ifdef __HAIR__
   else {
-    u = barycentrics.x;
-    v = barycentrics.y;
-    
     const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
     type = segment.type;
     prim = segment.prim;
@@ -204,6 +200,11 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   }
 #  endif
 
+  if (intersection_skip_self_shadow(payload.self, object, prim)) {
+    /* continue search */
+    return true;
+  }
+
 #  ifndef __TRANSPARENT_SHADOWS__
   /* No transparent shadows support compiled in, make opaque. */
   payload.result = true;
@@ -215,7 +216,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   short num_recorded_hits = payload.num_recorded_hits;
 
   MetalKernelContext context(launch_params_metal);
-  
+
   /* If no transparent shadows, all light is blocked and we can stop immediately. */
   if (num_hits >= max_hits ||
       !(context.intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
@@ -223,11 +224,11 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
     /* terminate ray */
     return false;
   }
-  
+
   /* Always use baked shadow transparency for curves. */
   if (type & PRIMITIVE_CURVE) {
     float throughput = payload.throughput;
-    throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, type, u);
     payload.throughput = throughput;
     payload.num_hits += 1;
 
@@ -240,10 +241,10 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
       return true;
     }
   }
-  
+
   payload.num_hits += 1;
   payload.num_recorded_hits += 1;
-  
+
   uint record_index = num_recorded_hits;
 
   const IntegratorShadowState state = payload.state;
@@ -278,7 +279,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
-  
+
   /* Continue tracing. */
 #  endif /* __TRANSPARENT_SHADOWS__ */
 #endif   /* __SHADOW_RECORD_ALL__ */
@@ -286,26 +287,25 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   return true;
 }
 
-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                            unsigned int object [[user_instance_id]],
-                                            unsigned int primitive_id [[primitive_id]],
-                                            float2 barycentrics [[barycentric_coord]],
-                                            float ray_tmax [[distance]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_shadow_all_hit_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    unsigned int object [[user_instance_id]],
+    unsigned int primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
 {
   uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
 
   TriangleIntersectionResult result;
   result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, object, prim, barycentrics, ray_tmax);
+      launch_params_metal, payload, object, prim, barycentrics, ray_tmax);
   result.accept = !result.continue_search;
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
 {
   /* unused function */
@@ -317,15 +317,16 @@ __anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]
 }
 
 template<typename TReturnType, uint intersection_type>
-inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_params_metal,
-                                           ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                           const uint object,
-                                           const uint prim,
-                                           const float u)
+inline TReturnType metalrt_visibility_test(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    uint prim,
+    const float u)
 {
   TReturnType result;
-    
-#  ifdef __HAIR__
+
+#ifdef __HAIR__
   if (intersection_type == METALRT_HIT_BOUNDING_BOX) {
     /* Filter out curve endcaps. */
     if (u == 0.0f || u == 1.0f) {
@@ -334,15 +335,23 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
       return result;
     }
   }
-#  endif
+#endif
 
   uint visibility = payload.visibility;
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
   if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
     result.accept = false;
     result.continue_search = true;
     return result;
   }
+#endif
+
+  if (intersection_type == METALRT_HIT_TRIANGLE) {
+  }
+#  ifdef __HAIR__
+  else {
+    prim = kernel_data_fetch(curve_segments, prim).prim;
+  }
 #  endif
 
   /* Shadow ray early termination. */
@@ -371,16 +380,17 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
   return result;
 }
 
-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
-                                             unsigned int object [[user_instance_id]],
-                                             unsigned int primitive_id [[primitive_id]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_visibility_test_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+    unsigned int object [[user_instance_id]],
+    unsigned int primitive_id [[primitive_id]])
 {
   uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
-  TriangleIntersectionResult result = metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, object, prim, 0.0f);
+  TriangleIntersectionResult result =
+      metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+          launch_params_metal, payload, object, prim, 0.0f);
   if (result.accept) {
     payload.prim = prim;
     payload.type = kernel_data_fetch(objects, object).primitive_type;
@@ -388,8 +398,7 @@ __anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
 {
   /* Unused function */
@@ -400,19 +409,21 @@ __anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance
   return result;
 }
 
+/* Primitive intersection functions. */
+
 #ifdef __HAIR__
-ccl_device_inline
-void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
-                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                const uint object,
-                                const uint prim,
-                                const uint type,
-                                const float3 ray_origin,
-                                const float3 ray_direction,
-                                float time,
-                                const float ray_tmin,
-                                const float ray_tmax,
-                                thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_curve(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = payload.visibility;
@@ -421,25 +432,16 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
   }
 #  endif
 
-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
   Intersection isect;
   isect.t = ray_tmax;
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
 
   MetalKernelContext context(launch_params_metal);
-  if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
+  if (context.curve_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
     result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
-                  launch_params_metal, payload, object, prim, isect.u);
+        launch_params_metal, payload, object, prim, isect.u);
     if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
       payload.u = isect.u;
       payload.v = isect.v;
       payload.prim = prim;
@@ -448,54 +450,41 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
   }
 }
 
-ccl_device_inline
-void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params_metal,
-                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
-                                       const uint object,
-                                       const uint prim,
-                                       const uint type,
-                                       const float3 ray_origin,
-                                       const float3 ray_direction,
-                                       float time,
-                                       const float ray_tmin,
-                                       const float ray_tmax,
-                                       thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_curve_shadow(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
   const uint visibility = payload.visibility;
 
-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
   Intersection isect;
   isect.t = ray_tmax;
-  /* Transform maximum distance into object space */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
 
   MetalKernelContext context(launch_params_metal);
-  if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
+  if (context.curve_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
     result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
-                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+        launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
     result.accept = !result.continue_search;
-
-    if (result.accept) {
-      result.distance = isect.t / len;
-    }
   }
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload
+                             [[payload]],
                              const uint object [[user_instance_id]],
                              const uint primitive_id [[primitive_id]],
-                             const float3 ray_origin [[origin]],
-                             const float3 ray_direction [[direction]],
+                             const float3 ray_P [[origin]],
+                             const float3 ray_D [[direction]],
                              const float ray_tmin [[min_distance]],
                              const float ray_tmax [[max_distance]])
 {
@@ -508,28 +497,36 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b
   result.distance = ray_tmax;
 
   if (segment.type & PRIMITIVE_CURVE_RIBBON) {
-    metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+    metalrt_intersection_curve(launch_params_metal,
+                               payload,
+                               object,
+                               segment.prim,
+                               segment.type,
+                               ray_P,
+                               ray_D,
 #  if defined(__METALRT_MOTION__)
                                payload.time,
 #  else
                                0.0f,
 #  endif
-                               ray_tmin, ray_tmax, result);
+                               ray_tmin,
+                               ray_tmax,
+                               result);
   }
 
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
-__intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                    const uint object [[user_instance_id]],
-                                    const uint primitive_id [[primitive_id]],
-                                    const float3 ray_origin [[origin]],
-                                    const float3 ray_direction [[direction]],
-                                    const float ray_tmin [[min_distance]],
-                                    const float ray_tmax [[max_distance]])
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
+__intersection__curve_ribbon_shadow(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    const uint object [[user_instance_id]],
+    const uint primitive_id [[primitive_id]],
+    const float3 ray_P [[origin]],
+    const float3 ray_D [[direction]],
+    const float ray_tmin [[min_distance]],
+    const float ray_tmax [[max_distance]])
 {
   uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
   const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
@@ -540,57 +537,73 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me
   result.distance = ray_tmax;
 
   if (segment.type & PRIMITIVE_CURVE_RIBBON) {
-    metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+    metalrt_intersection_curve_shadow(launch_params_metal,
+                                      payload,
+                                      object,
+                                      segment.prim,
+                                      segment.type,
+                                      ray_P,
+                                      ray_D,
 #  if defined(__METALRT_MOTION__)
-                               payload.time,
+                                      payload.time,
 #  else
-                               0.0f,
+                                      0.0f,
 #  endif
-                               ray_tmin, ray_tmax, result);
+                                      ray_tmin,
+                                      ray_tmax,
+                                      result);
   }
 
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                          ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                          ray_data MetalKernelContext::MetalRTIntersectionPayload &payload
+                          [[payload]],
                           const uint object [[user_instance_id]],
                           const uint primitive_id [[primitive_id]],
-                          const float3 ray_origin [[origin]],
-                          const float3 ray_direction [[direction]],
+                          const float3 ray_P [[origin]],
+                          const float3 ray_D [[direction]],
                           const float ray_tmin [[min_distance]],
                           const float ray_tmax [[max_distance]])
 {
   uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
   const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
-    
+
   BoundingBoxIntersectionResult result;
   result.accept = false;
   result.continue_search = true;
   result.distance = ray_tmax;
-  metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+  metalrt_intersection_curve(launch_params_metal,
+                             payload,
+                             object,
+                             segment.prim,
+                             segment.type,
+                             ray_P,
+                             ray_D,
 #  if defined(__METALRT_MOTION__)
                              payload.time,
 #  else
                              0.0f,
 #  endif
-                             ray_tmin, ray_tmax, result);
+                             ray_tmin,
+                             ray_tmax,
+                             result);
 
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
-__intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                 ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                 const uint object [[user_instance_id]],
-                                 const uint primitive_id [[primitive_id]],
-                                 const float3 ray_origin [[origin]],
-                                 const float3 ray_direction [[direction]],
-                                 const float ray_tmin [[min_distance]],
-                                 const float ray_tmax [[max_distance]])
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
+__intersection__curve_all_shadow(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    const uint object [[user_instance_id]],
+    const uint primitive_id [[primitive_id]],
+    const float3 ray_P [[origin]],
+    const float3 ray_D [[direction]],
+    const float ray_tmin [[min_distance]],
+    const float ray_tmax [[max_distance]])
 {
   uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
   const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
@@ -600,31 +613,39 @@ __intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal
   result.continue_search = true;
   result.distance = ray_tmax;
 
-  metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+  metalrt_intersection_curve_shadow(launch_params_metal,
+                                    payload,
+                                    object,
+                                    segment.prim,
+                                    segment.type,
+                                    ray_P,
+                                    ray_D,
 #  if defined(__METALRT_MOTION__)
-                             payload.time,
+                                    payload.time,
 #  else
-                             0.0f,
+                                    0.0f,
 #  endif
-                             ray_tmin, ray_tmax, result);
+                                    ray_tmin,
+                                    ray_tmax,
+                                    result);
 
   return result;
 }
 #endif /* __HAIR__ */
 
 #ifdef __POINTCLOUD__
-ccl_device_inline
-void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
-                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                const uint object,
-                                const uint prim,
-                                const uint type,
-                                const float3 ray_origin,
-                                const float3 ray_direction,
-                                float time,
-                                const float ray_tmin,
-                                const float ray_tmax,
-                                thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_point(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = payload.visibility;
@@ -633,25 +654,16 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
   }
 #  endif
 
-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the point intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
   Intersection isect;
   isect.t = ray_tmax;
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
 
   MetalKernelContext context(launch_params_metal);
-  if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
+  if (context.point_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
     result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
-                  launch_params_metal, payload, object, prim, isect.u);
+        launch_params_metal, payload, object, prim, isect.u);
     if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
       payload.u = isect.u;
       payload.v = isect.v;
       payload.prim = prim;
@@ -660,56 +672,46 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
   }
 }
 
-ccl_device_inline
-void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params_metal,
-                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
-                                       const uint object,
-                                       const uint prim,
-                                       const uint type,
-                                       const float3 ray_origin,
-                                       const float3 ray_direction,
-                                       float time,
-                                       const float ray_tmin,
-                                       const float ray_tmax,
-                                       thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_point_shadow(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
   const uint visibility = payload.visibility;
 
-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the point intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
   Intersection isect;
   isect.t = ray_tmax;
-  /* Transform maximum distance into object space */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
 
   MetalKernelContext context(launch_params_metal);
-  if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
+  if (context.point_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
     result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
-                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+        launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
     result.accept = !result.continue_search;
 
     if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
     }
   }
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
-                             const uint object [[user_instance_id]],
-                             const uint primitive_id [[primitive_id]],
-                             const float3 ray_origin [[origin]],
-                             const float3 ray_direction [[direction]],
-                             const float ray_tmin [[min_distance]],
-                             const float ray_tmax [[max_distance]])
+                      ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                      const uint object [[user_instance_id]],
+                      const uint primitive_id [[primitive_id]],
+                      const float3 ray_origin [[origin]],
+                      const float3 ray_direction [[direction]],
+                      const float ray_tmin [[min_distance]],
+                      const float ray_tmax [[max_distance]])
 {
   const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
   const int type = kernel_data_fetch(objects, object).primitive_type;
@@ -719,27 +721,35 @@ __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1
   result.continue_search = true;
   result.distance = ray_tmax;
 
-  metalrt_intersection_point(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+  metalrt_intersection_point(launch_params_metal,
+                             payload,
+                             object,
+                             prim,
+                             type,
+                             ray_origin,
+                             ray_direction,
 #  if defined(__METALRT_MOTION__)
                              payload.time,
 #  else
                              0.0f,
 #  endif
-                             ray_tmin, ray_tmax, result);
+                             ray_tmin,
+                             ray_tmax,
+                             result);
 
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                    const uint object [[user_instance_id]],
-                                    const uint primitive_id [[primitive_id]],
-                                    const float3 ray_origin [[origin]],
-                                    const float3 ray_direction [[direction]],
-                                    const float ray_tmin [[min_distance]],
-                                    const float ray_tmax [[max_distance]])
+                             ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload
+                             [[payload]],
+                             const uint object [[user_instance_id]],
+                             const uint primitive_id [[primitive_id]],
+                             const float3 ray_origin [[origin]],
+                             const float3 ray_direction [[direction]],
+                             const float ray_tmin [[min_distance]],
+                             const float ray_tmax [[max_distance]])
 {
   const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
   const int type = kernel_data_fetch(objects, object).primitive_type;
@@ -749,13 +759,21 @@ __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[b
   result.continue_search = true;
   result.distance = ray_tmax;
 
-  metalrt_intersection_point_shadow(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+  metalrt_intersection_point_shadow(launch_params_metal,
+                                    payload,
+                                    object,
+                                    prim,
+                                    type,
+                                    ray_origin,
+                                    ray_direction,
 #  if defined(__METALRT_MOTION__)
-                             payload.time,
+                                    payload.time,
 #  else
-                             0.0f,
+                                    0.0f,
 #  endif
-                             ray_tmin, ray_tmax, result);
+                                    ray_tmin,
+                                    ray_tmax,
+                                    result);
 
   return result;
 }
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index 1b25259bcf5..dfaec65130c 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -10,6 +10,7 @@
 #define CCL_NAMESPACE_END
 
 #include <cstdint>
+#include <math.h>
 
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
@@ -30,7 +31,7 @@
 #define ccl_global
 #define ccl_always_inline __attribute__((always_inline))
 #define ccl_device_inline inline
-#define ccl_noinline
+#define ccl_noinline __attribute__((noinline))
 #define ccl_inline_constant const constexpr
 #define ccl_static_constant const
 #define ccl_device_forceinline __attribute__((always_inline))
@@ -54,18 +55,6 @@
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers)
 #define ccl_gpu_kernel_threads(block_num_threads)
 
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-#  define KG_ND_ITEMS \
-  kg->nd_item_local_id_0 = item.get_local_id(0); \
-  kg->nd_item_local_range_0 = item.get_local_range(0); \
-  kg->nd_item_group_0 = item.get_group(0); \
-  kg->nd_item_group_range_0 = item.get_group_range(0); \
-  kg->nd_item_global_id_0 = item.get_global_id(0); \
-  kg->nd_item_global_range_0 = item.get_global_range(0);
-#else
-# define KG_ND_ITEMS
-#endif
-
 #define ccl_gpu_kernel_signature(name, ...) \
 void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
                           size_t kernel_global_size, \
@@ -75,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
       (kg); \
       cgh.parallel_for<class kernel_##name>( \
           sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
-          [=](sycl::nd_item<1> item) { \
-            KG_ND_ITEMS
+          [=](sycl::nd_item<1> item) {
 
 #define ccl_gpu_kernel_postfix \
           }); \
@@ -94,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
   } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
 
 /* GPU thread, block, grid size and index */
-#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
-#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
-#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
-#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
-#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
-#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
-#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
-#else
-#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
-#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
-#  define ccl_gpu_block_idx_x (kg->nd_item_group_0)
-#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
-#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
-#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
-#endif
+#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
+#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
 
 /* GPU warp synchronization */
-
 #define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
 #define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
 #ifdef __SYCL_DEVICE_ONLY__
@@ -149,25 +123,13 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
 /* clang-format on */
 
 /* Types */
+
 /* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
- * because these types have different interfaces from blender version */
+ * because these types have different interfaces from blender version. */
 
 using uchar = unsigned char;
 using sycl::half;
 
-struct float3 {
-  float x, y, z;
-};
-
-ccl_always_inline float3 make_float3(float x, float y, float z)
-{
-  return {x, y, z};
-}
-ccl_always_inline float3 make_float3(float x)
-{
-  return {x, x, x};
-}
-
 /* math functions */
 #define fabsf(x) sycl::fabs((x))
 #define copysignf(x, y) sycl::copysign((x), (y))
@@ -186,21 +148,15 @@ ccl_always_inline float3 make_float3(float x)
 #define fmodf(x, y) sycl::fmod((x), (y))
 #define lgammaf(x) sycl::lgamma((x))
 
-#define __forceinline __attribute__((always_inline))
-
-/* Types */
-#include "util/half.h"
-#include "util/types.h"
-
-/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
- * include oneAPI headers, which transitively include math.h headers which will cause redefinitions
- * of the math defines because math.h also uses them and having them defined before math.h include
- * is actually UB. */
-/* Use fast math functions - get them from sycl::native namespace for native math function
- * implementations */
 #define cosf(x) sycl::native::cos(((float)(x)))
 #define sinf(x) sycl::native::sin(((float)(x)))
 #define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
 #define tanf(x) sycl::native::tan(((float)(x)))
 #define logf(x) sycl::native::log(((float)(x)))
 #define expf(x) sycl::native::exp(((float)(x)))
+
+#define __forceinline __attribute__((always_inline))
+
+/* Types */
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
deleted file mode 100644
index 662068c0fed..00000000000
--- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2022 Intel Corporation */
-
-/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
-DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
-DLL_INTERFACE_CALL(oneapi_free, void, void *)
-DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
-
-DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
-DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
-
-DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
-DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
-DLL_INTERFACE_CALL(
-    oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
-DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
-DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
-
-DLL_INTERFACE_CALL(
-    oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
-DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_usm_memset,
-                   bool,
-                   SyclQueue *queue,
-                   void *usm_ptr,
-                   unsigned char value,
-                   size_t num_bytes)
-
-DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
-
-/* Operation with Kernel globals structure - map of global/constant allocation - filled before
- * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as
- * pointer. */
-DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
-DLL_INTERFACE_CALL(oneapi_set_global_memory,
-                   void,
-                   SyclQueue *queue,
-                   void *kernel_globals,
-                   const char *memory_name,
-                   void *memory_device_pointer)
-
-DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
-                   size_t,
-                   SyclQueue *queue,
-                   const DeviceKernel kernel,
-                   const size_t kernel_global_size)
-DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
-                   bool,
-                   KernelContext *context,
-                   int kernel,
-                   size_t global_size,
-                   void **args)
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
index d60f4f135ba..116620eb725 100644
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU {
 #undef KERNEL_DATA_ARRAY
   IntegratorStateGPU *integrator_state;
   const KernelData *__data;
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  size_t nd_item_local_id_0;
-  size_t nd_item_local_range_0;
-  size_t nd_item_group_0;
-  size_t nd_item_group_range_0;
-
-  size_t nd_item_global_id_0;
-  size_t nd_item_global_range_0;
-#endif
 } KernelGlobalsGPU;
 
 typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h
index 6681977a675..2417b8eac3b 100644
--- a/intern/cycles/kernel/device/oneapi/image.h
+++ b/intern/cycles/kernel/device/oneapi/image.h
@@ -81,10 +81,15 @@ ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y)
     x = svm_image_texture_wrap_periodic(x, info.width);
     y = svm_image_texture_wrap_periodic(y, info.height);
   }
-  else {
+  else if (info.extension == EXTENSION_EXTEND) {
     x = svm_image_texture_wrap_clamp(x, info.width);
     y = svm_image_texture_wrap_clamp(y, info.height);
   }
+  else {
+    if (x < 0 || x >= info.width || y < 0 || y >= info.height) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
 
   return svm_image_texture_read(info, x, y, 0);
 }
@@ -99,11 +104,16 @@ ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z)
     y = svm_image_texture_wrap_periodic(y, info.height);
     z = svm_image_texture_wrap_periodic(z, info.depth);
   }
-  else {
+  else if (info.extension == EXTENSION_EXTEND) {
     x = svm_image_texture_wrap_clamp(x, info.width);
     y = svm_image_texture_wrap_clamp(y, info.height);
     z = svm_image_texture_wrap_clamp(z, info.depth);
   }
+  else {
+    if (x < 0 || x >= info.width || y < 0 || y >= info.height || z < 0 || z >= info.depth) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
 
   return svm_image_texture_read(info, x, y, z);
 }
@@ -128,12 +138,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float
 {
   const TextureInfo &info = kernel_data_fetch(texture_info, id);
 
-  if (info.extension == EXTENSION_CLIP) {
-    if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-    }
-  }
-
   if (info.interpolation == INTERPOLATION_CLOSEST) {
     /* Closest interpolation. */
     int ix, iy;
@@ -315,12 +319,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, in
   }
 #endif
   else {
-    if (info.extension == EXTENSION_CLIP) {
-      if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
-        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-      }
-    }
-
     x *= info.width;
     y *= info.height;
     z *= info.depth;
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
index 300e201600c..525ae288f0c 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -3,208 +3,79 @@
 
 #ifdef WITH_ONEAPI
 
-/* clang-format off */
 #  include "kernel.h"
 #  include <iostream>
 #  include <map>
 #  include <set>
 
-#  include <CL/sycl.hpp>
+#  include <sycl/sycl.hpp>
 
 #  include "kernel/device/oneapi/compat.h"
 #  include "kernel/device/oneapi/globals.h"
 #  include "kernel/device/oneapi/kernel_templates.h"
 
 #  include "kernel/device/gpu/kernel.h"
-/* clang-format on */
 
 static OneAPIErrorCallback s_error_cb = nullptr;
 static void *s_error_user_ptr = nullptr;
 
-static std::vector<sycl::device> oneapi_available_devices();
-
 void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
 {
   s_error_cb = cb;
   s_error_user_ptr = user_ptr;
 }
 
-void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
-{
-#  ifdef _DEBUG
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  sycl::info::device_type device_type =
-      queue->get_device().get_info<sycl::info::device::device_type>();
-  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
-  (void)usm_type;
-  assert(usm_type == sycl::usm::alloc::device ||
-         ((device_type == sycl::info::device_type::host ||
-           device_type == sycl::info::device_type::is_cpu || allow_host) &&
-          usm_type == sycl::usm::alloc::host));
-#  endif
-}
-
-bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
-{
-  bool finished_correct = true;
-  try {
-    std::vector<sycl::device> devices = oneapi_available_devices();
-    if (device_index < 0 || device_index >= devices.size()) {
-      return false;
-    }
-    sycl::queue *created_queue = new sycl::queue(devices[device_index],
-                                                 sycl::property::queue::in_order());
-    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
-  }
-  catch (sycl::exception const &e) {
-    finished_correct = false;
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-  }
-  return finished_correct;
-}
-
-void oneapi_free_queue(SyclQueue *queue_)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  delete queue;
-}
-
-void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
-}
-
-void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  return sycl::malloc_device(memory_size, *queue);
-}
-
-void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
+/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like
+ * memory allocations, memory transfers and execution of kernel with USM memory. */
+bool oneapi_run_test_kernel(SyclQueue *queue_)
 {
   assert(queue_);
   sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, usm_ptr, true);
-  sycl::free(usm_ptr, *queue);
-}
+  const size_t N = 8;
+  const size_t memory_byte_size = sizeof(int) * N;
 
-bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, dest, true);
-  oneapi_check_usm(queue_, src, true);
-  sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
-#  ifdef WITH_CYCLES_DEBUG
+  bool is_computation_correct = true;
   try {
-    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
-     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
-     */
-    mem_event.wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-    return false;
-  }
-#  else
-  sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
-  sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
-  bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
-                             src_type == sycl::usm::alloc::device;
-  bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
-                                          src_type == sycl::usm::alloc::unknown;
-  /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
-   * may not wait until the end of the transfer before using the memory.
-   */
-  if (from_device_to_host || host_or_device_memop_with_offset)
-    mem_event.wait();
-  return true;
-#  endif
-}
+    int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
 
-bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, usm_ptr, true);
-  sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
-#  ifdef WITH_CYCLES_DEBUG
-  try {
-    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
-     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
-     */
-    mem_event.wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
+    for (size_t i = (size_t)0; i < N; i++) {
+      A_host[i] = rand() % 32;
     }
-    return false;
-  }
-#  else
-  (void)mem_event;
-  return true;
-#  endif
-}
 
-bool oneapi_queue_synchronize(SyclQueue *queue_)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  try {
-    queue->wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-    return false;
-  }
-}
+    int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
+    int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
 
-/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
- * also trigger runtime compilation of all existing oneAPI kernels */
-bool oneapi_run_test_kernel(SyclQueue *queue_)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  size_t N = 8;
-  sycl::buffer<float, 1> A(N);
-  sycl::buffer<float, 1> B(N);
-
-  {
-    sycl::host_accessor A_host_acc(A, sycl::write_only);
-    for (size_t i = (size_t)0; i < N; i++)
-      A_host_acc[i] = rand() % 32;
-  }
+    queue->memcpy(A_device, A_host, memory_byte_size);
+    queue->wait_and_throw();
 
-  try {
     queue->submit([&](sycl::handler &cgh) {
-      sycl::accessor A_acc(A, cgh, sycl::read_only);
-      sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
-
-      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
+      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); });
     });
     queue->wait_and_throw();
 
-    sycl::host_accessor A_host_acc(A, sycl::read_only);
-    sycl::host_accessor B_host_acc(B, sycl::read_only);
+    int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
+
+    queue->memcpy(B_host, B_device, memory_byte_size);
+    queue->wait_and_throw();
 
     for (size_t i = (size_t)0; i < N; i++) {
-      float result = A_host_acc[i] + B_host_acc[i];
-      (void)result;
+      const int expected_result = i + A_host[i];
+      if (B_host[i] != expected_result) {
+        is_computation_correct = false;
+        if (s_error_cb) {
+          s_error_cb(("Incorrect result in test kernel execution -  expected " +
+                      std::to_string(expected_result) + ", got " + std::to_string(B_host[i]))
+                         .c_str(),
+                     s_error_user_ptr);
+        }
+      }
     }
+
+    sycl::free(A_host, *queue);
+    sycl::free(B_host, *queue);
+    sycl::free(A_device, *queue);
+    sycl::free(B_device, *queue);
+    queue->wait_and_throw();
   }
   catch (sycl::exception const &e) {
     if (s_error_cb) {
@@ -213,63 +84,16 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
     return false;
   }
 
-  return true;
-}
-
-bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
-{
-  kernel_global_size = sizeof(KernelGlobalsGPU);
-
-  return true;
-}
-
-void oneapi_set_global_memory(SyclQueue *queue_,
-                              void *kernel_globals,
-                              const char *memory_name,
-                              void *memory_device_pointer)
-{
-  assert(queue_);
-  assert(kernel_globals);
-  assert(memory_name);
-  assert(memory_device_pointer);
-  KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
-  oneapi_check_usm(queue_, memory_device_pointer);
-  oneapi_check_usm(queue_, kernel_globals, true);
-
-  std::string matched_name(memory_name);
-
-/* This macro will change global ptr of KernelGlobals via name matching. */
-#  define KERNEL_DATA_ARRAY(type, name) \
-    else if (#name == matched_name) \
-    { \
-      globals->__##name = (type *)memory_device_pointer; \
-      return; \
-    }
-  if (false) {
-  }
-  else if ("integrator_state" == matched_name) {
-    globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
-    return;
-  }
-  KERNEL_DATA_ARRAY(KernelData, data)
-#  include "kernel/data_arrays.h"
-  else
-  {
-    std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
-              << std::endl;
-    assert(false);
-  }
-#  undef KERNEL_DATA_ARRAY
+  return is_computation_correct;
 }
 
 /* TODO: Move device information to OneapiDevice initialized on creation and use it. */
 /* TODO: Move below function to oneapi/queue.cpp. */
-size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
+size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
                                           const DeviceKernel kernel,
                                           const size_t kernel_global_size)
 {
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  assert(queue);
   (void)kernel_global_size;
   const static size_t preferred_work_group_size_intersect_shading = 32;
   const static size_t preferred_work_group_size_technical = 1024;
@@ -311,11 +135,63 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
       preferred_work_group_size = 512;
   }
 
-  const size_t limit_work_group_size =
-      queue->get_device().get_info<sycl::info::device::max_work_group_size>();
+  const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
+                                           ->get_device()
+                                           .get_info<sycl::info::device::max_work_group_size>();
+
   return std::min(limit_work_group_size, preferred_work_group_size);
 }
 
+bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
+{
+#  ifdef SYCL_SKIP_KERNELS_PRELOAD
+  (void)queue_;
+  (void)requested_features;
+#  else
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+
+  try {
+    sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
+        sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
+                                                           {queue->get_device()});
+
+    for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
+      const std::string &kernel_name = kernel_id.get_name();
+
+      /* NOTE(@nsirgien): Names in this conditions below should match names from
+       * oneapi_call macro in oneapi_enqueue_kernel below */
+      if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) {
+        continue;
+      }
+
+      if (((requested_features & KERNEL_FEATURE_MNEE) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) {
+        continue;
+      }
+
+      if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") !=
+              std::string::npos) {
+        continue;
+      }
+
+      sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle =
+          sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
+      sycl::build(one_kernel_bundle);
+    }
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+#  endif
+  return true;
+}
+
 bool oneapi_enqueue_kernel(KernelContext *kernel_context,
                            int kernel,
                            size_t global_size,
@@ -354,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
     /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
      * we extend work size to fit uniformity requirements. */
     global_size = groups_count * local_size;
-
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-    if (queue->get_device().is_host()) {
-      global_size = 1;
-      local_size = 1;
-    }
-#  endif
   }
 
   /* Let the compiler throw an error if there are any kernels missing in this implementation. */
@@ -645,13 +514,9 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
         /* Unsupported kernels */
         case DEVICE_KERNEL_NUM:
         case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
-          assert(0);
-          return false;
+          kernel_assert(0);
+          break;
       }
-
-      /* Unknown kernel. */
-      assert(0);
-      return false;
     });
   }
   catch (sycl::exception const &e) {
@@ -668,247 +533,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
 #  endif
   return success;
 }
-
-static const int lowest_supported_driver_version_win = 1011660;
-static const int lowest_supported_driver_version_neo = 23570;
-
-static int parse_driver_build_version(const sycl::device &device)
-{
-  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
-  int driver_build_version = 0;
-
-  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
-  if (second_dot_position == std::string::npos) {
-    std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
-              << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
-              << " xx.xx.xxx.xxxx (Windows) for device \""
-              << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
-  }
-  else {
-    try {
-      size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
-      if (third_dot_position != std::string::npos) {
-        const std::string &third_number_substr = driver_version.substr(
-            second_dot_position + 1, third_dot_position - second_dot_position - 1);
-        const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
-        if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
-          driver_build_version = std::stoi(third_number_substr) * 10000 +
-                                 std::stoi(forth_number_substr);
-      }
-      else {
-        const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
-        driver_build_version = std::stoi(third_number_substr);
-      }
-    }
-    catch (std::invalid_argument &e) {
-      std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
-                << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
-                << " xx.xx.xxx.xxxx (Windows) for device \""
-                << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
-    }
-  }
-
-  return driver_build_version;
-}
-
-static std::vector<sycl::device> oneapi_available_devices()
-{
-  bool allow_all_devices = false;
-  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
-    allow_all_devices = true;
-
-    /* Host device is useful only for debugging at the moment
-     * so we hide this device with default build settings. */
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  bool allow_host = true;
-#  else
-  bool allow_host = false;
-#  endif
-
-  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
-
-  std::vector<sycl::device> available_devices;
-  for (const sycl::platform &platform : oneapi_platforms) {
-    /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
-     */
-    if (platform.get_backend() == sycl::backend::opencl) {
-      continue;
-    }
-
-    const std::vector<sycl::device> &oneapi_devices =
-        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
-                                            platform.get_devices(sycl::info::device_type::gpu);
-
-    for (const sycl::device &device : oneapi_devices) {
-      if (allow_all_devices) {
-        /* still filter out host device if build doesn't support it. */
-        if (allow_host || !device.is_host()) {
-          available_devices.push_back(device);
-        }
-      }
-      else {
-        bool filter_out = false;
-
-        /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
-         * assuming they have either more than 96 Execution Units or not 7 threads per EU.
-         * Official support can be broaden to older and smaller GPUs once ready. */
-        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
-          /* Filtered-out defaults in-case these values aren't available through too old L0
-           * runtime. */
-          int number_of_eus = 96;
-          int threads_per_eu = 7;
-          if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
-            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
-          }
-          if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-            threads_per_eu =
-                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
-          }
-          /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
-          if (number_of_eus <= 96 && threads_per_eu == 7) {
-            filter_out = true;
-          }
-          /* if not already filtered out, check driver version. */
-          if (!filter_out) {
-            int driver_build_version = parse_driver_build_version(device);
-            if ((driver_build_version > 100000 &&
-                 driver_build_version < lowest_supported_driver_version_win) ||
-                (driver_build_version > 0 &&
-                 driver_build_version < lowest_supported_driver_version_neo)) {
-              filter_out = true;
-            }
-          }
-        }
-        else if (!allow_host && device.is_host()) {
-          filter_out = true;
-        }
-        else if (!allow_all_devices) {
-          filter_out = true;
-        }
-
-        if (!filter_out) {
-          available_devices.push_back(device);
-        }
-      }
-    }
-  }
-
-  return available_devices;
-}
-
-char *oneapi_device_capabilities()
-{
-  std::stringstream capabilities;
-
-  const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
-  for (const sycl::device &device : oneapi_devices) {
-    const std::string &name = device.get_info<sycl::info::device::name>();
-
-    capabilities << std::string("\t") << name << "\n";
-#  define WRITE_ATTR(attribute_name, attribute_variable) \
-    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
-                 << "\n";
-#  define GET_NUM_ATTR(attribute) \
-    { \
-      size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
-      capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
-    }
-
-    GET_NUM_ATTR(vendor_id)
-    GET_NUM_ATTR(max_compute_units)
-    GET_NUM_ATTR(max_work_item_dimensions)
-
-    sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>();
-    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
-    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
-    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
-
-    GET_NUM_ATTR(max_work_group_size)
-    GET_NUM_ATTR(max_num_sub_groups)
-    GET_NUM_ATTR(sub_group_independent_forward_progress)
-
-    GET_NUM_ATTR(preferred_vector_width_char)
-    GET_NUM_ATTR(preferred_vector_width_short)
-    GET_NUM_ATTR(preferred_vector_width_int)
-    GET_NUM_ATTR(preferred_vector_width_long)
-    GET_NUM_ATTR(preferred_vector_width_float)
-    GET_NUM_ATTR(preferred_vector_width_double)
-    GET_NUM_ATTR(preferred_vector_width_half)
-
-    GET_NUM_ATTR(native_vector_width_char)
-    GET_NUM_ATTR(native_vector_width_short)
-    GET_NUM_ATTR(native_vector_width_int)
-    GET_NUM_ATTR(native_vector_width_long)
-    GET_NUM_ATTR(native_vector_width_float)
-    GET_NUM_ATTR(native_vector_width_double)
-    GET_NUM_ATTR(native_vector_width_half)
-
-    size_t max_clock_frequency =
-        (size_t)(device.is_host() ? (size_t)0 :
-                                    device.get_info<sycl::info::device::max_clock_frequency>());
-    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
-
-    GET_NUM_ATTR(address_bits)
-    GET_NUM_ATTR(max_mem_alloc_size)
-
-    /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
-     * supported so we always return false, even if device supports HW texture usage acceleration.
-     */
-    bool image_support = false;
-    WRITE_ATTR("image_support", (size_t)image_support)
-
-    GET_NUM_ATTR(max_parameter_size)
-    GET_NUM_ATTR(mem_base_addr_align)
-    GET_NUM_ATTR(global_mem_size)
-    GET_NUM_ATTR(local_mem_size)
-    GET_NUM_ATTR(error_correction_support)
-    GET_NUM_ATTR(profiling_timer_resolution)
-    GET_NUM_ATTR(is_available)
-
-#  undef GET_NUM_ATTR
-#  undef WRITE_ATTR
-    capabilities << "\n";
-  }
-
-  return ::strdup(capabilities.str().c_str());
-}
-
-void oneapi_free(void *p)
-{
-  if (p) {
-    ::free(p);
-  }
-}
-
-void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
-{
-  int num = 0;
-  std::vector<sycl::device> devices = oneapi_available_devices();
-  for (sycl::device &device : devices) {
-    const std::string &platform_name =
-        device.get_platform().get_info<sycl::info::platform::name>();
-    std::string name = device.get_info<sycl::info::device::name>();
-    std::string id = "ONEAPI_" + platform_name + "_" + name;
-    if (device.has(sycl::aspect::ext_intel_pci_address)) {
-      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
-    }
-    (cb)(id.c_str(), name.c_str(), num, user_ptr);
-    num++;
-  }
-}
-
-size_t oneapi_get_memcapacity(SyclQueue *queue)
-{
-  return reinterpret_cast<sycl::queue *>(queue)
-      ->get_device()
-      .get_info<sycl::info::device::global_mem_size>();
-}
-
-size_t oneapi_get_compute_units_amount(SyclQueue *queue)
-{
-  return reinterpret_cast<sycl::queue *>(queue)
-      ->get_device()
-      .get_info<sycl::info::device::max_compute_units>();
-}
-
 #endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
index c5f853742ed..2bfc0b89c87 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -25,11 +25,6 @@ enum DeviceKernel : int;
 
 class SyclQueue;
 
-typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
-                                             const char *name,
-                                             int num,
-                                             void *user_ptr);
-
 typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
 
 struct KernelContext {
@@ -45,13 +40,17 @@ struct KernelContext {
 extern "C" {
 #  endif
 
-#  define DLL_INTERFACE_CALL(function, return_type, ...) \
-    CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
-#  include "kernel/device/oneapi/dll_interface_template.h"
-#  undef DLL_INTERFACE_CALL
-
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
+CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
+CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
+    SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
+                                                       int kernel,
+                                                       size_t global_size,
+                                                       void **args);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue,
+                                                     const unsigned int requested_features);
 #  ifdef __cplusplus
 }
 #  endif
-
 #endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h
index d8964d9b672..0ae925cf748 100644
--- a/intern/cycles/kernel/device/oneapi/kernel_templates.h
+++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h
@@ -80,7 +80,7 @@ void oneapi_call(
   (x, ##__VA_ARGS__)
 
 /* This template automatically casts entries in the void **args array to the types requested by the kernel func.
-   Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
+ * Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
 #define oneapi_template(...) \
   template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \
   void oneapi_call( \
diff --git a/intern/cycles/kernel/device/optix/bvh.h b/intern/cycles/kernel/device/optix/bvh.h
new file mode 100644
index 00000000000..6d81b44660c
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/bvh.h
@@ -0,0 +1,659 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* OptiX implementation of ray-scene intersection. */
+
+#pragma once
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+
+#define OPTIX_DEFINE_ABI_VERSION_ONLY
+#include <optix_function_table.h>
+
+CCL_NAMESPACE_BEGIN
+
+/* Utilities. */
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
+{
+  return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
+}
+template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
+{
+  return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
+}
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
+{
+  return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
+}
+
+ccl_device_forceinline int get_object_id()
+{
+#ifdef __OBJECT_MOTION__
+  /* Always get the instance ID from the TLAS
+   * There might be a motion transform node between TLAS and BLAS which does not have one. */
+  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+#else
+  return optixGetInstanceId();
+#endif
+}
+
+/* Hit/miss functions. */
+
+extern "C" __global__ void __miss__kernel_optix_miss()
+{
+  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
+  optixSetPayload_5(PRIMITIVE_NONE);
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_local_hit()
+{
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves and points. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
+#ifdef __BVH_LOCAL__
+  const int object = get_object_id();
+  if (object != optixGetPayload_4() /* local_object */) {
+    /* Only intersect with matching object. */
+    return optixIgnoreIntersection();
+  }
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_local(ray->self, prim)) {
+    return optixIgnoreIntersection();
+  }
+
+  const uint max_hits = optixGetPayload_5();
+  if (max_hits == 0) {
+    /* Special case for when no hit information is requested, just report that something was hit */
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+  }
+
+  int hit = 0;
+  uint *const lcg_state = get_payload_ptr_0<uint>();
+  LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
+
+  if (lcg_state) {
+    for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+      if (optixGetRayTmax() == local_isect->hits[i].t) {
+        return optixIgnoreIntersection();
+      }
+    }
+
+    hit = local_isect->num_hits++;
+
+    if (local_isect->num_hits > max_hits) {
+      hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
+      if (hit >= max_hits) {
+        return optixIgnoreIntersection();
+      }
+    }
+  }
+  else {
+    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
+      /* Record closest intersection only.
+       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
+       */
+      return optixIgnoreIntersection();
+    }
+
+    local_isect->num_hits = 1;
+  }
+
+  Intersection *isect = &local_isect->hits[hit];
+  isect->t = optixGetRayTmax();
+  isect->prim = prim;
+  isect->object = get_object_id();
+  isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
+
+  const float2 barycentrics = optixGetTriangleBarycentrics();
+  isect->u = barycentrics.x;
+  isect->v = barycentrics.y;
+
+  /* Record geometric normal. */
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0);
+  const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1);
+  const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+
+  /* Continue tracing (without this the trace call would return after the first hit). */
+  optixIgnoreIntersection();
+#endif
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
+{
+#ifdef __SHADOW_RECORD_ALL__
+  int prim = optixGetPrimitiveIndex();
+  const uint object = get_object_id();
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#  endif
+
+  float u = 0.0f, v = 0.0f;
+  int type = 0;
+  if (optixIsTriangleHit()) {
+    /* Triangle. */
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    u = barycentrics.x;
+    v = barycentrics.y;
+    type = kernel_data_fetch(objects, object).primitive_type;
+  }
+#  ifdef __HAIR__
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    /* Curve. */
+    u = __uint_as_float(optixGetAttribute_0());
+    v = __uint_as_float(optixGetAttribute_1());
+
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
+    type = segment.type;
+    prim = segment.prim;
+
+#    if OPTIX_ABI_VERSION < 55
+    /* Filter out curve end-caps. */
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+#    endif
+  }
+#  endif
+  else {
+    /* Point. */
+    type = kernel_data_fetch(objects, object).primitive_type;
+    u = 0.0f;
+    v = 0.0f;
+  }
+
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_shadow(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
+
+#  ifndef __TRANSPARENT_SHADOWS__
+  /* No transparent shadows support compiled in, make opaque. */
+  optixSetPayload_5(true);
+  return optixTerminateRay();
+#  else
+  const uint max_hits = optixGetPayload_3();
+  const uint num_hits_packed = optixGetPayload_2();
+  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
+  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
+
+  /* If no transparent shadows, all light is blocked and we can stop immediately. */
+  if (num_hits >= max_hits ||
+      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+  }
+
+  /* Always use baked shadow transparency for curves. */
+  if (type & PRIMITIVE_CURVE) {
+    float throughput = __uint_as_float(optixGetPayload_1());
+    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, type, u);
+    optixSetPayload_1(__float_as_uint(throughput));
+    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
+
+    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+      optixSetPayload_5(true);
+      return optixTerminateRay();
+    }
+    else {
+      /* Continue tracing. */
+      optixIgnoreIntersection();
+      return;
+    }
+  }
+
+  /* Record transparent intersection. */
+  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
+
+  uint record_index = num_recorded_hits;
+
+  const IntegratorShadowState state = optixGetPayload_0();
+
+  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+  if (record_index >= max_record_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
+    uint max_recorded_hit = 0;
+
+    for (int i = 1; i < max_record_hits; i++) {
+      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+      if (isect_t > max_recorded_t) {
+        max_recorded_t = isect_t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (optixGetRayTmax() >= max_recorded_t) {
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
+       * current hit anymore. */
+      return;
+    }
+
+    record_index = max_recorded_hit;
+  }
+
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
+
+  /* Continue tracing. */
+  optixIgnoreIntersection();
+#  endif /* __TRANSPARENT_SHADOWS__ */
+#endif   /* __SHADOW_RECORD_ALL__ */
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_volume_test()
+{
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  const uint object = get_object_id();
+#ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  if ((kernel_data_fetch(object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
+    return optixIgnoreIntersection();
+  }
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
+#ifdef __HAIR__
+#  if OPTIX_ABI_VERSION < 55
+  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
+    /* Filter out curve end-caps. */
+    const float u = __uint_as_float(optixGetAttribute_0());
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
+#  endif
+#endif
+
+  const uint object = get_object_id();
+  const uint visibility = optixGetPayload_4();
+#ifdef __VISIBILITY_FLAG__
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  int prim = optixGetPrimitiveIndex();
+  if (optixIsTriangleHit()) {
+    /* Triangle. */
+  }
+#ifdef __HAIR__
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    /* Curve. */
+    prim = kernel_data_fetch(curve_segments, prim).prim;
+  }
+#endif
+
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    if (intersection_skip_self_shadow(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
+    else {
+      /* Shadow ray early termination. */
+      return optixTerminateRay();
+    }
+  }
+  else {
+    if (intersection_skip_self(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
+  }
+}
+
+extern "C" __global__ void __closesthit__kernel_optix_hit()
+{
+  const int object = get_object_id();
+  const int prim = optixGetPrimitiveIndex();
+
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
+  optixSetPayload_4(object);
+
+  if (optixIsTriangleHit()) {
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    optixSetPayload_1(__float_as_uint(barycentrics.x));
+    optixSetPayload_2(__float_as_uint(barycentrics.y));
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
+  }
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
+    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
+    optixSetPayload_2(optixGetAttribute_1());
+    optixSetPayload_3(segment.prim);
+    optixSetPayload_5(segment.type);
+  }
+  else {
+    optixSetPayload_1(0);
+    optixSetPayload_2(0);
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
+  }
+}
+
+/* Custom primitive intersection functions. */
+
+#ifdef __HAIR__
+ccl_device_inline void optix_intersection_curve(const int prim, const int type)
+{
+  const int object = get_object_id();
+
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  const float3 ray_P = optixGetObjectRayOrigin();
+  const float3 ray_D = optixGetObjectRayDirection();
+  const float ray_tmin = optixGetRayTmin();
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+
+  if (curve_intersect(NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+    optixReportIntersection(isect.t,
+                            type & PRIMITIVE_ALL,
+                            __float_as_int(isect.u),  /* Attribute_0 */
+                            __float_as_int(isect.v)); /* Attribute_1 */
+  }
+}
+
+extern "C" __global__ void __intersection__curve_ribbon()
+{
+  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, optixGetPrimitiveIndex());
+  const int prim = segment.prim;
+  const int type = segment.type;
+  if (type & PRIMITIVE_CURVE_RIBBON) {
+    optix_intersection_curve(prim, type);
+  }
+}
+
+#endif
+
+#ifdef __POINTCLOUD__
+extern "C" __global__ void __intersection__point()
+{
+  const int prim = optixGetPrimitiveIndex();
+  const int object = get_object_id();
+  const int type = kernel_data_fetch(objects, object).primitive_type;
+
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  const float3 ray_P = optixGetObjectRayOrigin();
+  const float3 ray_D = optixGetObjectRayDirection();
+  const float ray_tmin = optixGetRayTmin();
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+
+  if (point_intersect(NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+    optixReportIntersection(isect.t, type & PRIMITIVE_ALL);
+  }
+}
+#endif
+
+/* Scene intersection. */
+
+ccl_device_intersect bool scene_intersect(KernelGlobals kg,
+                                          ccl_private const Ray *ray,
+                                          const uint visibility,
+                                          ccl_private Intersection *isect)
+{
+  uint p0 = 0;
+  uint p1 = 0;
+  uint p2 = 0;
+  uint p3 = 0;
+  uint p4 = visibility;
+  uint p5 = PRIMITIVE_NONE;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  uint ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             ray_flags,
+             0, /* SBT offset for PG_HITD */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  isect->t = __uint_as_float(p0);
+  isect->u = __uint_as_float(p1);
+  isect->v = __uint_as_float(p2);
+  isect->prim = p3;
+  isect->object = p4;
+  isect->type = p5;
+
+  return p5 != PRIMITIVE_NONE;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
+                                                ccl_private const Ray *ray,
+                                                ccl_private LocalIntersection *local_isect,
+                                                int local_object,
+                                                ccl_private uint *lcg_state,
+                                                int max_hits)
+{
+  uint p0 = pointer_pack_to_uint_0(lcg_state);
+  uint p1 = pointer_pack_to_uint_1(lcg_state);
+  uint p2 = pointer_pack_to_uint_0(local_isect);
+  uint p3 = pointer_pack_to_uint_1(local_isect);
+  uint p4 = local_object;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  /* Is set to zero on miss or if ray is aborted, so can be used as return value. */
+  uint p5 = max_hits;
+
+  if (local_isect) {
+    local_isect->num_hits = 0; /* Initialize hit count to zero. */
+  }
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             0xFF,
+             /* Need to always call into __anyhit__kernel_optix_local_hit. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             2, /* SBT offset for PG_HITL */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  return p5;
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
+                                                     IntegratorShadowState state,
+                                                     ccl_private const Ray *ray,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     ccl_private uint *num_recorded_hits,
+                                                     ccl_private float *throughput)
+{
+  uint p0 = state;
+  uint p1 = __float_as_uint(1.0f); /* Throughput. */
+  uint p2 = 0;                     /* Number of hits. */
+  uint p3 = max_hits;
+  uint p4 = visibility;
+  uint p5 = false;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             1, /* SBT offset for PG_HITS */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  *num_recorded_hits = uint16_unpack_from_uint_0(p2);
+  *throughput = __uint_as_float(p1);
+
+  return p5;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
+                                                 ccl_private const Ray *ray,
+                                                 ccl_private Intersection *isect,
+                                                 const uint visibility)
+{
+  uint p0 = 0;
+  uint p1 = 0;
+  uint p2 = 0;
+  uint p3 = 0;
+  uint p4 = visibility;
+  uint p5 = PRIMITIVE_NONE;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_volume_test. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             3, /* SBT offset for PG_HITV */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  isect->t = __uint_as_float(p0);
+  isect->u = __uint_as_float(p1);
+  isect->v = __uint_as_float(p2);
+  isect->prim = p3;
+  isect->object = p4;
+  isect->type = p5;
+
+  return p5 != PRIMITIVE_NONE;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index aa4a6321a8b..1a11a533b7e 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -8,7 +8,6 @@
 #include <optix.h>
 
 #define __KERNEL_GPU__
-#define __KERNEL_GPU_RAYTRACING__
 #define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
 #define __KERNEL_OPTIX__
 #define CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 510f7cca5d6..6abb5aeacb9 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -20,34 +20,6 @@
 #include "kernel/integrator/intersect_volume_stack.h"
 // clang-format on
 
-#define OPTIX_DEFINE_ABI_VERSION_ONLY
-#include <optix_function_table.h>
-
-template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
-{
-  return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
-}
-template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
-{
-  return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
-}
-
-template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
-{
-  return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
-}
-
-ccl_device_forceinline int get_object_id()
-{
-#ifdef __OBJECT_MOTION__
-  /* Always get the instance ID from the TLAS
-   * There might be a motion transform node between TLAS and BLAS which does not have one. */
-  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
-#else
-  return optixGetInstanceId();
-#endif
-}
-
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
 {
   const int global_index = optixGetLaunchIndex().x;
@@ -84,411 +56,3 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_st
   integrator_intersect_volume_stack(nullptr, path_index);
 }
 
-extern "C" __global__ void __miss__kernel_optix_miss()
-{
-  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
-  optixSetPayload_5(PRIMITIVE_NONE);
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_local_hit()
-{
-#if defined(__HAIR__) || defined(__POINTCLOUD__)
-  if (!optixIsTriangleHit()) {
-    /* Ignore curves and points. */
-    return optixIgnoreIntersection();
-  }
-#endif
-
-#ifdef __BVH_LOCAL__
-  const int object = get_object_id();
-  if (object != optixGetPayload_4() /* local_object */) {
-    /* Only intersect with matching object. */
-    return optixIgnoreIntersection();
-  }
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self_local(ray->self, prim)) {
-    return optixIgnoreIntersection();
-  }
-
-  const uint max_hits = optixGetPayload_5();
-  if (max_hits == 0) {
-    /* Special case for when no hit information is requested, just report that something was hit */
-    optixSetPayload_5(true);
-    return optixTerminateRay();
-  }
-
-  int hit = 0;
-  uint *const lcg_state = get_payload_ptr_0<uint>();
-  LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
-
-  if (lcg_state) {
-    for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
-      if (optixGetRayTmax() == local_isect->hits[i].t) {
-        return optixIgnoreIntersection();
-      }
-    }
-
-    hit = local_isect->num_hits++;
-
-    if (local_isect->num_hits > max_hits) {
-      hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
-      if (hit >= max_hits) {
-        return optixIgnoreIntersection();
-      }
-    }
-  }
-  else {
-    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      /* Record closest intersection only.
-       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
-       */
-      return optixIgnoreIntersection();
-    }
-
-    local_isect->num_hits = 1;
-  }
-
-  Intersection *isect = &local_isect->hits[hit];
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = get_object_id();
-  isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
-
-  const float2 barycentrics = optixGetTriangleBarycentrics();
-  isect->u = 1.0f - barycentrics.y - barycentrics.x;
-  isect->v = barycentrics.x;
-
-  /* Record geometric normal. */
-  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
-  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0);
-  const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1);
-  const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
-  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
-
-  /* Continue tracing (without this the trace call would return after the first hit). */
-  optixIgnoreIntersection();
-#endif
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
-{
-#ifdef __SHADOW_RECORD_ALL__
-  int prim = optixGetPrimitiveIndex();
-  const uint object = get_object_id();
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#  endif
-
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self_shadow(ray->self, object, prim)) {
-    return optixIgnoreIntersection();
-  }
-
-  float u = 0.0f, v = 0.0f;
-  int type = 0;
-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    u = 1.0f - barycentrics.y - barycentrics.x;
-    v = barycentrics.x;
-    type = kernel_data_fetch(objects, object).primitive_type;
-  }
-#  ifdef __HAIR__
-  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
-    u = __uint_as_float(optixGetAttribute_0());
-    v = __uint_as_float(optixGetAttribute_1());
-
-    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
-    type = segment.type;
-    prim = segment.prim;
-
-#    if OPTIX_ABI_VERSION < 55
-    /* Filter out curve endcaps. */
-    if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
-    }
-#    endif
-  }
-#  endif
-  else {
-    type = kernel_data_fetch(objects, object).primitive_type;
-    u = 0.0f;
-    v = 0.0f;
-  }
-
-#  ifndef __TRANSPARENT_SHADOWS__
-  /* No transparent shadows support compiled in, make opaque. */
-  optixSetPayload_5(true);
-  return optixTerminateRay();
-#  else
-  const uint max_hits = optixGetPayload_3();
-  const uint num_hits_packed = optixGetPayload_2();
-  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
-  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
-
-  /* If no transparent shadows, all light is blocked and we can stop immediately. */
-  if (num_hits >= max_hits ||
-      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
-    optixSetPayload_5(true);
-    return optixTerminateRay();
-  }
-
-  /* Always use baked shadow transparency for curves. */
-  if (type & PRIMITIVE_CURVE) {
-    float throughput = __uint_as_float(optixGetPayload_1());
-    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
-    optixSetPayload_1(__float_as_uint(throughput));
-    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
-
-    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
-      optixSetPayload_5(true);
-      return optixTerminateRay();
-    }
-    else {
-      /* Continue tracing. */
-      optixIgnoreIntersection();
-      return;
-    }
-  }
-
-  /* Record transparent intersection. */
-  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
-
-  uint record_index = num_recorded_hits;
-
-  const IntegratorShadowState state = optixGetPayload_0();
-
-  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
-  if (record_index >= max_record_hits) {
-    /* If maximum number of hits reached, find a hit to replace. */
-    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
-    uint max_recorded_hit = 0;
-
-    for (int i = 1; i < max_record_hits; i++) {
-      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
-      if (isect_t > max_recorded_t) {
-        max_recorded_t = isect_t;
-        max_recorded_hit = i;
-      }
-    }
-
-    if (optixGetRayTmax() >= max_recorded_t) {
-      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
-       * current hit anymore. */
-      return;
-    }
-
-    record_index = max_recorded_hit;
-  }
-
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
-
-  /* Continue tracing. */
-  optixIgnoreIntersection();
-#  endif /* __TRANSPARENT_SHADOWS__ */
-#endif   /* __SHADOW_RECORD_ALL__ */
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_volume_test()
-{
-#if defined(__HAIR__) || defined(__POINTCLOUD__)
-  if (!optixIsTriangleHit()) {
-    /* Ignore curves. */
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  const uint object = get_object_id();
-#ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  if ((kernel_data_fetch(object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
-    return optixIgnoreIntersection();
-  }
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self(ray->self, object, prim)) {
-    return optixIgnoreIntersection();
-  }
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
-{
-#ifdef __HAIR__
-#  if OPTIX_ABI_VERSION < 55
-  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
-    /* Filter out curve endcaps. */
-    const float u = __uint_as_float(optixGetAttribute_0());
-    if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
-    }
-  }
-#  endif
-#endif
-
-  const uint object = get_object_id();
-  const uint visibility = optixGetPayload_4();
-#ifdef __VISIBILITY_FLAG__
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-
-  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    if (intersection_skip_self_shadow(ray->self, object, prim)) {
-      return optixIgnoreIntersection();
-    }
-    else {
-      /* Shadow ray early termination. */
-      return optixTerminateRay();
-    }
-  }
-  else {
-    if (intersection_skip_self(ray->self, object, prim)) {
-      return optixIgnoreIntersection();
-    }
-  }
-}
-
-extern "C" __global__ void __closesthit__kernel_optix_hit()
-{
-  const int object = get_object_id();
-  const int prim = optixGetPrimitiveIndex();
-
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
-  optixSetPayload_4(object);
-
-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
-    optixSetPayload_2(__float_as_uint(barycentrics.x));
-    optixSetPayload_3(prim);
-    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
-  }
-  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
-    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
-    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
-    optixSetPayload_2(optixGetAttribute_1());
-    optixSetPayload_3(segment.prim);
-    optixSetPayload_5(segment.type);
-  }
-  else {
-    optixSetPayload_1(0);
-    optixSetPayload_2(0);
-    optixSetPayload_3(prim);
-    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
-  }
-}
-
-#ifdef __HAIR__
-ccl_device_inline void optix_intersection_curve(const int prim, const int type)
-{
-  const int object = get_object_id();
-
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return;
-  }
-#  endif
-
-  float3 P = optixGetObjectRayOrigin();
-  float3 dir = optixGetObjectRayDirection();
-  float tmin = optixGetRayTmin();
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
-#  ifdef __OBJECT_MOTION__
-  const float time = optixGetRayTime();
-#  else
-  const float time = 0.0f;
-#  endif
-
-  Intersection isect;
-  isect.t = optixGetRayTmax();
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
-
-  if (curve_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) {
-    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
-    optixReportIntersection(isect.t / len,
-                            type & PRIMITIVE_ALL,
-                            __float_as_int(isect.u),  /* Attribute_0 */
-                            __float_as_int(isect.v)); /* Attribute_1 */
-  }
-}
-
-extern "C" __global__ void __intersection__curve_ribbon()
-{
-  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, optixGetPrimitiveIndex());
-  const int prim = segment.prim;
-  const int type = segment.type;
-  if (type & PRIMITIVE_CURVE_RIBBON) {
-    optix_intersection_curve(prim, type);
-  }
-}
-
-#endif
-
-#ifdef __POINTCLOUD__
-extern "C" __global__ void __intersection__point()
-{
-  const int prim = optixGetPrimitiveIndex();
-  const int object = get_object_id();
-  const int type = kernel_data_fetch(objects, object).primitive_type;
-
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return;
-  }
-#  endif
-
-  float3 P = optixGetObjectRayOrigin();
-  float3 dir = optixGetObjectRayDirection();
-  float tmin = optixGetRayTmin();
-
-  /* The direction is not normalized by default, the point intersection routine expects that. */
-  float len;
-  dir = normalize_len(dir, &len);
-
-#  ifdef __OBJECT_MOTION__
-  const float time = optixGetRayTime();
-#  else
-  const float time = 0.0f;
-#  endif
-
-  Intersection isect;
-  isect.t = optixGetRayTmax();
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX) {
-    isect.t *= len;
-  }
-
-  if (point_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) {
-    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
-    optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL);
-  }
-}
-#endif